Refactored it a bit

This commit is contained in:
2026-04-02 13:05:56 +02:00
parent ba550f8dc5
commit 280450fc0f
8 changed files with 196 additions and 89 deletions

5
.gitignore vendored
View File

@@ -1,6 +1,5 @@
.venv
build
hulud_check.egg-info
*.egg-info
patternfile
*.egg-info

View File

@@ -1,20 +1,10 @@
FROM python:3.13-slim-trixie
WORKDIR /opt/hulud_check
WORKDIR /opt/glchecka
RUN apt-get update && apt-get install -y ripgrep git curl
ADD check_gitlab.py .
ADD pyproject.toml .
ADD entrypoint.sh .
RUN pip3 install .
COPY --chmod=755 <<EOF /opt/hulud_check/entrypoint.sh
#!/bin/bash
echo "Get most recent defintions"
curl -s https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv
echo "Format patterns"
tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print \$1}' > patternfile
echo "Running check"
python3 -u check_gitlab.py
EOF
ENTRYPOINT ["/opt/hulud_check/entrypoint.sh"]
ENTRYPOINT ["python3", "-u", "check_gitlab.py"]

42
README.md Normal file
View File

@@ -0,0 +1,42 @@
# Gitlab Repository Checker
This tool acts as a small helper to find specific strings in files (e.g. axios 1.4.1 in package.json).
It will query the Gitlab API for groups and projects and pull repos locally via HTTPS and search them via ripgrep.
In the end a CSV report will be generated with the findings.
It can also be used for all kinds of purposes for automated repository searches.
## Configuration
You can configure the tool via environment variables:
```shell
GITLAB_URL # Required - Defines the Gitlab URL (for Self Hosted instances or Managed)
GITLAB_PAT # Required - A Gitlab personal access tokens with permissions to read groups and projects
GITLAB_GROUP_ID # Optional - An ID of a specific group, default is all groups will be checked for repositories
GLOB_FILE # Optional - A file containing glob patterns or filenames to search patterns for (Its recursive in all subdirs)
GIT_TMP_ROOT # Optional - The path where repositories are cloned to for scanning - Default - /tmp/repo_check
REPORT_PATH # Optional - The path where reports are stored - Default - /tmp/check_reports
REPORT_FILE # Optional - The Filename for the report - Default - report.csv
PATTERN_FILE # Optional - The file containing the regex patterns which ripgrep will use to match strings - Default - patternfile
```
## Execution
```shell
python -m venv .venv
source .venv/bin/activate
pip install .
python -u check_gitlab.py
```
You can also try to use the Dockerfile
## Important
> *This product is 100% AI Free.
> **It is not nice code. Made in a hurry and and with low attention span.
> ***Product is not vegan. Might contain bugs and nuts.

View File

@@ -1,12 +1,5 @@
#!/usr/bin/env python3
# Very hacky quick check for sha1-hulud for gitlab repos
# Need to set the GITLAB_URL and GITLAB_PAT
# Also use the CSV provided by https://github.com/wiz-sec-public/wiz-research-iocs/blob/main/reports/shai-hulud-2-packages.csv and create a patternfile
# Patternfile creation:
# curl https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv
# tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print $1}' > patternfile
# pip install GitPython requests
# You need to have ripgrep installed too
# apt-get install ripgrep
@@ -53,59 +46,138 @@ class Report():
for row in self.findings:
writer.writerow(row)
def get_all_projects(next_link=None, group_id=None, prev_result=[]):
base_path = '/api/v4'
url_params = ["include_subgroups=true", "per_page=50", "search_namespaces=true", "owned=false", "order_by=id", "sort=asc"]
if group_id:
base_path += f"/groups/{group_id}"
else:
url_params.append("pagination=keyset")
class GitlabRepositories():
def __init__(self):
if not next_link:
result = session.query(f"{base_path}/projects?{'&'.join(url_params)}")
else:
result = session.get(next_link)
if not os.environ.get('GITLAB_URL'):
print("Environment variable GITLAB_URL not specified")
exit(1)
if not os.environ.get('GITLAB_PAT'):
print("Environment variable GITLAB_PAT not specified")
exit(1)
self.projects = []
self.groups = []
self.base_path = '/api/v4'
self.session = GitlabConnector()
if os.environ.get('GITLAB_GROUP_ID'):
self.groups.append(os.environ.get('GITLAB_GROUP_ID'))
def parse_pagination(self, result_headers):
ret_val = False
if not result_headers.get('Link'):
return ret_val
links = result_headers['Link'].split(', ')
if result.headers.get('Link'):
links = result.headers['Link'].split(', ')
for link in links:
parts = link.split('; ')
rel = parts[1].split('=')[1]
if rel == '"next"':
link = parts[0].replace('<', '').replace('>', '')
ret_val = parts[0].replace('<', '').replace('>', '')
break
prev_result += [{'id': i['id'], 'http_url_to_repo': i['http_url_to_repo'], 'ssh_url_to_repo': i['ssh_url_to_repo'], 'web_url': i['web_url']} for i in result.json()]
return ret_val
# I know, not nice.. but im in a hurry
try:
if rel == "\"next\"":
get_all_projects(next_link=link, group_id=group_id, prev_result=prev_result)
except:
pass
return prev_result
def clone_repo_with_http(repo_url=None):
repo_host_path = repo_url.split('://')[1]
repo_http_scheme = repo_url.split('://')[0]
repo_credentials = f"token:{session.pat}"
repo_remote = f"{repo_http_scheme}://{repo_credentials}@{repo_host_path}"
repo_name = repo_host_path.replace('/', '_').rstrip('.git')
repo_path = f"{git_tmp_root}/{repo_name}"
def get_groups(self, next_link=None):
result = None
if not next_link:
result = self.session.query(f"{self.base_path}/groups")
else:
result = self.session.get(next_link)
if not result:
print("No groups found or permissions not sufficient.")
return
self.groups += [i['id'] for i in result.json()]
next_link = self.parse_pagination(result.headers)
if next_link:
self.get_groups(next_link=next_link)
def get_projects_by_group(self, next_link=None, group_id=None):
result = None
url_params = ["include_subgroups=true", "per_page=20", "search_namespaces=true", "owned=false", "order_by=id", "sort=asc"]
group_path = f"{self.base_path}/groups/{group_id}"
if not next_link:
result = self.session.query(f"{group_path}/projects?{'&'.join(url_params)}")
else:
result = self.session.get(next_link)
if not result:
print(f"No projects in group {group_id} found or permissions not sufficient.")
return
self.projects += [{'id': i['id'], 'http_url_to_repo': i['http_url_to_repo'], 'ssh_url_to_repo': i['ssh_url_to_repo'], 'web_url': i['web_url']} for i in result.json()]
next_link = self.parse_pagination(result.headers)
if next_link:
self.get_projects_by_group(next_link=next_link, group_id=group_id)
#print(result.json())
def get_projects(self):
print("Getting GitLab Projects")
# When groups not empty or specified, parse groups
if not self.groups:
self.get_groups()
for group_id in self.groups:
print(f"Getting Projects for Group {group_id}")
self.get_projects_by_group(group_id=group_id)
print(len(self.projects))
def clone_repo(self, repo_url=None):
repo_host_path = repo_url.split('://')[1]
repo_http_scheme = repo_url.split('://')[0]
repo_credentials = f"token:{self.session.pat}"
repo_remote = f"{repo_http_scheme}://{repo_credentials}@{repo_host_path}"
repo_name = repo_host_path.replace('/', '_').rstrip('.git')
repo_path = f"{git_tmp_root}/{repo_name}"
if os.path.isdir(repo_path) and os.listdir(repo_path):
return repo_path
if os.path.isdir(repo_path) and os.listdir(repo_path):
return repo_path
print(f"Processing Repository {repo_host_path}")
try:
repo = Repo.clone_from(repo_remote, repo_path)
repo.close()
except:
print(f"Cant clone {repo_url}")
print(f"Processing Repository {repo_host_path}")
try:
repo = Repo.clone_from(repo_remote, repo_path)
repo.close()
except:
print(f"Cant clone {repo_url}")
return repo_path
return repo_path
return repo_path
def parse_fileglob():
glob_file = Path(os.environ.get('GLOB_FILE', 'globfile'))
ret_val = []
if not glob_file.is_file():
print(f"Specified glob file {glob_file} not found. Searching all files.")
return []
with open(glob_file, 'r') as file:
lines = file.readlines()
for line in lines:
ret_val.append("--glob")
ret_val.append(line.rstrip())
print(ret_val)
return ret_val
def scan_repo(path=None, repo=None):
scan_result = None
scan_result_lines = []
scan_matches = []
@@ -114,10 +186,10 @@ def scan_repo(path=None, repo=None):
"--json",
"-i",
"-f",
"patternfile",
path
pattern_file
]
ripgrep_cmd = ripgrep_cmd + fileglob
ripgrep_cmd.append(path)
try:
scan_result = subprocess.run(ripgrep_cmd, capture_output=True, text=True)
except:
@@ -125,7 +197,6 @@ def scan_repo(path=None, repo=None):
return []
scan_out_lines = list(filter(None, scan_result.stdout.split('\n')))
for line in scan_out_lines:
line_data = json.loads(line)
if line_data.get("type") == "match":
@@ -139,13 +210,15 @@ def scan_repo(path=None, repo=None):
return scan_matches
def evaluate_findings(findings=[]):
if not findings:
return []
finding_results = []
for finding in findings:
filename = finding['full_path'].split('/')[-1]
if filename.startswith("package"):
print(f"Found potential match - {finding['path']} - {finding['matches'][0]['match']['text']}")
detail = check_line_in_file(file=finding['full_path'], line_number=finding['line_number'])
finding_results += [[finding['repo'], finding['path'], finding['line_number'], detail.lstrip(),finding['matches'][0]['match']['text']]]
print(f"Found potential match - {finding['path']} - {finding['matches'][0]['match']['text']}")
detail = check_line_in_file(file=finding['full_path'], line_number=finding['line_number'])
finding_results += [[finding['repo'], finding['path'], finding['line_number'], detail.lstrip(),finding['matches'][0]['match']['text']]]
return finding_results
def check_line_in_file(file=None, line_number=None):
@@ -155,26 +228,31 @@ def check_line_in_file(file=None, line_number=None):
print(line)
return line.rstrip().replace(',', '')
def check_repos():
repos = get_all_projects(group_id=os.environ.get('GITLAB_GROUP'))
print(f"Found {len(repos)} Repositories..")
for repo in repos:
scan_path = clone_repo_with_http(repo['http_url_to_repo'])
gl = GitlabRepositories()
gl.get_projects()
print(f"Found {len(gl.projects)} Repositories..")
for repo in gl.projects:
scan_path = gl.clone_repo(repo['http_url_to_repo'])
findings = scan_repo(scan_path, repo['web_url'])
if findings:
print("Evaluating matches")
finding_results = evaluate_findings(findings=findings)
if finding_results:
report.findings += finding_results
print("Evaluating matches")
finding_results = evaluate_findings(findings=findings)
if finding_results:
report.findings += finding_results
subprocess.run(["rm", "-rf", scan_path])
git_tmp_root = os.environ.get('GIT_TMP_ROOT', '/tmp/hulud_check')
report_path = os.environ.get('REPORT_PATH', '/tmp/hulud_check_reports')
git_tmp_root = os.environ.get('GIT_TMP_ROOT', '/tmp/repo_check')
report_path = os.environ.get('REPORT_PATH', '/tmp/check_reports')
report_file = os.environ.get('REPORT_FILE', 'report.csv')
pattern_file = os.environ.get('PATTERN_FILE', 'patternfile')
fileglob = parse_fileglob()
Path(git_tmp_root).mkdir(parents=True, exist_ok=True)
Path(report_path).mkdir(parents=True, exist_ok=True)
session = GitlabConnector()
report = Report()
check_repos()
report.results()

View File

@@ -1,7 +0,0 @@
#!/bin/bash
echo "Get most recent defintions"
curl -s https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv
echo "Format patterns"
tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print $1}' > patternfile
echo "Running check"
python3 -u check_gitlab.py

3
globfile Normal file
View File

@@ -0,0 +1,3 @@
package-lock.json
package.json
yarn.lock

2
patternfile Normal file
View File

@@ -0,0 +1,2 @@
axios.*1\.4\.1
axios.*0\.3\.0

View File

@@ -1,6 +1,6 @@
[project]
name = "hulud_check"
description = "Quick hacky check for sha1-hulud"
name = "gitlab_scanner"
description = "Quick hacky Gitlab Repository scanner for searching patterns in files"
version = "2025.0.0"
requires-python = ">=3.13"
dependencies = [