From 280450fc0fe9192c4d7615ea910c641f6f5722cb Mon Sep 17 00:00:00 2001 From: Joe Jabs Date: Thu, 2 Apr 2026 13:05:56 +0200 Subject: [PATCH] Refactored it a bit --- .gitignore | 5 +- Containerfile | 14 +--- README.md | 42 ++++++++++ check_gitlab.py | 208 +++++++++++++++++++++++++++++++++--------------- entrypoint.sh | 7 -- globfile | 3 + patternfile | 2 + pyproject.toml | 4 +- 8 files changed, 196 insertions(+), 89 deletions(-) create mode 100644 README.md delete mode 100755 entrypoint.sh create mode 100644 globfile create mode 100644 patternfile diff --git a/.gitignore b/.gitignore index 3e39fa2..ec92604 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ .venv build -hulud_check.egg-info -*.egg-info -patternfile + +*.egg-info diff --git a/Containerfile b/Containerfile index 722ff43..ea3bfa0 100644 --- a/Containerfile +++ b/Containerfile @@ -1,20 +1,10 @@ FROM python:3.13-slim-trixie -WORKDIR /opt/hulud_check +WORKDIR /opt/glchecka RUN apt-get update && apt-get install -y ripgrep git curl ADD check_gitlab.py . ADD pyproject.toml . ADD entrypoint.sh . RUN pip3 install . -COPY --chmod=755 < sha1-hulud-2-packages.csv -echo "Format patterns" -tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print \$1}' > patternfile -echo "Running check" -python3 -u check_gitlab.py -EOF - -ENTRYPOINT ["/opt/hulud_check/entrypoint.sh"] +ENTRYPOINT ["python3", "-u", "check_gitlab.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..420e396 --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# Gitlab Repository Checker + +This tool acts as a small helper to find specific strings in files (e.g. axios 1.4.1 in package.json). +It will query the Gitlab API for groups and projects and pull repos locally via HTTPS and search them via ripgrep. + +In the end a CSV report will be generated with the findings. + +It can also be used for all kinds of purposes for automated repository searches. + +## Configuration + +You can configure the tool via environment variables: + +```shell + +GITLAB_URL # Required - Defines the Gitlab URL (for Self Hosted instances or Managed) +GITLAB_PAT # Required - A Gitlab personal access tokens with permissions to read groups and projects +GITLAB_GROUP_ID # Optional - An ID of a specific group, default is all groups will be checked for repositories +GLOB_FILE # Optional - A file containing glob patterns or filenames to search patterns for (Its recursive in all subdirs) +GIT_TMP_ROOT # Optional - The path where repositories are cloned to for scanning - Default - /tmp/repo_check +REPORT_PATH # Optional - The path where reports are stored - Default - /tmp/check_reports +REPORT_FILE # Optional - The Filename for the report - Default - report.csv +PATTERN_FILE # Optional - The file containing the regex patterns which ripgrep will use to match strings - Default - patternfile + +``` + +## Execution + +```shell +python -m venv .venv +source .venv/bin/activate +pip install . +python -u check_gitlab.py +``` + +You can also try to use the Dockerfile + +## Important + +> *This product is 100% AI Free. +> **It is not nice code. Made in a hurry and and with low attention span. +> ***Product is not vegan. Might contain bugs and nuts. diff --git a/check_gitlab.py b/check_gitlab.py index 82425a1..2b30df5 100755 --- a/check_gitlab.py +++ b/check_gitlab.py @@ -1,12 +1,5 @@ #!/usr/bin/env python3 -# Very hacky quick check for sha1-hulud for gitlab repos -# Need to set the GITLAB_URL and GITLAB_PAT -# Also use the CSV provided by https://github.com/wiz-sec-public/wiz-research-iocs/blob/main/reports/shai-hulud-2-packages.csv and create a patternfile -# Patternfile creation: -# curl https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv -# tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print $1}' > patternfile - # pip install GitPython requests # You need to have ripgrep installed too # apt-get install ripgrep @@ -53,59 +46,138 @@ class Report(): for row in self.findings: writer.writerow(row) -def get_all_projects(next_link=None, group_id=None, prev_result=[]): - base_path = '/api/v4' - url_params = ["include_subgroups=true", "per_page=50", "search_namespaces=true", "owned=false", "order_by=id", "sort=asc"] - if group_id: - base_path += f"/groups/{group_id}" - else: - url_params.append("pagination=keyset") +class GitlabRepositories(): + + def __init__(self): - if not next_link: - result = session.query(f"{base_path}/projects?{'&'.join(url_params)}") - else: - result = session.get(next_link) + if not os.environ.get('GITLAB_URL'): + print("Environment variable GITLAB_URL not specified") + exit(1) + + if not os.environ.get('GITLAB_PAT'): + print("Environment variable GITLAB_PAT not specified") + exit(1) + + self.projects = [] + self.groups = [] + self.base_path = '/api/v4' + self.session = GitlabConnector() + + if os.environ.get('GITLAB_GROUP_ID'): + self.groups.append(os.environ.get('GITLAB_GROUP_ID')) + + def parse_pagination(self, result_headers): + + ret_val = False + if not result_headers.get('Link'): + return ret_val + + links = result_headers['Link'].split(', ') - if result.headers.get('Link'): - links = result.headers['Link'].split(', ') for link in links: parts = link.split('; ') rel = parts[1].split('=')[1] if rel == '"next"': - link = parts[0].replace('<', '').replace('>', '') + ret_val = parts[0].replace('<', '').replace('>', '') break - prev_result += [{'id': i['id'], 'http_url_to_repo': i['http_url_to_repo'], 'ssh_url_to_repo': i['ssh_url_to_repo'], 'web_url': i['web_url']} for i in result.json()] + return ret_val - # I know, not nice.. but im in a hurry - try: - if rel == "\"next\"": - get_all_projects(next_link=link, group_id=group_id, prev_result=prev_result) - except: - pass - return prev_result -def clone_repo_with_http(repo_url=None): - repo_host_path = repo_url.split('://')[1] - repo_http_scheme = repo_url.split('://')[0] - repo_credentials = f"token:{session.pat}" - repo_remote = f"{repo_http_scheme}://{repo_credentials}@{repo_host_path}" - repo_name = repo_host_path.replace('/', '_').rstrip('.git') - repo_path = f"{git_tmp_root}/{repo_name}" + def get_groups(self, next_link=None): + + result = None + if not next_link: + result = self.session.query(f"{self.base_path}/groups") + else: + result = self.session.get(next_link) + + if not result: + print("No groups found or permissions not sufficient.") + return + + self.groups += [i['id'] for i in result.json()] + next_link = self.parse_pagination(result.headers) + + if next_link: + self.get_groups(next_link=next_link) + + def get_projects_by_group(self, next_link=None, group_id=None): + + result = None + url_params = ["include_subgroups=true", "per_page=20", "search_namespaces=true", "owned=false", "order_by=id", "sort=asc"] + group_path = f"{self.base_path}/groups/{group_id}" + + if not next_link: + result = self.session.query(f"{group_path}/projects?{'&'.join(url_params)}") + else: + result = self.session.get(next_link) + + if not result: + print(f"No projects in group {group_id} found or permissions not sufficient.") + return + + self.projects += [{'id': i['id'], 'http_url_to_repo': i['http_url_to_repo'], 'ssh_url_to_repo': i['ssh_url_to_repo'], 'web_url': i['web_url']} for i in result.json()] + next_link = self.parse_pagination(result.headers) + + if next_link: + self.get_projects_by_group(next_link=next_link, group_id=group_id) + + #print(result.json()) + + def get_projects(self): + print("Getting GitLab Projects") + # When groups not empty or specified, parse groups + if not self.groups: + self.get_groups() + + for group_id in self.groups: + print(f"Getting Projects for Group {group_id}") + self.get_projects_by_group(group_id=group_id) + + print(len(self.projects)) + + + def clone_repo(self, repo_url=None): + repo_host_path = repo_url.split('://')[1] + repo_http_scheme = repo_url.split('://')[0] + repo_credentials = f"token:{self.session.pat}" + repo_remote = f"{repo_http_scheme}://{repo_credentials}@{repo_host_path}" + repo_name = repo_host_path.replace('/', '_').rstrip('.git') + repo_path = f"{git_tmp_root}/{repo_name}" - if os.path.isdir(repo_path) and os.listdir(repo_path): - return repo_path + if os.path.isdir(repo_path) and os.listdir(repo_path): + return repo_path - print(f"Processing Repository {repo_host_path}") - try: - repo = Repo.clone_from(repo_remote, repo_path) - repo.close() - except: - print(f"Cant clone {repo_url}") + print(f"Processing Repository {repo_host_path}") + try: + repo = Repo.clone_from(repo_remote, repo_path) + repo.close() + except: + print(f"Cant clone {repo_url}") + return repo_path return repo_path - return repo_path + +def parse_fileglob(): + + glob_file = Path(os.environ.get('GLOB_FILE', 'globfile')) + ret_val = [] + if not glob_file.is_file(): + print(f"Specified glob file {glob_file} not found. Searching all files.") + return [] + + with open(glob_file, 'r') as file: + lines = file.readlines() + + for line in lines: + ret_val.append("--glob") + ret_val.append(line.rstrip()) + + print(ret_val) + return ret_val def scan_repo(path=None, repo=None): + scan_result = None scan_result_lines = [] scan_matches = [] @@ -114,10 +186,10 @@ def scan_repo(path=None, repo=None): "--json", "-i", "-f", - "patternfile", - path + pattern_file ] - + ripgrep_cmd = ripgrep_cmd + fileglob + ripgrep_cmd.append(path) try: scan_result = subprocess.run(ripgrep_cmd, capture_output=True, text=True) except: @@ -125,7 +197,6 @@ def scan_repo(path=None, repo=None): return [] scan_out_lines = list(filter(None, scan_result.stdout.split('\n'))) - for line in scan_out_lines: line_data = json.loads(line) if line_data.get("type") == "match": @@ -139,13 +210,15 @@ def scan_repo(path=None, repo=None): return scan_matches def evaluate_findings(findings=[]): + if not findings: + return [] + finding_results = [] for finding in findings: filename = finding['full_path'].split('/')[-1] - if filename.startswith("package"): - print(f"Found potential match - {finding['path']} - {finding['matches'][0]['match']['text']}") - detail = check_line_in_file(file=finding['full_path'], line_number=finding['line_number']) - finding_results += [[finding['repo'], finding['path'], finding['line_number'], detail.lstrip(),finding['matches'][0]['match']['text']]] + print(f"Found potential match - {finding['path']} - {finding['matches'][0]['match']['text']}") + detail = check_line_in_file(file=finding['full_path'], line_number=finding['line_number']) + finding_results += [[finding['repo'], finding['path'], finding['line_number'], detail.lstrip(),finding['matches'][0]['match']['text']]] return finding_results def check_line_in_file(file=None, line_number=None): @@ -155,26 +228,31 @@ def check_line_in_file(file=None, line_number=None): print(line) return line.rstrip().replace(',', '') - def check_repos(): - repos = get_all_projects(group_id=os.environ.get('GITLAB_GROUP')) - print(f"Found {len(repos)} Repositories..") - for repo in repos: - scan_path = clone_repo_with_http(repo['http_url_to_repo']) + gl = GitlabRepositories() + gl.get_projects() + print(f"Found {len(gl.projects)} Repositories..") + + for repo in gl.projects: + scan_path = gl.clone_repo(repo['http_url_to_repo']) findings = scan_repo(scan_path, repo['web_url']) if findings: - print("Evaluating matches") - finding_results = evaluate_findings(findings=findings) - if finding_results: - report.findings += finding_results + print("Evaluating matches") + finding_results = evaluate_findings(findings=findings) + if finding_results: + report.findings += finding_results subprocess.run(["rm", "-rf", scan_path]) -git_tmp_root = os.environ.get('GIT_TMP_ROOT', '/tmp/hulud_check') -report_path = os.environ.get('REPORT_PATH', '/tmp/hulud_check_reports') + +git_tmp_root = os.environ.get('GIT_TMP_ROOT', '/tmp/repo_check') +report_path = os.environ.get('REPORT_PATH', '/tmp/check_reports') report_file = os.environ.get('REPORT_FILE', 'report.csv') +pattern_file = os.environ.get('PATTERN_FILE', 'patternfile') + +fileglob = parse_fileglob() + Path(git_tmp_root).mkdir(parents=True, exist_ok=True) Path(report_path).mkdir(parents=True, exist_ok=True) -session = GitlabConnector() report = Report() check_repos() report.results() diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100755 index 2b55829..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -echo "Get most recent defintions" -curl -s https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv -echo "Format patterns" -tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print $1}' > patternfile -echo "Running check" -python3 -u check_gitlab.py diff --git a/globfile b/globfile new file mode 100644 index 0000000..e88e949 --- /dev/null +++ b/globfile @@ -0,0 +1,3 @@ +package-lock.json +package.json +yarn.lock diff --git a/patternfile b/patternfile new file mode 100644 index 0000000..7c0b5cb --- /dev/null +++ b/patternfile @@ -0,0 +1,2 @@ +axios.*1\.4\.1 +axios.*0\.3\.0 diff --git a/pyproject.toml b/pyproject.toml index e464f41..ba3124b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] -name = "hulud_check" -description = "Quick hacky check for sha1-hulud" +name = "gitlab_scanner" +description = "Quick hacky Gitlab Repository scanner for searching patterns in files" version = "2025.0.0" requires-python = ">=3.13" dependencies = [