Refactored it a bit

2026-04-02 13:05:56 +02:00
parent ba550f8dc5
commit 280450fc0f
8 changed files with 196 additions and 89 deletions
@@ -1,6 +1,5 @@
 .venv
 build
-hulud_check.egg-info
-*.egg-info
-patternfile
+
+*.egg-info

@@ -1,20 +1,10 @@
 FROM python:3.13-slim-trixie

-WORKDIR /opt/hulud_check
+WORKDIR /opt/glchecka
 RUN apt-get update && apt-get install -y ripgrep git curl
 ADD check_gitlab.py .
 ADD pyproject.toml .
 ADD entrypoint.sh .
 RUN pip3 install .

-COPY --chmod=755 <<EOF /opt/hulud_check/entrypoint.sh
-#!/bin/bash
-echo "Get most recent defintions"
-curl -s https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv
-echo "Format patterns"
-tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print \$1}' > patternfile
-echo "Running check"
-python3 -u check_gitlab.py
-EOF
-
-ENTRYPOINT ["/opt/hulud_check/entrypoint.sh"]
+ENTRYPOINT ["python3", "-u", "check_gitlab.py"]
@@ -0,0 +1,42 @@
+# Gitlab Repository Checker
+
+This tool acts as a small helper to find specific strings in files (e.g. axios 1.4.1 in package.json).
+It will query the Gitlab API for groups and projects and pull repos locally via HTTPS and search them via ripgrep.
+
+In the end a CSV report will be generated with the findings.
+
+It can also be used for all kinds of purposes for automated repository searches.
+
+## Configuration
+
+You can configure the tool via environment variables:
+
+```shell
+
+GITLAB_URL          # Required - Defines the Gitlab URL (for Self Hosted instances or Managed)
+GITLAB_PAT          # Required - A Gitlab personal access tokens with permissions to read groups and projects
+GITLAB_GROUP_ID     # Optional - An ID of a specific group, default is all groups will be checked for repositories
+GLOB_FILE           # Optional - A file containing glob patterns or filenames to search patterns for (Its recursive in all subdirs)
+GIT_TMP_ROOT        # Optional - The path where repositories are cloned to for scanning - Default - /tmp/repo_check
+REPORT_PATH         # Optional - The path where reports are stored - Default - /tmp/check_reports
+REPORT_FILE         # Optional - The Filename for the report - Default - report.csv
+PATTERN_FILE        # Optional - The file containing the regex patterns which ripgrep will use to match strings - Default - patternfile 
+
+```
+
+## Execution
+
+```shell
+python -m venv .venv
+source .venv/bin/activate
+pip install .
+python -u check_gitlab.py
+```
+
+You can also try to use the Dockerfile
+
+## Important
+
+> *This product is 100% AI Free.
+> **It is not nice code.  Made in a hurry and and with low attention span.  
+> ***Product is not vegan. Might contain bugs and nuts.
@@ -1,12 +1,5 @@
 #!/usr/bin/env python3

-# Very hacky quick check for sha1-hulud for gitlab repos
-# Need to set the GITLAB_URL and GITLAB_PAT
-# Also use the CSV provided by https://github.com/wiz-sec-public/wiz-research-iocs/blob/main/reports/shai-hulud-2-packages.csv and create a patternfile
-# Patternfile creation:
-# curl https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv
-# tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print $1}' > patternfile
-
 # pip install GitPython requests
 # You need to have ripgrep installed too
 # apt-get install ripgrep
@@ -53,59 +46,138 @@ class Report():
            for row in self.findings:
                writer.writerow(row)

-def get_all_projects(next_link=None, group_id=None, prev_result=[]):
-    base_path = '/api/v4'
-    url_params = ["include_subgroups=true", "per_page=50", "search_namespaces=true", "owned=false", "order_by=id", "sort=asc"]
-    if group_id:
-        base_path += f"/groups/{group_id}"
-    else:
-        url_params.append("pagination=keyset")
+class GitlabRepositories():
+    
+    def __init__(self):

-    if not next_link:
-        result = session.query(f"{base_path}/projects?{'&'.join(url_params)}")
-    else:
-        result = session.get(next_link)
+        if not os.environ.get('GITLAB_URL'):
+            print("Environment variable GITLAB_URL not specified")
+            exit(1)
+
+        if not os.environ.get('GITLAB_PAT'):
+            print("Environment variable GITLAB_PAT not specified")
+            exit(1)
+
+        self.projects = []
+        self.groups = []
+        self.base_path = '/api/v4'
+        self.session = GitlabConnector()
+        
+        if os.environ.get('GITLAB_GROUP_ID'):
+            self.groups.append(os.environ.get('GITLAB_GROUP_ID'))
+
+    def parse_pagination(self, result_headers):
+    
+        ret_val = False
+        if not result_headers.get('Link'):
+            return ret_val   
+        
+        links = result_headers['Link'].split(', ')

-    if result.headers.get('Link'):
-        links = result.headers['Link'].split(', ')
        for link in links:
            parts = link.split('; ')
            rel = parts[1].split('=')[1]
            if rel == '"next"':
-                link = parts[0].replace('<', '').replace('>', '')
+                ret_val = parts[0].replace('<', '').replace('>', '')
                break

-    prev_result += [{'id': i['id'], 'http_url_to_repo': i['http_url_to_repo'], 'ssh_url_to_repo': i['ssh_url_to_repo'], 'web_url': i['web_url']} for i in result.json()]
+        return ret_val

-    # I know, not nice.. but im in a hurry
-    try:
-        if rel == "\"next\"":
-            get_all_projects(next_link=link, group_id=group_id, prev_result=prev_result)
-    except:
-        pass
-    return prev_result

-def clone_repo_with_http(repo_url=None):
-    repo_host_path = repo_url.split('://')[1]
-    repo_http_scheme = repo_url.split('://')[0]
-    repo_credentials = f"token:{session.pat}"
-    repo_remote = f"{repo_http_scheme}://{repo_credentials}@{repo_host_path}"
-    repo_name = repo_host_path.replace('/', '_').rstrip('.git')
-    repo_path = f"{git_tmp_root}/{repo_name}"
+    def get_groups(self, next_link=None):
+        
+        result = None
+        if not next_link:
+            result = self.session.query(f"{self.base_path}/groups")
+        else:
+            result = self.session.get(next_link)
+
+        if not result:
+            print("No groups found or permissions not sufficient.")
+            return
+
+        self.groups += [i['id'] for i in result.json()]
+        next_link = self.parse_pagination(result.headers)
+
+        if next_link:
+            self.get_groups(next_link=next_link)
+
+    def get_projects_by_group(self, next_link=None, group_id=None):
+        
+        result = None
+        url_params = ["include_subgroups=true", "per_page=20", "search_namespaces=true", "owned=false", "order_by=id", "sort=asc"]
+        group_path = f"{self.base_path}/groups/{group_id}"
+        
+        if not next_link:
+            result = self.session.query(f"{group_path}/projects?{'&'.join(url_params)}")
+        else:
+            result = self.session.get(next_link)
+        
+        if not result:
+            print(f"No projects in group {group_id} found or permissions not sufficient.")
+            return
+
+        self.projects += [{'id': i['id'], 'http_url_to_repo': i['http_url_to_repo'], 'ssh_url_to_repo': i['ssh_url_to_repo'], 'web_url': i['web_url']} for i in result.json()]
+        next_link = self.parse_pagination(result.headers)
+
+        if next_link:
+            self.get_projects_by_group(next_link=next_link, group_id=group_id)
+
+        #print(result.json())
+
+    def get_projects(self):
+        print("Getting GitLab Projects")
+        # When groups not empty or specified, parse groups
+        if not self.groups:
+            self.get_groups()
+        
+        for group_id in self.groups:
+            print(f"Getting Projects for Group {group_id}")
+            self.get_projects_by_group(group_id=group_id)
+
+        print(len(self.projects))
+
+
+    def clone_repo(self, repo_url=None):
+        repo_host_path = repo_url.split('://')[1]
+        repo_http_scheme = repo_url.split('://')[0]
+        repo_credentials = f"token:{self.session.pat}"
+        repo_remote = f"{repo_http_scheme}://{repo_credentials}@{repo_host_path}"
+        repo_name = repo_host_path.replace('/', '_').rstrip('.git')
+        repo_path = f"{git_tmp_root}/{repo_name}"
    
-    if os.path.isdir(repo_path) and os.listdir(repo_path):
-        return repo_path
+        if os.path.isdir(repo_path) and os.listdir(repo_path):
+            return repo_path
    
-    print(f"Processing Repository {repo_host_path}")
-    try:
-        repo = Repo.clone_from(repo_remote, repo_path)
-        repo.close()
-    except:
-        print(f"Cant clone {repo_url}")
+        print(f"Processing Repository {repo_host_path}")
+        try:
+            repo = Repo.clone_from(repo_remote, repo_path)
+            repo.close()
+        except:
+            print(f"Cant clone {repo_url}")
+            return repo_path
        return repo_path
-    return repo_path
+
+def parse_fileglob():
+
+    glob_file = Path(os.environ.get('GLOB_FILE', 'globfile'))
+    ret_val = []
+    if not glob_file.is_file():
+        print(f"Specified glob file {glob_file} not found. Searching all files.")
+        return []
+    
+    with open(glob_file, 'r') as file:
+        lines = file.readlines()
+    
+    for line in lines:
+        ret_val.append("--glob")
+        ret_val.append(line.rstrip())
+
+    print(ret_val)
+    return ret_val

 def scan_repo(path=None, repo=None):
+
    scan_result = None
    scan_result_lines = []
    scan_matches = []
@@ -114,10 +186,10 @@ def scan_repo(path=None, repo=None):
        "--json",
        "-i",
        "-f",
-        "patternfile",
-        path
+        pattern_file
    ]
-
+    ripgrep_cmd = ripgrep_cmd + fileglob
+    ripgrep_cmd.append(path)
    try:
        scan_result = subprocess.run(ripgrep_cmd, capture_output=True, text=True)
    except:
@@ -125,7 +197,6 @@ def scan_repo(path=None, repo=None):
        return []

    scan_out_lines = list(filter(None, scan_result.stdout.split('\n')))
-
    for line in scan_out_lines:
        line_data = json.loads(line)
        if line_data.get("type") == "match":
@@ -139,13 +210,15 @@ def scan_repo(path=None, repo=None):
    return scan_matches

 def evaluate_findings(findings=[]):
+    if not findings:
+        return []
+    
    finding_results = []
    for finding in findings:
        filename = finding['full_path'].split('/')[-1]
-        if filename.startswith("package"):
-            print(f"Found potential match - {finding['path']} - {finding['matches'][0]['match']['text']}")
-            detail = check_line_in_file(file=finding['full_path'], line_number=finding['line_number'])
-            finding_results += [[finding['repo'], finding['path'], finding['line_number'], detail.lstrip(),finding['matches'][0]['match']['text']]]
+        print(f"Found potential match - {finding['path']} - {finding['matches'][0]['match']['text']}")
+        detail = check_line_in_file(file=finding['full_path'], line_number=finding['line_number'])
+        finding_results += [[finding['repo'], finding['path'], finding['line_number'], detail.lstrip(),finding['matches'][0]['match']['text']]]
    return finding_results

 def check_line_in_file(file=None, line_number=None):
@@ -155,26 +228,31 @@ def check_line_in_file(file=None, line_number=None):
                print(line)
                return line.rstrip().replace(',', '')

-
 def check_repos():
-    repos = get_all_projects(group_id=os.environ.get('GITLAB_GROUP'))
-    print(f"Found {len(repos)} Repositories..")
-    for repo in repos:
-        scan_path = clone_repo_with_http(repo['http_url_to_repo'])
+    gl = GitlabRepositories()
+    gl.get_projects()
+    print(f"Found {len(gl.projects)} Repositories..")
+
+    for repo in gl.projects:
+        scan_path = gl.clone_repo(repo['http_url_to_repo'])
        findings = scan_repo(scan_path, repo['web_url'])
        if findings:
-            print("Evaluating matches")
-            finding_results = evaluate_findings(findings=findings)
-            if finding_results:
-                report.findings += finding_results
+           print("Evaluating matches")
+           finding_results = evaluate_findings(findings=findings)
+           if finding_results:
+               report.findings += finding_results
        subprocess.run(["rm", "-rf", scan_path])

-git_tmp_root = os.environ.get('GIT_TMP_ROOT', '/tmp/hulud_check')
-report_path = os.environ.get('REPORT_PATH', '/tmp/hulud_check_reports')
+
+git_tmp_root = os.environ.get('GIT_TMP_ROOT', '/tmp/repo_check')
+report_path = os.environ.get('REPORT_PATH', '/tmp/check_reports')
 report_file = os.environ.get('REPORT_FILE', 'report.csv')
+pattern_file = os.environ.get('PATTERN_FILE', 'patternfile')
+
+fileglob = parse_fileglob()
+
 Path(git_tmp_root).mkdir(parents=True, exist_ok=True)
 Path(report_path).mkdir(parents=True, exist_ok=True)
-session = GitlabConnector()
 report = Report()
 check_repos()
 report.results()
@@ -1,7 +0,0 @@
-#!/bin/bash
-echo "Get most recent defintions"
-curl -s https://raw.githubusercontent.com/wiz-sec-public/wiz-research-iocs/refs/heads/main/reports/shai-hulud-2-packages.csv > sha1-hulud-2-packages.csv
-echo "Format patterns"
-tail -n +2 sha1-hulud-2-packages.csv | awk -F ',' '{print $1}' > patternfile
-echo "Running check"
-python3 -u check_gitlab.py
@@ -0,0 +1,3 @@
+package-lock.json
+package.json
+yarn.lock
@@ -0,0 +1,2 @@
+axios.*1\.4\.1
+axios.*0\.3\.0
@@ -1,6 +1,6 @@
 [project]
-name = "hulud_check"
-description = "Quick hacky check for sha1-hulud"
+name = "gitlab_scanner"
+description = "Quick hacky Gitlab Repository scanner for searching patterns in files"
 version = "2025.0.0"
 requires-python = ">=3.13"
 dependencies = [