semgrep pro parser (#12848)

valentijnscholten · web-flow · commit 069d8709a524 · 2025-09-05T20:45:11.000+02:00
* semgrep pro: parse sast finding

* update docs
diff --git a/docs/content/en/connecting_your_tools/parsers/file/semgrep_pro.md b/docs/content/en/connecting_your_tools/parsers/file/semgrep_pro.md
@@ -0,0 +1,60 @@
+---
+title: "Semgrep Pro JSON Report"
+toc_hide: true
+---
+Import Semgrep Pro findings in JSON format.
+
+### Sample Scan Data
+Sample Semgrep Pro JSON Report scans can be found [here](https://github.com/DefectDojo/django-DefectDojo/tree/master/unittests/scans/semgrep_pro).
+
+### Default Deduplication
+By default, DefectDojo uses the `match_based_id` from Semgrep Pro for deduplication. If this is not available, it falls back to using a combination of:
+- title
+- file path
+- line number
+
+### Fields Mapped
+The following fields are mapped from the Semgrep Pro JSON report:
+
+#### Basic Information
+- title: Mapped from `rule_name`
+- severity: Mapped from Semgrep Pro severity levels (ERROR/HIGH → High, WARNING/MEDIUM → Medium, INFO/LOW → Low)
+- file_path: Path to the affected file from `location.file_path`
+- line: Line number from `location.line`
+- unique_id_from_tool: Mapped from `match_based_id`
+
+#### Status Fields
+- active: Set to false if status is "fixed" or "removed"
+- verified: Set to true if triage_state is not "untriaged"
+
+#### Rich Content Fields
+- description: Includes:
+  - Rule message and details
+  - CWE references
+  - OWASP references
+  - Categories
+  - Triage information
+- impact: Includes:
+  - Vulnerability classes
+  - Confidence level
+  - Repository information
+- mitigation: Includes:
+  - Guidance summary
+  - Detailed instructions
+  - Auto-fix suggestions
+  - Auto-triage information
+  - Component details and risk level
+- references: Includes:
+  - Line of code URL
+  - CWE references
+  - OWASP references
+  - External ticket information
+
+#### Component Information
+- component_name: Mapped from `assistant.component.tag`
+
+#### Additional Fields
+- static_finding: Always set to true
+- dynamic_finding: Always set to false
+- cwe: Extracted from first CWE reference if available
+- date: Mapped from `created_at`
diff --git a/dojo/tools/semgrep_pro/__init__.py b/dojo/tools/semgrep_pro/__init__.py
diff --git a/dojo/tools/semgrep_pro/parser.py b/dojo/tools/semgrep_pro/parser.py
@@ -0,0 +1,195 @@
+import contextlib
+import json
+from datetime import datetime
+
+from dojo.models import Finding
+
+
+class SemgrepProParser:
+    def get_scan_types(self):
+        return ["Semgrep Pro JSON Report"]
+
+    def get_label_for_scan_types(self, scan_type):
+        return scan_type
+
+    def get_description_for_scan_types(self, scan_type):
+        return "Import Semgrep Pro findings in JSON format"
+
+    def get_findings(self, filename, test):
+        data = json.load(filename)
+        dupes = {}
+
+        for item in data.get("findings", []):
+            # Ensure required fields have default values
+            title = item.get("rule_name", "No title")
+            file_path = item.get("location", {}).get("file_path", "")
+            line = item.get("location", {}).get("line", 0)
+
+            # Map status to active/verified
+            status = item.get("status", "new").lower()
+            active = status not in {"fixed", "removed"}
+            triage_status = item.get("triage_state", "untriaged").lower()
+            verified = triage_status != "untriaged"
+
+            finding = Finding(
+                test=test,
+                title=title,
+                severity=self.convert_severity(item.get("severity", "INFO")),
+                description=self.get_description(item),
+                file_path=file_path,
+                line=line,
+                static_finding=True,
+                dynamic_finding=False,
+                vuln_id_from_tool=item.get("rule_name"),
+                nb_occurences=1,
+                active=active,
+                verified=verified,
+            )
+
+            # Add CWE if available
+            if "rule" in item and "cwe_names" in item["rule"]:
+                try:
+                    cwe_name = item["rule"]["cwe_names"][0]  # Take first CWE
+                    finding.cwe = int(cwe_name.split("-")[1].split(":")[0])
+                except (ValueError, IndexError, KeyError):
+                    finding.cwe = None
+
+            # Add references if available
+            references = []
+            if "line_of_code_url" in item:
+                references.append(f"Line of Code: {item['line_of_code_url']}")
+            if "rule" in item:
+                if "owasp_names" in item["rule"]:
+                    references.extend(item["rule"]["owasp_names"])
+                if "cwe_names" in item["rule"]:
+                    references.extend(item["rule"]["cwe_names"])
+
+            if "external_ticket" in item:
+                references.append(f"External Ticket: \n {item['external_ticket']}")
+
+            # Add file location details
+            if references:
+                finding.references = "\n".join(references)
+
+            # Add mitigation if available
+            mitigation_parts = []
+            if "assistant" in item:
+                assistant = item["assistant"]
+                if "guidance" in assistant:
+                    if "summary" in assistant["guidance"]:
+                        mitigation_parts.append(f"**Guidance Summary:**\n{assistant['guidance']['summary']}")
+                    if "instructions" in assistant["guidance"]:
+                        mitigation_parts.append(f"**Instructions:**\n{assistant['guidance']['instructions']}")
+
+                if "autofix" in assistant:
+                    autofix = assistant["autofix"]
+                    if "fix_code" in autofix:
+                        mitigation_parts.append(f"**Suggested Fix:**\n```\n{autofix['fix_code']}\n```")
+                    if autofix.get("explanation"):
+                        mitigation_parts.append(f"**Fix Explanation:**\n{autofix['explanation']}")
+
+                if "autotriage" in assistant:
+                    autotriage = assistant["autotriage"]
+                    if "verdict" in autotriage:
+                        mitigation_parts.append(f"**Auto-triage Verdict:** {autotriage['verdict']}")
+                    if "reason" in autotriage:
+                        mitigation_parts.append(f"**Auto-triage Reason:** {autotriage['reason']}")
+
+                if "component" in assistant:
+                    component = assistant["component"]
+                    if "tag" in component:
+                        mitigation_parts.append(f"**Component:** {component['tag']}")
+                    if "risk" in component:
+                        mitigation_parts.append(f"**Risk Level:** {component['risk']}")
+
+            finding.mitigation = "\n\n".join(mitigation_parts) if mitigation_parts else None
+
+            # Add unique identifier
+            finding.unique_id_from_tool = item.get("match_based_id")
+
+            # Add component name and version if available
+            if "assistant" in item and "component" in item["assistant"]:
+                finding.component_name = item["assistant"]["component"].get("tag")
+
+            # Add dates
+            if "created_at" in item:
+                with contextlib.suppress(ValueError, TypeError):
+                    finding.date = datetime.strptime(item["created_at"].split(".")[0], "%Y-%m-%dT%H:%M:%S")
+
+            # Add impact
+            impact_parts = []
+            if "rule" in item and "vulnerability_classes" in item["rule"]:
+                impact_parts.extend(item["rule"]["vulnerability_classes"])
+            if "confidence" in item:
+                impact_parts.append(f"Confidence: {item['confidence'].capitalize()}")
+            if "repository" in item:
+                repo = item["repository"]
+                impact_parts.append(f"Repository: {repo.get('name', '')} ({repo.get('url', '')})")
+            finding.impact = "\n".join(impact_parts)
+
+            # Use match_based_id for deduplication if available, otherwise use file location
+            dupe_key = finding.unique_id_from_tool or title + str(file_path) + str(line)
+
+            if dupe_key in dupes:
+                dupes[dupe_key].nb_occurences += 1
+            else:
+                dupes[dupe_key] = finding
+
+        return list(dupes.values())
+
+    def convert_severity(self, val):
+        val = val.upper()
+        if val == "ERROR" or val == "HIGH":
+            return "High"
+        if val == "WARNING" or val == "MEDIUM":
+            return "Medium"
+        if val == "INFO" or val == "LOW":
+            return "Low"
+        if val == "CRITICAL":
+            return "Critical"
+        return "Info"
+
+    def get_description(self, item):
+        desc = ""
+        if "rule_message" in item:
+            desc += f"**Message:** {item['rule_message']}\n\n"
+
+        if "rule" in item:
+            if "message" in item["rule"]:
+                desc += f"**Rule Message:** {item['rule']['message']}\n\n"
+            if "category" in item["rule"]:
+                desc += f"**Category:** {item['rule']['category']}\n\n"
+            if "confidence" in item["rule"]:
+                desc += f"**Confidence:** {item['rule']['confidence']}\n\n"
+            if "vulnerability_classes" in item["rule"]:
+                desc += "**Vulnerability Classes:**\n"
+                for vuln_class in item["rule"]["vulnerability_classes"]:
+                    desc += f"- {vuln_class}\n"
+                desc += "\n"
+            if "cwe_names" in item["rule"]:
+                desc += "**CWE References:**\n"
+                for cwe in item["rule"]["cwe_names"]:
+                    desc += f"- {cwe}\n"
+                desc += "\n"
+            if "owasp_names" in item["rule"]:
+                desc += "**OWASP References:**\n"
+                for owasp in item["rule"]["owasp_names"]:
+                    desc += f"- {owasp}\n"
+                desc += "\n"
+
+        # Add categories
+        if "categories" in item:
+            desc += "**Categories:**\n"
+            for category in item["categories"]:
+                desc += f"- {category}\n"
+            desc += "\n"
+
+        # Add triage information
+        if "triage_state" in item:
+            desc += f"**Triage State:** {item['triage_state']}\n"
+            if "triage_comment" in item:
+                desc += f"**Triage Comment:** {item['triage_comment']}\n"
+            if "triage_reason" in item:
+                desc += f"**Triage Reason:** {item['triage_reason']}\n\n"
+
+        return desc
diff --git a/unittests/scans/semgrep_pro/no_vuln.json b/unittests/scans/semgrep_pro/no_vuln.json
@@ -0,0 +1,4 @@
+{
+  "findings": [
+  ]
+}
diff --git a/unittests/scans/semgrep_pro/one_vuln.json b/unittests/scans/semgrep_pro/one_vuln.json
@@ -0,0 +1,95 @@
+{
+  "findings": [
+    {
+      "id": 1234567,
+      "ref": "refs/pull/1234/merge",
+      "first_seen_scan_id": 1234,
+      "syntactic_id": "440eeface888e78afceac3dc7d4cc2cf",
+      "match_based_id": "0f8c79a6f7e0ff2f908ff5bc366ae1548465069bae8892088051e1c3b4b12c6b8df37d5bcbb181eb868aa79f81f239d14bf2336d552786ab8ccdc7279adf07a6_1",
+      "external_ticket": {
+        "external_slug": "OPS-158",
+        "url": "string",
+        "id": 0,
+        "linked_issue_ids": [
+          0
+        ]
+      },
+      "review_comments": [
+        {
+          "external_discussion_id": "af04762b69acfb74c8f9",
+          "external_note_id": 123523
+        }
+      ],
+      "repository": {
+        "name": "semgrep",
+        "url": "https://github.com/semgrep/semgrep"
+      },
+      "line_of_code_url": "https://github.com/semgrep/semgrep/blob/39f95450a7d4d70e54c9edbd109bed8210a36889/src/core_cli/Core_CLI.ml#L1",
+      "triage_state": "untriaged",
+      "state": "unresolved",
+      "status": "open",
+      "severity": "medium",
+      "confidence": "medium",
+      "categories": [
+        "security"
+      ],
+      "created_at": "2020-11-18T23:28:12.391807Z",
+      "relevant_since": "2020-11-18T23:28:12.391807Z",
+      "rule_name": "typescript.react.security.audit.react-no-refs.react-no-refs",
+      "rule_message": "`ref` usage found. refs give direct DOM access and may create a possibility for XSS, which could cause\nsensitive information such as user cookies to be retrieved by an attacker. Instead, avoid direct DOM\nmanipulation or use DOMPurify to sanitize HTML before writing it into the page.\n",
+      "location": {
+        "file_path": "frontend/src/corpComponents/Code.tsx",
+        "line": 120,
+        "column": 8,
+        "end_line": 124,
+        "end_column": 16
+      },
+      "sourcing_policy": {
+        "id": 120,
+        "name": "Default Policy",
+        "slug": "default-policy"
+      },
+      "triaged_at": "2020-11-19T23:28:12.391807Z",
+      "triage_comment": "This finding is from the test repo",
+      "triage_reason": "acceptable_risk",
+      "state_updated_at": "2020-11-19T23:28:12.391807Z",
+      "rule": {
+        "name": "html.security.plaintext-http-link.plaintext-http-link",
+        "message": "This link points to a plaintext HTTP URL. Prefer an encrypted HTTPS URL if possible.",
+        "confidence": "high",
+        "category": "security",
+        "subcategories": [
+          "vuln"
+        ],
+        "vulnerability_classes": [
+          "Mishandled Sensitive Information"
+        ],
+        "cwe_names": [
+          "CWE-319: Cleartext Transmission of Sensitive Information"
+        ],
+        "owasp_names": [
+          "A03:2017 - Sensitive Data Exposure",
+          "A02:2021 - Cryptographic Failures"
+        ]
+      },
+      "assistant": {
+        "autofix": {
+          "fix_code": "cookie.setHttpOnly(true);\nresponse.addCookie(cookie);",
+          "explanation": ""
+        },
+        "guidance": {
+          "summary": "Use a template rendering engine such as EJS instead of string concatenation.",
+          "instructions": "1. Check if your project has any template engines installed such as EJS, Pug, or Mustache.\n    If not, install EJS, with a command such as `$ npm install ejs`.\n2. Create an EJS template: `const template = '<h2><%= user.id %></h2>'`\n3. <... example trimmed in API docs ...>"
+        },
+        "autotriage": {
+          "verdict": "false_positive",
+          "reason": "The matched code is used for a non-security related feature."
+        },
+        "component": {
+          "tag": "user data",
+          "risk": "high"
+        }
+      }
+    }
+  ]
+}
diff --git a/unittests/tools/test_semgrep_pro_parser.py b/unittests/tools/test_semgrep_pro_parser.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "findings": [
 +  ]
 +}