Skip to content

Commit 069d870

Browse files
semgrep pro parser (#12848)
* semgrep pro: parse sast finding * update docs
1 parent 3f0a777 commit 069d870

6 files changed

Lines changed: 427 additions & 0 deletions

File tree

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
---
2+
title: "Semgrep Pro JSON Report"
3+
toc_hide: true
4+
---
5+
Import Semgrep Pro findings in JSON format.
6+
7+
### Sample Scan Data
8+
Sample Semgrep Pro JSON Report scans can be found [here](https://github.com/DefectDojo/django-DefectDojo/tree/master/unittests/scans/semgrep_pro).
9+
10+
### Default Deduplication
11+
By default, DefectDojo uses the `match_based_id` from Semgrep Pro for deduplication. If this is not available, it falls back to using a combination of:
12+
- title
13+
- file path
14+
- line number
15+
16+
### Fields Mapped
17+
The following fields are mapped from the Semgrep Pro JSON report:
18+
19+
#### Basic Information
20+
- title: Mapped from `rule_name`
21+
- severity: Mapped from Semgrep Pro severity levels (ERROR/HIGH → High, WARNING/MEDIUM → Medium, INFO/LOW → Low)
22+
- file_path: Path to the affected file from `location.file_path`
23+
- line: Line number from `location.line`
24+
- unique_id_from_tool: Mapped from `match_based_id`
25+
26+
#### Status Fields
27+
- active: Set to false if status is "fixed" or "removed"
28+
- verified: Set to true if triage_state is not "untriaged"
29+
30+
#### Rich Content Fields
31+
- description: Includes:
32+
- Rule message and details
33+
- CWE references
34+
- OWASP references
35+
- Categories
36+
- Triage information
37+
- impact: Includes:
38+
- Vulnerability classes
39+
- Confidence level
40+
- Repository information
41+
- mitigation: Includes:
42+
- Guidance summary
43+
- Detailed instructions
44+
- Auto-fix suggestions
45+
- Auto-triage information
46+
- Component details and risk level
47+
- references: Includes:
48+
- Line of code URL
49+
- CWE references
50+
- OWASP references
51+
- External ticket information
52+
53+
#### Component Information
54+
- component_name: Mapped from `assistant.component.tag`
55+
56+
#### Additional Fields
57+
- static_finding: Always set to true
58+
- dynamic_finding: Always set to false
59+
- cwe: Extracted from first CWE reference if available
60+
- date: Mapped from `created_at`

dojo/tools/semgrep_pro/__init__.py

Whitespace-only changes.

dojo/tools/semgrep_pro/parser.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
import contextlib
2+
import json
3+
from datetime import datetime
4+
5+
from dojo.models import Finding
6+
7+
8+
class SemgrepProParser:
9+
def get_scan_types(self):
10+
return ["Semgrep Pro JSON Report"]
11+
12+
def get_label_for_scan_types(self, scan_type):
13+
return scan_type
14+
15+
def get_description_for_scan_types(self, scan_type):
16+
return "Import Semgrep Pro findings in JSON format"
17+
18+
def get_findings(self, filename, test):
19+
data = json.load(filename)
20+
dupes = {}
21+
22+
for item in data.get("findings", []):
23+
# Ensure required fields have default values
24+
title = item.get("rule_name", "No title")
25+
file_path = item.get("location", {}).get("file_path", "")
26+
line = item.get("location", {}).get("line", 0)
27+
28+
# Map status to active/verified
29+
status = item.get("status", "new").lower()
30+
active = status not in {"fixed", "removed"}
31+
triage_status = item.get("triage_state", "untriaged").lower()
32+
verified = triage_status != "untriaged"
33+
34+
finding = Finding(
35+
test=test,
36+
title=title,
37+
severity=self.convert_severity(item.get("severity", "INFO")),
38+
description=self.get_description(item),
39+
file_path=file_path,
40+
line=line,
41+
static_finding=True,
42+
dynamic_finding=False,
43+
vuln_id_from_tool=item.get("rule_name"),
44+
nb_occurences=1,
45+
active=active,
46+
verified=verified,
47+
)
48+
49+
# Add CWE if available
50+
if "rule" in item and "cwe_names" in item["rule"]:
51+
try:
52+
cwe_name = item["rule"]["cwe_names"][0] # Take first CWE
53+
finding.cwe = int(cwe_name.split("-")[1].split(":")[0])
54+
except (ValueError, IndexError, KeyError):
55+
finding.cwe = None
56+
57+
# Add references if available
58+
references = []
59+
if "line_of_code_url" in item:
60+
references.append(f"Line of Code: {item['line_of_code_url']}")
61+
if "rule" in item:
62+
if "owasp_names" in item["rule"]:
63+
references.extend(item["rule"]["owasp_names"])
64+
if "cwe_names" in item["rule"]:
65+
references.extend(item["rule"]["cwe_names"])
66+
67+
if "external_ticket" in item:
68+
references.append(f"External Ticket: \n {item['external_ticket']}")
69+
70+
# Add file location details
71+
if references:
72+
finding.references = "\n".join(references)
73+
74+
# Add mitigation if available
75+
mitigation_parts = []
76+
if "assistant" in item:
77+
assistant = item["assistant"]
78+
if "guidance" in assistant:
79+
if "summary" in assistant["guidance"]:
80+
mitigation_parts.append(f"**Guidance Summary:**\n{assistant['guidance']['summary']}")
81+
if "instructions" in assistant["guidance"]:
82+
mitigation_parts.append(f"**Instructions:**\n{assistant['guidance']['instructions']}")
83+
84+
if "autofix" in assistant:
85+
autofix = assistant["autofix"]
86+
if "fix_code" in autofix:
87+
mitigation_parts.append(f"**Suggested Fix:**\n```\n{autofix['fix_code']}\n```")
88+
if autofix.get("explanation"):
89+
mitigation_parts.append(f"**Fix Explanation:**\n{autofix['explanation']}")
90+
91+
if "autotriage" in assistant:
92+
autotriage = assistant["autotriage"]
93+
if "verdict" in autotriage:
94+
mitigation_parts.append(f"**Auto-triage Verdict:** {autotriage['verdict']}")
95+
if "reason" in autotriage:
96+
mitigation_parts.append(f"**Auto-triage Reason:** {autotriage['reason']}")
97+
98+
if "component" in assistant:
99+
component = assistant["component"]
100+
if "tag" in component:
101+
mitigation_parts.append(f"**Component:** {component['tag']}")
102+
if "risk" in component:
103+
mitigation_parts.append(f"**Risk Level:** {component['risk']}")
104+
105+
finding.mitigation = "\n\n".join(mitigation_parts) if mitigation_parts else None
106+
107+
# Add unique identifier
108+
finding.unique_id_from_tool = item.get("match_based_id")
109+
110+
# Add component name and version if available
111+
if "assistant" in item and "component" in item["assistant"]:
112+
finding.component_name = item["assistant"]["component"].get("tag")
113+
114+
# Add dates
115+
if "created_at" in item:
116+
with contextlib.suppress(ValueError, TypeError):
117+
finding.date = datetime.strptime(item["created_at"].split(".")[0], "%Y-%m-%dT%H:%M:%S")
118+
119+
# Add impact
120+
impact_parts = []
121+
if "rule" in item and "vulnerability_classes" in item["rule"]:
122+
impact_parts.extend(item["rule"]["vulnerability_classes"])
123+
if "confidence" in item:
124+
impact_parts.append(f"Confidence: {item['confidence'].capitalize()}")
125+
if "repository" in item:
126+
repo = item["repository"]
127+
impact_parts.append(f"Repository: {repo.get('name', '')} ({repo.get('url', '')})")
128+
finding.impact = "\n".join(impact_parts)
129+
130+
# Use match_based_id for deduplication if available, otherwise use file location
131+
dupe_key = finding.unique_id_from_tool or title + str(file_path) + str(line)
132+
133+
if dupe_key in dupes:
134+
dupes[dupe_key].nb_occurences += 1
135+
else:
136+
dupes[dupe_key] = finding
137+
138+
return list(dupes.values())
139+
140+
def convert_severity(self, val):
141+
val = val.upper()
142+
if val == "ERROR" or val == "HIGH":
143+
return "High"
144+
if val == "WARNING" or val == "MEDIUM":
145+
return "Medium"
146+
if val == "INFO" or val == "LOW":
147+
return "Low"
148+
if val == "CRITICAL":
149+
return "Critical"
150+
return "Info"
151+
152+
def get_description(self, item):
153+
desc = ""
154+
if "rule_message" in item:
155+
desc += f"**Message:** {item['rule_message']}\n\n"
156+
157+
if "rule" in item:
158+
if "message" in item["rule"]:
159+
desc += f"**Rule Message:** {item['rule']['message']}\n\n"
160+
if "category" in item["rule"]:
161+
desc += f"**Category:** {item['rule']['category']}\n\n"
162+
if "confidence" in item["rule"]:
163+
desc += f"**Confidence:** {item['rule']['confidence']}\n\n"
164+
if "vulnerability_classes" in item["rule"]:
165+
desc += "**Vulnerability Classes:**\n"
166+
for vuln_class in item["rule"]["vulnerability_classes"]:
167+
desc += f"- {vuln_class}\n"
168+
desc += "\n"
169+
if "cwe_names" in item["rule"]:
170+
desc += "**CWE References:**\n"
171+
for cwe in item["rule"]["cwe_names"]:
172+
desc += f"- {cwe}\n"
173+
desc += "\n"
174+
if "owasp_names" in item["rule"]:
175+
desc += "**OWASP References:**\n"
176+
for owasp in item["rule"]["owasp_names"]:
177+
desc += f"- {owasp}\n"
178+
desc += "\n"
179+
180+
# Add categories
181+
if "categories" in item:
182+
desc += "**Categories:**\n"
183+
for category in item["categories"]:
184+
desc += f"- {category}\n"
185+
desc += "\n"
186+
187+
# Add triage information
188+
if "triage_state" in item:
189+
desc += f"**Triage State:** {item['triage_state']}\n"
190+
if "triage_comment" in item:
191+
desc += f"**Triage Comment:** {item['triage_comment']}\n"
192+
if "triage_reason" in item:
193+
desc += f"**Triage Reason:** {item['triage_reason']}\n\n"
194+
195+
return desc
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"findings": [
3+
]
4+
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
{
2+
"findings": [
3+
{
4+
"id": 1234567,
5+
"ref": "refs/pull/1234/merge",
6+
"first_seen_scan_id": 1234,
7+
"syntactic_id": "440eeface888e78afceac3dc7d4cc2cf",
8+
"match_based_id": "0f8c79a6f7e0ff2f908ff5bc366ae1548465069bae8892088051e1c3b4b12c6b8df37d5bcbb181eb868aa79f81f239d14bf2336d552786ab8ccdc7279adf07a6_1",
9+
"external_ticket": {
10+
"external_slug": "OPS-158",
11+
"url": "string",
12+
"id": 0,
13+
"linked_issue_ids": [
14+
0
15+
]
16+
},
17+
"review_comments": [
18+
{
19+
"external_discussion_id": "af04762b69acfb74c8f9",
20+
"external_note_id": 123523
21+
}
22+
],
23+
"repository": {
24+
"name": "semgrep",
25+
"url": "https://github.com/semgrep/semgrep"
26+
},
27+
"line_of_code_url": "https://github.com/semgrep/semgrep/blob/39f95450a7d4d70e54c9edbd109bed8210a36889/src/core_cli/Core_CLI.ml#L1",
28+
"triage_state": "untriaged",
29+
"state": "unresolved",
30+
"status": "open",
31+
"severity": "medium",
32+
"confidence": "medium",
33+
"categories": [
34+
"security"
35+
],
36+
"created_at": "2020-11-18T23:28:12.391807Z",
37+
"relevant_since": "2020-11-18T23:28:12.391807Z",
38+
"rule_name": "typescript.react.security.audit.react-no-refs.react-no-refs",
39+
"rule_message": "`ref` usage found. refs give direct DOM access and may create a possibility for XSS, which could cause\nsensitive information such as user cookies to be retrieved by an attacker. Instead, avoid direct DOM\nmanipulation or use DOMPurify to sanitize HTML before writing it into the page.\n",
40+
"location": {
41+
"file_path": "frontend/src/corpComponents/Code.tsx",
42+
"line": 120,
43+
"column": 8,
44+
"end_line": 124,
45+
"end_column": 16
46+
},
47+
"sourcing_policy": {
48+
"id": 120,
49+
"name": "Default Policy",
50+
"slug": "default-policy"
51+
},
52+
"triaged_at": "2020-11-19T23:28:12.391807Z",
53+
"triage_comment": "This finding is from the test repo",
54+
"triage_reason": "acceptable_risk",
55+
"state_updated_at": "2020-11-19T23:28:12.391807Z",
56+
"rule": {
57+
"name": "html.security.plaintext-http-link.plaintext-http-link",
58+
"message": "This link points to a plaintext HTTP URL. Prefer an encrypted HTTPS URL if possible.",
59+
"confidence": "high",
60+
"category": "security",
61+
"subcategories": [
62+
"vuln"
63+
],
64+
"vulnerability_classes": [
65+
"Mishandled Sensitive Information"
66+
],
67+
"cwe_names": [
68+
"CWE-319: Cleartext Transmission of Sensitive Information"
69+
],
70+
"owasp_names": [
71+
"A03:2017 - Sensitive Data Exposure",
72+
"A02:2021 - Cryptographic Failures"
73+
]
74+
},
75+
"assistant": {
76+
"autofix": {
77+
"fix_code": "cookie.setHttpOnly(true);\nresponse.addCookie(cookie);",
78+
"explanation": ""
79+
},
80+
"guidance": {
81+
"summary": "Use a template rendering engine such as EJS instead of string concatenation.",
82+
"instructions": "1. Check if your project has any template engines installed such as EJS, Pug, or Mustache.\n If not, install EJS, with a command such as `$ npm install ejs`.\n2. Create an EJS template: `const template = '<h2><%= user.id %></h2>'`\n3. <... example trimmed in API docs ...>"
83+
},
84+
"autotriage": {
85+
"verdict": "false_positive",
86+
"reason": "The matched code is used for a non-security related feature."
87+
},
88+
"component": {
89+
"tag": "user data",
90+
"risk": "high"
91+
}
92+
}
93+
}
94+
]
95+
}

0 commit comments

Comments
 (0)