Skip to content

Commit e8100cb

Browse files
Reimport: batch-refresh finding status fields in close_old_findings (#14638)
* Batch-refresh close_old_findings status fields to avoid N refresh_from_db queries. Replace per-finding refresh_from_db(false_p, risk_accepted, out_of_scope) with one values() query for all PKs and assign onto instances, falling back to refresh_from_db when a row is missing. * docs: cite #12291 for close_old_findings status refresh origin * perf: chunk close_old_findings status sync queries (1000 PKs per SELECT) * tests: add 4th reimport step (empty scan, close_old_findings) to performance tests - Add step 4 to _import_reimport_performance: reimport with an empty StackHawk scan and close_old_findings=True, verifying findings are actually closed (assertGreater on len_closed_findings). - Fix all reimport steps to pass service="Secured Application" so the reimporter's service filter matches findings produced by the StackHawk parser (which sets service from the scan's application field). Without this, original_items was always empty and no matching/closing occurred. - Add stackhawk_empty.json scan fixture. - Fix update_performance_test_counts.py to handle reimport3 (step 4) by adding reimport3_queries/reimport3_async_tasks to param_map. - Update all expected query/task counts for both Small and Locations test classes to reflect the new step and the batch status-sync fix.
1 parent f3d692a commit e8100cb

4 files changed

Lines changed: 147 additions & 10 deletions

File tree

dojo/importers/default_reimporter.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from itertools import batched
23

34
from django.conf import settings
45
from django.core.files.uploadedfile import TemporaryUploadedFile
@@ -30,6 +31,9 @@
3031
logger = logging.getLogger(__name__)
3132
deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
3233

34+
# Bound IN-list size when bulk-loading status fields for close_old_findings.
35+
_CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK = 1000
36+
3337

3438
class DefaultReImporterOptions(ImporterOptions):
3539
def validate_test(
@@ -465,6 +469,44 @@ def process_findings(
465469

466470
return self.new_items, self.reactivated_items, self.to_mitigate, self.untouched
467471

472+
def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None:
473+
"""
474+
Refresh false_p, risk_accepted, and out_of_scope from the DB for each finding.
475+
476+
These can change during reimport (e.g. false positive) while the in-memory instances
477+
are stale. Per-finding refresh_from_db in close_old_findings was added in
478+
https://github.com/DefectDojo/django-DefectDojo/pull/12291. A naive refresh per
479+
finding issues one SELECT each; we batch one query per chunk of primary keys and fall
480+
back to refresh_from_db only when needed.
481+
482+
This really should be fixed differently, but for now we at least optimize it to be done in bulk.
483+
"""
484+
findings_without_pk = [f for f in findings if f.pk is None]
485+
findings_with_pk = [f for f in findings if f.pk is not None]
486+
487+
for finding in findings_without_pk:
488+
finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
489+
490+
for chunk in batched(findings_with_pk, _CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK, strict=False):
491+
ids = [f.pk for f in chunk]
492+
fresh_by_id = {
493+
row["id"]: row
494+
for row in Finding.objects.filter(pk__in=ids).values(
495+
"id",
496+
"false_p",
497+
"risk_accepted",
498+
"out_of_scope",
499+
)
500+
}
501+
for finding in chunk:
502+
row = fresh_by_id.get(finding.pk)
503+
if row is not None:
504+
finding.false_p = row["false_p"]
505+
finding.risk_accepted = row["risk_accepted"]
506+
finding.out_of_scope = row["out_of_scope"]
507+
else:
508+
finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
509+
468510
def close_old_findings(
469511
self,
470512
findings: list[Finding],
@@ -478,17 +520,17 @@ def close_old_findings(
478520
if self.close_old_findings_toggle is False:
479521
return []
480522
logger.debug("REIMPORT_SCAN: Closing findings no longer present in scan report")
523+
# Get any status changes that could have occurred earlier in the process
524+
# for special statuses only.
525+
# An example of such is a finding being reported as false positive, and
526+
# reimport makes this change in the database. However, the findings here
527+
# are calculated based from the original values before the reimport, so
528+
# any updates made during reimport are discarded without first getting the
529+
# state of the finding as it stands at this moment (django-DefectDojo #12291).
530+
self._sync_close_old_finding_status_fields(findings)
481531
# Determine if pushing to jira or if the finding groups are enabled
482532
mitigated_findings = []
483533
for finding in findings:
484-
# Get any status changes that could have occurred earlier in the process
485-
# for special statuses only.
486-
# An example of such is a finding being reported as false positive, and
487-
# reimport makes this change in the database. However, the findings here
488-
# are calculated based from the original values before the reimport, so
489-
# any updates made during reimport are discarded without first getting the
490-
# state of the finding as it stands at this moment
491-
finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
492534
# Ensure the finding is not already closed
493535
if not finding.mitigated or not finding.is_mitigated:
494536
logger.debug("mitigating finding: %i:%s", finding.id, finding)

scripts/update_performance_test_counts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,8 @@ def _extract_call_span(method_content: str, call_name: str) -> tuple[int, int] |
422422
"reimport1_async_tasks": "expected_num_async_tasks2",
423423
"reimport2_queries": "expected_num_queries3",
424424
"reimport2_async_tasks": "expected_num_async_tasks3",
425+
"reimport3_queries": "expected_num_queries4",
426+
"reimport3_async_tasks": "expected_num_async_tasks4",
425427
}
426428
param_map_deduplication = {
427429
"first_import_queries": "expected_num_queries1",
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"service": "StackHawk",
3+
"scanCompleted": {
4+
"scan": {
5+
"comment defect dojo team": "This is an empty StackHawk scan results",
6+
"id": "e2ff5651-7eef-47e9-b743-0c2f7d861e27",
7+
"hawkscanVersion": "2.1.1",
8+
"env": "Development",
9+
"status": "COMPLETED",
10+
"application": "Secured Application",
11+
"startedTimestamp": "2022-02-16T23:07:19.575Z",
12+
"scanURL": "https://app.stackhawk.com/scans/e2ff5651-7eef-47e9-b743-0c2f7d861e27"
13+
},
14+
"scanDuration": "21",
15+
"spiderDuration": "45",
16+
"completedScanStats": {
17+
"urlsCount": "31",
18+
"duration": "66",
19+
"scanResultsStats": {
20+
"totalCount": "0",
21+
"lowCount": "0",
22+
"mediumCount": "0",
23+
"highCount": "0",
24+
"lowTriagedCount": "0",
25+
"mediumTriagedCount": "0",
26+
"highTriagedCount": "0"
27+
}
28+
},
29+
"findings": [
30+
]
31+
}
32+
}

unittests/test_importers_performance.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151

5252
STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json"
5353
STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json"
54+
STACK_HAWK_EMPTY = get_unit_tests_scans_path("stackhawk") / "stackhawk_empty.json"
5455
STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan"
5556

5657

@@ -126,12 +127,17 @@ def _import_reimport_performance(
126127
expected_num_async_tasks2,
127128
expected_num_queries3,
128129
expected_num_async_tasks3,
130+
expected_num_queries4,
131+
expected_num_async_tasks4,
129132
scan_file1,
130133
scan_file2,
131134
scan_file3,
135+
scan_file4,
132136
scan_type,
133137
product_name,
134138
engagement_name,
139+
*,
140+
close_old_findings4=False,
135141
):
136142
"""
137143
Test import/reimport/reimport performance with specified scan files and scan type.
@@ -195,6 +201,7 @@ def _import_reimport_performance(
195201
"verified": True,
196202
"sync": True,
197203
"scan_type": scan_type,
204+
"service": "Secured Application",
198205
"tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"],
199206
"apply_tags_to_findings": True,
200207
}
@@ -224,18 +231,52 @@ def _import_reimport_performance(
224231
"verified": True,
225232
"sync": True,
226233
"scan_type": scan_type,
234+
"service": "Secured Application",
227235
}
228236
reimporter = DefaultReImporter(**reimport_options)
229237
test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
230238

239+
# Fourth import (reimport again, empty report)
240+
# Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
241+
# This allows us to see all count mismatches in a single test run, making it easier to fix
242+
# all incorrect expected values at once rather than fixing them one at a time.
243+
# Nested with statements are intentional - each assertion needs its own subTest wrapper.
244+
with ( # noqa: SIM117
245+
self.subTest("reimport3"), impersonate(Dojo_User.objects.get(username="admin")),
246+
scan_file4.open(encoding="utf-8") as scan,
247+
):
248+
with self.subTest(step="reimport3", metric="queries"):
249+
with self.assertNumQueries(expected_num_queries4):
250+
with self.subTest(step="reimport3", metric="async_tasks"):
251+
with self._assertNumAsyncTask(expected_num_async_tasks4):
252+
reimport_options = {
253+
"test": test,
254+
"user": lead,
255+
"lead": lead,
256+
"scan_date": None,
257+
"minimum_severity": "Info",
258+
"active": True,
259+
"verified": True,
260+
"sync": True,
261+
"scan_type": scan_type,
262+
# StackHawk parser sets the service field causing close old findings to fail if we do not specify the service field
263+
# This is a big problem that needs fixing. Parsers should not set the service field.
264+
"service": "Secured Application",
265+
"close_old_findings": close_old_findings4,
266+
}
267+
reimporter = DefaultReImporter(**reimport_options)
268+
test, _, len_new_findings4, len_closed_findings4, _, _, _ = reimporter.process_scan(scan)
269+
logger.info("Step 4: new=%s closed=%s", len_new_findings4, len_closed_findings4)
270+
self.assertGreater(len_closed_findings4, 0, "Step 4 (empty reimport with close_old_findings=True) should close findings")
271+
231272

232273
@tag("performance")
233274
@skip_unless_v2
234275
class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase):
235276

236277
"""Performance tests using small sample files (StackHawk, ~6 findings)."""
237278

238-
def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
279+
def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4):
239280
"""
240281
Log output can be quite large as when the assertNumQueries fails, all queries are printed.
241282
It could be usefule to capture the output in `less`:
@@ -251,12 +292,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
251292
expected_num_async_tasks2,
252293
expected_num_queries3,
253294
expected_num_async_tasks3,
295+
expected_num_queries4,
296+
expected_num_async_tasks4,
254297
scan_file1=STACK_HAWK_SUBSET_FILENAME,
255298
scan_file2=STACK_HAWK_FILENAME,
256299
scan_file3=STACK_HAWK_SUBSET_FILENAME,
300+
scan_file4=STACK_HAWK_EMPTY,
257301
scan_type=STACK_HAWK_SCAN_TYPE,
258302
product_name="TestDojoDefaultImporter",
259303
engagement_name="Test Create Engagement",
304+
close_old_findings4=True,
260305
)
261306

262307
@override_settings(ENABLE_AUDITLOG=True)
@@ -275,6 +320,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
275320
expected_num_async_tasks2=17,
276321
expected_num_queries3=108,
277322
expected_num_async_tasks3=16,
323+
expected_num_queries4=155,
324+
expected_num_async_tasks4=6,
278325
)
279326

280327
@override_settings(ENABLE_AUDITLOG=True)
@@ -297,6 +344,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
297344
expected_num_async_tasks2=17,
298345
expected_num_queries3=115,
299346
expected_num_async_tasks3=16,
347+
expected_num_queries4=155,
348+
expected_num_async_tasks4=6,
300349
)
301350

302351
@override_settings(ENABLE_AUDITLOG=True)
@@ -320,6 +369,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
320369
expected_num_async_tasks2=19,
321370
expected_num_queries3=119,
322371
expected_num_async_tasks3=18,
372+
expected_num_queries4=162,
373+
expected_num_async_tasks4=8,
323374
)
324375

325376
# Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results.
@@ -486,7 +537,7 @@ def setUp(self):
486537
for model in [Location, LocationFindingReference]:
487538
ContentType.objects.get_for_model(model)
488539

489-
def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
540+
def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4):
490541
r"""
491542
Log output can be quite large as when the assertNumQueries fails, all queries are printed.
492543
It could be useful to capture the output in `less`:
@@ -502,12 +553,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
502553
expected_num_async_tasks2,
503554
expected_num_queries3,
504555
expected_num_async_tasks3,
556+
expected_num_queries4,
557+
expected_num_async_tasks4,
505558
scan_file1=STACK_HAWK_SUBSET_FILENAME,
506559
scan_file2=STACK_HAWK_FILENAME,
507560
scan_file3=STACK_HAWK_SUBSET_FILENAME,
561+
scan_file4=STACK_HAWK_EMPTY,
508562
scan_type=STACK_HAWK_SCAN_TYPE,
509563
product_name="TestDojoDefaultImporterLocations",
510564
engagement_name="Test Create Engagement Locations",
565+
close_old_findings4=True,
511566
)
512567

513568
@override_settings(ENABLE_AUDITLOG=True)
@@ -526,6 +581,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
526581
expected_num_async_tasks2=17,
527582
expected_num_queries3=346,
528583
expected_num_async_tasks3=16,
584+
expected_num_queries4=212,
585+
expected_num_async_tasks4=6,
529586
)
530587

531588
@override_settings(ENABLE_AUDITLOG=True)
@@ -548,6 +605,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
548605
expected_num_async_tasks2=17,
549606
expected_num_queries3=355,
550607
expected_num_async_tasks3=16,
608+
expected_num_queries4=212,
609+
expected_num_async_tasks4=6,
551610
)
552611

553612
@override_settings(ENABLE_AUDITLOG=True)
@@ -571,6 +630,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
571630
expected_num_async_tasks2=19,
572631
expected_num_queries3=359,
573632
expected_num_async_tasks3=18,
633+
expected_num_queries4=222,
634+
expected_num_async_tasks4=8,
574635
)
575636

576637
def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True):

0 commit comments

Comments
 (0)