Reimport: batch-refresh finding status fields in close_old_findings (#14638)

valentijnscholten · web-flow · commit e8100cbabd78 · 2026-04-06T09:41:55.000-06:00
* Batch-refresh close_old_findings status fields to avoid N refresh_from_db queries. Replace per-finding refresh_from_db(false_p, risk_accepted, out_of_scope) with one values() query for all PKs and assign onto instances, falling back to refresh_from_db when a row is missing. * docs: cite #12291 for close_old_findings status refresh origin * perf: chunk close_old_findings status sync queries (1000 PKs per SELECT) * tests: add 4th reimport step (empty scan, close_old_findings) to performance tests - Add step 4 to _import_reimport_performance: reimport with an empty StackHawk scan and close_old_findings=True, verifying findings are actually closed (assertGreater on len_closed_findings). - Fix all reimport steps to pass service="Secured Application" so the reimporter's service filter matches findings produced by the StackHawk parser (which sets service from the scan's application field). Without this, original_items was always empty and no matching/closing occurred. - Add stackhawk_empty.json scan fixture. - Fix update_performance_test_counts.py to handle reimport3 (step 4) by adding reimport3_queries/reimport3_async_tasks to param_map. - Update all expected query/task counts for both Small and Locations test classes to reflect the new step and the batch status-sync fix.
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
@@ -1,4 +1,5 @@
 import logging
+from itertools import batched
 
 from django.conf import settings
 from django.core.files.uploadedfile import TemporaryUploadedFile
@@ -30,6 +31,9 @@
 logger = logging.getLogger(__name__)
 deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
 
+# Bound IN-list size when bulk-loading status fields for close_old_findings.
+_CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK = 1000
+
 
 class DefaultReImporterOptions(ImporterOptions):
     def validate_test(
@@ -465,6 +469,44 @@ def process_findings(
 
         return self.new_items, self.reactivated_items, self.to_mitigate, self.untouched
 
+    def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None:
+        """
+        Refresh false_p, risk_accepted, and out_of_scope from the DB for each finding.
+
+        These can change during reimport (e.g. false positive) while the in-memory instances
+        are stale. Per-finding refresh_from_db in close_old_findings was added in
+        https://github.com/DefectDojo/django-DefectDojo/pull/12291. A naive refresh per
+        finding issues one SELECT each; we batch one query per chunk of primary keys and fall
+        back to refresh_from_db only when needed.
+
+        This really should be fixed differently, but for now we at least optimize it to be done in bulk.
+        """
+        findings_without_pk = [f for f in findings if f.pk is None]
+        findings_with_pk = [f for f in findings if f.pk is not None]
+
+        for finding in findings_without_pk:
+            finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
+
+        for chunk in batched(findings_with_pk, _CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK, strict=False):
+            ids = [f.pk for f in chunk]
+            fresh_by_id = {
+                row["id"]: row
+                for row in Finding.objects.filter(pk__in=ids).values(
+                    "id",
+                    "false_p",
+                    "risk_accepted",
+                    "out_of_scope",
+                )
+            }
+            for finding in chunk:
+                row = fresh_by_id.get(finding.pk)
+                if row is not None:
+                    finding.false_p = row["false_p"]
+                    finding.risk_accepted = row["risk_accepted"]
+                    finding.out_of_scope = row["out_of_scope"]
+                else:
+                    finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
+
     def close_old_findings(
         self,
         findings: list[Finding],
@@ -478,17 +520,17 @@ def close_old_findings(
         if self.close_old_findings_toggle is False:
             return []
         logger.debug("REIMPORT_SCAN: Closing findings no longer present in scan report")
+        # Get any status changes that could have occurred earlier in the process
+        # for special statuses only.
+        # An example of such is a finding being reported as false positive, and
+        # reimport makes this change in the database. However, the findings here
+        # are calculated based from the original values before the reimport, so
+        # any updates made during reimport are discarded without first getting the
+        # state of the finding as it stands at this moment (django-DefectDojo #12291).
+        self._sync_close_old_finding_status_fields(findings)
         # Determine if pushing to jira or if the finding groups are enabled
         mitigated_findings = []
         for finding in findings:
-            # Get any status changes that could have occurred earlier in the process
-            # for special statuses only.
-            # An example of such is a finding being reported as false positive, and
-            # reimport makes this change in the database. However, the findings here
-            # are calculated based from the original values before the reimport, so
-            # any updates made during reimport are discarded without first getting the
-            # state of the finding as it stands at this moment
-            finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
             # Ensure the finding is not already closed
             if not finding.mitigated or not finding.is_mitigated:
                 logger.debug("mitigating finding: %i:%s", finding.id, finding)
diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py
@@ -422,6 +422,8 @@ def _extract_call_span(method_content: str, call_name: str) -> tuple[int, int] |
         "reimport1_async_tasks": "expected_num_async_tasks2",
         "reimport2_queries": "expected_num_queries3",
         "reimport2_async_tasks": "expected_num_async_tasks3",
+        "reimport3_queries": "expected_num_queries4",
+        "reimport3_async_tasks": "expected_num_async_tasks4",
     }
     param_map_deduplication = {
         "first_import_queries": "expected_num_queries1",
diff --git a/unittests/scans/stackhawk/stackhawk_empty.json b/unittests/scans/stackhawk/stackhawk_empty.json
@@ -0,0 +1,32 @@
+{
+  "service": "StackHawk",
+  "scanCompleted": {
+    "scan": {
+      "comment defect dojo team": "This is an empty StackHawk scan results",
+      "id": "e2ff5651-7eef-47e9-b743-0c2f7d861e27",
+      "hawkscanVersion": "2.1.1",
+      "env": "Development",
+      "status": "COMPLETED",
+      "application": "Secured Application",
+      "startedTimestamp": "2022-02-16T23:07:19.575Z",
+      "scanURL": "https://app.stackhawk.com/scans/e2ff5651-7eef-47e9-b743-0c2f7d861e27"
+    },
+    "scanDuration": "21",
+    "spiderDuration": "45",
+    "completedScanStats": {
+      "urlsCount": "31",
+      "duration": "66",
+      "scanResultsStats": {
+        "totalCount": "0",
+        "lowCount": "0",
+        "mediumCount": "0",
+        "highCount": "0",
+        "lowTriagedCount": "0",
+        "mediumTriagedCount": "0",
+        "highTriagedCount": "0"
+      }
+    },
+    "findings": [
+    ]
+  }
+}
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
@@ -51,6 +51,7 @@
 
 STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json"
 STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json"
+STACK_HAWK_EMPTY = get_unit_tests_scans_path("stackhawk") / "stackhawk_empty.json"
 STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan"
 
 
@@ -126,12 +127,17 @@ def _import_reimport_performance(
         expected_num_async_tasks2,
         expected_num_queries3,
         expected_num_async_tasks3,
+        expected_num_queries4,
+        expected_num_async_tasks4,
         scan_file1,
         scan_file2,
         scan_file3,
+        scan_file4,
         scan_type,
         product_name,
         engagement_name,
+        *,
+        close_old_findings4=False,
     ):
         """
         Test import/reimport/reimport performance with specified scan files and scan type.
@@ -195,6 +201,7 @@ def _import_reimport_performance(
                                 "verified": True,
                                 "sync": True,
                                 "scan_type": scan_type,
+                                "service": "Secured Application",
                                 "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"],
                                 "apply_tags_to_findings": True,
                             }
@@ -224,18 +231,52 @@ def _import_reimport_performance(
                                 "verified": True,
                                 "sync": True,
                                 "scan_type": scan_type,
+                                "service": "Secured Application",
                             }
                             reimporter = DefaultReImporter(**reimport_options)
                             test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
 
+        # Fourth import (reimport again, empty report)
+        # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        # Nested with statements are intentional - each assertion needs its own subTest wrapper.
+        with (  # noqa: SIM117
+            self.subTest("reimport3"), impersonate(Dojo_User.objects.get(username="admin")),
+            scan_file4.open(encoding="utf-8") as scan,
+        ):
+            with self.subTest(step="reimport3", metric="queries"):
+                with self.assertNumQueries(expected_num_queries4):
+                    with self.subTest(step="reimport3", metric="async_tasks"):
+                        with self._assertNumAsyncTask(expected_num_async_tasks4):
+                            reimport_options = {
+                                "test": test,
+                                "user": lead,
+                                "lead": lead,
+                                "scan_date": None,
+                                "minimum_severity": "Info",
+                                "active": True,
+                                "verified": True,
+                                "sync": True,
+                                "scan_type": scan_type,
+                                # StackHawk parser sets the service field causing close old findings to fail if we do not specify the service field
+                                # This is a big problem that needs fixing. Parsers should not set the service field.
+                                "service": "Secured Application",
+                                "close_old_findings": close_old_findings4,
+                            }
+                            reimporter = DefaultReImporter(**reimport_options)
+                            test, _, len_new_findings4, len_closed_findings4, _, _, _ = reimporter.process_scan(scan)
+        logger.info("Step 4: new=%s closed=%s", len_new_findings4, len_closed_findings4)
+        self.assertGreater(len_closed_findings4, 0, "Step 4 (empty reimport with close_old_findings=True) should close findings")
+
 
 @tag("performance")
 @skip_unless_v2
 class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase):
 
     """Performance tests using small sample files (StackHawk, ~6 findings)."""
 
-    def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
+    def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4):
         """
         Log output can be quite large as when the assertNumQueries fails, all queries are printed.
         It could be usefule to capture the output in `less`:
@@ -251,12 +292,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
             expected_num_async_tasks2,
             expected_num_queries3,
             expected_num_async_tasks3,
+            expected_num_queries4,
+            expected_num_async_tasks4,
             scan_file1=STACK_HAWK_SUBSET_FILENAME,
             scan_file2=STACK_HAWK_FILENAME,
             scan_file3=STACK_HAWK_SUBSET_FILENAME,
+            scan_file4=STACK_HAWK_EMPTY,
             scan_type=STACK_HAWK_SCAN_TYPE,
             product_name="TestDojoDefaultImporter",
             engagement_name="Test Create Engagement",
+            close_old_findings4=True,
         )
 
     @override_settings(ENABLE_AUDITLOG=True)
@@ -275,6 +320,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
             expected_num_async_tasks2=17,
             expected_num_queries3=108,
             expected_num_async_tasks3=16,
+            expected_num_queries4=155,
+            expected_num_async_tasks4=6,
         )
 
     @override_settings(ENABLE_AUDITLOG=True)
@@ -297,6 +344,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
             expected_num_async_tasks2=17,
             expected_num_queries3=115,
             expected_num_async_tasks3=16,
+            expected_num_queries4=155,
+            expected_num_async_tasks4=6,
         )
 
     @override_settings(ENABLE_AUDITLOG=True)
@@ -320,6 +369,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
             expected_num_async_tasks2=19,
             expected_num_queries3=119,
             expected_num_async_tasks3=18,
+            expected_num_queries4=162,
+            expected_num_async_tasks4=8,
         )
 
     # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results.
@@ -486,7 +537,7 @@ def setUp(self):
         for model in [Location, LocationFindingReference]:
             ContentType.objects.get_for_model(model)
 
-    def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
+    def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4):
         r"""
         Log output can be quite large as when the assertNumQueries fails, all queries are printed.
         It could be useful to capture the output in `less`:
@@ -502,12 +553,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
             expected_num_async_tasks2,
             expected_num_queries3,
             expected_num_async_tasks3,
+            expected_num_queries4,
+            expected_num_async_tasks4,
             scan_file1=STACK_HAWK_SUBSET_FILENAME,
             scan_file2=STACK_HAWK_FILENAME,
             scan_file3=STACK_HAWK_SUBSET_FILENAME,
+            scan_file4=STACK_HAWK_EMPTY,
             scan_type=STACK_HAWK_SCAN_TYPE,
             product_name="TestDojoDefaultImporterLocations",
             engagement_name="Test Create Engagement Locations",
+            close_old_findings4=True,
         )
 
     @override_settings(ENABLE_AUDITLOG=True)
@@ -526,6 +581,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
             expected_num_async_tasks2=17,
             expected_num_queries3=346,
             expected_num_async_tasks3=16,
+            expected_num_queries4=212,
+            expected_num_async_tasks4=6,
         )
 
     @override_settings(ENABLE_AUDITLOG=True)
@@ -548,6 +605,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
             expected_num_async_tasks2=17,
             expected_num_queries3=355,
             expected_num_async_tasks3=16,
+            expected_num_queries4=212,
+            expected_num_async_tasks4=6,
         )
 
     @override_settings(ENABLE_AUDITLOG=True)
@@ -571,6 +630,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
             expected_num_async_tasks2=19,
             expected_num_queries3=359,
             expected_num_async_tasks3=18,
+            expected_num_queries4=222,
+            expected_num_async_tasks4=8,
         )
 
     def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True):

Original file line number	Diff line number	Diff line change
`@@ -422,6 +422,8 @@ def _extract_call_span(method_content: str, call_name: str) -> tuple[int, int] \|`
`422`	`422`	`"reimport1_async_tasks": "expected_num_async_tasks2",`
`423`	`423`	`"reimport2_queries": "expected_num_queries3",`
`424`	`424`	`"reimport2_async_tasks": "expected_num_async_tasks3",`
	`425`	`+ "reimport3_queries": "expected_num_queries4",`
	`426`	`+ "reimport3_async_tasks": "expected_num_async_tasks4",`
`425`	`427`	`}`
`426`	`428`	`param_map_deduplication = {`
`427`	`429`	`"first_import_queries": "expected_num_queries1",`