fix(dedupe): prevent duplicate test processing in batch dedupe command

valentijnscholten · valentijnscholten · commit fa5de85fc22c · 2026-03-28T10:20:09.000+01:00
Finding.Meta.ordering includes multiple columns (numerical_severity, date,
title, epss_score, epss_percentile). When Django generates
SELECT DISTINCT test_id ... ORDER BY those columns, PostgreSQL requires
them in the SELECT list, so Django silently adds them. The DISTINCT then
operates on the full tuple instead of test_id alone, causing the same test
to appear multiple times in the iterator and be processed repeatedly.

Fix by calling .order_by("test_id") before .values_list().distinct() to
override the model-level ordering, so the query stays SELECT DISTINCT test_id
ORDER BY test_id.
diff --git a/dojo/management/commands/dedupe.py b/dojo/management/commands/dedupe.py
@@ -171,7 +171,11 @@ def _dedupe_batch_mode(self, findings_queryset, *, dedupe_sync: bool = True):
         logger.info(f"Processing {total_findings} findings in batches of max {batch_max_size} per test ({mode_str})")
 
         # Group findings by test_id to process them in batches per test
-        test_ids = findings_queryset.values_list("test_id", flat=True).distinct()
+        # Use order_by("test_id") to override the Finding model's default ordering
+        # (numerical_severity, date, title, ...). Without this, Django includes those
+        # ordering columns in the SELECT for DISTINCT, making test_ids non-unique and
+        # causing the same test to be processed multiple times.
+        test_ids = findings_queryset.order_by("test_id").values_list("test_id", flat=True).distinct()
         total_tests = len(test_ids)
         total_processed = 0