Skip to content

Commit 0d69df6

Browse files
foundy_by: optimize for dedupe (#13888)
* dedupe: optimize found_by * fix: update expected query counts in performance tests * fix: update expected query counts in performance tests
1 parent ad34f65 commit 0d69df6

3 files changed

Lines changed: 31 additions & 20 deletions

File tree

dojo/finding/deduplication.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,14 @@ def set_duplicate(new_finding, existing_finding):
147147
for find in new_finding.original_finding.all():
148148
new_finding.original_finding.remove(find)
149149
set_duplicate(find, existing_finding)
150-
existing_finding.found_by.add(new_finding.test.test_type)
150+
# Only add test type to found_by if it is not already present.
151+
# This is efficient because `found_by` is prefetched for candidates via `build_dedupe_scope_queryset()`.
152+
test_type = getattr(getattr(new_finding, "test", None), "test_type", None)
153+
if test_type is not None and test_type not in existing_finding.found_by.all():
154+
existing_finding.found_by.add(test_type)
155+
156+
# existing_finding.found_by.add(new_finding.test.test_type)
157+
151158
logger.debug("saving new finding: %d", new_finding.id)
152159
super(Finding, new_finding).save()
153160
logger.debug("saving existing finding: %d", existing_finding.id)
@@ -239,7 +246,7 @@ def build_dedupe_scope_queryset(test):
239246
return (
240247
Finding.objects.filter(scope_q)
241248
.select_related("test", "test__engagement", "test__test_type")
242-
.prefetch_related("endpoints")
249+
.prefetch_related("endpoints", "found_by")
243250
)
244251

245252

dojo/models.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2766,6 +2766,8 @@ def save(self, dedupe_option=True, rules_option=True, product_grading_option=Tru
27662766
logger.debug("Start saving finding of id " + str(self.id) + " dedupe_option:" + str(dedupe_option) + " (self.pk is %s)", "None" if self.pk is None else "not None")
27672767
from dojo.finding import helper as finding_helper # noqa: PLC0415 circular import
27682768

2769+
is_new_finding = self.pk is None
2770+
27692771
# if not isinstance(self.date, (datetime, date)):
27702772
# raise ValidationError(_("The 'date' field must be a valid date or datetime object."))
27712773

@@ -2809,7 +2811,7 @@ def save(self, dedupe_option=True, rules_option=True, product_grading_option=Tru
28092811

28102812
self.set_hash_code(dedupe_option)
28112813

2812-
if self.pk is None:
2814+
if is_new_finding:
28132815
# We enter here during the first call from serializers.py
28142816
from dojo.utils import apply_cwe_to_template # noqa: PLC0415 circular import
28152817
# No need to use the returned variable since `self` Is updated in memory
@@ -2838,7 +2840,9 @@ def save(self, dedupe_option=True, rules_option=True, product_grading_option=Tru
28382840
logger.debug("Saving finding of id " + str(self.id) + " dedupe_option:" + str(dedupe_option) + " (self.pk is %s)", "None" if self.pk is None else "not None")
28392841
super().save(*args, **kwargs)
28402842

2841-
self.found_by.add(self.test.test_type)
2843+
# Only add to found_by for newly-created findings (avoid doing this on every update)
2844+
if is_new_finding:
2845+
self.found_by.add(self.test.test_type)
28422846

28432847
# only perform post processing (in celery task) if needed. this check avoids submitting 1000s of tasks to celery that will do nothing
28442848
system_settings = System_Settings.objects.get()

unittests/test_importers_performance.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,11 @@ def test_import_reimport_reimport_performance_no_async(self):
217217
testuser.usercontactinfo.block_execution = True
218218
testuser.usercontactinfo.save()
219219
self._import_reimport_performance(
220-
expected_num_queries1=345,
220+
expected_num_queries1=346,
221221
expected_num_async_tasks1=6,
222-
expected_num_queries2=293,
222+
expected_num_queries2=294,
223223
expected_num_async_tasks2=17,
224-
expected_num_queries3=180,
224+
expected_num_queries3=181,
225225
expected_num_async_tasks3=16,
226226
)
227227

@@ -239,11 +239,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
239239
testuser.usercontactinfo.save()
240240

241241
self._import_reimport_performance(
242-
expected_num_queries1=311,
242+
expected_num_queries1=312,
243243
expected_num_async_tasks1=6,
244-
expected_num_queries2=286,
244+
expected_num_queries2=287,
245245
expected_num_async_tasks2=17,
246-
expected_num_queries3=175,
246+
expected_num_queries3=176,
247247
expected_num_async_tasks3=16,
248248
)
249249

@@ -265,11 +265,11 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
265265
self.system_settings(enable_product_grade=True)
266266

267267
self._import_reimport_performance(
268-
expected_num_queries1=347,
268+
expected_num_queries1=348,
269269
expected_num_async_tasks1=8,
270-
expected_num_queries2=295,
270+
expected_num_queries2=296,
271271
expected_num_async_tasks2=19,
272-
expected_num_queries3=182,
272+
expected_num_queries3=183,
273273
expected_num_async_tasks3=18,
274274
)
275275

@@ -288,11 +288,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
288288
self.system_settings(enable_product_grade=True)
289289

290290
self._import_reimport_performance(
291-
expected_num_queries1=313,
291+
expected_num_queries1=314,
292292
expected_num_async_tasks1=8,
293-
expected_num_queries2=288,
293+
expected_num_queries2=289,
294294
expected_num_async_tasks2=19,
295-
expected_num_queries3=177,
295+
expected_num_queries3=178,
296296
expected_num_async_tasks3=18,
297297
)
298298

@@ -449,9 +449,9 @@ def test_deduplication_performance_no_async(self):
449449
testuser.usercontactinfo.save()
450450

451451
self._deduplication_performance(
452-
expected_num_queries1=316,
452+
expected_num_queries1=317,
453453
expected_num_async_tasks1=7,
454-
expected_num_queries2=287,
454+
expected_num_queries2=282,
455455
expected_num_async_tasks2=7,
456456
)
457457

@@ -469,8 +469,8 @@ def test_deduplication_performance_pghistory_no_async(self):
469469
testuser.usercontactinfo.save()
470470

471471
self._deduplication_performance(
472-
expected_num_queries1=280,
472+
expected_num_queries1=281,
473473
expected_num_async_tasks1=7,
474-
expected_num_queries2=250,
474+
expected_num_queries2=245,
475475
expected_num_async_tasks2=7,
476476
)

0 commit comments

Comments
 (0)