Skip to content

Commit f38cb19

Browse files
capture dedupe performance
1 parent 080d4c6 commit f38cb19

1 file changed

Lines changed: 192 additions & 12 deletions

File tree

unittests/test_importers_performance.py

Lines changed: 192 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def setUp(self):
4646
self.system_settings(enable_webhooks_notifications=False)
4747
self.system_settings(enable_product_grade=False)
4848
self.system_settings(enable_github=False)
49+
self.system_settings(enable_deduplication=True)
4950

5051
# Warm up ContentType cache for relevant models. This is needed if we want to be able to run the test in isolation
5152
# As part of the test suite the ContentTYpe ids will already be cached and won't affect the query count.
@@ -219,11 +220,11 @@ def test_import_reimport_reimport_performance_no_async(self):
219220
testuser.usercontactinfo.block_execution = True
220221
testuser.usercontactinfo.save()
221222
self._import_reimport_performance(
222-
expected_num_queries1=593,
223+
expected_num_queries1=603,
223224
expected_num_async_tasks1=10,
224-
expected_num_queries2=503,
225+
expected_num_queries2=515,
225226
expected_num_async_tasks2=22,
226-
expected_num_queries3=294,
227+
expected_num_queries3=304,
227228
expected_num_async_tasks3=20,
228229
)
229230

@@ -241,11 +242,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
241242
testuser.usercontactinfo.save()
242243

243244
self._import_reimport_performance(
244-
expected_num_queries1=559,
245+
expected_num_queries1=569,
245246
expected_num_async_tasks1=10,
246-
expected_num_queries2=496,
247+
expected_num_queries2=508,
247248
expected_num_async_tasks2=22,
248-
expected_num_queries3=289,
249+
expected_num_queries3=299,
249250
expected_num_async_tasks3=20,
250251
)
251252

@@ -267,11 +268,11 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
267268
self.system_settings(enable_product_grade=True)
268269

269270
self._import_reimport_performance(
270-
expected_num_queries1=594,
271+
expected_num_queries1=604,
271272
expected_num_async_tasks1=11,
272-
expected_num_queries2=504,
273+
expected_num_queries2=516,
273274
expected_num_async_tasks2=23,
274-
expected_num_queries3=295,
275+
expected_num_queries3=305,
275276
expected_num_async_tasks3=21,
276277
)
277278

@@ -290,10 +291,189 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
290291
self.system_settings(enable_product_grade=True)
291292

292293
self._import_reimport_performance(
293-
expected_num_queries1=560,
294+
expected_num_queries1=570,
294295
expected_num_async_tasks1=11,
295-
expected_num_queries2=497,
296+
expected_num_queries2=509,
296297
expected_num_async_tasks2=23,
297-
expected_num_queries3=290,
298+
expected_num_queries3=300,
298299
expected_num_async_tasks3=21,
299300
)
301+
302+
# Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results.
303+
def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True):
304+
"""
305+
Test method to measure deduplication performance by importing the same scan twice.
306+
The second import should result in all findings being marked as duplicates.
307+
This is different from reimport as we create a new test each time.
308+
"""
309+
product_type, _created = Product_Type.objects.get_or_create(name="test")
310+
product, _created = Product.objects.get_or_create(
311+
name="TestDojoDeduplicationPerformance",
312+
prod_type=product_type,
313+
)
314+
engagement, _created = Engagement.objects.get_or_create(
315+
name="Test Deduplication Performance Engagement",
316+
product=product,
317+
target_start=timezone.now(),
318+
target_end=timezone.now(),
319+
)
320+
lead, _ = User.objects.get_or_create(username="admin")
321+
environment, _ = Development_Environment.objects.get_or_create(name="Development")
322+
323+
# First import - all findings should be new
324+
with (
325+
self.subTest("first_import"), impersonate(Dojo_User.objects.get(username="admin")),
326+
self.assertNumQueries(expected_num_queries1),
327+
self._assertNumAsyncTask(expected_num_async_tasks1),
328+
STACK_HAWK_FILENAME.open(encoding="utf-8") as scan,
329+
):
330+
import_options = {
331+
"user": lead,
332+
"lead": lead,
333+
"scan_date": None,
334+
"environment": environment,
335+
"minimum_severity": "Info",
336+
"active": True,
337+
"verified": True,
338+
"scan_type": STACK_HAWK_SCAN_TYPE,
339+
"engagement": engagement,
340+
}
341+
importer = DefaultImporter(**import_options)
342+
_, _, len_new_findings1, len_closed_findings1, _, _, _ = importer.process_scan(scan)
343+
344+
# Second import - all findings should be duplicates
345+
with (
346+
self.subTest("second_import"), impersonate(Dojo_User.objects.get(username="admin")),
347+
self.assertNumQueries(expected_num_queries2),
348+
self._assertNumAsyncTask(expected_num_async_tasks2),
349+
STACK_HAWK_FILENAME.open(encoding="utf-8") as scan,
350+
):
351+
import_options = {
352+
"user": lead,
353+
"lead": lead,
354+
"scan_date": None,
355+
"environment": environment,
356+
"minimum_severity": "Info",
357+
"active": True,
358+
"verified": True,
359+
"scan_type": STACK_HAWK_SCAN_TYPE,
360+
"engagement": engagement,
361+
}
362+
importer = DefaultImporter(**import_options)
363+
_, _, len_new_findings2, len_closed_findings2, _, _, _ = importer.process_scan(scan)
364+
365+
# Log the results for analysis
366+
logger.debug(f"First import: {len_new_findings1} new findings, {len_closed_findings1} closed findings")
367+
logger.debug(f"Second import: {len_new_findings2} new findings, {len_closed_findings2} closed findings")
368+
369+
# Assert that process_scan results show no deduplication yet (deduplication happens asynchronously)
370+
# The second import should report 6 new findings because deduplication is not visible in the stats from the importer
371+
self.assertEqual(len_new_findings1, 6, "First import should create 6 new findings")
372+
self.assertEqual(len_closed_findings1, 0, "First import should not close any findings")
373+
self.assertEqual(len_new_findings2, 6, "Second import should report 6 new findings initially (before deduplication)")
374+
self.assertEqual(len_closed_findings2, 0, "Second import should not close any findings")
375+
376+
# Verify that second import resulted in duplicates by checking the database
377+
# Only check duplicates in sync mode since deduplication happens asynchronously
378+
if check_duplicates:
379+
# Count active findings (non-duplicates) in the engagement
380+
active_findings = Finding.objects.filter(
381+
test__engagement=engagement,
382+
active=True,
383+
duplicate=False,
384+
).count()
385+
386+
# Count duplicate findings in the engagement
387+
duplicate_findings = Finding.objects.filter(
388+
test__engagement=engagement,
389+
duplicate=True,
390+
).count()
391+
392+
# We should have 6 active findings (from first import) and 6 duplicate findings (from second import)
393+
self.assertEqual(active_findings, 6, f"Expected 6 active findings, got {active_findings}")
394+
self.assertEqual(duplicate_findings, 6, f"Expected 6 duplicate findings, got {duplicate_findings}")
395+
396+
# Total findings should be 12 (6 active + 6 duplicates)
397+
total_findings = Finding.objects.filter(test__engagement=engagement).count()
398+
self.assertEqual(total_findings, 12, f"Expected 12 total findings, got {total_findings}")
399+
else:
400+
# In async mode, just verify we have 12 total findings (deduplication happens in celery tasks)
401+
total_findings = Finding.objects.filter(test__engagement=engagement).count()
402+
self.assertEqual(total_findings, 12, f"Expected 12 total findings, got {total_findings}")
403+
404+
@override_settings(ENABLE_AUDITLOG=True, AUDITLOG_TYPE="django-auditlog")
405+
def test_deduplication_performance_async(self):
406+
"""
407+
Test deduplication performance with async tasks enabled.
408+
This test imports the same scan twice to measure deduplication query and task overhead.
409+
"""
410+
configure_audit_system()
411+
configure_pghistory_triggers()
412+
413+
# Enable deduplication
414+
self.system_settings(enable_deduplication=True)
415+
416+
self._deduplication_performance(
417+
expected_num_queries1=660,
418+
expected_num_async_tasks1=12,
419+
expected_num_queries2=519,
420+
expected_num_async_tasks2=12,
421+
check_duplicates=False, # Async mode - deduplication happens later
422+
)
423+
424+
@override_settings(ENABLE_AUDITLOG=True, AUDITLOG_TYPE="django-pghistory")
425+
def test_deduplication_performance_pghistory_async(self):
426+
"""Test deduplication performance with django-pghistory and async tasks enabled."""
427+
configure_audit_system()
428+
configure_pghistory_triggers()
429+
430+
# Enable deduplication
431+
self.system_settings(enable_deduplication=True)
432+
433+
self._deduplication_performance(
434+
expected_num_queries1=624,
435+
expected_num_async_tasks1=12,
436+
expected_num_queries2=500,
437+
expected_num_async_tasks2=12,
438+
check_duplicates=False, # Async mode - deduplication happens later
439+
)
440+
441+
@override_settings(ENABLE_AUDITLOG=True, AUDITLOG_TYPE="django-auditlog")
442+
def test_deduplication_performance_no_async(self):
443+
"""Test deduplication performance with async tasks disabled."""
444+
configure_audit_system()
445+
configure_pghistory_triggers()
446+
447+
# Enable deduplication
448+
self.system_settings(enable_deduplication=True)
449+
450+
testuser = User.objects.get(username="admin")
451+
testuser.usercontactinfo.block_execution = True
452+
testuser.usercontactinfo.save()
453+
454+
self._deduplication_performance(
455+
expected_num_queries1=672,
456+
expected_num_async_tasks1=12,
457+
expected_num_queries2=633,
458+
expected_num_async_tasks2=12,
459+
)
460+
461+
@override_settings(ENABLE_AUDITLOG=True, AUDITLOG_TYPE="django-pghistory")
462+
def test_deduplication_performance_pghistory_no_async(self):
463+
"""Test deduplication performance with django-pghistory and async tasks disabled."""
464+
configure_audit_system()
465+
configure_pghistory_triggers()
466+
467+
# Enable deduplication
468+
self.system_settings(enable_deduplication=True)
469+
470+
testuser = User.objects.get(username="admin")
471+
testuser.usercontactinfo.block_execution = True
472+
testuser.usercontactinfo.save()
473+
474+
self._deduplication_performance(
475+
expected_num_queries1=636,
476+
expected_num_async_tasks1=12,
477+
expected_num_queries2=596,
478+
expected_num_async_tasks2=12,
479+
)

0 commit comments

Comments
 (0)