@@ -46,6 +46,7 @@ def setUp(self):
4646 self .system_settings (enable_webhooks_notifications = False )
4747 self .system_settings (enable_product_grade = False )
4848 self .system_settings (enable_github = False )
49+ self .system_settings (enable_deduplication = True )
4950
5051 # Warm up ContentType cache for relevant models. This is needed if we want to be able to run the test in isolation
5152 # As part of the test suite the ContentTYpe ids will already be cached and won't affect the query count.
@@ -219,11 +220,11 @@ def test_import_reimport_reimport_performance_no_async(self):
219220 testuser .usercontactinfo .block_execution = True
220221 testuser .usercontactinfo .save ()
221222 self ._import_reimport_performance (
222- expected_num_queries1 = 593 ,
223+ expected_num_queries1 = 603 ,
223224 expected_num_async_tasks1 = 10 ,
224- expected_num_queries2 = 503 ,
225+ expected_num_queries2 = 515 ,
225226 expected_num_async_tasks2 = 22 ,
226- expected_num_queries3 = 294 ,
227+ expected_num_queries3 = 304 ,
227228 expected_num_async_tasks3 = 20 ,
228229 )
229230
@@ -241,11 +242,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
241242 testuser .usercontactinfo .save ()
242243
243244 self ._import_reimport_performance (
244- expected_num_queries1 = 559 ,
245+ expected_num_queries1 = 569 ,
245246 expected_num_async_tasks1 = 10 ,
246- expected_num_queries2 = 496 ,
247+ expected_num_queries2 = 508 ,
247248 expected_num_async_tasks2 = 22 ,
248- expected_num_queries3 = 289 ,
249+ expected_num_queries3 = 299 ,
249250 expected_num_async_tasks3 = 20 ,
250251 )
251252
@@ -267,11 +268,11 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
267268 self .system_settings (enable_product_grade = True )
268269
269270 self ._import_reimport_performance (
270- expected_num_queries1 = 594 ,
271+ expected_num_queries1 = 604 ,
271272 expected_num_async_tasks1 = 11 ,
272- expected_num_queries2 = 504 ,
273+ expected_num_queries2 = 516 ,
273274 expected_num_async_tasks2 = 23 ,
274- expected_num_queries3 = 295 ,
275+ expected_num_queries3 = 305 ,
275276 expected_num_async_tasks3 = 21 ,
276277 )
277278
@@ -290,10 +291,189 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
290291 self .system_settings (enable_product_grade = True )
291292
292293 self ._import_reimport_performance (
293- expected_num_queries1 = 560 ,
294+ expected_num_queries1 = 570 ,
294295 expected_num_async_tasks1 = 11 ,
295- expected_num_queries2 = 497 ,
296+ expected_num_queries2 = 509 ,
296297 expected_num_async_tasks2 = 23 ,
297- expected_num_queries3 = 290 ,
298+ expected_num_queries3 = 300 ,
298299 expected_num_async_tasks3 = 21 ,
299300 )
301+
302+ # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results.
303+ def _deduplication_performance (self , expected_num_queries1 , expected_num_async_tasks1 , expected_num_queries2 , expected_num_async_tasks2 , * , check_duplicates = True ):
304+ """
305+ Test method to measure deduplication performance by importing the same scan twice.
306+ The second import should result in all findings being marked as duplicates.
307+ This is different from reimport as we create a new test each time.
308+ """
309+ product_type , _created = Product_Type .objects .get_or_create (name = "test" )
310+ product , _created = Product .objects .get_or_create (
311+ name = "TestDojoDeduplicationPerformance" ,
312+ prod_type = product_type ,
313+ )
314+ engagement , _created = Engagement .objects .get_or_create (
315+ name = "Test Deduplication Performance Engagement" ,
316+ product = product ,
317+ target_start = timezone .now (),
318+ target_end = timezone .now (),
319+ )
320+ lead , _ = User .objects .get_or_create (username = "admin" )
321+ environment , _ = Development_Environment .objects .get_or_create (name = "Development" )
322+
323+ # First import - all findings should be new
324+ with (
325+ self .subTest ("first_import" ), impersonate (Dojo_User .objects .get (username = "admin" )),
326+ self .assertNumQueries (expected_num_queries1 ),
327+ self ._assertNumAsyncTask (expected_num_async_tasks1 ),
328+ STACK_HAWK_FILENAME .open (encoding = "utf-8" ) as scan ,
329+ ):
330+ import_options = {
331+ "user" : lead ,
332+ "lead" : lead ,
333+ "scan_date" : None ,
334+ "environment" : environment ,
335+ "minimum_severity" : "Info" ,
336+ "active" : True ,
337+ "verified" : True ,
338+ "scan_type" : STACK_HAWK_SCAN_TYPE ,
339+ "engagement" : engagement ,
340+ }
341+ importer = DefaultImporter (** import_options )
342+ _ , _ , len_new_findings1 , len_closed_findings1 , _ , _ , _ = importer .process_scan (scan )
343+
344+ # Second import - all findings should be duplicates
345+ with (
346+ self .subTest ("second_import" ), impersonate (Dojo_User .objects .get (username = "admin" )),
347+ self .assertNumQueries (expected_num_queries2 ),
348+ self ._assertNumAsyncTask (expected_num_async_tasks2 ),
349+ STACK_HAWK_FILENAME .open (encoding = "utf-8" ) as scan ,
350+ ):
351+ import_options = {
352+ "user" : lead ,
353+ "lead" : lead ,
354+ "scan_date" : None ,
355+ "environment" : environment ,
356+ "minimum_severity" : "Info" ,
357+ "active" : True ,
358+ "verified" : True ,
359+ "scan_type" : STACK_HAWK_SCAN_TYPE ,
360+ "engagement" : engagement ,
361+ }
362+ importer = DefaultImporter (** import_options )
363+ _ , _ , len_new_findings2 , len_closed_findings2 , _ , _ , _ = importer .process_scan (scan )
364+
365+ # Log the results for analysis
366+ logger .debug (f"First import: { len_new_findings1 } new findings, { len_closed_findings1 } closed findings" )
367+ logger .debug (f"Second import: { len_new_findings2 } new findings, { len_closed_findings2 } closed findings" )
368+
369+ # Assert that process_scan results show no deduplication yet (deduplication happens asynchronously)
370+ # The second import should report 6 new findings because deduplication is not visible in the stats from the importer
371+ self .assertEqual (len_new_findings1 , 6 , "First import should create 6 new findings" )
372+ self .assertEqual (len_closed_findings1 , 0 , "First import should not close any findings" )
373+ self .assertEqual (len_new_findings2 , 6 , "Second import should report 6 new findings initially (before deduplication)" )
374+ self .assertEqual (len_closed_findings2 , 0 , "Second import should not close any findings" )
375+
376+ # Verify that second import resulted in duplicates by checking the database
377+ # Only check duplicates in sync mode since deduplication happens asynchronously
378+ if check_duplicates :
379+ # Count active findings (non-duplicates) in the engagement
380+ active_findings = Finding .objects .filter (
381+ test__engagement = engagement ,
382+ active = True ,
383+ duplicate = False ,
384+ ).count ()
385+
386+ # Count duplicate findings in the engagement
387+ duplicate_findings = Finding .objects .filter (
388+ test__engagement = engagement ,
389+ duplicate = True ,
390+ ).count ()
391+
392+ # We should have 6 active findings (from first import) and 6 duplicate findings (from second import)
393+ self .assertEqual (active_findings , 6 , f"Expected 6 active findings, got { active_findings } " )
394+ self .assertEqual (duplicate_findings , 6 , f"Expected 6 duplicate findings, got { duplicate_findings } " )
395+
396+ # Total findings should be 12 (6 active + 6 duplicates)
397+ total_findings = Finding .objects .filter (test__engagement = engagement ).count ()
398+ self .assertEqual (total_findings , 12 , f"Expected 12 total findings, got { total_findings } " )
399+ else :
400+ # In async mode, just verify we have 12 total findings (deduplication happens in celery tasks)
401+ total_findings = Finding .objects .filter (test__engagement = engagement ).count ()
402+ self .assertEqual (total_findings , 12 , f"Expected 12 total findings, got { total_findings } " )
403+
404+ @override_settings (ENABLE_AUDITLOG = True , AUDITLOG_TYPE = "django-auditlog" )
405+ def test_deduplication_performance_async (self ):
406+ """
407+ Test deduplication performance with async tasks enabled.
408+ This test imports the same scan twice to measure deduplication query and task overhead.
409+ """
410+ configure_audit_system ()
411+ configure_pghistory_triggers ()
412+
413+ # Enable deduplication
414+ self .system_settings (enable_deduplication = True )
415+
416+ self ._deduplication_performance (
417+ expected_num_queries1 = 660 ,
418+ expected_num_async_tasks1 = 12 ,
419+ expected_num_queries2 = 519 ,
420+ expected_num_async_tasks2 = 12 ,
421+ check_duplicates = False , # Async mode - deduplication happens later
422+ )
423+
424+ @override_settings (ENABLE_AUDITLOG = True , AUDITLOG_TYPE = "django-pghistory" )
425+ def test_deduplication_performance_pghistory_async (self ):
426+ """Test deduplication performance with django-pghistory and async tasks enabled."""
427+ configure_audit_system ()
428+ configure_pghistory_triggers ()
429+
430+ # Enable deduplication
431+ self .system_settings (enable_deduplication = True )
432+
433+ self ._deduplication_performance (
434+ expected_num_queries1 = 624 ,
435+ expected_num_async_tasks1 = 12 ,
436+ expected_num_queries2 = 500 ,
437+ expected_num_async_tasks2 = 12 ,
438+ check_duplicates = False , # Async mode - deduplication happens later
439+ )
440+
441+ @override_settings (ENABLE_AUDITLOG = True , AUDITLOG_TYPE = "django-auditlog" )
442+ def test_deduplication_performance_no_async (self ):
443+ """Test deduplication performance with async tasks disabled."""
444+ configure_audit_system ()
445+ configure_pghistory_triggers ()
446+
447+ # Enable deduplication
448+ self .system_settings (enable_deduplication = True )
449+
450+ testuser = User .objects .get (username = "admin" )
451+ testuser .usercontactinfo .block_execution = True
452+ testuser .usercontactinfo .save ()
453+
454+ self ._deduplication_performance (
455+ expected_num_queries1 = 672 ,
456+ expected_num_async_tasks1 = 12 ,
457+ expected_num_queries2 = 633 ,
458+ expected_num_async_tasks2 = 12 ,
459+ )
460+
461+ @override_settings (ENABLE_AUDITLOG = True , AUDITLOG_TYPE = "django-pghistory" )
462+ def test_deduplication_performance_pghistory_no_async (self ):
463+ """Test deduplication performance with django-pghistory and async tasks disabled."""
464+ configure_audit_system ()
465+ configure_pghistory_triggers ()
466+
467+ # Enable deduplication
468+ self .system_settings (enable_deduplication = True )
469+
470+ testuser = User .objects .get (username = "admin" )
471+ testuser .usercontactinfo .block_execution = True
472+ testuser .usercontactinfo .save ()
473+
474+ self ._deduplication_performance (
475+ expected_num_queries1 = 636 ,
476+ expected_num_async_tasks1 = 12 ,
477+ expected_num_queries2 = 596 ,
478+ expected_num_async_tasks2 = 12 ,
479+ )
0 commit comments