1111results into a single Elasticsearch index. Each result document includes:
1212- All original SARIF result fields (ruleId, message, locations, etc.)
1313- Derived fields (ruleGroup, ruleLanguage) parsed from ruleId
14- -Run-level metadata (tool info, version control provenance )
14+ -ONLY versionControlProvenance from run (minimal enrichment )
1515- Source file tracking metadata
1616
17+ This approach keeps documents minimal by indexing ONLY the result objects to avoid
18+ Elasticsearch size limits. Tool info and automation details are NOT included.
19+
1720Usage:
1821 python index-sarif-results-in-elasticsearch.py <sarif_files_list.txt> <elasticsearch_index_name>
1922
@@ -398,7 +401,7 @@ def sarif_results_generator(sarif_files, index_name):
398401
399402def index_sarif_files (sarif_files ,index_name ,host ,api_key = None ,username = None ,password = None ):
400403"""
401- Connect to Elasticsearch and bulk index all SARIF results.
404+ Connect to Elasticsearch and bulk index all SARIF results with progress logging .
402405 """
403406es_client = create_elasticsearch_client (host ,api_key ,username ,password )
404407
@@ -415,37 +418,58 @@ def index_sarif_files(sarif_files, index_name, host, api_key=None, username=None
415418return False
416419
417420print (f"Indexing results from{ len (sarif_files )} SARIF files..." )
421+ print ()
418422
419423try :
420- # Use bulk helper to index all documents
421- success_count ,failed_docs = helpers .bulk (
424+ # Track progress during bulk indexing
425+ documents_indexed = 0
426+ last_progress_update = 0
427+ progress_interval = 100 # Update every 100 documents
428+
429+ def progress_callback (success ,info ):
430+ """Callback to track progress during bulk indexing."""
431+ nonlocal documents_indexed ,last_progress_update
432+ documents_indexed += 1
433+
434+ # Print progress updates periodically
435+ if documents_indexed - last_progress_update >= progress_interval :
436+ print (f" → Indexed{ documents_indexed } documents so far..." )
437+ last_progress_update = documents_indexed
438+
439+ if not success :
440+ print (f" ✗ Failed to index document:{ info } " )
441+
442+ # Use bulk helper to index all documents with progress tracking
443+ print ("Starting bulk indexing..." )
444+ for success ,info in helpers .streaming_bulk (
422445es_client ,
423446sarif_results_generator (sarif_files ,index_name ),
424447chunk_size = 500 ,
425448request_timeout = 60 ,
426- )
449+ raise_on_error = False ,
450+ ):
451+ progress_callback (success ,info )
427452
453+ print (f" → Indexed{ documents_indexed } documents (final)" )
454+ print ()
428455print ("-" * 50 )
429456print (f"✓ Bulk indexing complete" )
430- print (f"✓ Successfully indexed:{ success_count } documents" )
431- print (f"✗ Failed to index:{ len (failed_docs )} documents" )
432-
433- if failed_docs :
434- print ("\n Failed documents:" )
435- for doc in failed_docs [:5 ]:# Show first 5 failures
436- print (f" -{ doc } " )
437- if len (failed_docs )> 5 :
438- print (f" ... and{ len (failed_docs )- 5 } more" )
457+ print (f"✓ Total documents indexed:{ documents_indexed } " )
439458
440- # Get final index stats
459+ # Get final index stats to verify
441460stats = es_client .indices .stats (index = index_name )
442461doc_count = stats ["indices" ][index_name ]["total" ]["docs" ]["count" ]
443462print (f"✓ Final document count in index:{ doc_count } " )
463+
464+ if doc_count != documents_indexed :
465+ print (f"⚠ Warning: Document count mismatch (indexed:{ documents_indexed } , in index:{ doc_count } )" )
444466
445467return True
446468
447469except Exception as e :
448470print (f"Error during bulk indexing:{ e } " )
471+ import traceback
472+ traceback .print_exc ()
449473return False
450474
451475