glam/scripts/fix_sbm_duplicate_stream.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

105 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Fix Sistema Brasileiro de Museus duplicate by streaming YAML processing.
Removes first occurrence, keeps enriched second occurrence with name update.
"""
import sys
from pathlib import Path
INPUT_FILE = Path("data/instances/all/globalglam-20251111-batch16.yaml")
OUTPUT_FILE = Path("data/instances/all/globalglam-20251111-batch16-fixed.yaml")
def main():
print("Fixing SBM duplicate...")
print(f"Input: {INPUT_FILE}")
print(f"Output: {OUTPUT_FILE}")
# Read file
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
lines = f.readlines()
print(f"Total lines: {len(lines)}")
# Find both SBM record starts
sbm_starts = []
for i, line in enumerate(lines):
if line.strip() == "- id: https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm":
sbm_starts.append(i)
print(f"Found {len(sbm_starts)} SBM records at lines: {sbm_starts}")
if len(sbm_starts) != 2:
print(f"ERROR: Expected 2 records, found {len(sbm_starts)}")
return 1
# Find end of first record (start of next record or end of file)
first_start = sbm_starts[0]
first_end = first_start + 1
# Next record starts with "- id:" at column 0
while first_end < len(lines):
if lines[first_end].startswith("- id:"):
break
first_end += 1
print(f"First record: lines {first_start} to {first_end-1} ({first_end-first_start} lines)")
# Find second record name line and update it
second_start = sbm_starts[1]
second_name_line = None
for i in range(second_start, min(second_start + 10, len(lines))):
if lines[i].strip().startswith("name:"):
second_name_line = i
break
if second_name_line:
print(f"Second record name at line {second_name_line}: {lines[second_name_line].strip()}")
# Update name to include (SBM)
lines[second_name_line] = " name: Sistema Brasileiro de Museus (SBM)\n"
print(f"Updated to: {lines[second_name_line].strip()}")
# Find provenance section in second record and add notes
second_prov_line = None
for i in range(second_start, min(second_start + 100, len(lines))):
if lines[i].strip().startswith("provenance:"):
second_prov_line = i
break
if second_prov_line:
# Look for notes field or enrichment_history
notes_line = None
enrichment_history_line = None
for i in range(second_prov_line, min(second_prov_line + 30, len(lines))):
if lines[i].strip().startswith("enrichment_history:"):
enrichment_history_line = i
break
# Add notes before enrichment_history if found
if enrichment_history_line:
indent = " "
note_text = (
f"{indent}notes: 'Duplicate fixed 2025-11-11: Merged with original record (line {first_start}), "
"keeping enriched metadata with Wikidata identifier.'\n"
)
lines.insert(enrichment_history_line, note_text)
print(f"Added provenance note at line {enrichment_history_line}")
# Remove first record
print(f"Removing first record (lines {first_start} to {first_end-1})...")
del lines[first_start:first_end]
# Write output
print(f"Writing to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.writelines(lines)
print(f"✓ Fixed! Removed {first_end - first_start} lines")
print(f" New total: {len(lines)} lines")
return 0
if __name__ == "__main__":
sys.exit(main())