- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
105 lines
3.6 KiB
Python
105 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix Sistema Brasileiro de Museus duplicate by streaming YAML processing.
|
|
Removes first occurrence, keeps enriched second occurrence with name update.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
INPUT_FILE = Path("data/instances/all/globalglam-20251111-batch16.yaml")
|
|
OUTPUT_FILE = Path("data/instances/all/globalglam-20251111-batch16-fixed.yaml")
|
|
|
|
def main():
|
|
print("Fixing SBM duplicate...")
|
|
print(f"Input: {INPUT_FILE}")
|
|
print(f"Output: {OUTPUT_FILE}")
|
|
|
|
# Read file
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
print(f"Total lines: {len(lines)}")
|
|
|
|
# Find both SBM record starts
|
|
sbm_starts = []
|
|
for i, line in enumerate(lines):
|
|
if line.strip() == "- id: https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm":
|
|
sbm_starts.append(i)
|
|
|
|
print(f"Found {len(sbm_starts)} SBM records at lines: {sbm_starts}")
|
|
|
|
if len(sbm_starts) != 2:
|
|
print(f"ERROR: Expected 2 records, found {len(sbm_starts)}")
|
|
return 1
|
|
|
|
# Find end of first record (start of next record or end of file)
|
|
first_start = sbm_starts[0]
|
|
first_end = first_start + 1
|
|
|
|
# Next record starts with "- id:" at column 0
|
|
while first_end < len(lines):
|
|
if lines[first_end].startswith("- id:"):
|
|
break
|
|
first_end += 1
|
|
|
|
print(f"First record: lines {first_start} to {first_end-1} ({first_end-first_start} lines)")
|
|
|
|
# Find second record name line and update it
|
|
second_start = sbm_starts[1]
|
|
second_name_line = None
|
|
|
|
for i in range(second_start, min(second_start + 10, len(lines))):
|
|
if lines[i].strip().startswith("name:"):
|
|
second_name_line = i
|
|
break
|
|
|
|
if second_name_line:
|
|
print(f"Second record name at line {second_name_line}: {lines[second_name_line].strip()}")
|
|
# Update name to include (SBM)
|
|
lines[second_name_line] = " name: Sistema Brasileiro de Museus (SBM)\n"
|
|
print(f"Updated to: {lines[second_name_line].strip()}")
|
|
|
|
# Find provenance section in second record and add notes
|
|
second_prov_line = None
|
|
for i in range(second_start, min(second_start + 100, len(lines))):
|
|
if lines[i].strip().startswith("provenance:"):
|
|
second_prov_line = i
|
|
break
|
|
|
|
if second_prov_line:
|
|
# Look for notes field or enrichment_history
|
|
notes_line = None
|
|
enrichment_history_line = None
|
|
|
|
for i in range(second_prov_line, min(second_prov_line + 30, len(lines))):
|
|
if lines[i].strip().startswith("enrichment_history:"):
|
|
enrichment_history_line = i
|
|
break
|
|
|
|
# Add notes before enrichment_history if found
|
|
if enrichment_history_line:
|
|
indent = " "
|
|
note_text = (
|
|
f"{indent}notes: 'Duplicate fixed 2025-11-11: Merged with original record (line {first_start}), "
|
|
"keeping enriched metadata with Wikidata identifier.'\n"
|
|
)
|
|
lines.insert(enrichment_history_line, note_text)
|
|
print(f"Added provenance note at line {enrichment_history_line}")
|
|
|
|
# Remove first record
|
|
print(f"Removing first record (lines {first_start} to {first_end-1})...")
|
|
del lines[first_start:first_end]
|
|
|
|
# Write output
|
|
print(f"Writing to {OUTPUT_FILE}...")
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
f.writelines(lines)
|
|
|
|
print(f"✓ Fixed! Removed {first_end - first_start} lines")
|
|
print(f" New total: {len(lines)} lines")
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|