#!/usr/bin/env python3 """ Fix Sistema Brasileiro de Museus duplicate by streaming YAML processing. Removes first occurrence, keeps enriched second occurrence with name update. """ import sys from pathlib import Path INPUT_FILE = Path("data/instances/all/globalglam-20251111-batch16.yaml") OUTPUT_FILE = Path("data/instances/all/globalglam-20251111-batch16-fixed.yaml") def main(): print("Fixing SBM duplicate...") print(f"Input: {INPUT_FILE}") print(f"Output: {OUTPUT_FILE}") # Read file with open(INPUT_FILE, 'r', encoding='utf-8') as f: lines = f.readlines() print(f"Total lines: {len(lines)}") # Find both SBM record starts sbm_starts = [] for i, line in enumerate(lines): if line.strip() == "- id: https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm": sbm_starts.append(i) print(f"Found {len(sbm_starts)} SBM records at lines: {sbm_starts}") if len(sbm_starts) != 2: print(f"ERROR: Expected 2 records, found {len(sbm_starts)}") return 1 # Find end of first record (start of next record or end of file) first_start = sbm_starts[0] first_end = first_start + 1 # Next record starts with "- id:" at column 0 while first_end < len(lines): if lines[first_end].startswith("- id:"): break first_end += 1 print(f"First record: lines {first_start} to {first_end-1} ({first_end-first_start} lines)") # Find second record name line and update it second_start = sbm_starts[1] second_name_line = None for i in range(second_start, min(second_start + 10, len(lines))): if lines[i].strip().startswith("name:"): second_name_line = i break if second_name_line: print(f"Second record name at line {second_name_line}: {lines[second_name_line].strip()}") # Update name to include (SBM) lines[second_name_line] = " name: Sistema Brasileiro de Museus (SBM)\n" print(f"Updated to: {lines[second_name_line].strip()}") # Find provenance section in second record and add notes second_prov_line = None for i in range(second_start, min(second_start + 100, len(lines))): if lines[i].strip().startswith("provenance:"): second_prov_line = i break if second_prov_line: # Look for notes field or enrichment_history notes_line = None enrichment_history_line = None for i in range(second_prov_line, min(second_prov_line + 30, len(lines))): if lines[i].strip().startswith("enrichment_history:"): enrichment_history_line = i break # Add notes before enrichment_history if found if enrichment_history_line: indent = " " note_text = ( f"{indent}notes: 'Duplicate fixed 2025-11-11: Merged with original record (line {first_start}), " "keeping enriched metadata with Wikidata identifier.'\n" ) lines.insert(enrichment_history_line, note_text) print(f"Added provenance note at line {enrichment_history_line}") # Remove first record print(f"Removing first record (lines {first_start} to {first_end-1})...") del lines[first_start:first_end] # Write output print(f"Writing to {OUTPUT_FILE}...") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.writelines(lines) print(f"✓ Fixed! Removed {first_end - first_start} lines") print(f" New total: {len(lines)} lines") return 0 if __name__ == "__main__": sys.exit(main())