#!/usr/bin/env python3 """ Fix Sistema Brasileiro de Museus duplicate in globalglam-20251111-batch16.yaml. Two records with same ID but slightly different names: 1. "Sistema Brasileiro de Museus (SBM)" - original 2. "Sistema Brasileiro de Museus" - batch16 (enriched) Strategy: Merge into single record, keep enriched metadata. """ import yaml import shutil from datetime import datetime from pathlib import Path # Paths BASE_DIR = Path(__file__).parent.parent INPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16.yaml" OUTPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16-fixed.yaml" BACKUP_FILE = BASE_DIR / "data" / "instances" / "all" / f"globalglam-20251111-batch16-pre-fix-{datetime.now().strftime('%Y%m%d-%H%M%S')}.yaml" def load_yaml(filepath): """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data, filepath): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def fix_sbm_duplicate(institutions): """ Fix Sistema Brasileiro de Museus duplicate by merging records. Args: institutions: List of institution dictionaries Returns: Updated list with duplicate merged """ # Find both SBM records target_id = "https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm" sbm_records = [] sbm_indices = [] for idx, inst in enumerate(institutions): if inst.get('id') == target_id: sbm_records.append(inst) sbm_indices.append(idx) print(f"Found {len(sbm_records)} Sistema Brasileiro de Museus records") if len(sbm_records) != 2: print(f"ERROR: Expected 2 SBM records, found {len(sbm_records)}") return institutions # Display both records for i, sbm in enumerate(sbm_records, 1): wikidata_ids = [ ident.get('identifier_value') for ident in sbm.get('identifiers', []) if ident.get('identifier_scheme') == 'Wikidata' ] print(f"\nRecord {i}:") print(f" Name: {sbm.get('name')}") print(f" Wikidata: {wikidata_ids}") print(f" Description length: {len(sbm.get('description', ''))}") print(f" Identifiers: {len(sbm.get('identifiers', []))}") # Determine which is enriched (has more identifiers) original = sbm_records[0] if len(sbm_records[0].get('identifiers', [])) < len(sbm_records[1].get('identifiers', [])) else sbm_records[1] enriched = sbm_records[1] if original == sbm_records[0] else sbm_records[0] print(f"\nOriginal record: '{original.get('name')}' ({len(original.get('identifiers', []))} identifiers)") print(f"Enriched record: '{enriched.get('name')}' ({len(enriched.get('identifiers', []))} identifiers)") # Merge: Keep enriched record, use name with abbreviation merged = enriched.copy() merged['name'] = "Sistema Brasileiro de Museus (SBM)" # Keep abbreviation for clarity # Update provenance notes if 'provenance' not in merged: merged['provenance'] = {} if merged['provenance'].get('notes'): merged['provenance']['notes'] += "\n\nDuplicate fixed 2025-11-11: Merged with original record, keeping enriched metadata." else: merged['provenance']['notes'] = "Duplicate fixed 2025-11-11: Merged with original record, keeping enriched metadata." print(f"\nMerged record: '{merged.get('name')}'") desc = merged.get('description', '') print(f" Description: {desc[:100] if desc else 'None'}...") print(f" Identifiers: {len(merged.get('identifiers', []))}") # Remove both originals for idx in sorted(sbm_indices, reverse=True): institutions.pop(idx) # Add merged record institutions.append(merged) print(f"\nDeduplication complete:") print(f" Original count: {len(sbm_records) + len(institutions) - 1}") print(f" New count: {len(institutions)}") print(f" Reduction: -1 institution") return institutions def main(): print("=" * 80) print("FIX SISTEMA BRASILEIRO DE MUSEUS DUPLICATE") print("=" * 80) # Create backup print(f"\n1. Creating backup: {BACKUP_FILE.name}") shutil.copy(INPUT_FILE, BACKUP_FILE) print(" āœ“ Backup created") # Load data print(f"\n2. Loading institutions from {INPUT_FILE.name}...") institutions = load_yaml(INPUT_FILE) print(f" Loaded {len(institutions)} institutions") # Fix duplicate print(f"\n3. Fixing Sistema Brasileiro de Museus duplicate...") institutions = fix_sbm_duplicate(institutions) # Save deduplicated dataset print(f"\n4. Saving deduplicated dataset to {OUTPUT_FILE.name}...") save_yaml(institutions, OUTPUT_FILE) print(" āœ“ Saved successfully") # Statistics brazil_institutions = [ inst for inst in institutions if inst.get('locations') and inst['locations'][0].get('country') == 'BR' ] brazil_with_wikidata = [ inst for inst in brazil_institutions if inst.get('identifiers') and any( ident.get('identifier_scheme') == 'Wikidata' for ident in inst['identifiers'] ) ] print(f"\n{'='*80}") print("FINAL STATISTICS") print(f"{'='*80}") print(f"Total institutions: {len(institutions)}") print(f"Brazilian institutions: {len(brazil_institutions)}") print(f"Brazilian with Wikidata: {len(brazil_with_wikidata)}") print(f"Coverage: {len(brazil_with_wikidata)/len(brazil_institutions)*100:.1f}%") print(f"\nāœ“ Duplicate fixed successfully!") print(f" Input: {INPUT_FILE}") print(f" Output: {OUTPUT_FILE}") print(f" Backup: {BACKUP_FILE}") if __name__ == "__main__": main()