#!/usr/bin/env python3 """ Mexican Wikidata Enrichment - Batch 1 Add Wikidata identifiers to 6 national priority institutions. Institutions: 1. Museo Nacional de Antropología (Q524249, VIAF: 139462066) 2. Museo Nacional de Arte (Q1138147, VIAF: 137951343) 3. Biblioteca Nacional de México (Q5495070, ISIL: MX-MXBN, VIAF: 147873206) 4. Cineteca Nacional (Q1092492) 5. Fototeca Nacional (Q66432183) 6. Instituto Nacional de Antropología e Historia (Q901361, VIAF: 139735572) """ import yaml from datetime import datetime, timezone from pathlib import Path # Define enrichment data BATCH_1_ENRICHMENTS = { "Museo Nacional de Antropología": { "wikidata": "Q524249", "viaf": "139462066", "confidence": 0.98, "notes": "Verified via SPARQL query and VIAF match" }, "Museo Nacional de Arte (MUNAL)": { "wikidata": "Q1138147", "viaf": "137951343", "confidence": 0.98, "notes": "Verified via SPARQL query and VIAF match" }, "Biblioteca Nacional de México": { "wikidata": "Q5495070", "viaf": "147873206", "isil": "MX-MXBN", "confidence": 0.98, "notes": "Verified via SPARQL query, VIAF, and ISIL registry match" }, "Cineteca Nacional": { "wikidata": "Q1092492", "confidence": 0.95, "notes": "Verified via SPARQL query" }, "Fototeca Nacional": { "wikidata": "Q66432183", "confidence": 0.95, "notes": "Verified via SPARQL query" }, "Instituto Nacional de Antropología e Historia": { "wikidata": "Q901361", "viaf": "139735572", "confidence": 0.98, "notes": "Verified via SPARQL query and VIAF match" } } def add_enrichment_identifiers(institution: dict, enrichment: dict) -> dict: """Add Wikidata, VIAF, and ISIL identifiers to institution record.""" # Ensure identifiers list exists if 'identifiers' not in institution or institution['identifiers'] is None: institution['identifiers'] = [] # Add Wikidata identifier if 'wikidata' in enrichment: q_number = enrichment['wikidata'] institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' }) # Add VIAF identifier if 'viaf' in enrichment: viaf_id = enrichment['viaf'] institution['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': viaf_id, 'identifier_url': f'https://viaf.org/viaf/{viaf_id}' }) # Add ISIL identifier if 'isil' in enrichment: isil_code = enrichment['isil'] institution['identifiers'].append({ 'identifier_scheme': 'ISIL', 'identifier_value': isil_code, # ISIL codes don't have a universal URL }) # Update provenance metadata current_time = datetime.now(timezone.utc).isoformat() if 'provenance' not in institution: institution['provenance'] = {} # Add enrichment metadata if 'enrichment_history' not in institution['provenance']: institution['provenance']['enrichment_history'] = [] enrichment_entry = { 'enrichment_date': current_time, 'enrichment_method': 'Wikidata SPARQL query + VIAF cross-reference', 'identifiers_added': [], 'confidence_score': enrichment['confidence'], 'notes': enrichment['notes'] } if 'wikidata' in enrichment: enrichment_entry['identifiers_added'].append(f"Wikidata:{enrichment['wikidata']}") if 'viaf' in enrichment: enrichment_entry['identifiers_added'].append(f"VIAF:{enrichment['viaf']}") if 'isil' in enrichment: enrichment_entry['identifiers_added'].append(f"ISIL:{enrichment['isil']}") institution['provenance']['enrichment_history'].append(enrichment_entry) return institution def main(): """Main enrichment workflow.""" # Load YAML file yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexico/mexican_institutions_geocoded.yaml') print("Loading Mexican institutions YAML file...") with open(yaml_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} institutions") # Track enrichment statistics enriched_count = 0 not_found = [] # Enrich each institution for institution in institutions: institution_name = institution.get('name', '') if institution_name in BATCH_1_ENRICHMENTS: enrichment = BATCH_1_ENRICHMENTS[institution_name] print(f"\nEnriching: {institution_name}") print(f" - Wikidata: {enrichment.get('wikidata', 'N/A')}") print(f" - VIAF: {enrichment.get('viaf', 'N/A')}") print(f" - ISIL: {enrichment.get('isil', 'N/A')}") institution = add_enrichment_identifiers(institution, enrichment) enriched_count += 1 # Check for missing institutions for name in BATCH_1_ENRICHMENTS.keys(): found = any(inst.get('name') == name for inst in institutions) if not found: not_found.append(name) if not_found: print("\n⚠️ Warning: Could not find these institutions in YAML:") for name in not_found: print(f" - {name}") # Save enriched YAML print(f"\n✅ Enriched {enriched_count} institutions") print(f"Writing to {yaml_path}...") with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=1000) print("✅ Batch 1 enrichment complete!") # Generate statistics total_wikidata = sum(1 for inst in institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) coverage = (total_wikidata / len(institutions)) * 100 print(f"\n📊 Current Wikidata Coverage:") print(f" {total_wikidata}/{len(institutions)} institutions ({coverage:.1f}%)") if __name__ == '__main__': main()