#!/usr/bin/env python3 """ Mexican Wikidata Enrichment - Batch 2 Add Wikidata identifiers to 4 institutions with perfect name matches (100% score). Institutions: 1. Archivo General de la Nación (Q2860534, VIAF: 159570855) 2. Museo Frida Kahlo (Q2663377, VIAF: 144233695) 3. Museo Soumaya (Q2097646, VIAF: 135048064) 4. Museo de Antropología de Xalapa (Q1841655, VIAF: 138582541) """ import yaml from datetime import datetime, timezone from pathlib import Path # Define enrichment data - Only 100% perfect matches from Wikidata SPARQL query BATCH_2_ENRICHMENTS = { "Archivo General de la Nación": { "wikidata": "Q2860534", "viaf": "159570855", "confidence": 0.98, "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference" }, "Museo Frida Kahlo": { "wikidata": "Q2663377", "viaf": "144233695", "confidence": 0.98, "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference" }, "Museo Soumaya": { "wikidata": "Q2097646", "viaf": "135048064", "confidence": 0.98, "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference" }, "Museo de Antropología de Xalapa": { "wikidata": "Q1841655", "viaf": "138582541", "confidence": 0.98, "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference" } } def add_enrichment_identifiers(institution: dict, enrichment: dict) -> dict: """Add Wikidata and VIAF identifiers to institution record.""" # Ensure identifiers list exists if 'identifiers' not in institution or institution['identifiers'] is None: institution['identifiers'] = [] # Check if identifiers already exist (avoid duplicates) existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']} # Add Wikidata identifier if not present if 'wikidata' in enrichment and 'Wikidata' not in existing_schemes: q_number = enrichment['wikidata'] institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' }) # Add VIAF identifier if not present if 'viaf' in enrichment and 'VIAF' not in existing_schemes: viaf_id = enrichment['viaf'] institution['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': viaf_id, 'identifier_url': f'https://viaf.org/viaf/{viaf_id}' }) # Update provenance metadata current_time = datetime.now(timezone.utc).isoformat() if 'provenance' not in institution: institution['provenance'] = {} # Add enrichment metadata if 'enrichment_history' not in institution['provenance']: institution['provenance']['enrichment_history'] = [] enrichment_entry = { 'enrichment_date': current_time, 'enrichment_method': 'Wikidata SPARQL query + VIAF cross-reference (Batch 2)', 'identifiers_added': [], 'confidence_score': enrichment['confidence'], 'notes': enrichment['notes'] } if 'wikidata' in enrichment and 'Wikidata' not in existing_schemes: enrichment_entry['identifiers_added'].append(f"Wikidata:{enrichment['wikidata']}") if 'viaf' in enrichment and 'VIAF' not in existing_schemes: enrichment_entry['identifiers_added'].append(f"VIAF:{enrichment['viaf']}") # Only add enrichment entry if we actually added new identifiers if enrichment_entry['identifiers_added']: institution['provenance']['enrichment_history'].append(enrichment_entry) return institution def main(): """Main enrichment workflow.""" # Load YAML file yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexico/mexican_institutions_geocoded.yaml') print("Loading Mexican institutions YAML file...") with open(yaml_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} institutions") # Track enrichment statistics enriched_count = 0 skipped_count = 0 not_found = [] # Enrich each institution for institution in institutions: institution_name = institution.get('name', '') if institution_name in BATCH_2_ENRICHMENTS: enrichment = BATCH_2_ENRICHMENTS[institution_name] # Check if already has identifiers existing_identifiers = institution.get('identifiers', []) has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in existing_identifiers) has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in existing_identifiers) if has_wikidata and has_viaf: print(f"\n⏭️ Skipping (already enriched): {institution_name}") skipped_count += 1 continue print(f"\n✅ Enriching: {institution_name}") print(f" - Wikidata: {enrichment.get('wikidata', 'N/A')}") print(f" - VIAF: {enrichment.get('viaf', 'N/A')}") institution = add_enrichment_identifiers(institution, enrichment) enriched_count += 1 # Check for missing institutions for name in BATCH_2_ENRICHMENTS.keys(): found = any(inst.get('name') == name for inst in institutions) if not found: not_found.append(name) if not_found: print("\n⚠️ Warning: Could not find these institutions in YAML:") for name in not_found: print(f" - {name}") # Save enriched YAML print(f"\n📝 Summary:") print(f" - Newly enriched: {enriched_count}") print(f" - Already enriched (skipped): {skipped_count}") print(f" - Not found in dataset: {len(not_found)}") print(f"\nWriting to {yaml_path}...") with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=1000) print("✅ Batch 2 enrichment complete!") # Generate statistics total_wikidata = sum(1 for inst in institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) coverage = (total_wikidata / len(institutions)) * 100 print(f"\n📊 Current Wikidata Coverage:") print(f" {total_wikidata}/{len(institutions)} institutions ({coverage:.1f}%)") if __name__ == '__main__': main()