#!/usr/bin/env python3 """ Enrich Chilean heritage institutions with Batch 11 Wikidata identifiers. Applies 5 validated Wikidata matches from batch11_final_validation.json: - Museo Histórico-Arqueológico (Quillota) → Q12184920 - Museo Mapuche de Purén → Q86282614 - Museo Pleistocénico (Osorno) → Q112044601 - Red de Museos Aysén → Q53877849 - Museo Territorial Yagan Usi → Q6775581 Updates coverage from 55/90 (61.1%) → 60/90 (66.7%) """ import json import yaml from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Any import shutil # File paths INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch10_enriched.yaml") VALIDATION_FILE = Path("scripts/batch11_final_validation.json") OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml") BACKUP_FILE = INPUT_FILE.with_suffix('.yaml.batch10.backup') def load_yaml(filepath: Path) -> List[Dict[str, Any]]: """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def load_json(filepath: Path) -> Dict[str, Any]: """Load JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def save_yaml(data: List[Dict[str, Any]], filepath: Path): """Save YAML file with proper formatting.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, indent=2) def normalize_name(name: str) -> str: """Normalize institution name for matching.""" return name.lower().strip().replace(" ", " ") def find_institution(institutions: List[Dict], name: str, city: str) -> tuple: """Find institution by name and city. Returns (index, institution) or (None, None).""" norm_name = normalize_name(name) norm_city = normalize_name(city) for idx, inst in enumerate(institutions): inst_name = normalize_name(inst.get('name', '')) # Check if institution has locations locations = inst.get('locations', []) if not locations: continue inst_city = normalize_name(locations[0].get('city', '')) # Match by name and city if norm_name == inst_name and norm_city == inst_city: return idx, inst # Also try partial name match with exact city (for "Museo Territorial Yagan Usi") if norm_name in inst_name and norm_city == inst_city: return idx, inst return None, None def has_wikidata_identifier(institution: Dict) -> bool: """Check if institution already has a Wikidata identifier.""" identifiers = institution.get('identifiers', []) return any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers) def add_wikidata_identifier(institution: Dict, q_number: str, wikidata_name: str, confidence: str, reason: str): """Add Wikidata identifier to institution.""" # Initialize identifiers list if it doesn't exist if 'identifiers' not in institution: institution['identifiers'] = [] # Add Wikidata identifier wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } institution['identifiers'].append(wikidata_id) # Update provenance if 'provenance' not in institution: institution['provenance'] = {} institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() institution['provenance']['enrichment_batch'] = 11 institution['provenance']['wikidata_match_confidence'] = confidence institution['provenance']['wikidata_match_reason'] = reason institution['provenance']['wikidata_name'] = wikidata_name return institution def enrich_batch11(): """Main enrichment function.""" print("=" * 80) print("Chilean GLAM Wikidata Enrichment - Batch 11") print("=" * 80) print() # Create backup print(f"Creating backup: {BACKUP_FILE}") shutil.copy2(INPUT_FILE, BACKUP_FILE) print() # Load data print(f"Loading dataset: {INPUT_FILE}") institutions = load_yaml(INPUT_FILE) print(f" Loaded {len(institutions)} institutions") print() print(f"Loading validation results: {VALIDATION_FILE}") validation = load_json(VALIDATION_FILE) validated_matches = validation['validated_matches'] print(f" Loaded {len(validated_matches)} validated matches") print() # Statistics enriched_count = 0 already_enriched = 0 not_found = 0 # Process each validated match print("Processing validated matches:") print("-" * 80) for match in validated_matches: museum_name = match['museum_name'] city = match['city'] q_number = match['q_number'] wikidata_name = match['wikidata_name'] confidence = match['confidence'] reason = match['reason'] print(f"\n{museum_name} ({city})") print(f" → {q_number}: {wikidata_name}") # Find institution in dataset idx, institution = find_institution(institutions, museum_name, city) if institution is None: print(f" ❌ NOT FOUND in dataset") not_found += 1 continue # Check if already has Wikidata ID if has_wikidata_identifier(institution): print(f" ⚠️ Already has Wikidata identifier (skipping)") already_enriched += 1 continue # Add Wikidata identifier add_wikidata_identifier(institution, q_number, wikidata_name, confidence, reason) institutions[idx] = institution enriched_count += 1 print(f" ✅ Enriched with {q_number}") print() print("-" * 80) print(f"\nEnrichment Summary:") print(f" Enriched: {enriched_count}") print(f" Already enriched: {already_enriched}") print(f" Not found: {not_found}") print() # Calculate coverage total_institutions = len(institutions) institutions_with_wikidata = sum(1 for inst in institutions if has_wikidata_identifier(inst)) coverage_pct = (institutions_with_wikidata / total_institutions) * 100 print(f"Coverage:") print(f" Institutions with Wikidata: {institutions_with_wikidata}/{total_institutions} ({coverage_pct:.1f}%)") print() # Save enriched dataset print(f"Saving enriched dataset: {OUTPUT_FILE}") save_yaml(institutions, OUTPUT_FILE) print() print("=" * 80) print("Batch 11 Enrichment Complete!") print("=" * 80) print() print(f"✅ Updated: {OUTPUT_FILE}") print(f"📦 Backup: {BACKUP_FILE}") print() print(f"Progress: {institutions_with_wikidata}/{total_institutions} institutions ({coverage_pct:.1f}%)") print(f"Target: 63/90 (70.0%) - Need {63 - institutions_with_wikidata} more institutions") if __name__ == "__main__": enrich_batch11()