#!/usr/bin/env python3 """ Chilean GLAM Institutions - Batch 8 Wikidata Enrichment (Libraries) Uses bulk SPARQL matches from query_wikidata_chilean_libraries.py 2 libraries with verified Q-numbers from Wikidata Query Service Target: 54/90 institutions (60% coverage) """ import yaml from pathlib import Path from datetime import datetime, timezone # Batch 8: 2 libraries from SPARQL bulk query BATCH_8_ENRICHMENTS = [ { "name": "Biblioteca Nacional Digital", "city": "Iquique", "q_number": "Q18924152", "wikidata_name": "Biblioteca Nacional Digital de Chile", "confidence": "partial", "notes": "SPARQL match - partial name (full official title in Wikidata). Note: City may be incorrect in our data - this is a national digital library, not specific to Iquique.", }, { "name": "William Mulloy Library", "city": "Isla de Pascua", # Updated from Unknown "q_number": "Q8015912", "wikidata_name": "Biblioteca William Mulloy", "confidence": "partial", "founded": "2002", "notes": "SPARQL match - partial name (Spanish vs English). Easter Island archaeological library.", }, ] def load_yaml(file_path: Path) -> list: """Load YAML file.""" with open(file_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data: list, file_path: Path) -> None: """Save data to YAML file.""" with open(file_path, 'w', encoding='utf-8') as f: yaml.dump( data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120, indent=2 ) def find_institution(institutions: list, name: str, city: str) -> dict: """Find institution by name and city.""" for inst in institutions: if inst['name'] == name: # Check city if provided if city and city != "Unknown": inst_city = inst.get('locations', [{}])[0].get('city', '') if inst_city == city or city == "Isla de Pascua": # Allow Easter Island match return inst else: return inst raise ValueError(f"Institution not found: {name} ({city})") def enrich_institution(inst: dict, enrichment: dict) -> None: """Add Wikidata identifier to institution.""" # Check if already has Wikidata existing_ids = inst.get('identifiers', []) has_wikidata = any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in existing_ids ) if has_wikidata: print(f" ⚠️ {inst['name']} already has Wikidata identifier") return # Add Wikidata identifier wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': enrichment['q_number'], 'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}" } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(wikidata_id) # Update city if it was Unknown if enrichment.get('city') and enrichment['city'] != "Unknown": locations = inst.get('locations', []) if locations and not locations[0].get('city'): locations[0]['city'] = enrichment['city'] print(f" 📍 Updated city to: {enrichment['city']}") # Update provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['enrichment_method'] = 'Wikidata SPARQL bulk query (Batch 8 - Libraries)' inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat() inst['provenance']['wikidata_match_confidence'] = enrichment['confidence'] # Add notes if 'notes' not in inst['provenance']: inst['provenance']['notes'] = [] elif isinstance(inst['provenance']['notes'], str): inst['provenance']['notes'] = [inst['provenance']['notes']] inst['provenance']['notes'].append( f"Batch 8: {enrichment['notes']}" ) print(f" ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})") def main(): print("=" * 80) print("CHILEAN GLAM INSTITUTIONS - BATCH 8 ENRICHMENT (LIBRARIES)") print("=" * 80) print() # Load data input_file = Path('data/instances/chile/chilean_institutions_batch7_enriched.yaml') print(f"📖 Loading: {input_file}") institutions = load_yaml(input_file) print(f" Loaded {len(institutions)} institutions") print() # Create backup backup_file = input_file.with_suffix('.yaml.batch8_backup') print(f"💾 Creating backup: {backup_file}") save_yaml(institutions, backup_file) print() # Apply enrichments print(f"🔧 Applying {len(BATCH_8_ENRICHMENTS)} enrichments...") print() enriched_count = 0 for i, enrichment in enumerate(BATCH_8_ENRICHMENTS, 1): print(f"{i}. {enrichment['name']} ({enrichment['city']})") try: inst = find_institution(institutions, enrichment['name'], enrichment['city']) enrich_institution(inst, enrichment) enriched_count += 1 except ValueError as e: print(f" ❌ {e}") except Exception as e: print(f" ❌ Error: {e}") print() # Save enriched data output_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml') print(f"💾 Saving enriched data: {output_file}") save_yaml(institutions, output_file) print() # Statistics print("=" * 80) print("ENRICHMENT SUMMARY") print("=" * 80) print() total = len(institutions) with_wikidata = sum( 1 for inst in institutions if any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ) print(f"Total institutions: {total}") print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)") print(f"Batch 8 enrichments: {enriched_count}") print() # By type from collections import defaultdict by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0}) for inst in institutions: inst_type = inst.get('institution_type', 'UNKNOWN') by_type[inst_type]['total'] += 1 if any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ): by_type[inst_type]['with_wd'] += 1 print("Coverage by type:") for inst_type in sorted(by_type.keys()): stats = by_type[inst_type] pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0 status = "✅" if pct == 100 else "⭐" if pct >= 50 else "" print(f" {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)") print() print("🎉 Batch 8 enrichment complete!") print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)") if __name__ == '__main__': main()