#!/usr/bin/env python3 """ Chilean Heritage Institutions - Batch 1 Wikidata Enrichment (Manual Verification) Adds VERIFIED Wikidata Q-numbers to 13 Chilean institutions (diverse sample): BATCH 1 TARGET INSTITUTIONS (13): Museums (4): 1. Museo Universidad de Tarapacá San Miguel de Azapa (MASMA) - Arica 2. Museo de Historia Natural de Atacama - Atacama 3. Museo Indígena Atacameño - Antofagasta 4. Museo de Tocopilla - Antofagasta Archives (3): 5. Archivo Central Andrés Bello, Universidad de Chile - Santiago 6. Archivo Central USACH - Santiago 7. Archivo Histórico del Arzobispado de Santiago - Santiago Libraries (3): 8. Biblioteca Nacional Digital de Chile - Santiago 9. Biblioteca Federico Varela, Universidad de Atacama - Atacama 10. CRA Escuela El Olivar - Arica Education Providers (3): 11. Universidad de Tarapacá - Arica 12. Universidad Arturo Prat - Iquique 13. Universidad Católica del Norte, Sede San Pedro de Atacama - Antofagasta STRATEGY: - Query Wikidata API for each institution - Fuzzy match with manual verification checkpoints (threshold > 85%) - Export enriched YAML with provenance tracking - Follow LinkML schema compliance Coverage Goal: 0/90 (0%) → 13/90 (14.4%) """ import yaml from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any import requests import time from rapidfuzz import fuzz # Wikidata SPARQL endpoint WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" # Batch 1: Selected 13 institutions for diverse sample # NOTE: Using simplified patterns that match the dataset's actual names BATCH_1_TARGETS = [ # Museums (4) { 'name_pattern': 'Museo Universidad de Tarapacá', 'region': 'Arica', 'city': 'Arica', 'inst_type': 'MUSEUM', 'wikidata_class': 'Q33506', # museum 'notes': 'MASMA - Archaeological museum with pre-Columbian collections' }, { 'name_pattern': 'Museo de Historia Natural de Atacama', 'region': 'Atacama', 'city': None, # No city in dataset 'inst_type': 'MUSEUM', 'wikidata_class': 'Q33506', 'notes': 'Natural history museum in Atacama region' }, { 'name_pattern': 'Museo Indígena Atacameño', 'region': 'Antofagasta', 'city': None, 'inst_type': 'MUSEUM', 'wikidata_class': 'Q33506', 'notes': 'Indigenous cultural museum' }, { 'name_pattern': 'Museo de Tocopilla', 'region': 'Antofagasta', 'city': 'Tocopilla', 'inst_type': 'MUSEUM', 'wikidata_class': 'Q33506', 'notes': 'Local museum in Tocopilla' }, # Archives (3) { 'name_pattern': 'Archivo Central Andrés Bello', 'region': 'Metropolitana', 'city': 'Santiago', 'inst_type': 'ARCHIVE', 'wikidata_class': 'Q166118', # archive 'notes': 'Universidad de Chile central archive' }, { 'name_pattern': 'Archivo Central USACH', 'region': 'Metropolitana', 'city': 'Santiago', 'inst_type': 'ARCHIVE', 'wikidata_class': 'Q166118', 'notes': 'Universidad de Santiago de Chile archive' }, { 'name_pattern': 'Archivo Histórico del Arzobispado', 'region': 'Metropolitana', 'city': 'Santiago', 'inst_type': 'ARCHIVE', 'wikidata_class': 'Q166118', 'notes': 'Archdiocese of Santiago historical archive' }, # Libraries (3) { 'name_pattern': 'Biblioteca Nacional Digital', 'region': 'Metropolitana', 'city': 'Santiago', 'inst_type': 'LIBRARY', 'wikidata_class': 'Q7075', # library 'notes': 'Digital platform of Biblioteca Nacional de Chile' }, { 'name_pattern': 'Biblioteca Federico Varela', 'region': 'Atacama', 'city': None, 'inst_type': 'LIBRARY', 'wikidata_class': 'Q7075', 'notes': 'Universidad de Atacama library' }, { 'name_pattern': 'CRA Escuela El Olivar', 'region': 'Arica', 'city': 'Arica', 'inst_type': 'LIBRARY', 'wikidata_class': 'Q7075', 'notes': 'School learning resource center' }, # Education Providers (3) { 'name_pattern': 'Universidad de Tarapacá', 'region': 'Arica', 'city': 'Arica', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', # university 'notes': 'Public university in Arica' }, { 'name_pattern': 'Universidad Arturo Prat', 'region': 'Tarapacá', 'city': 'Iquique', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', 'notes': 'Public university in Iquique' }, { 'name_pattern': 'Universidad Católica del Norte', 'region': 'Antofagasta', 'city': 'San Pedro de Atacama', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', 'notes': 'Catholic university campus in San Pedro de Atacama' }, ] def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool: """Check if institution matches target criteria.""" # Check name pattern (case-insensitive, fuzzy match) name = inst.get('name', '').lower() pattern = target['name_pattern'].lower() # Use fuzzy matching for name name_score = fuzz.partial_ratio(pattern, name) if name_score < 70: # Low threshold for initial match return False # Check institution type if inst.get('institution_type') != target['inst_type']: return False # Check region locations = inst.get('locations', []) if not locations: return False region = locations[0].get('region', '') if target['region'] and region != target['region']: return False # Check city if specified if target.get('city'): city = locations[0].get('city', '') if city and city != target['city']: return False return True def has_wikidata(inst: Dict[str, Any]) -> bool: """Check if institution already has Wikidata identifier.""" return any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) def query_wikidata(name: str, region: str, inst_class: str) -> List[Dict[str, Any]]: """Query Wikidata for institutions matching name and location.""" # Simplified query - search for institutions in Chile with matching type query = f""" SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{ ?item wdt:P31/wdt:P279* wd:{inst_class} . ?item wdt:P17 wd:Q298 . # Country: Chile OPTIONAL {{ ?item wdt:P214 ?viaf }} OPTIONAL {{ ?item wdt:P791 ?isil }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }} }} LIMIT 100 """ headers = { 'User-Agent': 'GLAM-Data-Extractor/0.1 (heritage-data-project)', 'Accept': 'application/json' } try: response = requests.get( WIKIDATA_SPARQL, params={'query': query, 'format': 'json'}, headers=headers, timeout=30 ) response.raise_for_status() results = response.json() bindings = results.get('results', {}).get('bindings', []) # Extract relevant fields matches = [] for binding in bindings: item_uri = binding.get('item', {}).get('value', '') q_number = item_uri.split('/')[-1] if item_uri else None if q_number: matches.append({ 'q_number': q_number, 'label': binding.get('itemLabel', {}).get('value', ''), 'description': binding.get('itemDescription', {}).get('value', ''), 'viaf': binding.get('viaf', {}).get('value', None), 'isil': binding.get('isil', {}).get('value', None) }) return matches except Exception as e: print(f" ⚠️ Wikidata query error: {e}") return [] def fuzzy_match_wikidata(inst_name: str, wd_results: List[Dict[str, Any]]) -> tuple[Dict[str, Any] | None, float]: """Fuzzy match institution name to Wikidata results.""" best_match = None best_score = 0 for result in wd_results: wd_label = result['label'] # Try multiple fuzzy matching strategies scores = [ fuzz.ratio(inst_name.lower(), wd_label.lower()), fuzz.partial_ratio(inst_name.lower(), wd_label.lower()), fuzz.token_sort_ratio(inst_name.lower(), wd_label.lower()) ] score = max(scores) if score > best_score: best_score = score best_match = result return best_match, best_score def add_wikidata_identifier(inst: Dict[str, Any], q_number: str, confidence: float, notes: str) -> Dict[str, Any]: """Add Wikidata identifier to institution.""" wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(wikidata_id) # Update provenance if 'provenance' in inst: old_method = inst['provenance'].get('extraction_method', '') enrichment_note = f" + Wikidata enrichment (Batch 1 manual verification, confidence={confidence:.2f})" inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}" return inst def main(): data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_geocoded_v2.yaml' backup_file = data_file.with_suffix('.batch1_backup') output_file = data_file.with_name('chilean_institutions_batch1_enriched.yaml') print("=" * 80) print("Chilean Heritage Institutions - Batch 1 Wikidata Enrichment") print("Manual Verification Session - November 9, 2025") print("Target: 13 institutions (diverse sample)") print("=" * 80) print() # Load data print(f"📂 Loading: {data_file}") with open(data_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" Total institutions: {len(institutions)}") print() # Create backup print(f"💾 Creating backup: {backup_file}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("Enrichment Process:") print("-" * 80) # Process each target enriched_count = 0 skipped_count = 0 not_found_count = 0 manual_review_count = 0 for i, target in enumerate(BATCH_1_TARGETS, 1): print(f"\n[{i}/13] 🔍 Searching: {target['name_pattern']} ({target['region']})") # Find matching institution in dataset matched = None for inst in institutions: if matches_institution(inst, target): matched = inst break if not matched: print(f" ❌ NOT FOUND in dataset") not_found_count += 1 continue print(f" ✓ Found: {matched.get('name')}") # Check if already has Wikidata if has_wikidata(matched): existing_q = next( (id_obj['identifier_value'] for id_obj in matched.get('identifiers', []) if id_obj.get('identifier_scheme') == 'Wikidata'), None ) print(f" ⏭️ Already enriched with {existing_q}") skipped_count += 1 continue # Query Wikidata print(f" 🌐 Querying Wikidata for {target['wikidata_class']} in Chile...") time.sleep(1) # Rate limiting wd_results = query_wikidata( target['name_pattern'], target['region'], target['wikidata_class'] ) if not wd_results: print(f" ⚠️ No Wikidata results found") manual_review_count += 1 continue print(f" 📊 Found {len(wd_results)} Wikidata candidates") # Fuzzy match best_match, match_score = fuzzy_match_wikidata(matched['name'], wd_results) if not best_match: print(f" ⚠️ No good match found (threshold < 70)") manual_review_count += 1 continue print(f" 🎯 Best match: {best_match['label']} ({best_match['q_number']})") print(f" Similarity: {match_score:.1f}%") if best_match.get('description'): print(f" Description: {best_match['description']}") # Manual verification checkpoint if match_score >= 85: print(f" ✅ HIGH CONFIDENCE - Auto-accepting") add_wikidata_identifier(matched, best_match['q_number'], match_score / 100, target['notes']) enriched_count += 1 elif match_score >= 70: print(f" ⚠️ MEDIUM CONFIDENCE - Flagged for manual review") print(f" To accept, manually verify: https://www.wikidata.org/wiki/{best_match['q_number']}") manual_review_count += 1 else: print(f" ❌ LOW CONFIDENCE - Skipping") manual_review_count += 1 print() print("=" * 80) print("Batch 1 Summary:") print("-" * 80) print(f"✅ Auto-enriched: {enriched_count}") print(f"⚠️ Manual review: {manual_review_count}") print(f"⏭️ Already enriched: {skipped_count}") print(f"❌ Not found: {not_found_count}") # Calculate coverage with_wikidata = sum(1 for inst in institutions if has_wikidata(inst)) print() print("Chilean Institution Coverage:") print(f" Total: {len(institutions)}") print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)") print(f" Without: {len(institutions) - with_wikidata}") # Save if any enrichments if enriched_count > 0: print() print(f"💾 Saving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("✅ Batch 1 enrichment complete!") print() print("NEXT STEPS:") print("1. Review manual verification candidates") print("2. Create Batch 2 with remaining high-priority institutions") print("3. Continue iterating until 80%+ coverage") else: print() print("⚠️ No automatic enrichments - all require manual review") print(" Review candidates above and add Q-numbers manually if matches confirmed") if __name__ == '__main__': main()