#!/usr/bin/env python3 """ Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (Universities Focus) Improved strategy based on Batch 1 learnings: - Focus on universities (excellent Wikidata coverage) - Better name normalization (strip possessives, handle word order) - Geographic filtering in SPARQL queries - Higher success rate expected (universities have standardized names) BATCH 2 TARGET INSTITUTIONS (5 major universities): 1. Universidad de Chile - Santiago 2. Universidad de Santiago de Chile (USACH) - Santiago 3. Universidad de Concepción - Concepción 4. Universidad Austral de Chile - Valdivia 5. Pontificia Universidad Católica de Chile - Santiago SUCCESS CRITERIA: - Batch 1: 2/90 with Wikidata (2.2%) - Goal: 7/90 with Wikidata (7.8%) - Expected success rate: 100% for universities IMPROVEMENTS FROM BATCH 1: 1. Name normalization: Remove "'s", "Universidad's" → "Universidad" 2. Geographic filtering: Add city/region to SPARQL query 3. Multiple name variants: Try both full and abbreviated names 4. Better fuzzy matching: Use token_set_ratio for word order variations """ import yaml from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Optional import requests import time from rapidfuzz import fuzz import re # Wikidata SPARQL endpoint WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" # Batch 2: Major Chilean universities (high success probability) BATCH_2_TARGETS = [ { 'name_pattern': 'Universidad de Chile', 'name_variants': ['Universidad de Chile', 'U. de Chile', 'UChile'], 'region': 'Santiago', 'city': 'Santiago', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', # university 'notes': 'Oldest and most prestigious public university in Chile (founded 1842)' }, { 'name_pattern': 'Universidad de Santiago de Chile', 'name_variants': ['Universidad de Santiago de Chile', 'USACH', 'U. de Santiago'], 'region': 'Santiago', 'city': 'Santiago', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', 'notes': 'Major public university in Santiago (founded 1849 as Escuela de Artes y Oficios)' }, { 'name_pattern': 'Universidad de Concepción', 'name_variants': ['Universidad de Concepción', 'UdeC', 'U. de Concepción'], 'region': 'Concepción', 'city': 'Concepción', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', 'notes': 'Third oldest university in Chile (founded 1919)' }, { 'name_pattern': 'Universidad Austral de Chile', 'name_variants': ['Universidad Austral de Chile', 'UACh', 'U. Austral'], 'region': 'Valdivia', 'city': 'Valdivia', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', 'notes': 'Public university in southern Chile (founded 1954)' }, { 'name_pattern': 'Pontificia Universidad Católica de Chile', 'name_variants': [ 'Pontificia Universidad Católica de Chile', 'UC Chile', 'PUC', 'Universidad Católica de Chile' ], 'region': 'Santiago', 'city': 'Santiago', 'inst_type': 'EDUCATION_PROVIDER', 'wikidata_class': 'Q3918', 'notes': 'Leading private Catholic university (founded 1888)' } ] def normalize_name(name: str) -> str: """Normalize institution name for better matching.""" # Remove possessive markers name = re.sub(r"'s\b", "", name) # Remove leading/trailing whitespace name = name.strip() # Normalize whitespace name = re.sub(r'\s+', ' ', name) return name def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool: """Check if institution matches target criteria with improved name matching.""" # Normalize institution name inst_name = normalize_name(inst.get('name', '')).lower() # Check against all name variants name_variants = target.get('name_variants', [target['name_pattern']]) matched_name = False for variant in name_variants: normalized_variant = normalize_name(variant).lower() # Try multiple fuzzy matching strategies scores = [ fuzz.ratio(inst_name, normalized_variant), fuzz.partial_ratio(inst_name, normalized_variant), fuzz.token_set_ratio(inst_name, normalized_variant) ] max_score = max(scores) if max_score >= 75: # Lower threshold to catch variations matched_name = True break if not matched_name: return False # Check institution type if inst.get('institution_type') != target['inst_type']: return False # Check location (region or city) locations = inst.get('locations', []) if not locations: return False location = locations[0] region = location.get('region', '') city = location.get('city', '') # Match by region or city target_region = target.get('region', '') target_city = target.get('city', '') location_match = False if target_region and (region == target_region or city == target_region): location_match = True if target_city and (city == target_city or region == target_city): location_match = True return location_match def has_wikidata(inst: Dict[str, Any]) -> bool: """Check if institution already has Wikidata identifier.""" return any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) def query_wikidata_with_location( name_variants: List[str], city: Optional[str], inst_class: str ) -> List[Dict[str, Any]]: """Query Wikidata with geographic filtering for better precision.""" # Build filter for city if provided city_filter = "" if city: # Map Chilean city names to Wikidata Q-numbers (add as needed) city_mapping = { 'Santiago': 'Q2887', 'Concepción': 'Q5775', 'Valdivia': 'Q3883' } if city in city_mapping: city_q = city_mapping[city] city_filter = f""" ?item wdt:P131* wd:{city_q} . # Located in or subdivision of city """ query = f""" SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{ ?item wdt:P31/wdt:P279* wd:{inst_class} . ?item wdt:P17 wd:Q298 . # Country: Chile {city_filter} OPTIONAL {{ ?item wdt:P214 ?viaf }} OPTIONAL {{ ?item wdt:P791 ?isil }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }} }} LIMIT 100 """ headers = { 'User-Agent': 'GLAM-Data-Extractor/0.2 (heritage-data-project; batch2-universities)', 'Accept': 'application/json' } try: response = requests.get( WIKIDATA_SPARQL, params={'query': query, 'format': 'json'}, headers=headers, timeout=30 ) response.raise_for_status() results = response.json() bindings = results.get('results', {}).get('bindings', []) # Extract relevant fields matches = [] for binding in bindings: item_uri = binding.get('item', {}).get('value', '') q_number = item_uri.split('/')[-1] if item_uri else None if q_number: matches.append({ 'q_number': q_number, 'label': binding.get('itemLabel', {}).get('value', ''), 'description': binding.get('itemDescription', {}).get('value', ''), 'viaf': binding.get('viaf', {}).get('value', None), 'isil': binding.get('isil', {}).get('value', None) }) return matches except Exception as e: print(f" ⚠️ Wikidata query error: {e}") return [] def fuzzy_match_wikidata_improved( inst_name: str, name_variants: List[str], wd_results: List[Dict[str, Any]] ) -> tuple[Optional[Dict[str, Any]], float]: """Improved fuzzy matching with multiple strategies.""" best_match = None best_score = 0 # Normalize institution name inst_name_norm = normalize_name(inst_name).lower() for result in wd_results: wd_label = normalize_name(result['label']).lower() # Try matching against institution name scores = [ fuzz.ratio(inst_name_norm, wd_label), fuzz.partial_ratio(inst_name_norm, wd_label), fuzz.token_set_ratio(inst_name_norm, wd_label), fuzz.token_sort_ratio(inst_name_norm, wd_label) ] # Also try matching against target name variants for variant in name_variants: variant_norm = normalize_name(variant).lower() scores.extend([ fuzz.ratio(variant_norm, wd_label), fuzz.token_set_ratio(variant_norm, wd_label), fuzz.token_sort_ratio(variant_norm, wd_label) ]) score = max(scores) if score > best_score: best_score = score best_match = result return best_match, best_score def add_wikidata_identifier( inst: Dict[str, Any], q_number: str, confidence: float, notes: str ) -> Dict[str, Any]: """Add Wikidata identifier to institution with provenance tracking.""" wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(wikidata_id) # Update provenance if 'provenance' in inst: old_method = inst['provenance'].get('extraction_method', '') enrichment_note = ( f" + Wikidata enrichment (Batch 2 universities, confidence={confidence:.2f})" ) inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}" return inst def main(): data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml' backup_file = data_file.with_suffix('.batch2_backup') output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml') print("=" * 80) print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment") print("Universities Focus - Improved Matching Strategy") print("Session: November 9, 2025") print("Target: 5 major universities") print("=" * 80) print() # Load data print(f"📂 Loading: {data_file}") with open(data_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" Total institutions: {len(institutions)}") # Check existing Wikidata coverage with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst)) print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)") print() # Create backup print(f"💾 Creating backup: {backup_file}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("Batch 2 Enrichment Process:") print("-" * 80) # Process each target enriched_count = 0 skipped_count = 0 not_found_count = 0 manual_review_count = 0 for i, target in enumerate(BATCH_2_TARGETS, 1): print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}") print(f" Location: {target['city']}, {target['region']}") print(f" Name variants: {', '.join(target['name_variants'][:3])}") # Find matching institution in dataset matched = None for inst in institutions: if matches_institution(inst, target): matched = inst break if not matched: print(f" ❌ NOT FOUND in dataset") print(f" (Check if institution name matches any variant)") not_found_count += 1 continue print(f" ✓ Found: {matched.get('name')}") # Check if already has Wikidata if has_wikidata(matched): existing_q = next( (id_obj['identifier_value'] for id_obj in matched.get('identifiers', []) if id_obj.get('identifier_scheme') == 'Wikidata'), None ) print(f" ⏭️ Already enriched with {existing_q}") skipped_count += 1 continue # Query Wikidata with location filtering print(f" 🌐 Querying Wikidata (universities in {target['city']})...") time.sleep(1.5) # Rate limiting wd_results = query_wikidata_with_location( target['name_variants'], target.get('city'), target['wikidata_class'] ) if not wd_results: print(f" ⚠️ No Wikidata results found") manual_review_count += 1 continue print(f" 📊 Found {len(wd_results)} Wikidata candidates") # Improved fuzzy matching best_match, match_score = fuzzy_match_wikidata_improved( matched['name'], target['name_variants'], wd_results ) if not best_match: print(f" ⚠️ No good match found (threshold < 70)") manual_review_count += 1 continue print(f" 🎯 Best match: {best_match['label']} ({best_match['q_number']})") print(f" Similarity: {match_score:.1f}%") if best_match.get('description'): print(f" Description: {best_match['description']}") if best_match.get('viaf'): print(f" VIAF: {best_match['viaf']}") # Confidence-based decision if match_score >= 85: print(f" ✅ HIGH CONFIDENCE - Auto-accepting") add_wikidata_identifier( matched, best_match['q_number'], match_score / 100, target['notes'] ) enriched_count += 1 elif match_score >= 75: print(f" ⚠️ MEDIUM CONFIDENCE - Needs manual verification") print(f" Verify at: https://www.wikidata.org/wiki/{best_match['q_number']}") manual_review_count += 1 else: print(f" ❌ LOW CONFIDENCE - Skipping") manual_review_count += 1 print() print("=" * 80) print("Batch 2 Summary:") print("-" * 80) print(f"✅ Auto-enriched: {enriched_count}") print(f"⚠️ Manual review: {manual_review_count}") print(f"⏭️ Already enriched: {skipped_count}") print(f"❌ Not found: {not_found_count}") # Calculate updated coverage with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst)) print() print("Chilean Institution Coverage:") print(f" Total: {len(institutions)}") print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)") print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)") print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions") # Save if any enrichments if enriched_count > 0: print() print(f"💾 Saving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("✅ Batch 2 enrichment complete!") print() print("NEXT STEPS:") print("1. Review medium-confidence candidates") print("2. Create Batch 3 targeting major museums:") print(" - Museo Histórico y Antropológico (Valdivia)") print(" - Museo Colchagua (Santa Cruz)") print(" - Museo Gabriela Mistral (Vicuña)") print(" - Museo Antropológico Padre Sebastián Englert (Easter Island)") print(" - Casa Museo Isla Negra (Pablo Neruda)") print("3. Continue until 20+ institutions enriched (22% coverage)") else: print() print("⚠️ No automatic enrichments - all require manual review") print() print("DEBUGGING TIPS:") print("1. Check if institution names in dataset match target name_variants") print("2. Verify institution_type field matches target") print("3. Check if location (city/region) matches target") print("4. Review name normalization logic in matches_institution()") if __name__ == '__main__': main()