#!/usr/bin/env python3 """ Query Wikidata for Chilean Archives using SPARQL Uses Wikidata Query Service to find archives in Chile with their Q-numbers """ import yaml from SPARQLWrapper import SPARQLWrapper, JSON from typing import List, Dict from pathlib import Path def query_chilean_archives() -> List[Dict]: """Query Wikidata for all archives in Chile.""" sparql = SPARQLWrapper("https://query.wikidata.org/sparql") # SPARQL query for archives in Chile # P31 = instance of, P17 = country, Q298 = Chile # Q166118 = archive institution query = """ SELECT DISTINCT ?archive ?archiveLabel ?cityLabel ?coords ?founded WHERE { # Archive types (including subclasses) ?archive wdt:P31/wdt:P279* wd:Q166118 . # Located in Chile ?archive wdt:P17 wd:Q298 . # Get city/location OPTIONAL { ?archive wdt:P131 ?city . } # Get coordinates OPTIONAL { ?archive wdt:P625 ?coords . } # Get founding date OPTIONAL { ?archive wdt:P571 ?founded . } # Get labels in Spanish and English SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . } } ORDER BY ?archiveLabel """ sparql.setQuery(query) sparql.setReturnFormat(JSON) print("🔍 Querying Wikidata for Chilean archives...") print(" Endpoint: https://query.wikidata.org/sparql") print() try: results = sparql.query().convert() # type: ignore archives = [] for result in results["results"]["bindings"]: # type: ignore archive_uri = result["archive"]["value"] # type: ignore q_number = archive_uri.split("/")[-1] archive = { "q_number": q_number, "name": result.get("archiveLabel", {}).get("value", ""), # type: ignore "city": result.get("cityLabel", {}).get("value", ""), # type: ignore "founded": result.get("founded", {}).get("value", "")[:4] if "founded" in result else "", # type: ignore "wikidata_url": f"https://www.wikidata.org/wiki/{q_number}" } archives.append(archive) return archives except Exception as e: print(f"❌ Error querying Wikidata: {e}") return [] def load_chilean_institutions(file_path: Path) -> List[Dict]: """Load Chilean institutions from YAML file.""" with open(file_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def normalize_name(name: str) -> str: """Normalize institution name for matching.""" return name.lower().strip().replace("'", "").replace(" ", " ") def find_matches(institutions: List[Dict], wikidata_archives: List[Dict]) -> List[Dict]: """Find matches between our institutions and Wikidata archives.""" matches = [] # Filter institutions without Wikidata archives_without_wd = [ inst for inst in institutions if inst.get('institution_type') == 'ARCHIVE' and not any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ] print(f"📊 Matching {len(archives_without_wd)} institutions against {len(wikidata_archives)} Wikidata entries...") print() for inst in archives_without_wd: inst_name = normalize_name(inst['name']) inst_city = inst.get('locations', [{}])[0].get('city', '').lower() for wd_archive in wikidata_archives: wd_name = normalize_name(wd_archive['name']) wd_city = wd_archive['city'].lower() # Name match strategies name_match = False # Strategy 1: Exact match if inst_name == wd_name: name_match = True # Strategy 2: Partial match (institution name contains Wikidata name or vice versa) elif inst_name in wd_name or wd_name in inst_name: name_match = True # Strategy 3: Key words match (archivo/archive + significant word) elif ('archivo' in inst_name or 'archive' in inst_name) and ('archivo' in wd_name or 'archive' in wd_name): inst_words = set(inst_name.split()) wd_words = set(wd_name.split()) common_words = inst_words & wd_words # Must share at least 2 significant words beyond "archivo" significant_common = common_words - {'de', 'del', 'la', 'el', 'archivo', 'archives', 'historico', 'histórico', 'national', 'nacional', 's'} if len(significant_common) >= 1: # At least 1 significant word for archives name_match = True # Strategy 4: "Archivo Nacional" special case (high-value institution) if 'nacional' in inst_name and ('archivo' in inst_name or 'archive' in inst_name): if 'nacional' in wd_name or 'national' in wd_name: name_match = True # Strategy 5: University archives (USACH, Universidad de Chile) if 'universidad' in inst_name or 'university' in inst_name: # Extract university name if 'usach' in inst_name and 'usach' in wd_name: name_match = True elif 'chile' in inst_name and 'chile' in wd_name: name_match = True # Strategy 6: Diocese/Church archives (Arzobispado, Diócesis) if 'diocesis' in inst_name or 'arzobispado' in inst_name: if 'diocese' in wd_name or 'diocesis' in wd_name or 'arzobispado' in wd_name: name_match = True # City match (flexible - allows partial matches) city_match = False if inst_city and wd_city: if inst_city in wd_city or wd_city in inst_city: city_match = True # Accept match if name matches and either city matches or no city info # Allow special exceptions for national/well-known institutions allow_match = ( city_match or not wd_city or 'nacional' in inst_name or 'national' in wd_name ) if name_match and allow_match: match = { 'institution': inst, 'wikidata': wd_archive, 'name_confidence': 'exact' if inst_name == wd_name else 'partial', 'city_match': city_match } matches.append(match) break # Only take first match per institution return matches def main(): print("=" * 80) print("WIKIDATA SPARQL QUERY - CHILEAN ARCHIVES") print("=" * 80) print() # Query Wikidata wikidata_archives = query_chilean_archives() if not wikidata_archives: print("❌ No results from Wikidata") return print(f"✅ Found {len(wikidata_archives)} archives in Wikidata") print() # Show sample print("Sample results (first 10):") for i, archive in enumerate(wikidata_archives[:10], 1): print(f" {i}. {archive['name']} ({archive['city']}) → {archive['q_number']}") print() # Load our institutions input_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml') institutions = load_chilean_institutions(input_file) print(f"📖 Loaded {len(institutions)} Chilean institutions") archives_count = sum(1 for i in institutions if i.get('institution_type') == 'ARCHIVE') print(f" {archives_count} are archives") with_wikidata = sum( 1 for inst in institutions if inst.get('institution_type') == 'ARCHIVE' and any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ) print(f" {with_wikidata} already have Wikidata") print(f" {archives_count - with_wikidata} need enrichment") print() # Find matches matches = find_matches(institutions, wikidata_archives) print("=" * 80) print(f"MATCHING RESULTS: {len(matches)} potential matches found") print("=" * 80) print() # Display matches for i, match in enumerate(matches, 1): inst = match['institution'] wd = match['wikidata'] print(f"{i}. {inst['name']}") print(f" Our city: {inst.get('locations', [{}])[0].get('city', 'Unknown')}") print(f" ↓ MATCH ({match['name_confidence']} name, city: {match['city_match']})") print(f" Wikidata: {wd['name']} ({wd['city']})") print(f" Q-number: {wd['q_number']}") if wd['founded']: print(f" Founded: {wd['founded']}") print() # Export matches to JSON for batch processing output_file = Path('data/instances/chile/wikidata_matches_batch9_archives.json') import json match_data = [ { 'institution_name': match['institution']['name'], 'institution_city': match['institution'].get('locations', [{}])[0].get('city', ''), 'q_number': match['wikidata']['q_number'], 'wikidata_name': match['wikidata']['name'], 'wikidata_city': match['wikidata']['city'], 'founded': match['wikidata']['founded'], 'confidence': match['name_confidence'], 'city_match': match['city_match'] } for match in matches ] with open(output_file, 'w', encoding='utf-8') as f: json.dump(match_data, f, indent=2, ensure_ascii=False) print(f"💾 Saved {len(matches)} matches to: {output_file}") print() print("🎯 Next step: Review matches and create Batch 9 enrichment script") if __name__ == '__main__': main()