#!/usr/bin/env python3 """ Wikidata enrichment for Argentine CONABIP libraries using fuzzy search. Searches Wikidata for libraries in Argentina, then uses fuzzy matching to verify results against CONABIP data. GLAM Data Extraction Project Schema: LinkML v0.2.1 Country: Argentina (AR) Source: CONABIP (Comisión Nacional de Bibliotecas Populares) """ import json import time import requests from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Argentina-Wikidata-Enrichment/1.0" def search_wikidata_fuzzy(name: str, city: Optional[str] = None, province: Optional[str] = None, timeout: int = 60, verbose: bool = False) -> Optional[Dict[str, Any]]: """ Search Wikidata for Argentine libraries using broad criteria. Returns best fuzzy match from results with 85% threshold. Includes city and province verification for better accuracy. """ # Query all libraries in Argentina (cached result shared across calls) # Note: This could be optimized by caching the SPARQL query result query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel ?provinceLabel WHERE { # Must be in Argentina ?item wdt:P17 wd:Q414 . # Must be library or archive type ?item wdt:P31 ?type . VALUES ?type { wd:Q7075 # Library wd:Q28564 # Public library wd:Q2668072 # National library wd:Q856234 # Community library wd:Q166118 # Archive wd:Q1622062 # Popular library (biblioteca popular) } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P571 ?inception . } OPTIONAL { ?item wdt:P131 ?city . } OPTIONAL { ?item wdt:P131/wdt:P131 ?province . } OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("es", "en")) } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . } } LIMIT 200 """ headers = {'User-Agent': USER_AGENT} params = { 'query': query, 'format': 'json' } try: time.sleep(1.0) # Rate limiting (reduced from 1.5s) response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) if not bindings: return None # Fuzzy match against all results best_match = None best_score = 0 name_lower = name.lower() city_lower = city.lower() if city else None province_lower = province.lower() if province else None # Normalize CABA (Ciudad Autónoma de Buenos Aires) if city_lower and "ciudad autónoma" in city_lower: city_lower = "buenos aires" # Treat CABA as Buenos Aires for matching for binding in bindings: item_label = binding.get("itemLabel", {}).get("value", "").lower() item_desc = binding.get("itemDescription", {}).get("value", "").lower() wd_city = binding.get("cityLabel", {}).get("value", "").lower() wd_province = binding.get("provinceLabel", {}).get("value", "").lower() # Calculate match score (multiple strategies) label_score = fuzz.ratio(name_lower, item_label) partial_score = fuzz.partial_ratio(name_lower, item_label) token_score = fuzz.token_set_ratio(name_lower, item_label) # Best of the three fuzzy match strategies score = max(label_score, partial_score, token_score) # City verification: if both have cities and they match, boost score if city_lower and wd_city: city_match = fuzz.ratio(city_lower, wd_city) if city_match > 80: # Cities match well if verbose: print(f" ✓ City match: {city} ≈ {wd_city} (boost +5)") score = min(100, score + 5) # Boost for city match elif city_match < 60: # Cities don't match - only log if very low score if verbose and city_match < 40: print(f" ⚠️ City mismatch: {city} vs {wd_city}") score *= 0.7 # Penalize (reduced from 0.6) # Province verification (Argentina has 23 provinces + CABA) if province_lower and wd_province: prov_match = fuzz.partial_ratio(province_lower, wd_province) if prov_match > 80: if verbose: print(f" ✓ Province match: {province} ≈ {wd_province} (boost +3)") score = min(100, score + 3) if score > best_score: best_score = score best_match = binding # Require minimum 85% match if best_score < 85: return None # Extract data from best match item_uri = best_match.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): return None result = { "qid": qid, "name": best_match.get("itemLabel", {}).get("value", ""), "description": best_match.get("itemDescription", {}).get("value", ""), "match_score": best_score } if "viaf" in best_match: result["viaf"] = best_match["viaf"]["value"] if "isil" in best_match: result["isil"] = best_match["isil"]["value"] if "website" in best_match: result["website"] = best_match["website"]["value"] if "inception" in best_match: result["founded_date"] = best_match["inception"]["value"].split("T")[0] if "coords" in best_match: coords_str = best_match["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) return result except Exception as e: print(f" ❌ Error querying Wikidata: {e}") return None def enrich_conabip_with_wikidata(input_file: Path, output_file: Path): """ Enrich CONABIP libraries with Wikidata Q-numbers. """ print("=" * 80) print("ARGENTINA CONABIP WIKIDATA ENRICHMENT") print("=" * 80) print(f"Input: {input_file}") print(f"Output: {output_file}") print() # Load CONABIP data with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) institutions = data.get("institutions", []) total = len(institutions) print(f"📚 Loaded {total} institutions from CONABIP") print() # Statistics stats = { "total": total, "enriched": 0, "skipped_existing": 0, "no_match": 0, "errors": 0 } # Enrich each institution for idx, inst in enumerate(institutions, 1): name = inst.get("name", "Unknown") city = inst.get("city") province = inst.get("province") print(f"[{idx}/{total}] {name}") print(f" 📍 {city}, {province}") # Skip if already has Wikidata Q-number identifiers = inst.get("identifiers", []) has_wikidata = any( id_obj.get("identifier_scheme") == "Wikidata" for id_obj in identifiers ) if has_wikidata: print(" ⏭️ Already has Wikidata") stats["skipped_existing"] += 1 continue # Search Wikidata print(" 🔍 Searching...", end=" ", flush=True) wd_result = search_wikidata_fuzzy(name, city, province, verbose=False) if wd_result: qid = wd_result["qid"] match_score = wd_result["match_score"] wd_name = wd_result["name"] print(f"✅ {qid} ({match_score:.0f}%)") # Add Wikidata identifier if "identifiers" not in inst: inst["identifiers"] = [] inst["identifiers"].append({ "identifier_scheme": "Wikidata", "identifier_value": qid, "identifier_url": f"https://www.wikidata.org/wiki/{qid}", "match_score": match_score, "enrichment_date": datetime.now(timezone.utc).isoformat() }) # Add additional identifiers from Wikidata if available extras = [] if "viaf" in wd_result: inst["identifiers"].append({ "identifier_scheme": "VIAF", "identifier_value": wd_result["viaf"], "identifier_url": f"https://viaf.org/viaf/{wd_result['viaf']}", "source": "Wikidata" }) extras.append(f"VIAF:{wd_result['viaf']}") if "isil" in wd_result: inst["identifiers"].append({ "identifier_scheme": "ISIL", "identifier_value": wd_result["isil"], "source": "Wikidata" }) extras.append(f"ISIL:{wd_result['isil']}") if "website" in wd_result and not inst.get("website"): inst["website"] = wd_result["website"] extras.append("Website") if "founded_date" in wd_result: inst["founded_date"] = wd_result["founded_date"] extras.append(f"Founded:{wd_result['founded_date']}") if extras: print(f" + {', '.join(extras)}") stats["enriched"] += 1 else: print("⚠️ No match (< 85%)") stats["no_match"] += 1 # Update metadata data["metadata"]["wikidata_enrichment_date"] = datetime.now(timezone.utc).isoformat() data["metadata"]["wikidata_enrichment_stats"] = stats # Save enriched data output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print("=" * 80) print("ENRICHMENT COMPLETE") print("=" * 80) print(f"✅ Enriched: {stats['enriched']}/{total} ({stats['enriched']/total*100:.1f}%)") print(f"⏭️ Already had Wikidata: {stats['skipped_existing']}") print(f"⚠️ No match found: {stats['no_match']}") print(f"❌ Errors: {stats['errors']}") print() print(f"📁 Output saved to: {output_file}") print() if __name__ == "__main__": # Paths base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_enhanced_FULL.json" output_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json" # Run enrichment enrich_conabip_with_wikidata(input_file, output_file)