glam/scripts/enrich_argentina_wikidata.py
2025-11-19 23:25:22 +01:00

314 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Wikidata enrichment for Argentine CONABIP libraries using fuzzy search.
Searches Wikidata for libraries in Argentina, then uses fuzzy matching
to verify results against CONABIP data.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
Country: Argentina (AR)
Source: CONABIP (Comisión Nacional de Bibliotecas Populares)
"""
import json
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from rapidfuzz import fuzz
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Argentina-Wikidata-Enrichment/1.0"
def search_wikidata_fuzzy(name: str, city: Optional[str] = None, province: Optional[str] = None, timeout: int = 60, verbose: bool = False) -> Optional[Dict[str, Any]]:
"""
Search Wikidata for Argentine libraries using broad criteria.
Returns best fuzzy match from results with 85% threshold.
Includes city and province verification for better accuracy.
"""
# Query all libraries in Argentina (cached result shared across calls)
# Note: This could be optimized by caching the SPARQL query result
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel ?provinceLabel
WHERE {
# Must be in Argentina
?item wdt:P17 wd:Q414 .
# Must be library or archive type
?item wdt:P31 ?type .
VALUES ?type {
wd:Q7075 # Library
wd:Q28564 # Public library
wd:Q2668072 # National library
wd:Q856234 # Community library
wd:Q166118 # Archive
wd:Q1622062 # Popular library (biblioteca popular)
}
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P571 ?inception . }
OPTIONAL { ?item wdt:P131 ?city . }
OPTIONAL { ?item wdt:P131/wdt:P131 ?province . }
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("es", "en")) }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
}
LIMIT 200
"""
headers = {'User-Agent': USER_AGENT}
params = {
'query': query,
'format': 'json'
}
try:
time.sleep(1.0) # Rate limiting (reduced from 1.5s)
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
if not bindings:
return None
# Fuzzy match against all results
best_match = None
best_score = 0
name_lower = name.lower()
city_lower = city.lower() if city else None
province_lower = province.lower() if province else None
# Normalize CABA (Ciudad Autónoma de Buenos Aires)
if city_lower and "ciudad autónoma" in city_lower:
city_lower = "buenos aires" # Treat CABA as Buenos Aires for matching
for binding in bindings:
item_label = binding.get("itemLabel", {}).get("value", "").lower()
item_desc = binding.get("itemDescription", {}).get("value", "").lower()
wd_city = binding.get("cityLabel", {}).get("value", "").lower()
wd_province = binding.get("provinceLabel", {}).get("value", "").lower()
# Calculate match score (multiple strategies)
label_score = fuzz.ratio(name_lower, item_label)
partial_score = fuzz.partial_ratio(name_lower, item_label)
token_score = fuzz.token_set_ratio(name_lower, item_label)
# Best of the three fuzzy match strategies
score = max(label_score, partial_score, token_score)
# City verification: if both have cities and they match, boost score
if city_lower and wd_city:
city_match = fuzz.ratio(city_lower, wd_city)
if city_match > 80: # Cities match well
if verbose:
print(f" ✓ City match: {city}{wd_city} (boost +5)")
score = min(100, score + 5) # Boost for city match
elif city_match < 60: # Cities don't match - only log if very low score
if verbose and city_match < 40:
print(f" ⚠️ City mismatch: {city} vs {wd_city}")
score *= 0.7 # Penalize (reduced from 0.6)
# Province verification (Argentina has 23 provinces + CABA)
if province_lower and wd_province:
prov_match = fuzz.partial_ratio(province_lower, wd_province)
if prov_match > 80:
if verbose:
print(f" ✓ Province match: {province}{wd_province} (boost +3)")
score = min(100, score + 3)
if score > best_score:
best_score = score
best_match = binding
# Require minimum 85% match
if best_score < 85:
return None
# Extract data from best match
item_uri = best_match.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
return None
result = {
"qid": qid,
"name": best_match.get("itemLabel", {}).get("value", ""),
"description": best_match.get("itemDescription", {}).get("value", ""),
"match_score": best_score
}
if "viaf" in best_match:
result["viaf"] = best_match["viaf"]["value"]
if "isil" in best_match:
result["isil"] = best_match["isil"]["value"]
if "website" in best_match:
result["website"] = best_match["website"]["value"]
if "inception" in best_match:
result["founded_date"] = best_match["inception"]["value"].split("T")[0]
if "coords" in best_match:
coords_str = best_match["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
return result
except Exception as e:
print(f" ❌ Error querying Wikidata: {e}")
return None
def enrich_conabip_with_wikidata(input_file: Path, output_file: Path):
"""
Enrich CONABIP libraries with Wikidata Q-numbers.
"""
print("=" * 80)
print("ARGENTINA CONABIP WIKIDATA ENRICHMENT")
print("=" * 80)
print(f"Input: {input_file}")
print(f"Output: {output_file}")
print()
# Load CONABIP data
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = data.get("institutions", [])
total = len(institutions)
print(f"📚 Loaded {total} institutions from CONABIP")
print()
# Statistics
stats = {
"total": total,
"enriched": 0,
"skipped_existing": 0,
"no_match": 0,
"errors": 0
}
# Enrich each institution
for idx, inst in enumerate(institutions, 1):
name = inst.get("name", "Unknown")
city = inst.get("city")
province = inst.get("province")
print(f"[{idx}/{total}] {name}")
print(f" 📍 {city}, {province}")
# Skip if already has Wikidata Q-number
identifiers = inst.get("identifiers", [])
has_wikidata = any(
id_obj.get("identifier_scheme") == "Wikidata"
for id_obj in identifiers
)
if has_wikidata:
print(" ⏭️ Already has Wikidata")
stats["skipped_existing"] += 1
continue
# Search Wikidata
print(" 🔍 Searching...", end=" ", flush=True)
wd_result = search_wikidata_fuzzy(name, city, province, verbose=False)
if wd_result:
qid = wd_result["qid"]
match_score = wd_result["match_score"]
wd_name = wd_result["name"]
print(f"{qid} ({match_score:.0f}%)")
# Add Wikidata identifier
if "identifiers" not in inst:
inst["identifiers"] = []
inst["identifiers"].append({
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}",
"match_score": match_score,
"enrichment_date": datetime.now(timezone.utc).isoformat()
})
# Add additional identifiers from Wikidata if available
extras = []
if "viaf" in wd_result:
inst["identifiers"].append({
"identifier_scheme": "VIAF",
"identifier_value": wd_result["viaf"],
"identifier_url": f"https://viaf.org/viaf/{wd_result['viaf']}",
"source": "Wikidata"
})
extras.append(f"VIAF:{wd_result['viaf']}")
if "isil" in wd_result:
inst["identifiers"].append({
"identifier_scheme": "ISIL",
"identifier_value": wd_result["isil"],
"source": "Wikidata"
})
extras.append(f"ISIL:{wd_result['isil']}")
if "website" in wd_result and not inst.get("website"):
inst["website"] = wd_result["website"]
extras.append("Website")
if "founded_date" in wd_result:
inst["founded_date"] = wd_result["founded_date"]
extras.append(f"Founded:{wd_result['founded_date']}")
if extras:
print(f" + {', '.join(extras)}")
stats["enriched"] += 1
else:
print("⚠️ No match (< 85%)")
stats["no_match"] += 1
# Update metadata
data["metadata"]["wikidata_enrichment_date"] = datetime.now(timezone.utc).isoformat()
data["metadata"]["wikidata_enrichment_stats"] = stats
# Save enriched data
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("=" * 80)
print("ENRICHMENT COMPLETE")
print("=" * 80)
print(f"✅ Enriched: {stats['enriched']}/{total} ({stats['enriched']/total*100:.1f}%)")
print(f"⏭️ Already had Wikidata: {stats['skipped_existing']}")
print(f"⚠️ No match found: {stats['no_match']}")
print(f"❌ Errors: {stats['errors']}")
print()
print(f"📁 Output saved to: {output_file}")
print()
if __name__ == "__main__":
# Paths
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_enhanced_FULL.json"
output_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json"
# Run enrichment
enrich_conabip_with_wikidata(input_file, output_file)