314 lines
11 KiB
Python
Executable file
314 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Wikidata enrichment for Argentine CONABIP libraries using fuzzy search.
|
|
|
|
Searches Wikidata for libraries in Argentina, then uses fuzzy matching
|
|
to verify results against CONABIP data.
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
Country: Argentina (AR)
|
|
Source: CONABIP (Comisión Nacional de Bibliotecas Populares)
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Argentina-Wikidata-Enrichment/1.0"
|
|
|
|
def search_wikidata_fuzzy(name: str, city: Optional[str] = None, province: Optional[str] = None, timeout: int = 60, verbose: bool = False) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Search Wikidata for Argentine libraries using broad criteria.
|
|
|
|
Returns best fuzzy match from results with 85% threshold.
|
|
Includes city and province verification for better accuracy.
|
|
"""
|
|
|
|
# Query all libraries in Argentina (cached result shared across calls)
|
|
# Note: This could be optimized by caching the SPARQL query result
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel ?provinceLabel
|
|
WHERE {
|
|
# Must be in Argentina
|
|
?item wdt:P17 wd:Q414 .
|
|
|
|
# Must be library or archive type
|
|
?item wdt:P31 ?type .
|
|
VALUES ?type {
|
|
wd:Q7075 # Library
|
|
wd:Q28564 # Public library
|
|
wd:Q2668072 # National library
|
|
wd:Q856234 # Community library
|
|
wd:Q166118 # Archive
|
|
wd:Q1622062 # Popular library (biblioteca popular)
|
|
}
|
|
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
OPTIONAL { ?item wdt:P131 ?city . }
|
|
OPTIONAL { ?item wdt:P131/wdt:P131 ?province . }
|
|
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("es", "en")) }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
|
|
}
|
|
LIMIT 200
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1.0) # Rate limiting (reduced from 1.5s)
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
if not bindings:
|
|
return None
|
|
|
|
# Fuzzy match against all results
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
name_lower = name.lower()
|
|
city_lower = city.lower() if city else None
|
|
province_lower = province.lower() if province else None
|
|
|
|
# Normalize CABA (Ciudad Autónoma de Buenos Aires)
|
|
if city_lower and "ciudad autónoma" in city_lower:
|
|
city_lower = "buenos aires" # Treat CABA as Buenos Aires for matching
|
|
|
|
for binding in bindings:
|
|
item_label = binding.get("itemLabel", {}).get("value", "").lower()
|
|
item_desc = binding.get("itemDescription", {}).get("value", "").lower()
|
|
wd_city = binding.get("cityLabel", {}).get("value", "").lower()
|
|
wd_province = binding.get("provinceLabel", {}).get("value", "").lower()
|
|
|
|
# Calculate match score (multiple strategies)
|
|
label_score = fuzz.ratio(name_lower, item_label)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label)
|
|
|
|
# Best of the three fuzzy match strategies
|
|
score = max(label_score, partial_score, token_score)
|
|
|
|
# City verification: if both have cities and they match, boost score
|
|
if city_lower and wd_city:
|
|
city_match = fuzz.ratio(city_lower, wd_city)
|
|
if city_match > 80: # Cities match well
|
|
if verbose:
|
|
print(f" ✓ City match: {city} ≈ {wd_city} (boost +5)")
|
|
score = min(100, score + 5) # Boost for city match
|
|
elif city_match < 60: # Cities don't match - only log if very low score
|
|
if verbose and city_match < 40:
|
|
print(f" ⚠️ City mismatch: {city} vs {wd_city}")
|
|
score *= 0.7 # Penalize (reduced from 0.6)
|
|
|
|
# Province verification (Argentina has 23 provinces + CABA)
|
|
if province_lower and wd_province:
|
|
prov_match = fuzz.partial_ratio(province_lower, wd_province)
|
|
if prov_match > 80:
|
|
if verbose:
|
|
print(f" ✓ Province match: {province} ≈ {wd_province} (boost +3)")
|
|
score = min(100, score + 3)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = binding
|
|
|
|
# Require minimum 85% match
|
|
if best_score < 85:
|
|
return None
|
|
|
|
# Extract data from best match
|
|
item_uri = best_match.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
return None
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": best_match.get("itemLabel", {}).get("value", ""),
|
|
"description": best_match.get("itemDescription", {}).get("value", ""),
|
|
"match_score": best_score
|
|
}
|
|
|
|
if "viaf" in best_match:
|
|
result["viaf"] = best_match["viaf"]["value"]
|
|
|
|
if "isil" in best_match:
|
|
result["isil"] = best_match["isil"]["value"]
|
|
|
|
if "website" in best_match:
|
|
result["website"] = best_match["website"]["value"]
|
|
|
|
if "inception" in best_match:
|
|
result["founded_date"] = best_match["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in best_match:
|
|
coords_str = best_match["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error querying Wikidata: {e}")
|
|
return None
|
|
|
|
def enrich_conabip_with_wikidata(input_file: Path, output_file: Path):
|
|
"""
|
|
Enrich CONABIP libraries with Wikidata Q-numbers.
|
|
"""
|
|
|
|
print("=" * 80)
|
|
print("ARGENTINA CONABIP WIKIDATA ENRICHMENT")
|
|
print("=" * 80)
|
|
print(f"Input: {input_file}")
|
|
print(f"Output: {output_file}")
|
|
print()
|
|
|
|
# Load CONABIP data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = data.get("institutions", [])
|
|
total = len(institutions)
|
|
|
|
print(f"📚 Loaded {total} institutions from CONABIP")
|
|
print()
|
|
|
|
# Statistics
|
|
stats = {
|
|
"total": total,
|
|
"enriched": 0,
|
|
"skipped_existing": 0,
|
|
"no_match": 0,
|
|
"errors": 0
|
|
}
|
|
|
|
# Enrich each institution
|
|
for idx, inst in enumerate(institutions, 1):
|
|
name = inst.get("name", "Unknown")
|
|
city = inst.get("city")
|
|
province = inst.get("province")
|
|
|
|
print(f"[{idx}/{total}] {name}")
|
|
print(f" 📍 {city}, {province}")
|
|
|
|
# Skip if already has Wikidata Q-number
|
|
identifiers = inst.get("identifiers", [])
|
|
has_wikidata = any(
|
|
id_obj.get("identifier_scheme") == "Wikidata"
|
|
for id_obj in identifiers
|
|
)
|
|
|
|
if has_wikidata:
|
|
print(" ⏭️ Already has Wikidata")
|
|
stats["skipped_existing"] += 1
|
|
continue
|
|
|
|
# Search Wikidata
|
|
print(" 🔍 Searching...", end=" ", flush=True)
|
|
wd_result = search_wikidata_fuzzy(name, city, province, verbose=False)
|
|
|
|
if wd_result:
|
|
qid = wd_result["qid"]
|
|
match_score = wd_result["match_score"]
|
|
wd_name = wd_result["name"]
|
|
|
|
print(f"✅ {qid} ({match_score:.0f}%)")
|
|
|
|
|
|
# Add Wikidata identifier
|
|
if "identifiers" not in inst:
|
|
inst["identifiers"] = []
|
|
|
|
inst["identifiers"].append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": qid,
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{qid}",
|
|
"match_score": match_score,
|
|
"enrichment_date": datetime.now(timezone.utc).isoformat()
|
|
})
|
|
|
|
# Add additional identifiers from Wikidata if available
|
|
extras = []
|
|
if "viaf" in wd_result:
|
|
inst["identifiers"].append({
|
|
"identifier_scheme": "VIAF",
|
|
"identifier_value": wd_result["viaf"],
|
|
"identifier_url": f"https://viaf.org/viaf/{wd_result['viaf']}",
|
|
"source": "Wikidata"
|
|
})
|
|
extras.append(f"VIAF:{wd_result['viaf']}")
|
|
|
|
if "isil" in wd_result:
|
|
inst["identifiers"].append({
|
|
"identifier_scheme": "ISIL",
|
|
"identifier_value": wd_result["isil"],
|
|
"source": "Wikidata"
|
|
})
|
|
extras.append(f"ISIL:{wd_result['isil']}")
|
|
|
|
if "website" in wd_result and not inst.get("website"):
|
|
inst["website"] = wd_result["website"]
|
|
extras.append("Website")
|
|
|
|
if "founded_date" in wd_result:
|
|
inst["founded_date"] = wd_result["founded_date"]
|
|
extras.append(f"Founded:{wd_result['founded_date']}")
|
|
|
|
if extras:
|
|
print(f" + {', '.join(extras)}")
|
|
|
|
stats["enriched"] += 1
|
|
else:
|
|
print("⚠️ No match (< 85%)")
|
|
stats["no_match"] += 1
|
|
|
|
# Update metadata
|
|
data["metadata"]["wikidata_enrichment_date"] = datetime.now(timezone.utc).isoformat()
|
|
data["metadata"]["wikidata_enrichment_stats"] = stats
|
|
|
|
# Save enriched data
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print("=" * 80)
|
|
print("ENRICHMENT COMPLETE")
|
|
print("=" * 80)
|
|
print(f"✅ Enriched: {stats['enriched']}/{total} ({stats['enriched']/total*100:.1f}%)")
|
|
print(f"⏭️ Already had Wikidata: {stats['skipped_existing']}")
|
|
print(f"⚠️ No match found: {stats['no_match']}")
|
|
print(f"❌ Errors: {stats['errors']}")
|
|
print()
|
|
print(f"📁 Output saved to: {output_file}")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
# Paths
|
|
base_dir = Path(__file__).parent.parent
|
|
input_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_enhanced_FULL.json"
|
|
output_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json"
|
|
|
|
# Run enrichment
|
|
enrich_conabip_with_wikidata(input_file, output_file)
|