glam/scripts/enrich_tunisia_wikidata.py
2025-12-09 09:16:19 +01:00

280 lines
9.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Wikidata enrichment for Tunisian heritage institutions.
Searches Wikidata by institution name and location for French/Arabic
named institutions in Tunisia.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""
import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any
from rapidfuzz import fuzz
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Wikidata-Enrichment/1.0"
def search_wikidata_by_name(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
"""
Search Wikidata for heritage institutions by name.
Returns dict with qid, viaf, founded_date, etc.
"""
# Escape quotes in name
name_escaped = name.replace('"', '\\"').replace("'", "\\'")
# Simplified query without wdt:P279* (subclass transitive) to avoid timeout
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception
WHERE {{
# Search by label in French, Arabic, or English
{{
?item rdfs:label "{name_escaped}"@fr .
}} UNION {{
?item rdfs:label "{name_escaped}"@ar .
}} UNION {{
?item rdfs:label "{name_escaped}"@en .
}}
# Must be in Tunisia
?item wdt:P17 wd:Q948 .
# Must be heritage institution type (direct instance only, no subclass search)
?item wdt:P31 ?type .
VALUES ?type {{
wd:Q33506 # Museum
wd:Q7075 # Library
wd:Q166118 # Archive
wd:Q1030034 # Archaeological museum
wd:Q473972 # Art museum
wd:Q570116 # Public library
wd:Q22687 # Synagogue
wd:Q7840289 # Art gallery
wd:Q2668072 # National library
wd:Q7210356 # Organization
}}
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
OPTIONAL {{ ?item wdt:P791 ?isil . }}
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P625 ?coords . }}
OPTIONAL {{ ?item wdt:P571 ?inception . }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
}}
LIMIT 5
"""
headers = {'User-Agent': USER_AGENT}
params = {
'query': query,
'format': 'json'
}
try:
time.sleep(1.5) # Increased rate limiting
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
if not bindings:
return None
# Return first result with highest match quality
binding = bindings[0]
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
return None
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", "")
}
if "viaf" in binding:
result["viaf"] = binding["viaf"]["value"]
if "isil" in binding:
result["isil"] = binding["isil"]["value"]
if "website" in binding:
result["website"] = binding["website"]["value"]
if "inception" in binding:
result["founded_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
return result
except requests.exceptions.Timeout:
print(f" ⏱️ Query timeout (>{timeout}s)")
return None
except requests.exceptions.RequestException as e:
print(f" ❌ Network error: {e}")
return None
except Exception as e:
print(f" ❌ Error: {e}")
return None
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
"""Add Wikidata information to institution record."""
# Add Wikidata identifier
if 'identifiers' not in institution:
institution['identifiers'] = []
# Check if Wikidata already exists
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
if 'Wikidata' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': wikidata_result['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
})
# Add VIAF if present
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': wikidata_result['viaf'],
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
})
# Add ISIL if present
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'ISIL',
'identifier_value': wikidata_result['isil'],
# ISIL codes don't have a universal URLisil']}"
})
# Update provenance
if 'provenance' in institution:
notes = institution['provenance'].get('notes', '')
enrich_note = f" Wikidata enriched on {datetime.now(timezone.utc).isoformat()} (Q{wikidata_result['qid']})."
institution['provenance']['notes'] = notes + enrich_note
def save_checkpoint(data: dict, input_file: Path, stats: dict):
"""Save progress checkpoint."""
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']})")
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
data['_metadata']['enhancements'].append('Wikidata enrichment')
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def main():
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
print("Tunisia Wikidata Enrichment")
print("=" * 60)
print("Features:")
print(" - Simplified SPARQL queries (no transitive subclass)")
print(" - Multilingual search (French/Arabic/English)")
print(" - Checkpoint saving every 10 institutions")
print(" - Timeout handling (60s per query)")
print("=" * 60)
# Load data
print(f"\nReading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data['institutions']
print(f"Total institutions: {len(institutions)}")
# Statistics
stats = {
'total': len(institutions),
'already_enriched': 0,
'searched': 0,
'found': 0,
'enriched': 0,
'failed': 0,
'timeouts': 0
}
# Process each institution
checkpoint_interval = 10
for i, inst in enumerate(institutions, 1):
name = inst.get('name', '')
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
if 'Wikidata' in existing_schemes:
stats['already_enriched'] += 1
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
continue
# Search Wikidata
print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
stats['searched'] += 1
result = search_wikidata_by_name(name, city, timeout=60)
if result:
stats['found'] += 1
print(f" ✅ Found: {result['qid']} - {result.get('name', '')}")
# Verify name match (fuzzy)
match_score = fuzz.ratio(name.lower(), result['name'].lower())
if match_score > 85:
add_wikidata_to_institution(inst, result)
stats['enriched'] += 1
print(f" ✅ Enriched (match score: {match_score})")
else:
stats['failed'] += 1
print(f" ⚠️ Low match score ({match_score}), skipping")
else:
stats['failed'] += 1
if "timeout" in str(result):
stats['timeouts'] += 1
print(f" ❌ Not found")
# Checkpoint every N institutions
if i % checkpoint_interval == 0 or i == len(institutions):
save_checkpoint(data, input_file, stats)
# Final save
save_checkpoint(data, input_file, stats)
# Print statistics
print("\n" + "=" * 60)
print("WIKIDATA ENRICHMENT STATISTICS")
print("=" * 60)
print(f"Total institutions: {stats['total']}")
print(f"Already enriched: {stats['already_enriched']}")
print(f"Searched: {stats['searched']}")
print(f"Found: {stats['found']}")
print(f"Enriched: {stats['enriched']}")
print(f"Failed: {stats['failed']}")
print(f"Timeouts: {stats['timeouts']}")
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
print("\n✅ Wikidata enrichment complete!")
if __name__ == '__main__':
main()