280 lines
9.8 KiB
Python
Executable file
280 lines
9.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Wikidata enrichment for Tunisian heritage institutions.
|
|
|
|
Searches Wikidata by institution name and location for French/Arabic
|
|
named institutions in Tunisia.
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Tunisia-Wikidata-Enrichment/1.0"
|
|
|
|
def search_wikidata_by_name(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Search Wikidata for heritage institutions by name.
|
|
|
|
Returns dict with qid, viaf, founded_date, etc.
|
|
"""
|
|
# Escape quotes in name
|
|
name_escaped = name.replace('"', '\\"').replace("'", "\\'")
|
|
|
|
# Simplified query without wdt:P279* (subclass transitive) to avoid timeout
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception
|
|
WHERE {{
|
|
# Search by label in French, Arabic, or English
|
|
{{
|
|
?item rdfs:label "{name_escaped}"@fr .
|
|
}} UNION {{
|
|
?item rdfs:label "{name_escaped}"@ar .
|
|
}} UNION {{
|
|
?item rdfs:label "{name_escaped}"@en .
|
|
}}
|
|
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must be heritage institution type (direct instance only, no subclass search)
|
|
?item wdt:P31 ?type .
|
|
VALUES ?type {{
|
|
wd:Q33506 # Museum
|
|
wd:Q7075 # Library
|
|
wd:Q166118 # Archive
|
|
wd:Q1030034 # Archaeological museum
|
|
wd:Q473972 # Art museum
|
|
wd:Q570116 # Public library
|
|
wd:Q22687 # Synagogue
|
|
wd:Q7840289 # Art gallery
|
|
wd:Q2668072 # National library
|
|
wd:Q7210356 # Organization
|
|
}}
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
|
|
}}
|
|
LIMIT 5
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1.5) # Increased rate limiting
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
if not bindings:
|
|
return None
|
|
|
|
# Return first result with highest match quality
|
|
binding = bindings[0]
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
return None
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", "")
|
|
}
|
|
|
|
if "viaf" in binding:
|
|
result["viaf"] = binding["viaf"]["value"]
|
|
|
|
if "isil" in binding:
|
|
result["isil"] = binding["isil"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founded_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
return result
|
|
|
|
except requests.exceptions.Timeout:
|
|
print(f" ⏱️ Query timeout (>{timeout}s)")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ❌ Network error: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
return None
|
|
|
|
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
|
|
"""Add Wikidata information to institution record."""
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if Wikidata already exists
|
|
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
|
|
|
|
if 'Wikidata' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': wikidata_result['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
|
|
})
|
|
|
|
# Add VIAF if present
|
|
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_result['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
|
|
})
|
|
|
|
# Add ISIL if present
|
|
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': wikidata_result['isil'],
|
|
# ISIL codes don't have a universal URLisil']}"
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' in institution:
|
|
notes = institution['provenance'].get('notes', '')
|
|
enrich_note = f" Wikidata enriched on {datetime.now(timezone.utc).isoformat()} (Q{wikidata_result['qid']})."
|
|
institution['provenance']['notes'] = notes + enrich_note
|
|
|
|
def save_checkpoint(data: dict, input_file: Path, stats: dict):
|
|
"""Save progress checkpoint."""
|
|
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']})")
|
|
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
|
|
if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
|
|
data['_metadata']['enhancements'].append('Wikidata enrichment')
|
|
|
|
with open(input_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
def main():
|
|
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
|
|
|
|
print("Tunisia Wikidata Enrichment")
|
|
print("=" * 60)
|
|
print("Features:")
|
|
print(" - Simplified SPARQL queries (no transitive subclass)")
|
|
print(" - Multilingual search (French/Arabic/English)")
|
|
print(" - Checkpoint saving every 10 institutions")
|
|
print(" - Timeout handling (60s per query)")
|
|
print("=" * 60)
|
|
|
|
# Load data
|
|
print(f"\nReading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
institutions = data['institutions']
|
|
print(f"Total institutions: {len(institutions)}")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': len(institutions),
|
|
'already_enriched': 0,
|
|
'searched': 0,
|
|
'found': 0,
|
|
'enriched': 0,
|
|
'failed': 0,
|
|
'timeouts': 0
|
|
}
|
|
|
|
# Process each institution
|
|
checkpoint_interval = 10
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
name = inst.get('name', '')
|
|
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
|
|
|
|
if 'Wikidata' in existing_schemes:
|
|
stats['already_enriched'] += 1
|
|
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
|
|
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
|
|
continue
|
|
|
|
# Search Wikidata
|
|
print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
|
|
stats['searched'] += 1
|
|
|
|
result = search_wikidata_by_name(name, city, timeout=60)
|
|
|
|
if result:
|
|
stats['found'] += 1
|
|
print(f" ✅ Found: {result['qid']} - {result.get('name', '')}")
|
|
|
|
# Verify name match (fuzzy)
|
|
match_score = fuzz.ratio(name.lower(), result['name'].lower())
|
|
if match_score > 85:
|
|
add_wikidata_to_institution(inst, result)
|
|
stats['enriched'] += 1
|
|
print(f" ✅ Enriched (match score: {match_score})")
|
|
else:
|
|
stats['failed'] += 1
|
|
print(f" ⚠️ Low match score ({match_score}), skipping")
|
|
else:
|
|
stats['failed'] += 1
|
|
if "timeout" in str(result):
|
|
stats['timeouts'] += 1
|
|
print(f" ❌ Not found")
|
|
|
|
# Checkpoint every N institutions
|
|
if i % checkpoint_interval == 0 or i == len(institutions):
|
|
save_checkpoint(data, input_file, stats)
|
|
|
|
# Final save
|
|
save_checkpoint(data, input_file, stats)
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 60)
|
|
print("WIKIDATA ENRICHMENT STATISTICS")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Already enriched: {stats['already_enriched']}")
|
|
print(f"Searched: {stats['searched']}")
|
|
print(f"Found: {stats['found']}")
|
|
print(f"Enriched: {stats['enriched']}")
|
|
print(f"Failed: {stats['failed']}")
|
|
print(f"Timeouts: {stats['timeouts']}")
|
|
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
|
|
print("\n✅ Wikidata enrichment complete!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|