glam/scripts/enrich_tunisia_wikidata_fuzzy.py
2025-12-09 09:16:19 +01:00

305 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Wikidata enrichment for Tunisian heritage institutions using fuzzy search.
Searches Wikidata by CONTAINS search rather than exact label match,
then uses fuzzy matching to verify results.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""
import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from rapidfuzz import fuzz
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Wikidata-Enrichment/2.0"
def search_wikidata_fuzzy(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
"""
Search Wikidata for Tunisian heritage institutions using broader criteria.
Returns best fuzzy match from results.
"""
# Try multiple search strategies
queries = []
# Strategy 1: Search by country + institution type
# Get all museums/libraries/archives in Tunisia, then fuzzy match client-side
query1 = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel
WHERE {
# Must be in Tunisia
?item wdt:P17 wd:Q948 .
# Must be heritage institution type
?item wdt:P31 ?type .
VALUES ?type {
wd:Q33506 # Museum
wd:Q7075 # Library
wd:Q166118 # Archive
wd:Q1030034 # Archaeological museum
wd:Q473972 # Art museum
wd:Q570116 # Public library
wd:Q22687 # Synagogue
wd:Q7840289 # Art gallery
wd:Q2668072 # National library
}
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P571 ?inception . }
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
LIMIT 100
"""
headers = {'User-Agent': USER_AGENT}
params = {
'query': query1,
'format': 'json'
}
try:
time.sleep(1.5) # Rate limiting
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
if not bindings:
return None
# Fuzzy match against all results
best_match = None
best_score = 0
name_lower = name.lower()
for binding in bindings:
item_label = binding.get("itemLabel", {}).get("value", "").lower()
item_desc = binding.get("itemDescription", {}).get("value", "").lower()
# Calculate match score
label_score = fuzz.ratio(name_lower, item_label)
partial_score = fuzz.partial_ratio(name_lower, item_label)
token_score = fuzz.token_set_ratio(name_lower, item_label)
# Best of the three fuzzy match strategies
score = max(label_score, partial_score, token_score)
if score > best_score:
best_score = score
best_match = binding
# Require minimum 70% match
if best_score < 70:
return None
# Extract data from best match
item_uri = best_match.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
return None
result = {
"qid": qid,
"name": best_match.get("itemLabel", {}).get("value", ""),
"description": best_match.get("itemDescription", {}).get("value", ""),
"match_score": best_score
}
if "viaf" in best_match:
result["viaf"] = best_match["viaf"]["value"]
if "isil" in best_match:
result["isil"] = best_match["isil"]["value"]
if "website" in best_match:
result["website"] = best_match["website"]["value"]
if "inception" in best_match:
result["founded_date"] = best_match["inception"]["value"].split("T")[0]
if "coords" in best_match:
coords_str = best_match["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
return result
except requests.exceptions.Timeout:
print(f" ⏱️ Query timeout (>{timeout}s)")
return None
except requests.exceptions.RequestException as e:
print(f" ❌ Network error: {e}")
return None
except Exception as e:
print(f" ❌ Error: {e}")
return None
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
"""Add Wikidata information to institution record."""
# Add Wikidata identifier
if 'identifiers' not in institution:
institution['identifiers'] = []
# Check if Wikidata already exists
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
if 'Wikidata' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': wikidata_result['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
})
# Add VIAF if present
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': wikidata_result['viaf'],
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
})
# Add ISIL if present
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'ISIL',
'identifier_value': wikidata_result['isil'],
# ISIL codes don't have a universal URLisil']}"
})
# Update provenance notes
if 'provenance' not in institution:
institution['provenance'] = {}
notes = institution['provenance'].get('notes', '')
enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']}, match: {wikidata_result.get('match_score', 0):.0f}%)."
institution['provenance']['notes'] = (notes + enrich_note).strip()
def save_checkpoint(data: dict, input_file: Path, stats: dict):
"""Save progress checkpoint."""
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
data['_metadata']['enhancements'].append('Wikidata enrichment')
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def main():
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
print("Tunisia Wikidata Enrichment (Fuzzy Search)")
print("=" * 60)
print("Features:")
print(" - Broad SPARQL query (all Tunisian heritage institutions)")
print(" - Client-side fuzzy matching (70% threshold)")
print(" - Checkpoint saving every 10 institutions")
print(" - Multiple match strategies (exact, partial, token)")
print("=" * 60)
# Load data
print(f"\nReading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data['institutions']
print(f"Total institutions: {len(institutions)}")
# Statistics
stats = {
'total': len(institutions),
'already_enriched': 0,
'searched': 0,
'found': 0,
'enriched': 0,
'failed': 0,
'low_confidence': 0
}
# Process each institution
checkpoint_interval = 10
for i, inst in enumerate(institutions, 1):
name = inst.get('name', '')
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
if 'Wikidata' in existing_schemes:
stats['already_enriched'] += 1
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
continue
# Search Wikidata with fuzzy matching
print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
stats['searched'] += 1
result = search_wikidata_fuzzy(name, city, timeout=60)
if result:
stats['found'] += 1
match_score = result.get('match_score', 0)
print(f" ✅ Found: {result['qid']} - {result.get('name', '')} (match: {match_score:.0f}%)")
# Accept matches above 70%
if match_score >= 70:
add_wikidata_to_institution(inst, result)
stats['enriched'] += 1
print(f" ✅ Enriched")
else:
stats['low_confidence'] += 1
stats['failed'] += 1
print(f" ⚠️ Match score too low, skipping")
else:
stats['failed'] += 1
print(f" ❌ Not found")
# Checkpoint every N institutions
if i % checkpoint_interval == 0 or i == len(institutions):
save_checkpoint(data, input_file, stats)
# Final save
save_checkpoint(data, input_file, stats)
# Print statistics
print("\n" + "=" * 60)
print("WIKIDATA ENRICHMENT STATISTICS")
print("=" * 60)
print(f"Total institutions: {stats['total']}")
print(f"Already enriched: {stats['already_enriched']}")
print(f"Searched: {stats['searched']}")
print(f"Found: {stats['found']}")
print(f"Enriched (new): {stats['enriched']}")
print(f"Failed: {stats['failed']}")
print(f"Low confidence: {stats['low_confidence']}")
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
if stats['enriched'] > 0:
improvement = stats['enriched']
print(f"✨ Added {improvement} new Wikidata identifiers!")
print("\n✅ Wikidata enrichment complete!")
if __name__ == '__main__':
main()