378 lines
15 KiB
Python
378 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Wikidata enrichment for Libyan heritage institutions using two-tier search strategy.
|
||
|
||
Two-Tier Search Strategy:
|
||
1. SPARQL query: Broad query for all Libyan heritage institutions → fuzzy match
|
||
2. MediaWiki API fallback: Text search via Wikidata's search API (catches edge cases)
|
||
|
||
The MediaWiki API search indexes labels, descriptions, aliases, and statement values
|
||
in all languages, complementing SPARQL's label-only matching.
|
||
|
||
GLAM Data Extraction Project
|
||
Schema: LinkML v0.2.1
|
||
"""
|
||
|
||
import yaml
|
||
import time
|
||
import requests
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, Any, List
|
||
from rapidfuzz import fuzz
|
||
from glam_extractor.wikidata import search_wikidata_mediawiki
|
||
|
||
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
||
USER_AGENT = "GLAM-Libya-Wikidata-Enrichment/2.0"
|
||
|
||
def search_wikidata_fuzzy(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
|
||
"""
|
||
Search Wikidata for Libyan heritage institutions using broader criteria.
|
||
|
||
Returns best fuzzy match from results.
|
||
City matching gives bonus but doesn't penalize mismatches (80% threshold).
|
||
"""
|
||
|
||
# Try multiple search strategies
|
||
queries = []
|
||
|
||
# Strategy 1: Search by country + institution type
|
||
# Get all museums/libraries/archives in Libya, then fuzzy match client-side
|
||
query1 = """
|
||
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel
|
||
WHERE {
|
||
# Must be in Libya
|
||
?item wdt:P17 wd:Q1016 .
|
||
|
||
# Must be heritage institution type
|
||
?item wdt:P31 ?type .
|
||
VALUES ?type {
|
||
wd:Q33506 # Museum
|
||
wd:Q7075 # Library
|
||
wd:Q166118 # Archive
|
||
wd:Q1030034 # Archaeological museum
|
||
wd:Q473972 # Art museum
|
||
wd:Q570116 # Public library
|
||
wd:Q22687 # Synagogue
|
||
wd:Q7840289 # Art gallery
|
||
wd:Q2668072 # National library
|
||
}
|
||
|
||
OPTIONAL { ?item wdt:P214 ?viaf . }
|
||
OPTIONAL { ?item wdt:P791 ?isil . }
|
||
OPTIONAL { ?item wdt:P856 ?website . }
|
||
OPTIONAL { ?item wdt:P625 ?coords . }
|
||
OPTIONAL { ?item wdt:P571 ?inception . }
|
||
OPTIONAL { ?item wdt:P131 ?city . }
|
||
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }
|
||
|
||
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
||
}
|
||
LIMIT 100
|
||
"""
|
||
|
||
headers = {'User-Agent': USER_AGENT}
|
||
params = {
|
||
'query': query1,
|
||
'format': 'json'
|
||
}
|
||
|
||
try:
|
||
time.sleep(1.5) # Rate limiting
|
||
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
||
response.raise_for_status()
|
||
|
||
results = response.json()
|
||
bindings = results.get("results", {}).get("bindings", [])
|
||
|
||
if not bindings:
|
||
return None
|
||
|
||
# Fuzzy match against all results
|
||
best_match = None
|
||
best_score = 0
|
||
|
||
name_lower = name.lower()
|
||
city_lower = city.lower() if city else None
|
||
|
||
for binding in bindings:
|
||
item_label = binding.get("itemLabel", {}).get("value", "").lower()
|
||
item_desc = binding.get("itemDescription", {}).get("value", "").lower()
|
||
wd_city = binding.get("cityLabel", {}).get("value", "").lower()
|
||
|
||
# Calculate match score
|
||
label_score = fuzz.ratio(name_lower, item_label)
|
||
partial_score = fuzz.partial_ratio(name_lower, item_label)
|
||
token_score = fuzz.token_set_ratio(name_lower, item_label)
|
||
|
||
# Best of the three fuzzy match strategies
|
||
score = max(label_score, partial_score, token_score)
|
||
|
||
# MANDATORY geographic filtering: reject if cities clearly don't match
|
||
# This prevents false matches like "Barce Museum" matching to Misrata institutions
|
||
city_bonus = 0
|
||
if city_lower and wd_city:
|
||
city_match = fuzz.ratio(city_lower, wd_city)
|
||
if city_match >= 70: # Cities match well enough
|
||
print(f" ✓ City match: {city} ≈ {wd_city} (boosting +5%)")
|
||
city_bonus = 5 # Small bonus for city match
|
||
elif city_match < 50: # Cities are completely different
|
||
print(f" ❌ City mismatch: {city} vs {wd_city} (rejecting - geographic filter)")
|
||
continue # Skip this candidate entirely
|
||
else:
|
||
print(f" ⚠️ City difference: {city} vs {wd_city} (neutral, no bonus)")
|
||
|
||
# Apply city bonus (but don't exceed 100)
|
||
final_score = min(score + city_bonus, 100)
|
||
|
||
if final_score > best_score:
|
||
best_score = final_score
|
||
best_match = binding
|
||
|
||
# Require minimum 85% match for better precision (raised from 80% to prevent false positives)
|
||
# Geographic filtering now handles edge cases, so we can be more conservative
|
||
if best_score < 85:
|
||
# SPARQL didn't find good match - try MediaWiki API fallback
|
||
print(f" ℹ️ SPARQL best match {best_score:.1f}% below threshold, trying MediaWiki API fallback...")
|
||
|
||
fallback_result = search_wikidata_mediawiki(
|
||
name=name,
|
||
city=city,
|
||
country_qid="Q1016", # Libya
|
||
fuzzy_threshold=85,
|
||
timeout=timeout
|
||
)
|
||
|
||
if fallback_result:
|
||
# MediaWiki API found a match!
|
||
fallback_result["enrichment_method"] = "MediaWiki API search (fallback)"
|
||
return fallback_result
|
||
|
||
# Neither method found a match
|
||
return None
|
||
|
||
# Extract data from best match
|
||
item_uri = best_match.get("item", {}).get("value", "")
|
||
qid = item_uri.split("/")[-1] if item_uri else None
|
||
|
||
if not qid or not qid.startswith("Q"):
|
||
return None
|
||
|
||
result = {
|
||
"qid": qid,
|
||
"name": best_match.get("itemLabel", {}).get("value", ""),
|
||
"description": best_match.get("itemDescription", {}).get("value", ""),
|
||
"match_score": best_score
|
||
}
|
||
|
||
if "viaf" in best_match:
|
||
result["viaf"] = best_match["viaf"]["value"]
|
||
|
||
if "isil" in best_match:
|
||
result["isil"] = best_match["isil"]["value"]
|
||
|
||
if "website" in best_match:
|
||
result["website"] = best_match["website"]["value"]
|
||
|
||
if "inception" in best_match:
|
||
result["founded_date"] = best_match["inception"]["value"].split("T")[0]
|
||
|
||
if "coords" in best_match:
|
||
coords_str = best_match["coords"]["value"]
|
||
if coords_str.startswith("Point("):
|
||
lon, lat = coords_str[6:-1].split()
|
||
result["latitude"] = float(lat)
|
||
result["longitude"] = float(lon)
|
||
|
||
return result
|
||
|
||
except requests.exceptions.Timeout:
|
||
print(f" ⏱️ Query timeout (>{timeout}s)")
|
||
return None
|
||
except requests.exceptions.RequestException as e:
|
||
print(f" ❌ Network error: {e}")
|
||
return None
|
||
except Exception as e:
|
||
print(f" ❌ Error: {e}")
|
||
return None
|
||
|
||
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
|
||
"""Add Wikidata information to institution record."""
|
||
|
||
# Add Wikidata identifier
|
||
if 'identifiers' not in institution:
|
||
institution['identifiers'] = []
|
||
|
||
# Check if Wikidata already exists
|
||
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
|
||
|
||
if 'Wikidata' not in existing_schemes:
|
||
institution['identifiers'].append({
|
||
'identifier_scheme': 'Wikidata',
|
||
'identifier_value': wikidata_result['qid'],
|
||
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
|
||
})
|
||
|
||
# Add VIAF if present
|
||
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
|
||
institution['identifiers'].append({
|
||
'identifier_scheme': 'VIAF',
|
||
'identifier_value': wikidata_result['viaf'],
|
||
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
|
||
})
|
||
|
||
# Add ISIL if present
|
||
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
|
||
institution['identifiers'].append({
|
||
'identifier_scheme': 'ISIL',
|
||
'identifier_value': wikidata_result['isil'],
|
||
# ISIL codes don't have a universal URLisil']}"
|
||
})
|
||
|
||
# Update provenance notes
|
||
if 'provenance' not in institution:
|
||
institution['provenance'] = {}
|
||
|
||
# Track enrichment method (SPARQL vs MediaWiki API)
|
||
enrichment_method = wikidata_result.get('enrichment_method', 'SPARQL query')
|
||
|
||
notes = institution['provenance'].get('notes', '')
|
||
enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']}, match: {wikidata_result.get('match_score', 0):.0f}%, method: {enrichment_method})."
|
||
institution['provenance']['notes'] = (notes + enrich_note).strip()
|
||
|
||
def save_checkpoint(data, input_file: Path, stats: dict):
|
||
"""Save progress checkpoint."""
|
||
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")
|
||
|
||
# Handle metadata for dict format (Tunisia) vs list format (Algeria/Libya)
|
||
if isinstance(data, dict) and '_metadata' in data:
|
||
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
|
||
if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
|
||
data['_metadata']['enhancements'].append('Wikidata enrichment')
|
||
|
||
with open(input_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||
|
||
def main():
|
||
input_file = Path('data/instances/libya/libyan_institutions.yaml')
|
||
|
||
print("Libya Wikidata Enrichment (Two-Tier Search)")
|
||
print("=" * 60)
|
||
print("Features:")
|
||
print(" - Two-tier strategy: SPARQL → MediaWiki API fallback")
|
||
print(" - SPARQL: Broad query (all Libyan heritage institutions)")
|
||
print(" - MediaWiki API: Text search (labels, descriptions, aliases)")
|
||
print(" - Client-side fuzzy matching (85% threshold)")
|
||
print(" - Geographic filtering (city matching with bonus)")
|
||
print(" - Checkpoint saving every 10 institutions")
|
||
print(" - Multiple match strategies (exact, partial, token)")
|
||
print("=" * 60)
|
||
|
||
# Load data
|
||
print(f"\nReading: {input_file}")
|
||
with open(input_file, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
# Handle both list format and dict with 'institutions' key
|
||
institutions = data if isinstance(data, list) else data.get('institutions', [])
|
||
print(f"Total institutions: {len(institutions)}")
|
||
|
||
# Statistics
|
||
stats = {
|
||
'total': len(institutions),
|
||
'already_enriched': 0,
|
||
'searched': 0,
|
||
'found': 0,
|
||
'enriched': 0,
|
||
'failed': 0,
|
||
'low_confidence': 0,
|
||
'duplicate_prevented': 0
|
||
}
|
||
|
||
# Track Q-numbers used in this enrichment run (prevent duplicates)
|
||
used_qids = set()
|
||
|
||
# Also collect existing Q-numbers from already-enriched institutions
|
||
for inst in institutions:
|
||
identifiers = inst.get('identifiers', [])
|
||
for ident in identifiers:
|
||
if ident.get('identifier_scheme') == 'Wikidata':
|
||
used_qids.add(ident['identifier_value'])
|
||
|
||
# Process each institution
|
||
checkpoint_interval = 10
|
||
|
||
for i, inst in enumerate(institutions, 1):
|
||
name = inst.get('name', '')
|
||
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
|
||
|
||
# Check if already has Wikidata
|
||
identifiers = inst.get('identifiers', [])
|
||
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
|
||
|
||
if 'Wikidata' in existing_schemes:
|
||
stats['already_enriched'] += 1
|
||
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
|
||
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
|
||
continue
|
||
|
||
# Search Wikidata with fuzzy matching
|
||
print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
|
||
stats['searched'] += 1
|
||
|
||
result = search_wikidata_fuzzy(name, city, timeout=60)
|
||
|
||
if result:
|
||
stats['found'] += 1
|
||
match_score = result.get('match_score', 0)
|
||
qid = result['qid']
|
||
print(f" ✅ Found: {qid} - {result.get('name', '')} (match: {match_score:.0f}%)")
|
||
|
||
# Check if Q-number already used
|
||
if qid in used_qids:
|
||
stats['duplicate_prevented'] += 1
|
||
stats['failed'] += 1
|
||
print(f" ⚠️ Q-number {qid} already assigned to another institution, skipping")
|
||
# Accept matches above 80% (function already filters, but double-check)
|
||
elif match_score >= 80:
|
||
add_wikidata_to_institution(inst, result)
|
||
used_qids.add(qid) # Track this Q-number
|
||
stats['enriched'] += 1
|
||
print(f" ✅ Enriched")
|
||
else:
|
||
stats['low_confidence'] += 1
|
||
stats['failed'] += 1
|
||
print(f" ⚠️ Match score too low (<80%), skipping")
|
||
else:
|
||
stats['failed'] += 1
|
||
print(f" ❌ Not found")
|
||
|
||
# Checkpoint every N institutions
|
||
if i % checkpoint_interval == 0 or i == len(institutions):
|
||
save_checkpoint(data, input_file, stats)
|
||
|
||
# Final save
|
||
save_checkpoint(data, input_file, stats)
|
||
|
||
# Print statistics
|
||
print("\n" + "=" * 60)
|
||
print("WIKIDATA ENRICHMENT STATISTICS")
|
||
print("=" * 60)
|
||
print(f"Total institutions: {stats['total']}")
|
||
print(f"Already enriched: {stats['already_enriched']}")
|
||
print(f"Searched: {stats['searched']}")
|
||
print(f"Found: {stats['found']}")
|
||
print(f"Enriched (new): {stats['enriched']}")
|
||
print(f"Failed: {stats['failed']}")
|
||
print(f" - Low confidence: {stats['low_confidence']}")
|
||
print(f" - Duplicate Q-numbers prevented: {stats['duplicate_prevented']}")
|
||
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
|
||
|
||
if stats['enriched'] > 0:
|
||
improvement = stats['enriched']
|
||
print(f"✨ Added {improvement} new Wikidata identifiers!")
|
||
|
||
print("\n✅ Wikidata enrichment complete!")
|
||
|
||
if __name__ == '__main__':
|
||
main()
|