glam/scripts/enrich_belgium_wikidata_fuzzy.py
2025-12-21 00:01:54 +01:00

295 lines
9.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Belgian (BE) custodian files with Wikidata data using fuzzy name matching.
This script:
1. Fetches Belgian heritage institutions from Wikidata in batches
2. Uses fuzzy matching to find corresponding custodians
3. Enriches files that don't already have wikidata_enrichment
"""
import yaml
import glob
import time
import httpx
from datetime import datetime, timezone
from pathlib import Path
import logging
import sys
import re
from difflib import SequenceMatcher
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('be_wikidata_fuzzy.log'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAMBot/1.0 (Heritage Custodian Enrichment; contact@example.org)"
MATCH_THRESHOLD = 0.85
def normalize_name(name: str) -> str:
"""Normalize institution name for comparison."""
if not name:
return ""
name = name.lower()
# Remove common prefixes/suffixes
name = re.sub(r'\b(de|het|een|the|le|la|les|du|des|van|voor|von)\b', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', '', name)
# Normalize whitespace
name = re.sub(r'\s+', ' ', name).strip()
return name
def similarity(a: str, b: str) -> float:
"""Calculate similarity ratio between two strings."""
a_norm = normalize_name(a)
b_norm = normalize_name(b)
if not a_norm or not b_norm:
return 0.0
return SequenceMatcher(None, a_norm, b_norm).ratio()
def fetch_belgian_institutions_by_type(type_qid: str, type_name: str) -> list[dict]:
"""Fetch Belgian institutions of a specific type from Wikidata."""
sparql_query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?website ?image WHERE {{
?item wdt:P17 wd:Q31 .
?item wdt:P31 wd:{type_qid} .
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P18 ?image . }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,fr,de,en". }}
}}
"""
headers = {
"User-Agent": USER_AGENT,
"Accept": "application/sparql-results+json"
}
try:
response = httpx.get(
WIKIDATA_SPARQL,
params={"query": sparql_query, "format": "json"},
headers=headers,
timeout=60.0
)
response.raise_for_status()
data = response.json()
results = []
for b in data.get("results", {}).get("bindings", []):
item_uri = b.get("item", {}).get("value", "")
wikidata_id = item_uri.split("/")[-1] if item_uri else None
if not wikidata_id:
continue
label = b.get("itemLabel", {}).get("value", "")
# Skip if label is just the Q-number
if label.startswith("Q") and label[1:].isdigit():
continue
results.append({
"wikidata_id": wikidata_id,
"wikidata_url": item_uri,
"label": label,
"description": b.get("itemDescription", {}).get("value"),
"website": b.get("website", {}).get("value"),
"image": b.get("image", {}).get("value"),
})
logger.info(f" {type_name}: {len(results)} items")
return results
except Exception as e:
logger.error(f"Error fetching {type_name}: {e}")
return []
def fetch_belgian_institutions() -> list[dict]:
"""Fetch all Belgian heritage institutions from Wikidata."""
logger.info("Fetching Belgian institutions from Wikidata...")
# Define institution types to fetch
types = [
("Q7075", "library"),
("Q166118", "archive"),
("Q33506", "museum"),
("Q207694", "art museum"),
("Q1007870", "public library"),
("Q2668072", "provincial archive"),
("Q473972", "city archive"),
("Q17431399", "local history museum"),
("Q210272", "cultural center"),
("Q28564", "public library"),
("Q856234", "national library"),
]
all_results = []
seen_ids = set()
for type_qid, type_name in types:
results = fetch_belgian_institutions_by_type(type_qid, type_name)
for r in results:
if r["wikidata_id"] not in seen_ids:
seen_ids.add(r["wikidata_id"])
all_results.append(r)
time.sleep(1) # Rate limiting between queries
logger.info(f"Total unique Belgian institutions: {len(all_results)}")
return all_results
def get_instance_of(wikidata_id: str) -> list[str]:
"""Get instance_of (P31) values for a Wikidata entity."""
sparql_query = f"""
SELECT ?type WHERE {{
wd:{wikidata_id} wdt:P31 ?type .
}}
"""
headers = {
"User-Agent": USER_AGENT,
"Accept": "application/sparql-results+json"
}
try:
response = httpx.get(
WIKIDATA_SPARQL,
params={"query": sparql_query, "format": "json"},
headers=headers,
timeout=30.0
)
response.raise_for_status()
data = response.json()
types = []
for binding in data.get("results", {}).get("bindings", []):
type_uri = binding.get("type", {}).get("value", "")
type_id = type_uri.split("/")[-1] if type_uri else None
if type_id:
types.append(type_id)
return types
except:
return []
def get_custodian_name(data: dict) -> str:
"""Extract the best name from custodian data."""
if data.get('custodian_name', {}).get('emic_name'):
return data['custodian_name']['emic_name']
if data.get('original_entry', {}).get('name'):
return data['original_entry']['name']
if data.get('name'):
return data['name']
return ""
def main():
"""Main enrichment process."""
# Fetch Wikidata institutions
wikidata_institutions = fetch_belgian_institutions()
if not wikidata_institutions:
logger.error("Failed to fetch Wikidata institutions")
return
# Load BE custodian files
data_dir = Path("data/custodian")
be_files = sorted(data_dir.glob("BE-*.yaml"))
logger.info(f"Processing {len(be_files)} Belgian custodian files")
enriched_count = 0
skipped_count = 0
for i, filepath in enumerate(be_files):
if (i + 1) % 50 == 0:
logger.info(f"Progress: {i+1}/{len(be_files)} files processed, {enriched_count} enriched")
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
continue
# Skip if already enriched
if 'wikidata_enrichment' in data:
skipped_count += 1
continue
# Get custodian name
custodian_name = get_custodian_name(data)
if not custodian_name:
continue
# Find best match
best_match = None
best_score = 0
for wd in wikidata_institutions:
score = similarity(custodian_name, wd['label'])
if score > best_score:
best_score = score
best_match = wd
if best_score < MATCH_THRESHOLD:
continue
# Get instance_of for the match
instance_of = get_instance_of(best_match['wikidata_id'])
time.sleep(0.3)
# Build enrichment block
enrichment = {
'wikidata_id': best_match['wikidata_id'],
'wikidata_url': best_match['wikidata_url'],
'matched_by': 'fuzzy_name_match',
'match_score': round(best_score, 3),
'matched_name': best_match['label'],
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_version': '2.1.0',
}
if best_match.get('label'):
enrichment['wikidata_label'] = best_match['label']
if best_match.get('description'):
enrichment['wikidata_description'] = best_match['description']
if best_match.get('website'):
enrichment['official_website'] = best_match['website']
if best_match.get('image'):
enrichment['image'] = best_match['image']
if instance_of:
enrichment['instance_of'] = instance_of
# Add to data
data['wikidata_enrichment'] = enrichment
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
logger.info(f"Enriched {filepath.name}: '{custodian_name}''{best_match['label']}' ({best_match['wikidata_id']}, score={best_score:.3f})")
enriched_count += 1
except Exception as e:
logger.error(f"Error processing {filepath}: {e}")
logger.info("=" * 60)
logger.info("ENRICHMENT COMPLETE")
logger.info(f"Total BE files: {len(be_files)}")
logger.info(f"Enriched: {enriched_count}")
logger.info(f"Skipped (already enriched): {skipped_count}")
if __name__ == "__main__":
main()