295 lines
9.5 KiB
Python
Executable file
295 lines
9.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Belgian (BE) custodian files with Wikidata data using fuzzy name matching.
|
|
|
|
This script:
|
|
1. Fetches Belgian heritage institutions from Wikidata in batches
|
|
2. Uses fuzzy matching to find corresponding custodians
|
|
3. Enriches files that don't already have wikidata_enrichment
|
|
"""
|
|
|
|
import yaml
|
|
import glob
|
|
import time
|
|
import httpx
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import logging
|
|
import sys
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('be_wikidata_fuzzy.log'),
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAMBot/1.0 (Heritage Custodian Enrichment; contact@example.org)"
|
|
MATCH_THRESHOLD = 0.85
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for comparison."""
|
|
if not name:
|
|
return ""
|
|
name = name.lower()
|
|
# Remove common prefixes/suffixes
|
|
name = re.sub(r'\b(de|het|een|the|le|la|les|du|des|van|voor|von)\b', '', name)
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', '', name)
|
|
# Normalize whitespace
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
return name
|
|
|
|
|
|
def similarity(a: str, b: str) -> float:
|
|
"""Calculate similarity ratio between two strings."""
|
|
a_norm = normalize_name(a)
|
|
b_norm = normalize_name(b)
|
|
if not a_norm or not b_norm:
|
|
return 0.0
|
|
return SequenceMatcher(None, a_norm, b_norm).ratio()
|
|
|
|
|
|
def fetch_belgian_institutions_by_type(type_qid: str, type_name: str) -> list[dict]:
|
|
"""Fetch Belgian institutions of a specific type from Wikidata."""
|
|
|
|
sparql_query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?website ?image WHERE {{
|
|
?item wdt:P17 wd:Q31 .
|
|
?item wdt:P31 wd:{type_qid} .
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P18 ?image . }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,fr,de,en". }}
|
|
}}
|
|
"""
|
|
|
|
headers = {
|
|
"User-Agent": USER_AGENT,
|
|
"Accept": "application/sparql-results+json"
|
|
}
|
|
|
|
try:
|
|
response = httpx.get(
|
|
WIKIDATA_SPARQL,
|
|
params={"query": sparql_query, "format": "json"},
|
|
headers=headers,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
results = []
|
|
for b in data.get("results", {}).get("bindings", []):
|
|
item_uri = b.get("item", {}).get("value", "")
|
|
wikidata_id = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not wikidata_id:
|
|
continue
|
|
|
|
label = b.get("itemLabel", {}).get("value", "")
|
|
# Skip if label is just the Q-number
|
|
if label.startswith("Q") and label[1:].isdigit():
|
|
continue
|
|
|
|
results.append({
|
|
"wikidata_id": wikidata_id,
|
|
"wikidata_url": item_uri,
|
|
"label": label,
|
|
"description": b.get("itemDescription", {}).get("value"),
|
|
"website": b.get("website", {}).get("value"),
|
|
"image": b.get("image", {}).get("value"),
|
|
})
|
|
|
|
logger.info(f" {type_name}: {len(results)} items")
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching {type_name}: {e}")
|
|
return []
|
|
|
|
|
|
def fetch_belgian_institutions() -> list[dict]:
|
|
"""Fetch all Belgian heritage institutions from Wikidata."""
|
|
logger.info("Fetching Belgian institutions from Wikidata...")
|
|
|
|
# Define institution types to fetch
|
|
types = [
|
|
("Q7075", "library"),
|
|
("Q166118", "archive"),
|
|
("Q33506", "museum"),
|
|
("Q207694", "art museum"),
|
|
("Q1007870", "public library"),
|
|
("Q2668072", "provincial archive"),
|
|
("Q473972", "city archive"),
|
|
("Q17431399", "local history museum"),
|
|
("Q210272", "cultural center"),
|
|
("Q28564", "public library"),
|
|
("Q856234", "national library"),
|
|
]
|
|
|
|
all_results = []
|
|
seen_ids = set()
|
|
|
|
for type_qid, type_name in types:
|
|
results = fetch_belgian_institutions_by_type(type_qid, type_name)
|
|
for r in results:
|
|
if r["wikidata_id"] not in seen_ids:
|
|
seen_ids.add(r["wikidata_id"])
|
|
all_results.append(r)
|
|
time.sleep(1) # Rate limiting between queries
|
|
|
|
logger.info(f"Total unique Belgian institutions: {len(all_results)}")
|
|
return all_results
|
|
|
|
|
|
def get_instance_of(wikidata_id: str) -> list[str]:
|
|
"""Get instance_of (P31) values for a Wikidata entity."""
|
|
sparql_query = f"""
|
|
SELECT ?type WHERE {{
|
|
wd:{wikidata_id} wdt:P31 ?type .
|
|
}}
|
|
"""
|
|
|
|
headers = {
|
|
"User-Agent": USER_AGENT,
|
|
"Accept": "application/sparql-results+json"
|
|
}
|
|
|
|
try:
|
|
response = httpx.get(
|
|
WIKIDATA_SPARQL,
|
|
params={"query": sparql_query, "format": "json"},
|
|
headers=headers,
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
types = []
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
type_uri = binding.get("type", {}).get("value", "")
|
|
type_id = type_uri.split("/")[-1] if type_uri else None
|
|
if type_id:
|
|
types.append(type_id)
|
|
return types
|
|
except:
|
|
return []
|
|
|
|
|
|
def get_custodian_name(data: dict) -> str:
|
|
"""Extract the best name from custodian data."""
|
|
if data.get('custodian_name', {}).get('emic_name'):
|
|
return data['custodian_name']['emic_name']
|
|
if data.get('original_entry', {}).get('name'):
|
|
return data['original_entry']['name']
|
|
if data.get('name'):
|
|
return data['name']
|
|
return ""
|
|
|
|
|
|
def main():
|
|
"""Main enrichment process."""
|
|
# Fetch Wikidata institutions
|
|
wikidata_institutions = fetch_belgian_institutions()
|
|
if not wikidata_institutions:
|
|
logger.error("Failed to fetch Wikidata institutions")
|
|
return
|
|
|
|
# Load BE custodian files
|
|
data_dir = Path("data/custodian")
|
|
be_files = sorted(data_dir.glob("BE-*.yaml"))
|
|
|
|
logger.info(f"Processing {len(be_files)} Belgian custodian files")
|
|
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
|
|
for i, filepath in enumerate(be_files):
|
|
if (i + 1) % 50 == 0:
|
|
logger.info(f"Progress: {i+1}/{len(be_files)} files processed, {enriched_count} enriched")
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
# Skip if already enriched
|
|
if 'wikidata_enrichment' in data:
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Get custodian name
|
|
custodian_name = get_custodian_name(data)
|
|
if not custodian_name:
|
|
continue
|
|
|
|
# Find best match
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for wd in wikidata_institutions:
|
|
score = similarity(custodian_name, wd['label'])
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = wd
|
|
|
|
if best_score < MATCH_THRESHOLD:
|
|
continue
|
|
|
|
# Get instance_of for the match
|
|
instance_of = get_instance_of(best_match['wikidata_id'])
|
|
time.sleep(0.3)
|
|
|
|
# Build enrichment block
|
|
enrichment = {
|
|
'wikidata_id': best_match['wikidata_id'],
|
|
'wikidata_url': best_match['wikidata_url'],
|
|
'matched_by': 'fuzzy_name_match',
|
|
'match_score': round(best_score, 3),
|
|
'matched_name': best_match['label'],
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_version': '2.1.0',
|
|
}
|
|
|
|
if best_match.get('label'):
|
|
enrichment['wikidata_label'] = best_match['label']
|
|
if best_match.get('description'):
|
|
enrichment['wikidata_description'] = best_match['description']
|
|
if best_match.get('website'):
|
|
enrichment['official_website'] = best_match['website']
|
|
if best_match.get('image'):
|
|
enrichment['image'] = best_match['image']
|
|
if instance_of:
|
|
enrichment['instance_of'] = instance_of
|
|
|
|
# Add to data
|
|
data['wikidata_enrichment'] = enrichment
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
logger.info(f"Enriched {filepath.name}: '{custodian_name}' → '{best_match['label']}' ({best_match['wikidata_id']}, score={best_score:.3f})")
|
|
enriched_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {filepath}: {e}")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("ENRICHMENT COMPLETE")
|
|
logger.info(f"Total BE files: {len(be_files)}")
|
|
logger.info(f"Enriched: {enriched_count}")
|
|
logger.info(f"Skipped (already enriched): {skipped_count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|