#!/usr/bin/env python3 """ Enrich Belgian (BE) custodian files with Wikidata data using fuzzy name matching. This script: 1. Fetches Belgian heritage institutions from Wikidata in batches 2. Uses fuzzy matching to find corresponding custodians 3. Enriches files that don't already have wikidata_enrichment """ import yaml import glob import time import httpx from datetime import datetime, timezone from pathlib import Path import logging import sys import re from difflib import SequenceMatcher # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('be_wikidata_fuzzy.log'), logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" USER_AGENT = "GLAMBot/1.0 (Heritage Custodian Enrichment; contact@example.org)" MATCH_THRESHOLD = 0.85 def normalize_name(name: str) -> str: """Normalize institution name for comparison.""" if not name: return "" name = name.lower() # Remove common prefixes/suffixes name = re.sub(r'\b(de|het|een|the|le|la|les|du|des|van|voor|von)\b', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', '', name) # Normalize whitespace name = re.sub(r'\s+', ' ', name).strip() return name def similarity(a: str, b: str) -> float: """Calculate similarity ratio between two strings.""" a_norm = normalize_name(a) b_norm = normalize_name(b) if not a_norm or not b_norm: return 0.0 return SequenceMatcher(None, a_norm, b_norm).ratio() def fetch_belgian_institutions_by_type(type_qid: str, type_name: str) -> list[dict]: """Fetch Belgian institutions of a specific type from Wikidata.""" sparql_query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?website ?image WHERE {{ ?item wdt:P17 wd:Q31 . ?item wdt:P31 wd:{type_qid} . OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P18 ?image . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,fr,de,en". }} }} """ headers = { "User-Agent": USER_AGENT, "Accept": "application/sparql-results+json" } try: response = httpx.get( WIKIDATA_SPARQL, params={"query": sparql_query, "format": "json"}, headers=headers, timeout=60.0 ) response.raise_for_status() data = response.json() results = [] for b in data.get("results", {}).get("bindings", []): item_uri = b.get("item", {}).get("value", "") wikidata_id = item_uri.split("/")[-1] if item_uri else None if not wikidata_id: continue label = b.get("itemLabel", {}).get("value", "") # Skip if label is just the Q-number if label.startswith("Q") and label[1:].isdigit(): continue results.append({ "wikidata_id": wikidata_id, "wikidata_url": item_uri, "label": label, "description": b.get("itemDescription", {}).get("value"), "website": b.get("website", {}).get("value"), "image": b.get("image", {}).get("value"), }) logger.info(f" {type_name}: {len(results)} items") return results except Exception as e: logger.error(f"Error fetching {type_name}: {e}") return [] def fetch_belgian_institutions() -> list[dict]: """Fetch all Belgian heritage institutions from Wikidata.""" logger.info("Fetching Belgian institutions from Wikidata...") # Define institution types to fetch types = [ ("Q7075", "library"), ("Q166118", "archive"), ("Q33506", "museum"), ("Q207694", "art museum"), ("Q1007870", "public library"), ("Q2668072", "provincial archive"), ("Q473972", "city archive"), ("Q17431399", "local history museum"), ("Q210272", "cultural center"), ("Q28564", "public library"), ("Q856234", "national library"), ] all_results = [] seen_ids = set() for type_qid, type_name in types: results = fetch_belgian_institutions_by_type(type_qid, type_name) for r in results: if r["wikidata_id"] not in seen_ids: seen_ids.add(r["wikidata_id"]) all_results.append(r) time.sleep(1) # Rate limiting between queries logger.info(f"Total unique Belgian institutions: {len(all_results)}") return all_results def get_instance_of(wikidata_id: str) -> list[str]: """Get instance_of (P31) values for a Wikidata entity.""" sparql_query = f""" SELECT ?type WHERE {{ wd:{wikidata_id} wdt:P31 ?type . }} """ headers = { "User-Agent": USER_AGENT, "Accept": "application/sparql-results+json" } try: response = httpx.get( WIKIDATA_SPARQL, params={"query": sparql_query, "format": "json"}, headers=headers, timeout=30.0 ) response.raise_for_status() data = response.json() types = [] for binding in data.get("results", {}).get("bindings", []): type_uri = binding.get("type", {}).get("value", "") type_id = type_uri.split("/")[-1] if type_uri else None if type_id: types.append(type_id) return types except: return [] def get_custodian_name(data: dict) -> str: """Extract the best name from custodian data.""" if data.get('custodian_name', {}).get('emic_name'): return data['custodian_name']['emic_name'] if data.get('original_entry', {}).get('name'): return data['original_entry']['name'] if data.get('name'): return data['name'] return "" def main(): """Main enrichment process.""" # Fetch Wikidata institutions wikidata_institutions = fetch_belgian_institutions() if not wikidata_institutions: logger.error("Failed to fetch Wikidata institutions") return # Load BE custodian files data_dir = Path("data/custodian") be_files = sorted(data_dir.glob("BE-*.yaml")) logger.info(f"Processing {len(be_files)} Belgian custodian files") enriched_count = 0 skipped_count = 0 for i, filepath in enumerate(be_files): if (i + 1) % 50 == 0: logger.info(f"Progress: {i+1}/{len(be_files)} files processed, {enriched_count} enriched") try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: continue # Skip if already enriched if 'wikidata_enrichment' in data: skipped_count += 1 continue # Get custodian name custodian_name = get_custodian_name(data) if not custodian_name: continue # Find best match best_match = None best_score = 0 for wd in wikidata_institutions: score = similarity(custodian_name, wd['label']) if score > best_score: best_score = score best_match = wd if best_score < MATCH_THRESHOLD: continue # Get instance_of for the match instance_of = get_instance_of(best_match['wikidata_id']) time.sleep(0.3) # Build enrichment block enrichment = { 'wikidata_id': best_match['wikidata_id'], 'wikidata_url': best_match['wikidata_url'], 'matched_by': 'fuzzy_name_match', 'match_score': round(best_score, 3), 'matched_name': best_match['label'], 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_version': '2.1.0', } if best_match.get('label'): enrichment['wikidata_label'] = best_match['label'] if best_match.get('description'): enrichment['wikidata_description'] = best_match['description'] if best_match.get('website'): enrichment['official_website'] = best_match['website'] if best_match.get('image'): enrichment['image'] = best_match['image'] if instance_of: enrichment['instance_of'] = instance_of # Add to data data['wikidata_enrichment'] = enrichment # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) logger.info(f"Enriched {filepath.name}: '{custodian_name}' → '{best_match['label']}' ({best_match['wikidata_id']}, score={best_score:.3f})") enriched_count += 1 except Exception as e: logger.error(f"Error processing {filepath}: {e}") logger.info("=" * 60) logger.info("ENRICHMENT COMPLETE") logger.info(f"Total BE files: {len(be_files)}") logger.info(f"Enriched: {enriched_count}") logger.info(f"Skipped (already enriched): {skipped_count}") if __name__ == "__main__": main()