#!/usr/bin/env python3 """ Enrich KB Netherlands library entries with Wikidata data. This script reads the KB ISIL library entries from data/nde/enriched/entries/ and enriches them with Wikidata data by: 1. Searching for Dutch public libraries in Wikidata by ISIL code 2. Falling back to fuzzy name matching for libraries not found by ISIL 3. Adding Wikidata IDs, coordinates, founding dates, etc. Usage: python scripts/enrich_kb_libraries_wikidata.py [--dry-run] [--limit N] """ import os import sys import time import json import yaml import httpx from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Any from dataclasses import dataclass, field import logging import argparse from difflib import SequenceMatcher import re # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1" SPARQL_URL = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-KB-Library-Enricher/1.0 (https://github.com/sst/glam)" # Rate limiting REQUEST_DELAY = 0.5 def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" name = name.lower() # Remove common prefixes/suffixes name = re.sub(r'^(stichting|bibliotheek|openbare bibliotheek|ob|)\s*', '', name) name = re.sub(r'\s*(bibliotheek|library|bieb|bibl\.?)$', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name.strip() def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_dutch_libraries_by_isil(client: httpx.Client, isil_codes: List[str]) -> Dict[str, Dict[str, Any]]: """ Query Wikidata for Dutch libraries by ISIL codes. Returns dict mapping ISIL code to Wikidata data. """ if not isil_codes: return {} # Build VALUES clause for ISIL codes isil_values = " ".join(f'"{code}"' for code in isil_codes) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ VALUES ?isil {{ {isil_values} }} ?item wdt:P791 ?isil . OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }} }} """ headers = { "Accept": "application/sparql-results+json", "User-Agent": USER_AGENT, } try: response = client.get( SPARQL_URL, params={"query": query, "format": "json"}, headers=headers, timeout=60.0 ) response.raise_for_status() data = response.json() results = {} for binding in data.get("results", {}).get("bindings", []): isil = binding.get("isil", {}).get("value", "") if not isil: continue item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "isil": isil, "identifiers": {} } if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[isil] = result return results except Exception as e: logger.error(f"Error querying Wikidata by ISIL: {e}") return {} def query_dutch_public_libraries(client: httpx.Client) -> Dict[str, Dict[str, Any]]: """ Query Wikidata for all Dutch public libraries. Returns dict mapping QID to library data. """ query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE { # Libraries in Netherlands ?item wdt:P31/wdt:P279* wd:Q7075 . # instance of library (or subclass) ?item wdt:P17 wd:Q55 . # country: Netherlands OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P571 ?inception . } SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" . } } LIMIT 1000 """ headers = { "Accept": "application/sparql-results+json", "User-Agent": USER_AGENT, } try: response = client.get( SPARQL_URL, params={"query": query, "format": "json"}, headers=headers, timeout=60.0 ) response.raise_for_status() data = response.json() results = {} for binding in data.get("results", {}).get("bindings", []): item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["isil"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: logger.error(f"Error querying Wikidata for Dutch libraries: {e}") return {} def find_best_match( name: str, city: str, libraries: Dict[str, Dict[str, Any]], threshold: float = 0.85 ) -> Optional[Dict[str, Any]]: """ Find best matching library by name and city. """ best_score = 0.0 best_match = None for qid, lib_data in libraries.items(): lib_name = lib_data.get("name", "") if not lib_name: continue # Calculate name similarity name_score = similarity_score(name, lib_name) # Boost score if city appears in library name or description city_boost = 0.0 if city: city_lower = city.lower() if city_lower in lib_name.lower(): city_boost = 0.15 elif city_lower in lib_data.get("description", "").lower(): city_boost = 0.1 total_score = name_score + city_boost if total_score > best_score: best_score = total_score best_match = lib_data if best_score >= threshold and best_match: best_match["match_score"] = best_score return best_match return None def enrich_entry_with_wikidata( entry: Dict[str, Any], wikidata: Dict[str, Any], match_method: str ) -> Dict[str, Any]: """ Enrich an entry with Wikidata data. """ enrichment = { "wikidata_entity_id": wikidata["qid"], "wikidata_label": wikidata.get("name"), "wikidata_description": wikidata.get("description"), "fetch_timestamp": datetime.now(timezone.utc).isoformat(), "match_method": match_method, } # Add coordinates if available if "latitude" in wikidata and "longitude" in wikidata: enrichment["wikidata_coordinates"] = { "latitude": wikidata["latitude"], "longitude": wikidata["longitude"] } # Add founding date if "founding_date" in wikidata: enrichment["wikidata_inception"] = wikidata["founding_date"] # Add identifiers if wikidata.get("identifiers"): enrichment["wikidata_identifiers"] = wikidata["identifiers"] # Add match score if available if "match_score" in wikidata: enrichment["match_confidence"] = round(wikidata["match_score"], 3) entry["wikidata_enrichment"] = enrichment return entry def process_kb_entries( entries_dir: Path, dry_run: bool = False, limit: Optional[int] = None, ) -> Dict[str, int]: """ Process all KB ISIL library entries. """ stats = { "total_files": 0, "isil_matches": 0, "fuzzy_matches": 0, "not_found": 0, "already_enriched": 0, "errors": 0, } # Find all KB ISIL files kb_files = sorted(entries_dir.glob("*_kb_isil.yaml")) stats["total_files"] = len(kb_files) if limit: kb_files = kb_files[:limit] logger.info(f"Found {stats['total_files']} KB library entries") logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})") # Collect all ISIL codes first entries_data = [] isil_codes = [] for yaml_file in kb_files: try: with open(yaml_file, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: continue # Check if already has Wikidata enrichment if entry.get("wikidata_enrichment"): stats["already_enriched"] += 1 continue # Get ISIL code from KB enrichment kb_enrichment = entry.get("kb_enrichment", {}) isil_code = kb_enrichment.get("isil_code") or entry.get("original_entry", {}).get("isil_code_kb") if isil_code: isil_codes.append(isil_code) entries_data.append({ "file": yaml_file, "entry": entry, "isil_code": isil_code, "name": kb_enrichment.get("name") or entry.get("original_entry", {}).get("organisatie", ""), "city": kb_enrichment.get("city") or entry.get("original_entry", {}).get("plaatsnaam_bezoekadres", ""), }) except Exception as e: logger.error(f"Error loading {yaml_file.name}: {e}") stats["errors"] += 1 if not entries_data: logger.info("No entries to process") return stats logger.info(f"Collected {len(isil_codes)} ISIL codes for SPARQL query") with httpx.Client(timeout=60.0) as client: # Step 1: Query Wikidata for all ISIL codes at once logger.info("Querying Wikidata for libraries by ISIL codes...") isil_results = query_dutch_libraries_by_isil(client, isil_codes) logger.info(f"Found {len(isil_results)} libraries by ISIL code") time.sleep(REQUEST_DELAY) # Step 2: Query Wikidata for all Dutch libraries (for fuzzy matching) logger.info("Querying Wikidata for all Dutch libraries (for fuzzy matching)...") all_libraries = query_dutch_public_libraries(client) logger.info(f"Found {len(all_libraries)} Dutch libraries in Wikidata") time.sleep(REQUEST_DELAY) # Step 3: Process each entry for entry_data in entries_data: yaml_file = entry_data["file"] entry = entry_data["entry"] isil_code = entry_data["isil_code"] name = entry_data["name"] city = entry_data["city"] logger.info(f"\nProcessing: {name} ({isil_code})") matched = False # Try ISIL match first if isil_code and isil_code in isil_results: wikidata = isil_results[isil_code] logger.info(f" -> ISIL match: {wikidata['name']} ({wikidata['qid']})") entry = enrich_entry_with_wikidata(entry, wikidata, "isil_code_match") stats["isil_matches"] += 1 matched = True # Try fuzzy name matching if no ISIL match if not matched and name: fuzzy_match = find_best_match(name, city, all_libraries, threshold=0.75) if fuzzy_match: logger.info(f" -> Fuzzy match: {fuzzy_match['name']} ({fuzzy_match['qid']}) [score: {fuzzy_match['match_score']:.3f}]") entry = enrich_entry_with_wikidata(entry, fuzzy_match, "fuzzy_name_match") stats["fuzzy_matches"] += 1 matched = True if not matched: logger.info(f" -> No match found") entry["wikidata_enrichment_status"] = "NOT_FOUND" entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() stats["not_found"] += 1 # Save updated entry if not dry_run: try: with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) except Exception as e: logger.error(f"Error saving {yaml_file.name}: {e}") stats["errors"] += 1 return stats def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Enrich KB library entries with Wikidata data" ) parser.add_argument( "--dry-run", action="store_true", help="Don't save changes, just show what would be done" ) parser.add_argument( "--limit", type=int, help="Limit number of entries to process" ) parser.add_argument( "--entries-dir", type=Path, default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries", help="Path to entries directory" ) args = parser.parse_args() if args.dry_run: logger.info("DRY RUN MODE - no changes will be saved") if not args.entries_dir.exists(): logger.error(f"Entries directory not found: {args.entries_dir}") return 1 # Process entries stats = process_kb_entries( entries_dir=args.entries_dir, dry_run=args.dry_run, limit=args.limit, ) # Print summary logger.info("\n" + "=" * 60) logger.info("WIKIDATA ENRICHMENT COMPLETE") logger.info("=" * 60) logger.info(f"Total KB library files: {stats['total_files']}") logger.info(f"Already enriched: {stats['already_enriched']}") logger.info(f"ISIL code matches: {stats['isil_matches']}") logger.info(f"Fuzzy name matches: {stats['fuzzy_matches']}") logger.info(f"Not found: {stats['not_found']}") logger.info(f"Errors: {stats['errors']}") total_enriched = stats["isil_matches"] + stats["fuzzy_matches"] total_processed = stats["total_files"] - stats["already_enriched"] - stats["errors"] if total_processed > 0: success_rate = total_enriched / total_processed * 100 logger.info(f"Success rate: {success_rate:.1f}%") # Save stats if not args.dry_run: stats_file = args.entries_dir.parent / f"kb_wikidata_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(stats_file, 'w') as f: json.dump({ "timestamp": datetime.now(timezone.utc).isoformat(), "dry_run": args.dry_run, "limit": args.limit, **stats }, f, indent=2) logger.info(f"Stats saved to: {stats_file}") return 0 if __name__ == "__main__": sys.exit(main())