glam/scripts/enrich_kb_libraries_wikidata.py
kempersc 30162e6526 Add script to validate KB library entries and generate enrichment report
- Implemented a Python script to validate KB library YAML files for required fields and data quality.
- Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics.
- Created a comprehensive markdown report summarizing validation results and enrichment quality.
- Included error handling for file loading and validation processes.
- Generated JSON statistics for further analysis.
2025-11-28 14:48:33 +01:00

524 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Enrich KB Netherlands library entries with Wikidata data.
This script reads the KB ISIL library entries from data/nde/enriched/entries/
and enriches them with Wikidata data by:
1. Searching for Dutch public libraries in Wikidata by ISIL code
2. Falling back to fuzzy name matching for libraries not found by ISIL
3. Adding Wikidata IDs, coordinates, founding dates, etc.
Usage:
python scripts/enrich_kb_libraries_wikidata.py [--dry-run] [--limit N]
"""
import os
import sys
import time
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import logging
import argparse
from difflib import SequenceMatcher
import re
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
SPARQL_URL = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-KB-Library-Enricher/1.0 (https://github.com/sst/glam)"
# Rate limiting
REQUEST_DELAY = 0.5
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
name = name.lower()
# Remove common prefixes/suffixes
name = re.sub(r'^(stichting|bibliotheek|openbare bibliotheek|ob|)\s*', '', name)
name = re.sub(r'\s*(bibliotheek|library|bieb|bibl\.?)$', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name.strip()
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_dutch_libraries_by_isil(client: httpx.Client, isil_codes: List[str]) -> Dict[str, Dict[str, Any]]:
"""
Query Wikidata for Dutch libraries by ISIL codes.
Returns dict mapping ISIL code to Wikidata data.
"""
if not isil_codes:
return {}
# Build VALUES clause for ISIL codes
isil_values = " ".join(f'"{code}"' for code in isil_codes)
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
WHERE {{
VALUES ?isil {{ {isil_values} }}
?item wdt:P791 ?isil .
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
OPTIONAL {{ ?item wdt:P625 ?coords . }}
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P571 ?inception . }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }}
}}
"""
headers = {
"Accept": "application/sparql-results+json",
"User-Agent": USER_AGENT,
}
try:
response = client.get(
SPARQL_URL,
params={"query": query, "format": "json"},
headers=headers,
timeout=60.0
)
response.raise_for_status()
data = response.json()
results = {}
for binding in data.get("results", {}).get("bindings", []):
isil = binding.get("isil", {}).get("value", "")
if not isil:
continue
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"isil": isil,
"identifiers": {}
}
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[isil] = result
return results
except Exception as e:
logger.error(f"Error querying Wikidata by ISIL: {e}")
return {}
def query_dutch_public_libraries(client: httpx.Client) -> Dict[str, Dict[str, Any]]:
"""
Query Wikidata for all Dutch public libraries.
Returns dict mapping QID to library data.
"""
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
WHERE {
# Libraries in Netherlands
?item wdt:P31/wdt:P279* wd:Q7075 . # instance of library (or subclass)
?item wdt:P17 wd:Q55 . # country: Netherlands
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P571 ?inception . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" . }
}
LIMIT 1000
"""
headers = {
"Accept": "application/sparql-results+json",
"User-Agent": USER_AGENT,
}
try:
response = client.get(
SPARQL_URL,
params={"query": query, "format": "json"},
headers=headers,
timeout=60.0
)
response.raise_for_status()
data = response.json()
results = {}
for binding in data.get("results", {}).get("bindings", []):
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"identifiers": {}
}
if "isil" in binding:
result["isil"] = binding["isil"]["value"]
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[qid] = result
return results
except Exception as e:
logger.error(f"Error querying Wikidata for Dutch libraries: {e}")
return {}
def find_best_match(
name: str,
city: str,
libraries: Dict[str, Dict[str, Any]],
threshold: float = 0.85
) -> Optional[Dict[str, Any]]:
"""
Find best matching library by name and city.
"""
best_score = 0.0
best_match = None
for qid, lib_data in libraries.items():
lib_name = lib_data.get("name", "")
if not lib_name:
continue
# Calculate name similarity
name_score = similarity_score(name, lib_name)
# Boost score if city appears in library name or description
city_boost = 0.0
if city:
city_lower = city.lower()
if city_lower in lib_name.lower():
city_boost = 0.15
elif city_lower in lib_data.get("description", "").lower():
city_boost = 0.1
total_score = name_score + city_boost
if total_score > best_score:
best_score = total_score
best_match = lib_data
if best_score >= threshold and best_match:
best_match["match_score"] = best_score
return best_match
return None
def enrich_entry_with_wikidata(
entry: Dict[str, Any],
wikidata: Dict[str, Any],
match_method: str
) -> Dict[str, Any]:
"""
Enrich an entry with Wikidata data.
"""
enrichment = {
"wikidata_entity_id": wikidata["qid"],
"wikidata_label": wikidata.get("name"),
"wikidata_description": wikidata.get("description"),
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
"match_method": match_method,
}
# Add coordinates if available
if "latitude" in wikidata and "longitude" in wikidata:
enrichment["wikidata_coordinates"] = {
"latitude": wikidata["latitude"],
"longitude": wikidata["longitude"]
}
# Add founding date
if "founding_date" in wikidata:
enrichment["wikidata_inception"] = wikidata["founding_date"]
# Add identifiers
if wikidata.get("identifiers"):
enrichment["wikidata_identifiers"] = wikidata["identifiers"]
# Add match score if available
if "match_score" in wikidata:
enrichment["match_confidence"] = round(wikidata["match_score"], 3)
entry["wikidata_enrichment"] = enrichment
return entry
def process_kb_entries(
entries_dir: Path,
dry_run: bool = False,
limit: Optional[int] = None,
) -> Dict[str, int]:
"""
Process all KB ISIL library entries.
"""
stats = {
"total_files": 0,
"isil_matches": 0,
"fuzzy_matches": 0,
"not_found": 0,
"already_enriched": 0,
"errors": 0,
}
# Find all KB ISIL files
kb_files = sorted(entries_dir.glob("*_kb_isil.yaml"))
stats["total_files"] = len(kb_files)
if limit:
kb_files = kb_files[:limit]
logger.info(f"Found {stats['total_files']} KB library entries")
logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})")
# Collect all ISIL codes first
entries_data = []
isil_codes = []
for yaml_file in kb_files:
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
continue
# Check if already has Wikidata enrichment
if entry.get("wikidata_enrichment"):
stats["already_enriched"] += 1
continue
# Get ISIL code from KB enrichment
kb_enrichment = entry.get("kb_enrichment", {})
isil_code = kb_enrichment.get("isil_code") or entry.get("original_entry", {}).get("isil_code_kb")
if isil_code:
isil_codes.append(isil_code)
entries_data.append({
"file": yaml_file,
"entry": entry,
"isil_code": isil_code,
"name": kb_enrichment.get("name") or entry.get("original_entry", {}).get("organisatie", ""),
"city": kb_enrichment.get("city") or entry.get("original_entry", {}).get("plaatsnaam_bezoekadres", ""),
})
except Exception as e:
logger.error(f"Error loading {yaml_file.name}: {e}")
stats["errors"] += 1
if not entries_data:
logger.info("No entries to process")
return stats
logger.info(f"Collected {len(isil_codes)} ISIL codes for SPARQL query")
with httpx.Client(timeout=60.0) as client:
# Step 1: Query Wikidata for all ISIL codes at once
logger.info("Querying Wikidata for libraries by ISIL codes...")
isil_results = query_dutch_libraries_by_isil(client, isil_codes)
logger.info(f"Found {len(isil_results)} libraries by ISIL code")
time.sleep(REQUEST_DELAY)
# Step 2: Query Wikidata for all Dutch libraries (for fuzzy matching)
logger.info("Querying Wikidata for all Dutch libraries (for fuzzy matching)...")
all_libraries = query_dutch_public_libraries(client)
logger.info(f"Found {len(all_libraries)} Dutch libraries in Wikidata")
time.sleep(REQUEST_DELAY)
# Step 3: Process each entry
for entry_data in entries_data:
yaml_file = entry_data["file"]
entry = entry_data["entry"]
isil_code = entry_data["isil_code"]
name = entry_data["name"]
city = entry_data["city"]
logger.info(f"\nProcessing: {name} ({isil_code})")
matched = False
# Try ISIL match first
if isil_code and isil_code in isil_results:
wikidata = isil_results[isil_code]
logger.info(f" -> ISIL match: {wikidata['name']} ({wikidata['qid']})")
entry = enrich_entry_with_wikidata(entry, wikidata, "isil_code_match")
stats["isil_matches"] += 1
matched = True
# Try fuzzy name matching if no ISIL match
if not matched and name:
fuzzy_match = find_best_match(name, city, all_libraries, threshold=0.75)
if fuzzy_match:
logger.info(f" -> Fuzzy match: {fuzzy_match['name']} ({fuzzy_match['qid']}) [score: {fuzzy_match['match_score']:.3f}]")
entry = enrich_entry_with_wikidata(entry, fuzzy_match, "fuzzy_name_match")
stats["fuzzy_matches"] += 1
matched = True
if not matched:
logger.info(f" -> No match found")
entry["wikidata_enrichment_status"] = "NOT_FOUND"
entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
stats["not_found"] += 1
# Save updated entry
if not dry_run:
try:
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
except Exception as e:
logger.error(f"Error saving {yaml_file.name}: {e}")
stats["errors"] += 1
return stats
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Enrich KB library entries with Wikidata data"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Don't save changes, just show what would be done"
)
parser.add_argument(
"--limit",
type=int,
help="Limit number of entries to process"
)
parser.add_argument(
"--entries-dir",
type=Path,
default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
help="Path to entries directory"
)
args = parser.parse_args()
if args.dry_run:
logger.info("DRY RUN MODE - no changes will be saved")
if not args.entries_dir.exists():
logger.error(f"Entries directory not found: {args.entries_dir}")
return 1
# Process entries
stats = process_kb_entries(
entries_dir=args.entries_dir,
dry_run=args.dry_run,
limit=args.limit,
)
# Print summary
logger.info("\n" + "=" * 60)
logger.info("WIKIDATA ENRICHMENT COMPLETE")
logger.info("=" * 60)
logger.info(f"Total KB library files: {stats['total_files']}")
logger.info(f"Already enriched: {stats['already_enriched']}")
logger.info(f"ISIL code matches: {stats['isil_matches']}")
logger.info(f"Fuzzy name matches: {stats['fuzzy_matches']}")
logger.info(f"Not found: {stats['not_found']}")
logger.info(f"Errors: {stats['errors']}")
total_enriched = stats["isil_matches"] + stats["fuzzy_matches"]
total_processed = stats["total_files"] - stats["already_enriched"] - stats["errors"]
if total_processed > 0:
success_rate = total_enriched / total_processed * 100
logger.info(f"Success rate: {success_rate:.1f}%")
# Save stats
if not args.dry_run:
stats_file = args.entries_dir.parent / f"kb_wikidata_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(stats_file, 'w') as f:
json.dump({
"timestamp": datetime.now(timezone.utc).isoformat(),
"dry_run": args.dry_run,
"limit": args.limit,
**stats
}, f, indent=2)
logger.info(f"Stats saved to: {stats_file}")
return 0
if __name__ == "__main__":
sys.exit(main())