- Implemented a Python script to validate KB library YAML files for required fields and data quality. - Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics. - Created a comprehensive markdown report summarizing validation results and enrichment quality. - Included error handling for file loading and validation processes. - Generated JSON statistics for further analysis.
524 lines
17 KiB
Python
524 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich KB Netherlands library entries with Wikidata data.
|
|
|
|
This script reads the KB ISIL library entries from data/nde/enriched/entries/
|
|
and enriches them with Wikidata data by:
|
|
1. Searching for Dutch public libraries in Wikidata by ISIL code
|
|
2. Falling back to fuzzy name matching for libraries not found by ISIL
|
|
3. Adding Wikidata IDs, coordinates, founding dates, etc.
|
|
|
|
Usage:
|
|
python scripts/enrich_kb_libraries_wikidata.py [--dry-run] [--limit N]
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import yaml
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass, field
|
|
import logging
|
|
import argparse
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1"
|
|
SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-KB-Library-Enricher/1.0 (https://github.com/sst/glam)"
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 0.5
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes/suffixes
|
|
name = re.sub(r'^(stichting|bibliotheek|openbare bibliotheek|ob|)\s*', '', name)
|
|
name = re.sub(r'\s*(bibliotheek|library|bieb|bibl\.?)$', '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name.strip()
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def query_dutch_libraries_by_isil(client: httpx.Client, isil_codes: List[str]) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for Dutch libraries by ISIL codes.
|
|
|
|
Returns dict mapping ISIL code to Wikidata data.
|
|
"""
|
|
if not isil_codes:
|
|
return {}
|
|
|
|
# Build VALUES clause for ISIL codes
|
|
isil_values = " ".join(f'"{code}"' for code in isil_codes)
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {{
|
|
VALUES ?isil {{ {isil_values} }}
|
|
|
|
?item wdt:P791 ?isil .
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }}
|
|
}}
|
|
"""
|
|
|
|
headers = {
|
|
"Accept": "application/sparql-results+json",
|
|
"User-Agent": USER_AGENT,
|
|
}
|
|
|
|
try:
|
|
response = client.get(
|
|
SPARQL_URL,
|
|
params={"query": query, "format": "json"},
|
|
headers=headers,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
results = {}
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
isil = binding.get("isil", {}).get("value", "")
|
|
if not isil:
|
|
continue
|
|
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"isil": isil,
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
results[isil] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error querying Wikidata by ISIL: {e}")
|
|
return {}
|
|
|
|
|
|
def query_dutch_public_libraries(client: httpx.Client) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for all Dutch public libraries.
|
|
|
|
Returns dict mapping QID to library data.
|
|
"""
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {
|
|
# Libraries in Netherlands
|
|
?item wdt:P31/wdt:P279* wd:Q7075 . # instance of library (or subclass)
|
|
?item wdt:P17 wd:Q55 . # country: Netherlands
|
|
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" . }
|
|
}
|
|
LIMIT 1000
|
|
"""
|
|
|
|
headers = {
|
|
"Accept": "application/sparql-results+json",
|
|
"User-Agent": USER_AGENT,
|
|
}
|
|
|
|
try:
|
|
response = client.get(
|
|
SPARQL_URL,
|
|
params={"query": query, "format": "json"},
|
|
headers=headers,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
results = {}
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "isil" in binding:
|
|
result["isil"] = binding["isil"]["value"]
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
results[qid] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error querying Wikidata for Dutch libraries: {e}")
|
|
return {}
|
|
|
|
|
|
def find_best_match(
|
|
name: str,
|
|
city: str,
|
|
libraries: Dict[str, Dict[str, Any]],
|
|
threshold: float = 0.85
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Find best matching library by name and city.
|
|
"""
|
|
best_score = 0.0
|
|
best_match = None
|
|
|
|
for qid, lib_data in libraries.items():
|
|
lib_name = lib_data.get("name", "")
|
|
if not lib_name:
|
|
continue
|
|
|
|
# Calculate name similarity
|
|
name_score = similarity_score(name, lib_name)
|
|
|
|
# Boost score if city appears in library name or description
|
|
city_boost = 0.0
|
|
if city:
|
|
city_lower = city.lower()
|
|
if city_lower in lib_name.lower():
|
|
city_boost = 0.15
|
|
elif city_lower in lib_data.get("description", "").lower():
|
|
city_boost = 0.1
|
|
|
|
total_score = name_score + city_boost
|
|
|
|
if total_score > best_score:
|
|
best_score = total_score
|
|
best_match = lib_data
|
|
|
|
if best_score >= threshold and best_match:
|
|
best_match["match_score"] = best_score
|
|
return best_match
|
|
|
|
return None
|
|
|
|
|
|
def enrich_entry_with_wikidata(
|
|
entry: Dict[str, Any],
|
|
wikidata: Dict[str, Any],
|
|
match_method: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Enrich an entry with Wikidata data.
|
|
"""
|
|
enrichment = {
|
|
"wikidata_entity_id": wikidata["qid"],
|
|
"wikidata_label": wikidata.get("name"),
|
|
"wikidata_description": wikidata.get("description"),
|
|
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"match_method": match_method,
|
|
}
|
|
|
|
# Add coordinates if available
|
|
if "latitude" in wikidata and "longitude" in wikidata:
|
|
enrichment["wikidata_coordinates"] = {
|
|
"latitude": wikidata["latitude"],
|
|
"longitude": wikidata["longitude"]
|
|
}
|
|
|
|
# Add founding date
|
|
if "founding_date" in wikidata:
|
|
enrichment["wikidata_inception"] = wikidata["founding_date"]
|
|
|
|
# Add identifiers
|
|
if wikidata.get("identifiers"):
|
|
enrichment["wikidata_identifiers"] = wikidata["identifiers"]
|
|
|
|
# Add match score if available
|
|
if "match_score" in wikidata:
|
|
enrichment["match_confidence"] = round(wikidata["match_score"], 3)
|
|
|
|
entry["wikidata_enrichment"] = enrichment
|
|
|
|
return entry
|
|
|
|
|
|
def process_kb_entries(
|
|
entries_dir: Path,
|
|
dry_run: bool = False,
|
|
limit: Optional[int] = None,
|
|
) -> Dict[str, int]:
|
|
"""
|
|
Process all KB ISIL library entries.
|
|
"""
|
|
stats = {
|
|
"total_files": 0,
|
|
"isil_matches": 0,
|
|
"fuzzy_matches": 0,
|
|
"not_found": 0,
|
|
"already_enriched": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
# Find all KB ISIL files
|
|
kb_files = sorted(entries_dir.glob("*_kb_isil.yaml"))
|
|
stats["total_files"] = len(kb_files)
|
|
|
|
if limit:
|
|
kb_files = kb_files[:limit]
|
|
|
|
logger.info(f"Found {stats['total_files']} KB library entries")
|
|
logger.info(f"Processing {len(kb_files)} files (limit: {limit or 'none'})")
|
|
|
|
# Collect all ISIL codes first
|
|
entries_data = []
|
|
isil_codes = []
|
|
|
|
for yaml_file in kb_files:
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
continue
|
|
|
|
# Check if already has Wikidata enrichment
|
|
if entry.get("wikidata_enrichment"):
|
|
stats["already_enriched"] += 1
|
|
continue
|
|
|
|
# Get ISIL code from KB enrichment
|
|
kb_enrichment = entry.get("kb_enrichment", {})
|
|
isil_code = kb_enrichment.get("isil_code") or entry.get("original_entry", {}).get("isil_code_kb")
|
|
|
|
if isil_code:
|
|
isil_codes.append(isil_code)
|
|
|
|
entries_data.append({
|
|
"file": yaml_file,
|
|
"entry": entry,
|
|
"isil_code": isil_code,
|
|
"name": kb_enrichment.get("name") or entry.get("original_entry", {}).get("organisatie", ""),
|
|
"city": kb_enrichment.get("city") or entry.get("original_entry", {}).get("plaatsnaam_bezoekadres", ""),
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading {yaml_file.name}: {e}")
|
|
stats["errors"] += 1
|
|
|
|
if not entries_data:
|
|
logger.info("No entries to process")
|
|
return stats
|
|
|
|
logger.info(f"Collected {len(isil_codes)} ISIL codes for SPARQL query")
|
|
|
|
with httpx.Client(timeout=60.0) as client:
|
|
# Step 1: Query Wikidata for all ISIL codes at once
|
|
logger.info("Querying Wikidata for libraries by ISIL codes...")
|
|
isil_results = query_dutch_libraries_by_isil(client, isil_codes)
|
|
logger.info(f"Found {len(isil_results)} libraries by ISIL code")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Step 2: Query Wikidata for all Dutch libraries (for fuzzy matching)
|
|
logger.info("Querying Wikidata for all Dutch libraries (for fuzzy matching)...")
|
|
all_libraries = query_dutch_public_libraries(client)
|
|
logger.info(f"Found {len(all_libraries)} Dutch libraries in Wikidata")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Step 3: Process each entry
|
|
for entry_data in entries_data:
|
|
yaml_file = entry_data["file"]
|
|
entry = entry_data["entry"]
|
|
isil_code = entry_data["isil_code"]
|
|
name = entry_data["name"]
|
|
city = entry_data["city"]
|
|
|
|
logger.info(f"\nProcessing: {name} ({isil_code})")
|
|
|
|
matched = False
|
|
|
|
# Try ISIL match first
|
|
if isil_code and isil_code in isil_results:
|
|
wikidata = isil_results[isil_code]
|
|
logger.info(f" -> ISIL match: {wikidata['name']} ({wikidata['qid']})")
|
|
entry = enrich_entry_with_wikidata(entry, wikidata, "isil_code_match")
|
|
stats["isil_matches"] += 1
|
|
matched = True
|
|
|
|
# Try fuzzy name matching if no ISIL match
|
|
if not matched and name:
|
|
fuzzy_match = find_best_match(name, city, all_libraries, threshold=0.75)
|
|
if fuzzy_match:
|
|
logger.info(f" -> Fuzzy match: {fuzzy_match['name']} ({fuzzy_match['qid']}) [score: {fuzzy_match['match_score']:.3f}]")
|
|
entry = enrich_entry_with_wikidata(entry, fuzzy_match, "fuzzy_name_match")
|
|
stats["fuzzy_matches"] += 1
|
|
matched = True
|
|
|
|
if not matched:
|
|
logger.info(f" -> No match found")
|
|
entry["wikidata_enrichment_status"] = "NOT_FOUND"
|
|
entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
stats["not_found"] += 1
|
|
|
|
# Save updated entry
|
|
if not dry_run:
|
|
try:
|
|
with open(yaml_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
except Exception as e:
|
|
logger.error(f"Error saving {yaml_file.name}: {e}")
|
|
stats["errors"] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich KB library entries with Wikidata data"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Don't save changes, just show what would be done"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
help="Limit number of entries to process"
|
|
)
|
|
parser.add_argument(
|
|
"--entries-dir",
|
|
type=Path,
|
|
default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
|
|
help="Path to entries directory"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN MODE - no changes will be saved")
|
|
|
|
if not args.entries_dir.exists():
|
|
logger.error(f"Entries directory not found: {args.entries_dir}")
|
|
return 1
|
|
|
|
# Process entries
|
|
stats = process_kb_entries(
|
|
entries_dir=args.entries_dir,
|
|
dry_run=args.dry_run,
|
|
limit=args.limit,
|
|
)
|
|
|
|
# Print summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("WIKIDATA ENRICHMENT COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total KB library files: {stats['total_files']}")
|
|
logger.info(f"Already enriched: {stats['already_enriched']}")
|
|
logger.info(f"ISIL code matches: {stats['isil_matches']}")
|
|
logger.info(f"Fuzzy name matches: {stats['fuzzy_matches']}")
|
|
logger.info(f"Not found: {stats['not_found']}")
|
|
logger.info(f"Errors: {stats['errors']}")
|
|
|
|
total_enriched = stats["isil_matches"] + stats["fuzzy_matches"]
|
|
total_processed = stats["total_files"] - stats["already_enriched"] - stats["errors"]
|
|
if total_processed > 0:
|
|
success_rate = total_enriched / total_processed * 100
|
|
logger.info(f"Success rate: {success_rate:.1f}%")
|
|
|
|
# Save stats
|
|
if not args.dry_run:
|
|
stats_file = args.entries_dir.parent / f"kb_wikidata_enrichment_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(stats_file, 'w') as f:
|
|
json.dump({
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"dry_run": args.dry_run,
|
|
"limit": args.limit,
|
|
**stats
|
|
}, f, indent=2)
|
|
logger.info(f"Stats saved to: {stats_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|