1043 lines
36 KiB
Python
1043 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Unified NDE Entry Enrichment Script
|
|
|
|
This script provides a flexible way to enrich NDE entries with:
|
|
- Wikidata data (Q-numbers, coordinates, founding dates, identifiers)
|
|
- Google Maps data (place IDs, coordinates, ratings, reviews, opening hours)
|
|
|
|
Supports different entry types through configuration profiles:
|
|
- museum_register: Museum Register Nederland entries (1515-1655)
|
|
- kb_isil: KB Netherlands library entries
|
|
- all: All entries without enrichment
|
|
- custom: Custom entry range or pattern
|
|
|
|
Usage:
|
|
# Enrich Museum Register entries with Wikidata
|
|
python scripts/enrich_nde_entries.py --profile museum_register --source wikidata
|
|
|
|
# Enrich KB libraries with Google Maps
|
|
python scripts/enrich_nde_entries.py --profile kb_isil --source google_maps
|
|
|
|
# Enrich specific range with both sources
|
|
python scripts/enrich_nde_entries.py --start 1515 --end 1600 --source both
|
|
|
|
# Enrich all entries missing Wikidata
|
|
python scripts/enrich_nde_entries.py --profile all --source wikidata --skip-enriched
|
|
|
|
Environment Variables:
|
|
GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import yaml
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any, Tuple, Callable
|
|
from dataclasses import dataclass, field, asdict
|
|
from difflib import SequenceMatcher
|
|
import logging
|
|
import argparse
|
|
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
print("httpx is required. Install with: pip install httpx")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
except ImportError:
|
|
pass # dotenv is optional
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Configuration Profiles
|
|
# =============================================================================
|
|
|
|
PROFILES = {
|
|
"museum_register": {
|
|
"description": "Museum Register Nederland entries",
|
|
"entry_range": (1515, 1655),
|
|
"file_pattern": None,
|
|
"institution_type": "museum",
|
|
"wikidata_query_type": "museum",
|
|
},
|
|
"kb_isil": {
|
|
"description": "KB Netherlands library entries",
|
|
"entry_range": None,
|
|
"file_pattern": "*_kb_isil.yaml",
|
|
"institution_type": "library",
|
|
"wikidata_query_type": "library",
|
|
},
|
|
"na_isil": {
|
|
"description": "NA Netherlands archive entries",
|
|
"entry_range": None,
|
|
"file_pattern": None,
|
|
"has_field": "isil-code_na",
|
|
"institution_type": "archive",
|
|
"wikidata_query_type": "archive",
|
|
},
|
|
"all": {
|
|
"description": "All entries",
|
|
"entry_range": None,
|
|
"file_pattern": "*.yaml",
|
|
"institution_type": None,
|
|
"wikidata_query_type": "heritage",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# API Configuration
|
|
# =============================================================================
|
|
|
|
SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
USER_AGENT = "GLAM-NDE-Enricher/1.0 (https://github.com/sst/glam)"
|
|
REQUEST_DELAY = 0.4 # Seconds between requests
|
|
|
|
GOOGLE_PLACE_FIELDS = [
|
|
"id", "displayName", "formattedAddress", "addressComponents",
|
|
"location", "types", "businessStatus", "internationalPhoneNumber",
|
|
"nationalPhoneNumber", "regularOpeningHours", "websiteUri",
|
|
"rating", "userRatingCount", "googleMapsUri", "primaryType",
|
|
"shortFormattedAddress", "editorialSummary",
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# Data Classes
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class EnrichmentStats:
|
|
"""Track enrichment statistics."""
|
|
total_files: int = 0
|
|
already_enriched: int = 0
|
|
website_matches: int = 0
|
|
isil_matches: int = 0
|
|
name_matches: int = 0
|
|
not_found: int = 0
|
|
skipped: int = 0
|
|
errors: int = 0
|
|
|
|
@property
|
|
def total_enriched(self) -> int:
|
|
return self.website_matches + self.isil_matches + self.name_matches
|
|
|
|
def to_dict(self) -> Dict[str, int]:
|
|
return asdict(self)
|
|
|
|
|
|
# =============================================================================
|
|
# Name Normalization and Matching
|
|
# =============================================================================
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
if not name:
|
|
return ""
|
|
|
|
name = name.lower()
|
|
|
|
# Remove parenthetical content (e.g., "(incl. Kunsthal)")
|
|
name = re.sub(r'\s*\([^)]*\)', '', name)
|
|
|
|
# Remove common Dutch prefixes (at start of name only)
|
|
prefix_patterns = [
|
|
r'^stichting\s+', r'^vereniging\s+',
|
|
r'^het\s+', r'^de\s+', r'^nationaal\s+', r'^gemeentelijk\s+',
|
|
r'^openbare\s+bibliotheek\s+',
|
|
]
|
|
for pattern in prefix_patterns:
|
|
name = re.sub(pattern, '', name)
|
|
|
|
# Remove suffixes
|
|
suffix_patterns = [
|
|
r'\s+nederland$', r'\s+stichting$',
|
|
]
|
|
for pattern in suffix_patterns:
|
|
name = re.sub(pattern, '', name)
|
|
|
|
# Remove location suffixes (city names after main name)
|
|
# e.g., "Rijksmuseum Amsterdam" -> "Rijksmuseum"
|
|
name = re.sub(r'\s+(amsterdam|rotterdam|den haag|utrecht|eindhoven|groningen|tilburg|almere|breda|nijmegen|enschede|haarlem|arnhem|zaanstad|amersfoort|apeldoorn|hoofddorp|maastricht|leiden|dordrecht|zoetermeer|zwolle|deventer|delft|alkmaar|heerlen|venlo|leeuwarden|hilversum)$', '', name)
|
|
|
|
# Normalize compound museum words: keep core name
|
|
# "molenmuseum" -> "molen", "scheepvaartmuseum" -> "scheepvaart"
|
|
# But keep standalone "museum" words like "rijksmuseum"
|
|
name = re.sub(r'(\w{3,})museum\b', r'\1', name) # compound: keep prefix
|
|
name = re.sub(r'\bmuseum\s+', '', name) # "museum xyz" -> "xyz"
|
|
name = re.sub(r'\s+museum$', '', name) # "xyz museum" -> "xyz"
|
|
|
|
# Remove articles that appear mid-name
|
|
name = re.sub(r'\b(het|de)\b', ' ', name)
|
|
|
|
# Remove punctuation and normalize whitespace
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
name = ' '.join(name.split())
|
|
|
|
return name.strip()
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
if not norm1 or not norm2:
|
|
return 0.0
|
|
|
|
# Standard sequence matching
|
|
seq_score = SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
# Bonus for substring containment (one name contains the other)
|
|
# This helps match "molen valk" with "valk" or "naturalis" with "naturalis biodiversity center"
|
|
shorter, longer = (norm1, norm2) if len(norm1) <= len(norm2) else (norm2, norm1)
|
|
if shorter and shorter in longer:
|
|
# Substring match bonus - scaled by how much of the longer string is matched
|
|
containment_ratio = len(shorter) / len(longer)
|
|
seq_score = max(seq_score, 0.65 + 0.35 * containment_ratio)
|
|
|
|
return seq_score
|
|
|
|
|
|
# =============================================================================
|
|
# Wikidata Functions
|
|
# =============================================================================
|
|
|
|
def get_wikidata_query(query_type: str) -> str:
|
|
"""Get SPARQL query for different institution types."""
|
|
|
|
type_filters = {
|
|
"museum": "?item wdt:P31/wdt:P279* wd:Q33506 .",
|
|
"library": "?item wdt:P31/wdt:P279* wd:Q7075 .",
|
|
"archive": "?item wdt:P31/wdt:P279* wd:Q166118 .",
|
|
"heritage": """
|
|
{ ?item wdt:P31/wdt:P279* wd:Q33506 . } # museum
|
|
UNION { ?item wdt:P31/wdt:P279* wd:Q7075 . } # library
|
|
UNION { ?item wdt:P31/wdt:P279* wd:Q166118 . } # archive
|
|
""",
|
|
}
|
|
|
|
type_filter = type_filters.get(query_type, type_filters["heritage"])
|
|
|
|
return f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {{
|
|
{type_filter}
|
|
?item wdt:P17 wd:Q55 . # country: Netherlands
|
|
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }}
|
|
}}
|
|
LIMIT 3000
|
|
"""
|
|
|
|
|
|
def query_wikidata_institutions(client: httpx.Client, query_type: str) -> Dict[str, Dict[str, Any]]:
|
|
"""Query Wikidata for Dutch institutions."""
|
|
query = get_wikidata_query(query_type)
|
|
|
|
headers = {
|
|
"Accept": "application/sparql-results+json",
|
|
"User-Agent": USER_AGENT,
|
|
}
|
|
|
|
try:
|
|
logger.info(f"Querying Wikidata for Dutch {query_type} institutions...")
|
|
response = client.get(
|
|
SPARQL_URL,
|
|
params={"query": query, "format": "json"},
|
|
headers=headers,
|
|
timeout=120.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
results = {}
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q") or qid in results:
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "isil" in binding:
|
|
result["isil"] = binding["isil"]["value"]
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
try:
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
results[qid] = result
|
|
|
|
logger.info(f"Found {len(results)} institutions in Wikidata")
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error querying Wikidata: {e}")
|
|
return {}
|
|
|
|
|
|
def query_wikidata_by_isil(client: httpx.Client, isil_codes: List[str]) -> Dict[str, Dict[str, Any]]:
|
|
"""Query Wikidata for institutions by ISIL codes."""
|
|
if not isil_codes:
|
|
return {}
|
|
|
|
isil_values = " ".join(f'"{code}"' for code in isil_codes[:100]) # Limit batch size
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {{
|
|
VALUES ?isil {{ {isil_values} }}
|
|
?item wdt:P791 ?isil .
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }}
|
|
}}
|
|
"""
|
|
|
|
headers = {
|
|
"Accept": "application/sparql-results+json",
|
|
"User-Agent": USER_AGENT,
|
|
}
|
|
|
|
try:
|
|
response = client.get(
|
|
SPARQL_URL,
|
|
params={"query": query, "format": "json"},
|
|
headers=headers,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
results = {}
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
isil = binding.get("isil", {}).get("value", "")
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not isil or not qid:
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"isil": isil,
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
try:
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
results[isil] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error querying Wikidata by ISIL: {e}")
|
|
return {}
|
|
|
|
|
|
def find_wikidata_match(
|
|
name: str,
|
|
city: Optional[str],
|
|
province: Optional[str],
|
|
institutions: Dict[str, Dict[str, Any]],
|
|
threshold: float = 0.70
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Find best matching institution by name."""
|
|
best_score = 0.0
|
|
best_match = None
|
|
|
|
for qid, inst_data in institutions.items():
|
|
inst_name = inst_data.get("name", "")
|
|
if not inst_name:
|
|
continue
|
|
|
|
name_score = similarity_score(name, inst_name)
|
|
|
|
# Boost for location match
|
|
location_boost = 0.0
|
|
search_text = (inst_name + " " + inst_data.get("description", "")).lower()
|
|
|
|
if city and city.lower() in search_text:
|
|
location_boost = 0.12
|
|
if province and province.lower() in search_text:
|
|
location_boost = max(location_boost, 0.08)
|
|
|
|
total_score = name_score + location_boost
|
|
|
|
if total_score > best_score:
|
|
best_score = total_score
|
|
best_match = inst_data.copy()
|
|
|
|
if best_score >= threshold and best_match:
|
|
best_match["match_score"] = best_score
|
|
return best_match
|
|
|
|
return None
|
|
|
|
|
|
def create_wikidata_enrichment(wikidata: Dict[str, Any], match_method: str) -> Dict[str, Any]:
|
|
"""Create Wikidata enrichment section."""
|
|
enrichment = {
|
|
"wikidata_entity_id": wikidata["qid"],
|
|
"wikidata_label": wikidata.get("name"),
|
|
"wikidata_description": wikidata.get("description"),
|
|
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"match_method": match_method,
|
|
}
|
|
|
|
if "latitude" in wikidata and "longitude" in wikidata:
|
|
enrichment["wikidata_coordinates"] = {
|
|
"latitude": wikidata["latitude"],
|
|
"longitude": wikidata["longitude"]
|
|
}
|
|
|
|
if "founding_date" in wikidata:
|
|
enrichment["wikidata_inception"] = wikidata["founding_date"]
|
|
|
|
if wikidata.get("identifiers"):
|
|
enrichment["wikidata_identifiers"] = wikidata["identifiers"]
|
|
|
|
if "isil" in wikidata:
|
|
enrichment["wikidata_isil"] = wikidata["isil"]
|
|
|
|
if "match_score" in wikidata:
|
|
enrichment["match_confidence"] = round(wikidata["match_score"], 3)
|
|
|
|
return enrichment
|
|
|
|
|
|
# =============================================================================
|
|
# Google Maps Functions
|
|
# =============================================================================
|
|
|
|
def search_google_place(
|
|
query: str,
|
|
client: httpx.Client,
|
|
api_key: str,
|
|
location_bias: Optional[Tuple[float, float]] = None,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Search for a place using Google Places API."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": api_key,
|
|
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in GOOGLE_PLACE_FIELDS]),
|
|
}
|
|
|
|
body = {
|
|
"textQuery": query,
|
|
"languageCode": "nl",
|
|
"regionCode": "NL",
|
|
"maxResultCount": 1,
|
|
}
|
|
|
|
if location_bias:
|
|
lat, lng = location_bias
|
|
body["locationBias"] = {
|
|
"circle": {
|
|
"center": {"latitude": lat, "longitude": lng},
|
|
"radius": 50000.0
|
|
}
|
|
}
|
|
|
|
try:
|
|
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
places = data.get("places", [])
|
|
return places[0] if places else None
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
error_data = {}
|
|
try:
|
|
error_data = e.response.json()
|
|
except Exception:
|
|
pass
|
|
error_msg = error_data.get("error", {}).get("message", str(e))
|
|
logger.error(f"Google API error: {error_msg}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error searching Google: {e}")
|
|
return None
|
|
|
|
|
|
def create_google_maps_enrichment(place: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Create Google Maps enrichment section."""
|
|
location = place.get("location", {})
|
|
display_name = place.get("displayName", {})
|
|
|
|
opening_hours = place.get("regularOpeningHours")
|
|
if opening_hours:
|
|
opening_hours = {
|
|
"periods": opening_hours.get("periods"),
|
|
"weekday_text": opening_hours.get("weekdayDescriptions"),
|
|
}
|
|
|
|
address_components = place.get("addressComponents")
|
|
if address_components:
|
|
address_components = [
|
|
{
|
|
"long_name": c.get("longText"),
|
|
"short_name": c.get("shortText"),
|
|
"types": c.get("types", []),
|
|
}
|
|
for c in address_components
|
|
]
|
|
|
|
enrichment = {
|
|
"place_id": place.get("id", ""),
|
|
"name": display_name.get("text", ""),
|
|
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"api_status": "OK",
|
|
}
|
|
|
|
if location.get("latitude") and location.get("longitude"):
|
|
enrichment["coordinates"] = {
|
|
"latitude": location["latitude"],
|
|
"longitude": location["longitude"],
|
|
}
|
|
|
|
if place.get("formattedAddress"):
|
|
enrichment["formatted_address"] = place["formattedAddress"]
|
|
if place.get("shortFormattedAddress"):
|
|
enrichment["short_address"] = place["shortFormattedAddress"]
|
|
if address_components:
|
|
enrichment["address_components"] = address_components
|
|
|
|
if place.get("nationalPhoneNumber"):
|
|
enrichment["phone_local"] = place["nationalPhoneNumber"]
|
|
if place.get("internationalPhoneNumber"):
|
|
enrichment["phone_international"] = place["internationalPhoneNumber"]
|
|
if place.get("websiteUri"):
|
|
enrichment["website"] = place["websiteUri"]
|
|
|
|
if place.get("types"):
|
|
enrichment["google_place_types"] = place["types"]
|
|
if place.get("primaryType"):
|
|
enrichment["primary_type"] = place["primaryType"]
|
|
if place.get("businessStatus"):
|
|
enrichment["business_status"] = place["businessStatus"]
|
|
|
|
if opening_hours:
|
|
enrichment["opening_hours"] = opening_hours
|
|
|
|
if place.get("rating") is not None:
|
|
enrichment["rating"] = place["rating"]
|
|
if place.get("userRatingCount") is not None:
|
|
enrichment["total_ratings"] = place["userRatingCount"]
|
|
|
|
if place.get("editorialSummary"):
|
|
enrichment["editorial_summary"] = place["editorialSummary"].get("text")
|
|
|
|
if place.get("googleMapsUri"):
|
|
enrichment["google_maps_url"] = place["googleMapsUri"]
|
|
|
|
return enrichment
|
|
|
|
|
|
# =============================================================================
|
|
# Entry Processing
|
|
# =============================================================================
|
|
|
|
def get_entry_info(entry: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract key information from an entry."""
|
|
original = entry.get("original_entry", {})
|
|
mr_enrichment = entry.get("museum_register_enrichment", {})
|
|
kb_enrichment = entry.get("kb_enrichment", {})
|
|
|
|
return {
|
|
"name": (
|
|
original.get("organisatie") or
|
|
mr_enrichment.get("museum_name") or
|
|
kb_enrichment.get("name") or
|
|
""
|
|
),
|
|
"website": (
|
|
original.get("webadres_organisatie") or
|
|
mr_enrichment.get("website_url") or
|
|
""
|
|
),
|
|
"city": (
|
|
original.get("plaatsnaam_bezoekadres") or
|
|
kb_enrichment.get("city") or
|
|
""
|
|
),
|
|
"province": (
|
|
original.get("provincie") or
|
|
mr_enrichment.get("province") or
|
|
""
|
|
),
|
|
"street": original.get("straat_en_huisnummer_bezoekadres") or "",
|
|
"isil_na": original.get("isil-code_na") or "",
|
|
"isil_kb": original.get("isil_code_kb") or kb_enrichment.get("isil_code") or "",
|
|
"type": original.get("type_organisatie") or "",
|
|
}
|
|
|
|
|
|
def build_google_search_query(info: Dict[str, Any], institution_type: Optional[str]) -> str:
|
|
"""Build a search query for Google Places."""
|
|
name = info["name"]
|
|
city = info["city"]
|
|
street = info["street"]
|
|
|
|
# Add institution type hint if not in name
|
|
if institution_type == "library" and "bibliotheek" not in name.lower():
|
|
name = f"Bibliotheek {name}"
|
|
elif institution_type == "museum" and "museum" not in name.lower():
|
|
name = f"{name} museum"
|
|
|
|
parts = [name]
|
|
if street:
|
|
parts.append(street)
|
|
if city:
|
|
parts.append(city)
|
|
parts.append("Netherlands")
|
|
|
|
return ", ".join(filter(None, parts))
|
|
|
|
|
|
def get_entry_files(
|
|
entries_dir: Path,
|
|
profile: Dict[str, Any],
|
|
entry_range: Optional[Tuple[int, int]] = None,
|
|
) -> List[Path]:
|
|
"""Get list of entry files to process based on profile."""
|
|
yaml_files = []
|
|
|
|
# Get pattern from profile or use entry range
|
|
file_pattern = profile.get("file_pattern")
|
|
profile_range = profile.get("entry_range") or entry_range
|
|
has_field = profile.get("has_field")
|
|
|
|
if file_pattern and file_pattern != "*.yaml":
|
|
# Use specific file pattern
|
|
yaml_files = sorted(entries_dir.glob(file_pattern))
|
|
else:
|
|
# Use entry range
|
|
for f in sorted(entries_dir.glob("*.yaml")):
|
|
if f.name.startswith("_"):
|
|
continue
|
|
|
|
match = re.match(r'^(\d+)_', f.name)
|
|
if not match:
|
|
continue
|
|
|
|
entry_num = int(match.group(1))
|
|
|
|
if profile_range:
|
|
start, end = profile_range
|
|
if entry_num < start or entry_num > end:
|
|
continue
|
|
|
|
yaml_files.append(f)
|
|
|
|
# Filter by has_field if specified
|
|
if has_field:
|
|
filtered = []
|
|
for f in yaml_files:
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as fh:
|
|
entry = yaml.safe_load(fh)
|
|
if entry and entry.get("original_entry", {}).get(has_field):
|
|
filtered.append(f)
|
|
except Exception:
|
|
pass
|
|
yaml_files = filtered
|
|
|
|
return yaml_files
|
|
|
|
|
|
def process_entries(
|
|
entries_dir: Path,
|
|
profile: Dict[str, Any],
|
|
source: str,
|
|
dry_run: bool = False,
|
|
limit: Optional[int] = None,
|
|
entry_range: Optional[Tuple[int, int]] = None,
|
|
force: bool = False,
|
|
google_api_key: Optional[str] = None,
|
|
) -> EnrichmentStats:
|
|
"""Process entries for enrichment."""
|
|
stats = EnrichmentStats()
|
|
|
|
# Get files to process
|
|
yaml_files = get_entry_files(entries_dir, profile, entry_range)
|
|
stats.total_files = len(yaml_files)
|
|
|
|
if limit:
|
|
yaml_files = yaml_files[:limit]
|
|
|
|
logger.info(f"Found {stats.total_files} entry files matching profile")
|
|
logger.info(f"Processing {len(yaml_files)} files (limit: {limit or 'none'})")
|
|
|
|
# Determine which enrichments to run
|
|
do_wikidata = source in ("wikidata", "both")
|
|
do_google = source in ("google_maps", "both")
|
|
|
|
if do_google and not google_api_key:
|
|
logger.error("GOOGLE_PLACES_TOKEN required for Google Maps enrichment")
|
|
return stats
|
|
|
|
# Collect entry data
|
|
entries_data = []
|
|
isil_codes = []
|
|
|
|
for yaml_file in yaml_files:
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
stats.skipped += 1
|
|
continue
|
|
|
|
# Check existing enrichment
|
|
has_wikidata = bool(entry.get("wikidata_enrichment"))
|
|
has_google = bool(entry.get("google_maps_enrichment"))
|
|
|
|
if not force:
|
|
if do_wikidata and has_wikidata and do_google and has_google:
|
|
stats.already_enriched += 1
|
|
continue
|
|
if do_wikidata and not do_google and has_wikidata:
|
|
stats.already_enriched += 1
|
|
continue
|
|
if do_google and not do_wikidata and has_google:
|
|
stats.already_enriched += 1
|
|
continue
|
|
|
|
info = get_entry_info(entry)
|
|
if not info["name"]:
|
|
stats.skipped += 1
|
|
continue
|
|
|
|
# Collect ISIL codes for batch query
|
|
if info["isil_na"]:
|
|
isil_codes.append(info["isil_na"])
|
|
if info["isil_kb"]:
|
|
isil_codes.append(info["isil_kb"])
|
|
|
|
entries_data.append({
|
|
"file": yaml_file,
|
|
"entry": entry,
|
|
"info": info,
|
|
"needs_wikidata": do_wikidata and (force or not has_wikidata),
|
|
"needs_google": do_google and (force or not has_google),
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading {yaml_file.name}: {e}")
|
|
stats.errors += 1
|
|
|
|
if not entries_data:
|
|
logger.info("No entries to process")
|
|
return stats
|
|
|
|
logger.info(f"Collected {len(entries_data)} entries for enrichment")
|
|
|
|
# Initialize data sources
|
|
wikidata_institutions = {}
|
|
isil_results = {}
|
|
|
|
with httpx.Client(timeout=120.0) as client:
|
|
if do_wikidata:
|
|
# Query Wikidata
|
|
query_type = profile.get("wikidata_query_type", "heritage")
|
|
wikidata_institutions = query_wikidata_institutions(client, query_type)
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Also query by ISIL codes
|
|
if isil_codes:
|
|
logger.info(f"Querying Wikidata for {len(isil_codes)} ISIL codes...")
|
|
isil_results = query_wikidata_by_isil(client, list(set(isil_codes)))
|
|
logger.info(f"Found {len(isil_results)} by ISIL")
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Process each entry
|
|
for entry_data in entries_data:
|
|
yaml_file = entry_data["file"]
|
|
entry = entry_data["entry"]
|
|
info = entry_data["info"]
|
|
modified = False
|
|
|
|
logger.info(f"\nProcessing: {info['name'][:60]}")
|
|
|
|
# Wikidata enrichment
|
|
if entry_data["needs_wikidata"]:
|
|
wikidata_match: Optional[Dict[str, Any]] = None
|
|
match_method: str = "unknown"
|
|
|
|
# Try ISIL match first
|
|
for isil in [info["isil_na"], info["isil_kb"]]:
|
|
if isil and isil in isil_results:
|
|
wikidata_match = isil_results[isil]
|
|
match_method = "isil_code_match"
|
|
stats.isil_matches += 1
|
|
logger.info(f" -> ISIL match: {wikidata_match['name']} ({wikidata_match['qid']})")
|
|
break
|
|
|
|
# Try name match
|
|
if not wikidata_match:
|
|
wikidata_match = find_wikidata_match(
|
|
info["name"], info["city"], info["province"],
|
|
wikidata_institutions, threshold=0.75
|
|
)
|
|
if wikidata_match:
|
|
match_method = "fuzzy_name_match"
|
|
stats.name_matches += 1
|
|
score = wikidata_match.get("match_score", 0)
|
|
logger.info(f" -> Name match: {wikidata_match['name']} ({wikidata_match['qid']}) [{score:.2f}]")
|
|
|
|
if wikidata_match:
|
|
entry["wikidata_enrichment"] = create_wikidata_enrichment(wikidata_match, match_method)
|
|
modified = True
|
|
else:
|
|
entry["wikidata_enrichment_status"] = "NOT_FOUND"
|
|
entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
if entry_data["needs_wikidata"] and not entry_data["needs_google"]:
|
|
stats.not_found += 1
|
|
logger.info(" -> No Wikidata match")
|
|
|
|
# Google Maps enrichment
|
|
if entry_data["needs_google"]:
|
|
# google_api_key is guaranteed non-None here (checked at line 702-704)
|
|
assert google_api_key is not None
|
|
|
|
institution_type = profile.get("institution_type")
|
|
query = build_google_search_query(info, institution_type)
|
|
|
|
NL_CENTER = (52.1326, 5.2913)
|
|
place = search_google_place(query, client, google_api_key, NL_CENTER)
|
|
|
|
if place:
|
|
entry["google_maps_enrichment"] = create_google_maps_enrichment(place)
|
|
entry["google_maps_status"] = "SUCCESS"
|
|
entry["google_maps_search_query"] = query
|
|
modified = True
|
|
|
|
gm_name = place.get("displayName", {}).get("text", "")
|
|
rating = place.get("rating", "N/A")
|
|
logger.info(f" -> Google: {gm_name} ({rating}★)")
|
|
else:
|
|
entry["google_maps_status"] = "NOT_FOUND"
|
|
entry["google_maps_search_query"] = query
|
|
entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
stats.not_found += 1
|
|
logger.info(" -> No Google match")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Save entry
|
|
if modified and not dry_run:
|
|
try:
|
|
with open(yaml_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
except Exception as e:
|
|
logger.error(f"Error saving {yaml_file.name}: {e}")
|
|
stats.errors += 1
|
|
|
|
return stats
|
|
|
|
|
|
# =============================================================================
|
|
# Main Entry Point
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Unified NDE entry enrichment with Wikidata and Google Maps",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Enrich Museum Register entries with Wikidata
|
|
%(prog)s --profile museum_register --source wikidata
|
|
|
|
# Enrich KB libraries with Google Maps
|
|
%(prog)s --profile kb_isil --source google_maps
|
|
|
|
# Enrich custom range with both sources
|
|
%(prog)s --start 1515 --end 1600 --source both
|
|
|
|
# Dry run to see what would be done
|
|
%(prog)s --profile museum_register --source both --dry-run
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--profile",
|
|
choices=list(PROFILES.keys()),
|
|
default="all",
|
|
help="Entry profile to process (default: all)"
|
|
)
|
|
parser.add_argument(
|
|
"--source",
|
|
choices=["wikidata", "google_maps", "both"],
|
|
default="both",
|
|
help="Enrichment source (default: both)"
|
|
)
|
|
parser.add_argument(
|
|
"--start",
|
|
type=int,
|
|
help="Start entry number (overrides profile range)"
|
|
)
|
|
parser.add_argument(
|
|
"--end",
|
|
type=int,
|
|
help="End entry number (overrides profile range)"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
help="Limit number of entries to process"
|
|
)
|
|
parser.add_argument(
|
|
"--force",
|
|
action="store_true",
|
|
help="Re-enrich entries that already have data"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Don't save changes, just show what would be done"
|
|
)
|
|
parser.add_argument(
|
|
"--entries-dir",
|
|
type=Path,
|
|
default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries",
|
|
help="Path to entries directory"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get profile
|
|
profile = PROFILES[args.profile].copy()
|
|
logger.info(f"Profile: {args.profile} - {profile['description']}")
|
|
|
|
# Override range if specified
|
|
entry_range = None
|
|
if args.start is not None or args.end is not None:
|
|
start = args.start or 0
|
|
end = args.end or 99999
|
|
entry_range = (start, end)
|
|
logger.info(f"Entry range: {start} to {end}")
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN MODE - no changes will be saved")
|
|
|
|
if not args.entries_dir.exists():
|
|
logger.error(f"Entries directory not found: {args.entries_dir}")
|
|
return 1
|
|
|
|
# Get Google API key if needed
|
|
google_api_key = None
|
|
if args.source in ("google_maps", "both"):
|
|
google_api_key = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
if not google_api_key:
|
|
logger.error("GOOGLE_PLACES_TOKEN environment variable required for Google Maps enrichment")
|
|
return 1
|
|
|
|
# Process entries
|
|
stats = process_entries(
|
|
entries_dir=args.entries_dir,
|
|
profile=profile,
|
|
source=args.source,
|
|
dry_run=args.dry_run,
|
|
limit=args.limit,
|
|
entry_range=entry_range,
|
|
force=args.force,
|
|
google_api_key=google_api_key,
|
|
)
|
|
|
|
# Print summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("ENRICHMENT COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total files: {stats.total_files}")
|
|
logger.info(f"Already enriched: {stats.already_enriched}")
|
|
logger.info(f"ISIL matches: {stats.isil_matches}")
|
|
logger.info(f"Name matches: {stats.name_matches}")
|
|
logger.info(f"Not found: {stats.not_found}")
|
|
logger.info(f"Skipped: {stats.skipped}")
|
|
logger.info(f"Errors: {stats.errors}")
|
|
logger.info(f"Total enriched: {stats.total_enriched}")
|
|
|
|
# Save stats
|
|
if not args.dry_run:
|
|
stats_file = args.entries_dir.parent / f"enrichment_stats_{args.profile}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(stats_file, 'w') as f:
|
|
json.dump({
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"profile": args.profile,
|
|
"source": args.source,
|
|
"dry_run": args.dry_run,
|
|
"limit": args.limit,
|
|
"entry_range": list(entry_range) if entry_range else None,
|
|
**stats.to_dict()
|
|
}, f, indent=2)
|
|
logger.info(f"Stats saved to: {stats_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|