#!/usr/bin/env python3 """ Unified NDE Entry Enrichment Script This script provides a flexible way to enrich NDE entries with: - Wikidata data (Q-numbers, coordinates, founding dates, identifiers) - Google Maps data (place IDs, coordinates, ratings, reviews, opening hours) Supports different entry types through configuration profiles: - museum_register: Museum Register Nederland entries (1515-1655) - kb_isil: KB Netherlands library entries - all: All entries without enrichment - custom: Custom entry range or pattern Usage: # Enrich Museum Register entries with Wikidata python scripts/enrich_nde_entries.py --profile museum_register --source wikidata # Enrich KB libraries with Google Maps python scripts/enrich_nde_entries.py --profile kb_isil --source google_maps # Enrich specific range with both sources python scripts/enrich_nde_entries.py --start 1515 --end 1600 --source both # Enrich all entries missing Wikidata python scripts/enrich_nde_entries.py --profile all --source wikidata --skip-enriched Environment Variables: GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment """ import os import sys import time import json import yaml import re from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Any, Tuple, Callable from dataclasses import dataclass, field, asdict from difflib import SequenceMatcher import logging import argparse try: import httpx except ImportError: print("httpx is required. Install with: pip install httpx") sys.exit(1) try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # dotenv is optional # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # ============================================================================= # Configuration Profiles # ============================================================================= PROFILES = { "museum_register": { "description": "Museum Register Nederland entries", "entry_range": (1515, 1655), "file_pattern": None, "institution_type": "museum", "wikidata_query_type": "museum", }, "kb_isil": { "description": "KB Netherlands library entries", "entry_range": None, "file_pattern": "*_kb_isil.yaml", "institution_type": "library", "wikidata_query_type": "library", }, "na_isil": { "description": "NA Netherlands archive entries", "entry_range": None, "file_pattern": None, "has_field": "isil-code_na", "institution_type": "archive", "wikidata_query_type": "archive", }, "all": { "description": "All entries", "entry_range": None, "file_pattern": "*.yaml", "institution_type": None, "wikidata_query_type": "heritage", }, } # ============================================================================= # API Configuration # ============================================================================= SPARQL_URL = "https://query.wikidata.org/sparql" TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" USER_AGENT = "GLAM-NDE-Enricher/1.0 (https://github.com/sst/glam)" REQUEST_DELAY = 0.4 # Seconds between requests GOOGLE_PLACE_FIELDS = [ "id", "displayName", "formattedAddress", "addressComponents", "location", "types", "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber", "regularOpeningHours", "websiteUri", "rating", "userRatingCount", "googleMapsUri", "primaryType", "shortFormattedAddress", "editorialSummary", ] # ============================================================================= # Data Classes # ============================================================================= @dataclass class EnrichmentStats: """Track enrichment statistics.""" total_files: int = 0 already_enriched: int = 0 website_matches: int = 0 isil_matches: int = 0 name_matches: int = 0 not_found: int = 0 skipped: int = 0 errors: int = 0 @property def total_enriched(self) -> int: return self.website_matches + self.isil_matches + self.name_matches def to_dict(self) -> Dict[str, int]: return asdict(self) # ============================================================================= # Name Normalization and Matching # ============================================================================= def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" if not name: return "" name = name.lower() # Remove parenthetical content (e.g., "(incl. Kunsthal)") name = re.sub(r'\s*\([^)]*\)', '', name) # Remove common Dutch prefixes (at start of name only) prefix_patterns = [ r'^stichting\s+', r'^vereniging\s+', r'^het\s+', r'^de\s+', r'^nationaal\s+', r'^gemeentelijk\s+', r'^openbare\s+bibliotheek\s+', ] for pattern in prefix_patterns: name = re.sub(pattern, '', name) # Remove suffixes suffix_patterns = [ r'\s+nederland$', r'\s+stichting$', ] for pattern in suffix_patterns: name = re.sub(pattern, '', name) # Remove location suffixes (city names after main name) # e.g., "Rijksmuseum Amsterdam" -> "Rijksmuseum" name = re.sub(r'\s+(amsterdam|rotterdam|den haag|utrecht|eindhoven|groningen|tilburg|almere|breda|nijmegen|enschede|haarlem|arnhem|zaanstad|amersfoort|apeldoorn|hoofddorp|maastricht|leiden|dordrecht|zoetermeer|zwolle|deventer|delft|alkmaar|heerlen|venlo|leeuwarden|hilversum)$', '', name) # Normalize compound museum words: keep core name # "molenmuseum" -> "molen", "scheepvaartmuseum" -> "scheepvaart" # But keep standalone "museum" words like "rijksmuseum" name = re.sub(r'(\w{3,})museum\b', r'\1', name) # compound: keep prefix name = re.sub(r'\bmuseum\s+', '', name) # "museum xyz" -> "xyz" name = re.sub(r'\s+museum$', '', name) # "xyz museum" -> "xyz" # Remove articles that appear mid-name name = re.sub(r'\b(het|de)\b', ' ', name) # Remove punctuation and normalize whitespace name = re.sub(r'[^\w\s]', ' ', name) name = ' '.join(name.split()) return name.strip() def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) if not norm1 or not norm2: return 0.0 # Standard sequence matching seq_score = SequenceMatcher(None, norm1, norm2).ratio() # Bonus for substring containment (one name contains the other) # This helps match "molen valk" with "valk" or "naturalis" with "naturalis biodiversity center" shorter, longer = (norm1, norm2) if len(norm1) <= len(norm2) else (norm2, norm1) if shorter and shorter in longer: # Substring match bonus - scaled by how much of the longer string is matched containment_ratio = len(shorter) / len(longer) seq_score = max(seq_score, 0.65 + 0.35 * containment_ratio) return seq_score # ============================================================================= # Wikidata Functions # ============================================================================= def get_wikidata_query(query_type: str) -> str: """Get SPARQL query for different institution types.""" type_filters = { "museum": "?item wdt:P31/wdt:P279* wd:Q33506 .", "library": "?item wdt:P31/wdt:P279* wd:Q7075 .", "archive": "?item wdt:P31/wdt:P279* wd:Q166118 .", "heritage": """ { ?item wdt:P31/wdt:P279* wd:Q33506 . } # museum UNION { ?item wdt:P31/wdt:P279* wd:Q7075 . } # library UNION { ?item wdt:P31/wdt:P279* wd:Q166118 . } # archive """, } type_filter = type_filters.get(query_type, type_filters["heritage"]) return f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ {type_filter} ?item wdt:P17 wd:Q55 . # country: Netherlands OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }} }} LIMIT 3000 """ def query_wikidata_institutions(client: httpx.Client, query_type: str) -> Dict[str, Dict[str, Any]]: """Query Wikidata for Dutch institutions.""" query = get_wikidata_query(query_type) headers = { "Accept": "application/sparql-results+json", "User-Agent": USER_AGENT, } try: logger.info(f"Querying Wikidata for Dutch {query_type} institutions...") response = client.get( SPARQL_URL, params={"query": query, "format": "json"}, headers=headers, timeout=120.0 ) response.raise_for_status() data = response.json() results = {} for binding in data.get("results", {}).get("bindings", []): item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q") or qid in results: continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["isil"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): try: lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) except (ValueError, IndexError): pass results[qid] = result logger.info(f"Found {len(results)} institutions in Wikidata") return results except Exception as e: logger.error(f"Error querying Wikidata: {e}") return {} def query_wikidata_by_isil(client: httpx.Client, isil_codes: List[str]) -> Dict[str, Dict[str, Any]]: """Query Wikidata for institutions by ISIL codes.""" if not isil_codes: return {} isil_values = " ".join(f'"{code}"' for code in isil_codes[:100]) # Limit batch size query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ VALUES ?isil {{ {isil_values} }} ?item wdt:P791 ?isil . OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }} }} """ headers = { "Accept": "application/sparql-results+json", "User-Agent": USER_AGENT, } try: response = client.get( SPARQL_URL, params={"query": query, "format": "json"}, headers=headers, timeout=60.0 ) response.raise_for_status() data = response.json() results = {} for binding in data.get("results", {}).get("bindings", []): isil = binding.get("isil", {}).get("value", "") item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not isil or not qid: continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "isil": isil, "identifiers": {} } if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): try: lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) except (ValueError, IndexError): pass results[isil] = result return results except Exception as e: logger.error(f"Error querying Wikidata by ISIL: {e}") return {} def find_wikidata_match( name: str, city: Optional[str], province: Optional[str], institutions: Dict[str, Dict[str, Any]], threshold: float = 0.70 ) -> Optional[Dict[str, Any]]: """Find best matching institution by name.""" best_score = 0.0 best_match = None for qid, inst_data in institutions.items(): inst_name = inst_data.get("name", "") if not inst_name: continue name_score = similarity_score(name, inst_name) # Boost for location match location_boost = 0.0 search_text = (inst_name + " " + inst_data.get("description", "")).lower() if city and city.lower() in search_text: location_boost = 0.12 if province and province.lower() in search_text: location_boost = max(location_boost, 0.08) total_score = name_score + location_boost if total_score > best_score: best_score = total_score best_match = inst_data.copy() if best_score >= threshold and best_match: best_match["match_score"] = best_score return best_match return None def create_wikidata_enrichment(wikidata: Dict[str, Any], match_method: str) -> Dict[str, Any]: """Create Wikidata enrichment section.""" enrichment = { "wikidata_entity_id": wikidata["qid"], "wikidata_label": wikidata.get("name"), "wikidata_description": wikidata.get("description"), "fetch_timestamp": datetime.now(timezone.utc).isoformat(), "match_method": match_method, } if "latitude" in wikidata and "longitude" in wikidata: enrichment["wikidata_coordinates"] = { "latitude": wikidata["latitude"], "longitude": wikidata["longitude"] } if "founding_date" in wikidata: enrichment["wikidata_inception"] = wikidata["founding_date"] if wikidata.get("identifiers"): enrichment["wikidata_identifiers"] = wikidata["identifiers"] if "isil" in wikidata: enrichment["wikidata_isil"] = wikidata["isil"] if "match_score" in wikidata: enrichment["match_confidence"] = round(wikidata["match_score"], 3) return enrichment # ============================================================================= # Google Maps Functions # ============================================================================= def search_google_place( query: str, client: httpx.Client, api_key: str, location_bias: Optional[Tuple[float, float]] = None, ) -> Optional[Dict[str, Any]]: """Search for a place using Google Places API.""" headers = { "Content-Type": "application/json", "X-Goog-Api-Key": api_key, "X-Goog-FieldMask": ",".join([f"places.{f}" for f in GOOGLE_PLACE_FIELDS]), } body = { "textQuery": query, "languageCode": "nl", "regionCode": "NL", "maxResultCount": 1, } if location_bias: lat, lng = location_bias body["locationBias"] = { "circle": { "center": {"latitude": lat, "longitude": lng}, "radius": 50000.0 } } try: response = client.post(TEXT_SEARCH_URL, headers=headers, json=body) response.raise_for_status() data = response.json() places = data.get("places", []) return places[0] if places else None except httpx.HTTPStatusError as e: error_data = {} try: error_data = e.response.json() except Exception: pass error_msg = error_data.get("error", {}).get("message", str(e)) logger.error(f"Google API error: {error_msg}") return None except Exception as e: logger.error(f"Error searching Google: {e}") return None def create_google_maps_enrichment(place: Dict[str, Any]) -> Dict[str, Any]: """Create Google Maps enrichment section.""" location = place.get("location", {}) display_name = place.get("displayName", {}) opening_hours = place.get("regularOpeningHours") if opening_hours: opening_hours = { "periods": opening_hours.get("periods"), "weekday_text": opening_hours.get("weekdayDescriptions"), } address_components = place.get("addressComponents") if address_components: address_components = [ { "long_name": c.get("longText"), "short_name": c.get("shortText"), "types": c.get("types", []), } for c in address_components ] enrichment = { "place_id": place.get("id", ""), "name": display_name.get("text", ""), "fetch_timestamp": datetime.now(timezone.utc).isoformat(), "api_status": "OK", } if location.get("latitude") and location.get("longitude"): enrichment["coordinates"] = { "latitude": location["latitude"], "longitude": location["longitude"], } if place.get("formattedAddress"): enrichment["formatted_address"] = place["formattedAddress"] if place.get("shortFormattedAddress"): enrichment["short_address"] = place["shortFormattedAddress"] if address_components: enrichment["address_components"] = address_components if place.get("nationalPhoneNumber"): enrichment["phone_local"] = place["nationalPhoneNumber"] if place.get("internationalPhoneNumber"): enrichment["phone_international"] = place["internationalPhoneNumber"] if place.get("websiteUri"): enrichment["website"] = place["websiteUri"] if place.get("types"): enrichment["google_place_types"] = place["types"] if place.get("primaryType"): enrichment["primary_type"] = place["primaryType"] if place.get("businessStatus"): enrichment["business_status"] = place["businessStatus"] if opening_hours: enrichment["opening_hours"] = opening_hours if place.get("rating") is not None: enrichment["rating"] = place["rating"] if place.get("userRatingCount") is not None: enrichment["total_ratings"] = place["userRatingCount"] if place.get("editorialSummary"): enrichment["editorial_summary"] = place["editorialSummary"].get("text") if place.get("googleMapsUri"): enrichment["google_maps_url"] = place["googleMapsUri"] return enrichment # ============================================================================= # Entry Processing # ============================================================================= def get_entry_info(entry: Dict[str, Any]) -> Dict[str, Any]: """Extract key information from an entry.""" original = entry.get("original_entry", {}) mr_enrichment = entry.get("museum_register_enrichment", {}) kb_enrichment = entry.get("kb_enrichment", {}) return { "name": ( original.get("organisatie") or mr_enrichment.get("museum_name") or kb_enrichment.get("name") or "" ), "website": ( original.get("webadres_organisatie") or mr_enrichment.get("website_url") or "" ), "city": ( original.get("plaatsnaam_bezoekadres") or kb_enrichment.get("city") or "" ), "province": ( original.get("provincie") or mr_enrichment.get("province") or "" ), "street": original.get("straat_en_huisnummer_bezoekadres") or "", "isil_na": original.get("isil-code_na") or "", "isil_kb": original.get("isil_code_kb") or kb_enrichment.get("isil_code") or "", "type": original.get("type_organisatie") or "", } def build_google_search_query(info: Dict[str, Any], institution_type: Optional[str]) -> str: """Build a search query for Google Places.""" name = info["name"] city = info["city"] street = info["street"] # Add institution type hint if not in name if institution_type == "library" and "bibliotheek" not in name.lower(): name = f"Bibliotheek {name}" elif institution_type == "museum" and "museum" not in name.lower(): name = f"{name} museum" parts = [name] if street: parts.append(street) if city: parts.append(city) parts.append("Netherlands") return ", ".join(filter(None, parts)) def get_entry_files( entries_dir: Path, profile: Dict[str, Any], entry_range: Optional[Tuple[int, int]] = None, ) -> List[Path]: """Get list of entry files to process based on profile.""" yaml_files = [] # Get pattern from profile or use entry range file_pattern = profile.get("file_pattern") profile_range = profile.get("entry_range") or entry_range has_field = profile.get("has_field") if file_pattern and file_pattern != "*.yaml": # Use specific file pattern yaml_files = sorted(entries_dir.glob(file_pattern)) else: # Use entry range for f in sorted(entries_dir.glob("*.yaml")): if f.name.startswith("_"): continue match = re.match(r'^(\d+)_', f.name) if not match: continue entry_num = int(match.group(1)) if profile_range: start, end = profile_range if entry_num < start or entry_num > end: continue yaml_files.append(f) # Filter by has_field if specified if has_field: filtered = [] for f in yaml_files: try: with open(f, 'r', encoding='utf-8') as fh: entry = yaml.safe_load(fh) if entry and entry.get("original_entry", {}).get(has_field): filtered.append(f) except Exception: pass yaml_files = filtered return yaml_files def process_entries( entries_dir: Path, profile: Dict[str, Any], source: str, dry_run: bool = False, limit: Optional[int] = None, entry_range: Optional[Tuple[int, int]] = None, force: bool = False, google_api_key: Optional[str] = None, ) -> EnrichmentStats: """Process entries for enrichment.""" stats = EnrichmentStats() # Get files to process yaml_files = get_entry_files(entries_dir, profile, entry_range) stats.total_files = len(yaml_files) if limit: yaml_files = yaml_files[:limit] logger.info(f"Found {stats.total_files} entry files matching profile") logger.info(f"Processing {len(yaml_files)} files (limit: {limit or 'none'})") # Determine which enrichments to run do_wikidata = source in ("wikidata", "both") do_google = source in ("google_maps", "both") if do_google and not google_api_key: logger.error("GOOGLE_PLACES_TOKEN required for Google Maps enrichment") return stats # Collect entry data entries_data = [] isil_codes = [] for yaml_file in yaml_files: try: with open(yaml_file, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: stats.skipped += 1 continue # Check existing enrichment has_wikidata = bool(entry.get("wikidata_enrichment")) has_google = bool(entry.get("google_maps_enrichment")) if not force: if do_wikidata and has_wikidata and do_google and has_google: stats.already_enriched += 1 continue if do_wikidata and not do_google and has_wikidata: stats.already_enriched += 1 continue if do_google and not do_wikidata and has_google: stats.already_enriched += 1 continue info = get_entry_info(entry) if not info["name"]: stats.skipped += 1 continue # Collect ISIL codes for batch query if info["isil_na"]: isil_codes.append(info["isil_na"]) if info["isil_kb"]: isil_codes.append(info["isil_kb"]) entries_data.append({ "file": yaml_file, "entry": entry, "info": info, "needs_wikidata": do_wikidata and (force or not has_wikidata), "needs_google": do_google and (force or not has_google), }) except Exception as e: logger.error(f"Error loading {yaml_file.name}: {e}") stats.errors += 1 if not entries_data: logger.info("No entries to process") return stats logger.info(f"Collected {len(entries_data)} entries for enrichment") # Initialize data sources wikidata_institutions = {} isil_results = {} with httpx.Client(timeout=120.0) as client: if do_wikidata: # Query Wikidata query_type = profile.get("wikidata_query_type", "heritage") wikidata_institutions = query_wikidata_institutions(client, query_type) time.sleep(REQUEST_DELAY) # Also query by ISIL codes if isil_codes: logger.info(f"Querying Wikidata for {len(isil_codes)} ISIL codes...") isil_results = query_wikidata_by_isil(client, list(set(isil_codes))) logger.info(f"Found {len(isil_results)} by ISIL") time.sleep(REQUEST_DELAY) # Process each entry for entry_data in entries_data: yaml_file = entry_data["file"] entry = entry_data["entry"] info = entry_data["info"] modified = False logger.info(f"\nProcessing: {info['name'][:60]}") # Wikidata enrichment if entry_data["needs_wikidata"]: wikidata_match: Optional[Dict[str, Any]] = None match_method: str = "unknown" # Try ISIL match first for isil in [info["isil_na"], info["isil_kb"]]: if isil and isil in isil_results: wikidata_match = isil_results[isil] match_method = "isil_code_match" stats.isil_matches += 1 logger.info(f" -> ISIL match: {wikidata_match['name']} ({wikidata_match['qid']})") break # Try name match if not wikidata_match: wikidata_match = find_wikidata_match( info["name"], info["city"], info["province"], wikidata_institutions, threshold=0.75 ) if wikidata_match: match_method = "fuzzy_name_match" stats.name_matches += 1 score = wikidata_match.get("match_score", 0) logger.info(f" -> Name match: {wikidata_match['name']} ({wikidata_match['qid']}) [{score:.2f}]") if wikidata_match: entry["wikidata_enrichment"] = create_wikidata_enrichment(wikidata_match, match_method) modified = True else: entry["wikidata_enrichment_status"] = "NOT_FOUND" entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() if entry_data["needs_wikidata"] and not entry_data["needs_google"]: stats.not_found += 1 logger.info(" -> No Wikidata match") # Google Maps enrichment if entry_data["needs_google"]: # google_api_key is guaranteed non-None here (checked at line 702-704) assert google_api_key is not None institution_type = profile.get("institution_type") query = build_google_search_query(info, institution_type) NL_CENTER = (52.1326, 5.2913) place = search_google_place(query, client, google_api_key, NL_CENTER) if place: entry["google_maps_enrichment"] = create_google_maps_enrichment(place) entry["google_maps_status"] = "SUCCESS" entry["google_maps_search_query"] = query modified = True gm_name = place.get("displayName", {}).get("text", "") rating = place.get("rating", "N/A") logger.info(f" -> Google: {gm_name} ({rating}★)") else: entry["google_maps_status"] = "NOT_FOUND" entry["google_maps_search_query"] = query entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat() stats.not_found += 1 logger.info(" -> No Google match") time.sleep(REQUEST_DELAY) # Save entry if modified and not dry_run: try: with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) except Exception as e: logger.error(f"Error saving {yaml_file.name}: {e}") stats.errors += 1 return stats # ============================================================================= # Main Entry Point # ============================================================================= def main(): parser = argparse.ArgumentParser( description="Unified NDE entry enrichment with Wikidata and Google Maps", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Enrich Museum Register entries with Wikidata %(prog)s --profile museum_register --source wikidata # Enrich KB libraries with Google Maps %(prog)s --profile kb_isil --source google_maps # Enrich custom range with both sources %(prog)s --start 1515 --end 1600 --source both # Dry run to see what would be done %(prog)s --profile museum_register --source both --dry-run """ ) parser.add_argument( "--profile", choices=list(PROFILES.keys()), default="all", help="Entry profile to process (default: all)" ) parser.add_argument( "--source", choices=["wikidata", "google_maps", "both"], default="both", help="Enrichment source (default: both)" ) parser.add_argument( "--start", type=int, help="Start entry number (overrides profile range)" ) parser.add_argument( "--end", type=int, help="End entry number (overrides profile range)" ) parser.add_argument( "--limit", type=int, help="Limit number of entries to process" ) parser.add_argument( "--force", action="store_true", help="Re-enrich entries that already have data" ) parser.add_argument( "--dry-run", action="store_true", help="Don't save changes, just show what would be done" ) parser.add_argument( "--entries-dir", type=Path, default=Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries", help="Path to entries directory" ) args = parser.parse_args() # Get profile profile = PROFILES[args.profile].copy() logger.info(f"Profile: {args.profile} - {profile['description']}") # Override range if specified entry_range = None if args.start is not None or args.end is not None: start = args.start or 0 end = args.end or 99999 entry_range = (start, end) logger.info(f"Entry range: {start} to {end}") if args.dry_run: logger.info("DRY RUN MODE - no changes will be saved") if not args.entries_dir.exists(): logger.error(f"Entries directory not found: {args.entries_dir}") return 1 # Get Google API key if needed google_api_key = None if args.source in ("google_maps", "both"): google_api_key = os.getenv("GOOGLE_PLACES_TOKEN", "") if not google_api_key: logger.error("GOOGLE_PLACES_TOKEN environment variable required for Google Maps enrichment") return 1 # Process entries stats = process_entries( entries_dir=args.entries_dir, profile=profile, source=args.source, dry_run=args.dry_run, limit=args.limit, entry_range=entry_range, force=args.force, google_api_key=google_api_key, ) # Print summary logger.info("\n" + "=" * 60) logger.info("ENRICHMENT COMPLETE") logger.info("=" * 60) logger.info(f"Total files: {stats.total_files}") logger.info(f"Already enriched: {stats.already_enriched}") logger.info(f"ISIL matches: {stats.isil_matches}") logger.info(f"Name matches: {stats.name_matches}") logger.info(f"Not found: {stats.not_found}") logger.info(f"Skipped: {stats.skipped}") logger.info(f"Errors: {stats.errors}") logger.info(f"Total enriched: {stats.total_enriched}") # Save stats if not args.dry_run: stats_file = args.entries_dir.parent / f"enrichment_stats_{args.profile}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(stats_file, 'w') as f: json.dump({ "timestamp": datetime.now(timezone.utc).isoformat(), "profile": args.profile, "source": args.source, "dry_run": args.dry_run, "limit": args.limit, "entry_range": list(entry_range) if entry_range else None, **stats.to_dict() }, f, indent=2) logger.info(f"Stats saved to: {stats_file}") return 0 if __name__ == "__main__": sys.exit(main())