#!/usr/bin/env python3 """ Enrich custodian files with Google Maps and Wikidata data. This script finds custodian files missing enrichment data and adds: - Google Maps: coordinates, place_id, address, phone, website, hours, ratings - Wikidata: entity ID, descriptions, identifiers (VIAF, ISNI, etc.) Usage: python scripts/enrich_custodian_files.py --google-maps [--dry-run] [--limit N] python scripts/enrich_custodian_files.py --wikidata [--dry-run] [--limit N] python scripts/enrich_custodian_files.py --all [--dry-run] [--limit N] Environment Variables: GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment """ import os import sys import time import argparse import logging from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List import yaml import requests import httpx from dotenv import load_dotenv # Load environment variables load_dotenv() # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Paths PROJECT_ROOT = Path(__file__).parent.parent CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" # API Configuration GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" WIKIDATA_API = "https://www.wikidata.org/w/api.php" # Rate limiting GOOGLE_DELAY = 0.2 # 5 requests per second WIKIDATA_DELAY = 0.5 # 2 requests per second # Fields to request from Places API (New) PLACE_FIELDS = [ "id", "displayName", "formattedAddress", "location", "types", "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber", "websiteUri", "rating", "userRatingCount", "photos" ] def find_files_missing_google_maps() -> List[Path]: """Find custodian files without google_maps_enrichment.""" missing = [] for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")): with open(filepath, 'r', encoding='utf-8') as f: content = f.read() if 'google_maps_enrichment:' not in content: missing.append(filepath) return missing def find_files_missing_wikidata() -> List[Path]: """Find custodian files without wikidata_enrichment.""" missing = [] for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")): with open(filepath, 'r', encoding='utf-8') as f: content = f.read() if 'wikidata_enrichment:' not in content: missing.append(filepath) return missing def get_institution_name(data: dict) -> str: """Extract institution name from custodian data.""" # Try various locations for the name if 'original_entry' in data and 'name' in data['original_entry']: return data['original_entry']['name'] if 'custodian_name' in data: if isinstance(data['custodian_name'], dict): return data['custodian_name'].get('claim_value', '') return str(data['custodian_name']) if 'name' in data: return data['name'] return '' def get_institution_location(data: dict) -> str: """Extract location info for search query.""" parts = [] original = data.get('original_entry', {}) if original.get('city'): parts.append(original['city']) if original.get('location'): parts.append(original['location']) elif original.get('country'): # Map country codes to names country_map = { 'NL': 'Netherlands', 'PS': 'Palestine', 'LB': 'Lebanon', 'BE': 'Belgium', 'US': 'United States', } parts.append(country_map.get(original['country'], original['country'])) return ', '.join(parts) def search_google_places(name: str, location: str) -> Optional[Dict[str, Any]]: """Search Google Places API for an institution.""" if not GOOGLE_PLACES_TOKEN: logger.error("GOOGLE_PLACES_TOKEN not set") return None query = f"{name} {location}".strip() headers = { "Content-Type": "application/json", "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN, "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]) } payload = { "textQuery": query, "maxResultCount": 1 } try: response = httpx.post(TEXT_SEARCH_URL, headers=headers, json=payload, timeout=30) response.raise_for_status() data = response.json() if data.get("places"): return data["places"][0] except Exception as e: logger.error(f"Google Places error for '{query}': {e}") return None def format_google_maps_enrichment(place: Dict[str, Any]) -> Dict[str, Any]: """Format Google Places response into enrichment structure.""" enrichment = { 'place_id': place.get('id', ''), 'name': place.get('displayName', {}).get('text', ''), 'formatted_address': place.get('formattedAddress', ''), 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'api_status': 'OK' } # Add coordinates if 'location' in place: enrichment['coordinates'] = { 'latitude': place['location'].get('latitude'), 'longitude': place['location'].get('longitude') } # Add phone if place.get('internationalPhoneNumber'): enrichment['phone_international'] = place['internationalPhoneNumber'] if place.get('nationalPhoneNumber'): enrichment['phone_local'] = place['nationalPhoneNumber'] # Add website if place.get('websiteUri'): enrichment['website'] = place['websiteUri'] # Add types if place.get('types'): enrichment['google_place_types'] = place['types'] # Add business status if place.get('businessStatus'): enrichment['business_status'] = place['businessStatus'] # Add rating if place.get('rating'): enrichment['rating'] = place['rating'] if place.get('userRatingCount'): enrichment['user_rating_count'] = place['userRatingCount'] # Add photo count if place.get('photos'): enrichment['photo_count'] = len(place['photos']) return enrichment # Wikidata requires User-Agent header WIKIDATA_HEADERS = { "User-Agent": "GLAM-Enrichment-Bot/1.0 (https://github.com/glamorga; contact@example.com)" } def search_wikidata(name: str, language: str = "en") -> Optional[str]: """Search Wikidata for an entity by name.""" params = { "action": "wbsearchentities", "search": name, "language": language, "format": "json", "limit": 5, } try: response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10) response.raise_for_status() data = response.json() if data.get("search"): return data["search"][0]["id"] except Exception as e: logger.error(f"Wikidata search error for '{name}': {e}") return None def get_wikidata_entity(entity_id: str) -> Optional[Dict[str, Any]]: """Get entity data from Wikidata.""" params = { "action": "wbgetentities", "ids": entity_id, "languages": "en|nl|ar|de|fr", "props": "labels|descriptions|claims|sitelinks", "format": "json", } try: response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10) response.raise_for_status() data = response.json() if "entities" in data and entity_id in data["entities"]: return data["entities"][entity_id] except Exception as e: logger.error(f"Wikidata entity error for '{entity_id}': {e}") return None def format_wikidata_enrichment(entity_id: str, entity: Dict[str, Any]) -> Dict[str, Any]: """Format Wikidata entity into enrichment structure.""" enrichment = { 'wikidata_entity_id': entity_id, 'wikidata_url': f'https://www.wikidata.org/wiki/{entity_id}', 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), } # Add description descriptions = entity.get('descriptions', {}) for lang in ['en', 'nl', 'ar', 'de', 'fr']: if lang in descriptions: enrichment['wikidata_description'] = descriptions[lang].get('value', '') break # Add labels labels = entity.get('labels', {}) enrichment['labels'] = { lang: label.get('value', '') for lang, label in labels.items() } # Extract key identifiers from claims claims = entity.get('claims', {}) identifiers = {} id_properties = { 'P214': 'viaf', 'P213': 'isni', 'P244': 'lcnaf', 'P227': 'gnd', 'P791': 'isil', 'P856': 'official_website', 'P18': 'image', } for prop, name in id_properties.items(): if prop in claims: claim = claims[prop][0] if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']: value = claim['mainsnak']['datavalue'].get('value', '') if isinstance(value, str): identifiers[name] = value if identifiers: enrichment['identifiers'] = identifiers return enrichment def enrich_with_google_maps(filepath: Path, dry_run: bool = False) -> bool: """Enrich a single file with Google Maps data.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) name = get_institution_name(data) location = get_institution_location(data) if not name: logger.warning(f"No name found in {filepath.name}") return False logger.info(f"Searching Google Maps: {name} ({location})") place = search_google_places(name, location) if not place: logger.warning(f"No Google Maps result for: {name}") # Add empty enrichment to mark as searched if not dry_run: data['google_maps_enrichment'] = { 'api_status': 'NOT_FOUND', 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'search_query': f"{name} {location}".strip() } with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return False enrichment = format_google_maps_enrichment(place) logger.info(f" Found: {enrichment.get('name', 'Unknown')}") if not dry_run: data['google_maps_enrichment'] = enrichment data['enrichment_status'] = 'enriched' with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return True def enrich_with_wikidata(filepath: Path, dry_run: bool = False) -> bool: """Enrich a single file with Wikidata data.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) name = get_institution_name(data) if not name: logger.warning(f"No name found in {filepath.name}") return False # Check if we already have a Wikidata ID in original_entry existing_id = None if 'original_entry' in data: wikidata = data['original_entry'].get('wikidata', {}) if isinstance(wikidata, dict): existing_id = wikidata.get('id') if existing_id: logger.info(f"Using existing Wikidata ID: {existing_id}") entity_id = existing_id else: logger.info(f"Searching Wikidata: {name}") entity_id = search_wikidata(name) if not entity_id: # Try Dutch search for NL files if filepath.name.startswith('NL-'): entity_id = search_wikidata(name, language='nl') if not entity_id: logger.warning(f"No Wikidata result for: {name}") if not dry_run: data['wikidata_enrichment'] = { 'status': 'NOT_FOUND', 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'search_query': name } with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return False entity = get_wikidata_entity(entity_id) if not entity: logger.warning(f"Could not fetch Wikidata entity: {entity_id}") return False enrichment = format_wikidata_enrichment(entity_id, entity) logger.info(f" Found: {entity_id} - {enrichment.get('wikidata_description', '')[:50]}") if not dry_run: data['wikidata_enrichment'] = enrichment with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return True def main(): parser = argparse.ArgumentParser(description='Enrich custodian files with Google Maps and Wikidata') parser.add_argument('--google-maps', action='store_true', help='Enrich with Google Maps') parser.add_argument('--wikidata', action='store_true', help='Enrich with Wikidata') parser.add_argument('--all', action='store_true', help='Enrich with both sources') parser.add_argument('--dry-run', action='store_true', help='Do not write changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process') parser.add_argument('--country', type=str, default=None, help='Filter by country code (e.g., PS, NL)') args = parser.parse_args() if not (args.google_maps or args.wikidata or args.all): parser.error("Must specify --google-maps, --wikidata, or --all") do_google = args.google_maps or args.all do_wikidata = args.wikidata or args.all if do_google and not GOOGLE_PLACES_TOKEN: logger.error("GOOGLE_PLACES_TOKEN environment variable required for Google Maps enrichment") sys.exit(1) # Find files to process if do_google: google_files = find_files_missing_google_maps() if args.country: google_files = [f for f in google_files if f.name.startswith(f"{args.country}-")] logger.info(f"Found {len(google_files)} files missing Google Maps enrichment") if do_wikidata: wikidata_files = find_files_missing_wikidata() if args.country: wikidata_files = [f for f in wikidata_files if f.name.startswith(f"{args.country}-")] logger.info(f"Found {len(wikidata_files)} files missing Wikidata enrichment") # Process Google Maps if do_google: files_to_process = google_files[:args.limit] if args.limit else google_files logger.info(f"\n=== Processing {len(files_to_process)} files for Google Maps ===\n") success = 0 for i, filepath in enumerate(files_to_process, 1): logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}") if enrich_with_google_maps(filepath, args.dry_run): success += 1 time.sleep(GOOGLE_DELAY) logger.info(f"\nGoogle Maps: {success}/{len(files_to_process)} enriched successfully") # Process Wikidata if do_wikidata: files_to_process = wikidata_files[:args.limit] if args.limit else wikidata_files logger.info(f"\n=== Processing {len(files_to_process)} files for Wikidata ===\n") success = 0 for i, filepath in enumerate(files_to_process, 1): logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}") if enrich_with_wikidata(filepath, args.dry_run): success += 1 time.sleep(WIKIDATA_DELAY) logger.info(f"\nWikidata: {success}/{len(files_to_process)} enriched successfully") if __name__ == '__main__': main()