glam/scripts/enrich_custodian_files.py
2025-12-07 00:26:01 +01:00

468 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich custodian files with Google Maps and Wikidata data.
This script finds custodian files missing enrichment data and adds:
- Google Maps: coordinates, place_id, address, phone, website, hours, ratings
- Wikidata: entity ID, descriptions, identifiers (VIAF, ISNI, etc.)
Usage:
python scripts/enrich_custodian_files.py --google-maps [--dry-run] [--limit N]
python scripts/enrich_custodian_files.py --wikidata [--dry-run] [--limit N]
python scripts/enrich_custodian_files.py --all [--dry-run] [--limit N]
Environment Variables:
GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
"""
import os
import sys
import time
import argparse
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
import yaml
import requests
import httpx
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
# API Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
# Rate limiting
GOOGLE_DELAY = 0.2 # 5 requests per second
WIKIDATA_DELAY = 0.5 # 2 requests per second
# Fields to request from Places API (New)
PLACE_FIELDS = [
"id", "displayName", "formattedAddress", "location", "types",
"businessStatus", "internationalPhoneNumber", "nationalPhoneNumber",
"websiteUri", "rating", "userRatingCount", "photos"
]
def find_files_missing_google_maps() -> List[Path]:
"""Find custodian files without google_maps_enrichment."""
missing = []
for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")):
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if 'google_maps_enrichment:' not in content:
missing.append(filepath)
return missing
def find_files_missing_wikidata() -> List[Path]:
"""Find custodian files without wikidata_enrichment."""
missing = []
for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")):
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if 'wikidata_enrichment:' not in content:
missing.append(filepath)
return missing
def get_institution_name(data: dict) -> str:
"""Extract institution name from custodian data."""
# Try various locations for the name
if 'original_entry' in data and 'name' in data['original_entry']:
return data['original_entry']['name']
if 'custodian_name' in data:
if isinstance(data['custodian_name'], dict):
return data['custodian_name'].get('claim_value', '')
return str(data['custodian_name'])
if 'name' in data:
return data['name']
return ''
def get_institution_location(data: dict) -> str:
"""Extract location info for search query."""
parts = []
original = data.get('original_entry', {})
if original.get('city'):
parts.append(original['city'])
if original.get('location'):
parts.append(original['location'])
elif original.get('country'):
# Map country codes to names
country_map = {
'NL': 'Netherlands',
'PS': 'Palestine',
'LB': 'Lebanon',
'BE': 'Belgium',
'US': 'United States',
}
parts.append(country_map.get(original['country'], original['country']))
return ', '.join(parts)
def search_google_places(name: str, location: str) -> Optional[Dict[str, Any]]:
"""Search Google Places API for an institution."""
if not GOOGLE_PLACES_TOKEN:
logger.error("GOOGLE_PLACES_TOKEN not set")
return None
query = f"{name} {location}".strip()
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS])
}
payload = {
"textQuery": query,
"maxResultCount": 1
}
try:
response = httpx.post(TEXT_SEARCH_URL, headers=headers, json=payload, timeout=30)
response.raise_for_status()
data = response.json()
if data.get("places"):
return data["places"][0]
except Exception as e:
logger.error(f"Google Places error for '{query}': {e}")
return None
def format_google_maps_enrichment(place: Dict[str, Any]) -> Dict[str, Any]:
"""Format Google Places response into enrichment structure."""
enrichment = {
'place_id': place.get('id', ''),
'name': place.get('displayName', {}).get('text', ''),
'formatted_address': place.get('formattedAddress', ''),
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
'api_status': 'OK'
}
# Add coordinates
if 'location' in place:
enrichment['coordinates'] = {
'latitude': place['location'].get('latitude'),
'longitude': place['location'].get('longitude')
}
# Add phone
if place.get('internationalPhoneNumber'):
enrichment['phone_international'] = place['internationalPhoneNumber']
if place.get('nationalPhoneNumber'):
enrichment['phone_local'] = place['nationalPhoneNumber']
# Add website
if place.get('websiteUri'):
enrichment['website'] = place['websiteUri']
# Add types
if place.get('types'):
enrichment['google_place_types'] = place['types']
# Add business status
if place.get('businessStatus'):
enrichment['business_status'] = place['businessStatus']
# Add rating
if place.get('rating'):
enrichment['rating'] = place['rating']
if place.get('userRatingCount'):
enrichment['user_rating_count'] = place['userRatingCount']
# Add photo count
if place.get('photos'):
enrichment['photo_count'] = len(place['photos'])
return enrichment
# Wikidata requires User-Agent header
WIKIDATA_HEADERS = {
"User-Agent": "GLAM-Enrichment-Bot/1.0 (https://github.com/glamorga; contact@example.com)"
}
def search_wikidata(name: str, language: str = "en") -> Optional[str]:
"""Search Wikidata for an entity by name."""
params = {
"action": "wbsearchentities",
"search": name,
"language": language,
"format": "json",
"limit": 5,
}
try:
response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10)
response.raise_for_status()
data = response.json()
if data.get("search"):
return data["search"][0]["id"]
except Exception as e:
logger.error(f"Wikidata search error for '{name}': {e}")
return None
def get_wikidata_entity(entity_id: str) -> Optional[Dict[str, Any]]:
"""Get entity data from Wikidata."""
params = {
"action": "wbgetentities",
"ids": entity_id,
"languages": "en|nl|ar|de|fr",
"props": "labels|descriptions|claims|sitelinks",
"format": "json",
}
try:
response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10)
response.raise_for_status()
data = response.json()
if "entities" in data and entity_id in data["entities"]:
return data["entities"][entity_id]
except Exception as e:
logger.error(f"Wikidata entity error for '{entity_id}': {e}")
return None
def format_wikidata_enrichment(entity_id: str, entity: Dict[str, Any]) -> Dict[str, Any]:
"""Format Wikidata entity into enrichment structure."""
enrichment = {
'wikidata_entity_id': entity_id,
'wikidata_url': f'https://www.wikidata.org/wiki/{entity_id}',
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
}
# Add description
descriptions = entity.get('descriptions', {})
for lang in ['en', 'nl', 'ar', 'de', 'fr']:
if lang in descriptions:
enrichment['wikidata_description'] = descriptions[lang].get('value', '')
break
# Add labels
labels = entity.get('labels', {})
enrichment['labels'] = {
lang: label.get('value', '')
for lang, label in labels.items()
}
# Extract key identifiers from claims
claims = entity.get('claims', {})
identifiers = {}
id_properties = {
'P214': 'viaf',
'P213': 'isni',
'P244': 'lcnaf',
'P227': 'gnd',
'P791': 'isil',
'P856': 'official_website',
'P18': 'image',
}
for prop, name in id_properties.items():
if prop in claims:
claim = claims[prop][0]
if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']:
value = claim['mainsnak']['datavalue'].get('value', '')
if isinstance(value, str):
identifiers[name] = value
if identifiers:
enrichment['identifiers'] = identifiers
return enrichment
def enrich_with_google_maps(filepath: Path, dry_run: bool = False) -> bool:
"""Enrich a single file with Google Maps data."""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
name = get_institution_name(data)
location = get_institution_location(data)
if not name:
logger.warning(f"No name found in {filepath.name}")
return False
logger.info(f"Searching Google Maps: {name} ({location})")
place = search_google_places(name, location)
if not place:
logger.warning(f"No Google Maps result for: {name}")
# Add empty enrichment to mark as searched
if not dry_run:
data['google_maps_enrichment'] = {
'api_status': 'NOT_FOUND',
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
'search_query': f"{name} {location}".strip()
}
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return False
enrichment = format_google_maps_enrichment(place)
logger.info(f" Found: {enrichment.get('name', 'Unknown')}")
if not dry_run:
data['google_maps_enrichment'] = enrichment
data['enrichment_status'] = 'enriched'
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return True
def enrich_with_wikidata(filepath: Path, dry_run: bool = False) -> bool:
"""Enrich a single file with Wikidata data."""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
name = get_institution_name(data)
if not name:
logger.warning(f"No name found in {filepath.name}")
return False
# Check if we already have a Wikidata ID in original_entry
existing_id = None
if 'original_entry' in data:
wikidata = data['original_entry'].get('wikidata', {})
if isinstance(wikidata, dict):
existing_id = wikidata.get('id')
if existing_id:
logger.info(f"Using existing Wikidata ID: {existing_id}")
entity_id = existing_id
else:
logger.info(f"Searching Wikidata: {name}")
entity_id = search_wikidata(name)
if not entity_id:
# Try Dutch search for NL files
if filepath.name.startswith('NL-'):
entity_id = search_wikidata(name, language='nl')
if not entity_id:
logger.warning(f"No Wikidata result for: {name}")
if not dry_run:
data['wikidata_enrichment'] = {
'status': 'NOT_FOUND',
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
'search_query': name
}
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return False
entity = get_wikidata_entity(entity_id)
if not entity:
logger.warning(f"Could not fetch Wikidata entity: {entity_id}")
return False
enrichment = format_wikidata_enrichment(entity_id, entity)
logger.info(f" Found: {entity_id} - {enrichment.get('wikidata_description', '')[:50]}")
if not dry_run:
data['wikidata_enrichment'] = enrichment
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return True
def main():
parser = argparse.ArgumentParser(description='Enrich custodian files with Google Maps and Wikidata')
parser.add_argument('--google-maps', action='store_true', help='Enrich with Google Maps')
parser.add_argument('--wikidata', action='store_true', help='Enrich with Wikidata')
parser.add_argument('--all', action='store_true', help='Enrich with both sources')
parser.add_argument('--dry-run', action='store_true', help='Do not write changes')
parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process')
parser.add_argument('--country', type=str, default=None, help='Filter by country code (e.g., PS, NL)')
args = parser.parse_args()
if not (args.google_maps or args.wikidata or args.all):
parser.error("Must specify --google-maps, --wikidata, or --all")
do_google = args.google_maps or args.all
do_wikidata = args.wikidata or args.all
if do_google and not GOOGLE_PLACES_TOKEN:
logger.error("GOOGLE_PLACES_TOKEN environment variable required for Google Maps enrichment")
sys.exit(1)
# Find files to process
if do_google:
google_files = find_files_missing_google_maps()
if args.country:
google_files = [f for f in google_files if f.name.startswith(f"{args.country}-")]
logger.info(f"Found {len(google_files)} files missing Google Maps enrichment")
if do_wikidata:
wikidata_files = find_files_missing_wikidata()
if args.country:
wikidata_files = [f for f in wikidata_files if f.name.startswith(f"{args.country}-")]
logger.info(f"Found {len(wikidata_files)} files missing Wikidata enrichment")
# Process Google Maps
if do_google:
files_to_process = google_files[:args.limit] if args.limit else google_files
logger.info(f"\n=== Processing {len(files_to_process)} files for Google Maps ===\n")
success = 0
for i, filepath in enumerate(files_to_process, 1):
logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}")
if enrich_with_google_maps(filepath, args.dry_run):
success += 1
time.sleep(GOOGLE_DELAY)
logger.info(f"\nGoogle Maps: {success}/{len(files_to_process)} enriched successfully")
# Process Wikidata
if do_wikidata:
files_to_process = wikidata_files[:args.limit] if args.limit else wikidata_files
logger.info(f"\n=== Processing {len(files_to_process)} files for Wikidata ===\n")
success = 0
for i, filepath in enumerate(files_to_process, 1):
logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}")
if enrich_with_wikidata(filepath, args.dry_run):
success += 1
time.sleep(WIKIDATA_DELAY)
logger.info(f"\nWikidata: {success}/{len(files_to_process)} enriched successfully")
if __name__ == '__main__':
main()