468 lines
16 KiB
Python
Executable file
468 lines
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich custodian files with Google Maps and Wikidata data.
|
|
|
|
This script finds custodian files missing enrichment data and adds:
|
|
- Google Maps: coordinates, place_id, address, phone, website, hours, ratings
|
|
- Wikidata: entity ID, descriptions, identifiers (VIAF, ISNI, etc.)
|
|
|
|
Usage:
|
|
python scripts/enrich_custodian_files.py --google-maps [--dry-run] [--limit N]
|
|
python scripts/enrich_custodian_files.py --wikidata [--dry-run] [--limit N]
|
|
python scripts/enrich_custodian_files.py --all [--dry-run] [--limit N]
|
|
|
|
Environment Variables:
|
|
GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List
|
|
import yaml
|
|
import requests
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
|
|
|
# API Configuration
|
|
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
|
|
|
|
# Rate limiting
|
|
GOOGLE_DELAY = 0.2 # 5 requests per second
|
|
WIKIDATA_DELAY = 0.5 # 2 requests per second
|
|
|
|
# Fields to request from Places API (New)
|
|
PLACE_FIELDS = [
|
|
"id", "displayName", "formattedAddress", "location", "types",
|
|
"businessStatus", "internationalPhoneNumber", "nationalPhoneNumber",
|
|
"websiteUri", "rating", "userRatingCount", "photos"
|
|
]
|
|
|
|
|
|
def find_files_missing_google_maps() -> List[Path]:
|
|
"""Find custodian files without google_maps_enrichment."""
|
|
missing = []
|
|
for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
if 'google_maps_enrichment:' not in content:
|
|
missing.append(filepath)
|
|
return missing
|
|
|
|
|
|
def find_files_missing_wikidata() -> List[Path]:
|
|
"""Find custodian files without wikidata_enrichment."""
|
|
missing = []
|
|
for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
if 'wikidata_enrichment:' not in content:
|
|
missing.append(filepath)
|
|
return missing
|
|
|
|
|
|
def get_institution_name(data: dict) -> str:
|
|
"""Extract institution name from custodian data."""
|
|
# Try various locations for the name
|
|
if 'original_entry' in data and 'name' in data['original_entry']:
|
|
return data['original_entry']['name']
|
|
if 'custodian_name' in data:
|
|
if isinstance(data['custodian_name'], dict):
|
|
return data['custodian_name'].get('claim_value', '')
|
|
return str(data['custodian_name'])
|
|
if 'name' in data:
|
|
return data['name']
|
|
return ''
|
|
|
|
|
|
def get_institution_location(data: dict) -> str:
|
|
"""Extract location info for search query."""
|
|
parts = []
|
|
|
|
original = data.get('original_entry', {})
|
|
if original.get('city'):
|
|
parts.append(original['city'])
|
|
if original.get('location'):
|
|
parts.append(original['location'])
|
|
elif original.get('country'):
|
|
# Map country codes to names
|
|
country_map = {
|
|
'NL': 'Netherlands',
|
|
'PS': 'Palestine',
|
|
'LB': 'Lebanon',
|
|
'BE': 'Belgium',
|
|
'US': 'United States',
|
|
}
|
|
parts.append(country_map.get(original['country'], original['country']))
|
|
|
|
return ', '.join(parts)
|
|
|
|
|
|
def search_google_places(name: str, location: str) -> Optional[Dict[str, Any]]:
|
|
"""Search Google Places API for an institution."""
|
|
if not GOOGLE_PLACES_TOKEN:
|
|
logger.error("GOOGLE_PLACES_TOKEN not set")
|
|
return None
|
|
|
|
query = f"{name} {location}".strip()
|
|
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
|
|
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS])
|
|
}
|
|
|
|
payload = {
|
|
"textQuery": query,
|
|
"maxResultCount": 1
|
|
}
|
|
|
|
try:
|
|
response = httpx.post(TEXT_SEARCH_URL, headers=headers, json=payload, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if data.get("places"):
|
|
return data["places"][0]
|
|
except Exception as e:
|
|
logger.error(f"Google Places error for '{query}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def format_google_maps_enrichment(place: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Format Google Places response into enrichment structure."""
|
|
enrichment = {
|
|
'place_id': place.get('id', ''),
|
|
'name': place.get('displayName', {}).get('text', ''),
|
|
'formatted_address': place.get('formattedAddress', ''),
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'api_status': 'OK'
|
|
}
|
|
|
|
# Add coordinates
|
|
if 'location' in place:
|
|
enrichment['coordinates'] = {
|
|
'latitude': place['location'].get('latitude'),
|
|
'longitude': place['location'].get('longitude')
|
|
}
|
|
|
|
# Add phone
|
|
if place.get('internationalPhoneNumber'):
|
|
enrichment['phone_international'] = place['internationalPhoneNumber']
|
|
if place.get('nationalPhoneNumber'):
|
|
enrichment['phone_local'] = place['nationalPhoneNumber']
|
|
|
|
# Add website
|
|
if place.get('websiteUri'):
|
|
enrichment['website'] = place['websiteUri']
|
|
|
|
# Add types
|
|
if place.get('types'):
|
|
enrichment['google_place_types'] = place['types']
|
|
|
|
# Add business status
|
|
if place.get('businessStatus'):
|
|
enrichment['business_status'] = place['businessStatus']
|
|
|
|
# Add rating
|
|
if place.get('rating'):
|
|
enrichment['rating'] = place['rating']
|
|
if place.get('userRatingCount'):
|
|
enrichment['user_rating_count'] = place['userRatingCount']
|
|
|
|
# Add photo count
|
|
if place.get('photos'):
|
|
enrichment['photo_count'] = len(place['photos'])
|
|
|
|
return enrichment
|
|
|
|
|
|
# Wikidata requires User-Agent header
|
|
WIKIDATA_HEADERS = {
|
|
"User-Agent": "GLAM-Enrichment-Bot/1.0 (https://github.com/glamorga; contact@example.com)"
|
|
}
|
|
|
|
|
|
def search_wikidata(name: str, language: str = "en") -> Optional[str]:
|
|
"""Search Wikidata for an entity by name."""
|
|
params = {
|
|
"action": "wbsearchentities",
|
|
"search": name,
|
|
"language": language,
|
|
"format": "json",
|
|
"limit": 5,
|
|
}
|
|
|
|
try:
|
|
response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if data.get("search"):
|
|
return data["search"][0]["id"]
|
|
except Exception as e:
|
|
logger.error(f"Wikidata search error for '{name}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def get_wikidata_entity(entity_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get entity data from Wikidata."""
|
|
params = {
|
|
"action": "wbgetentities",
|
|
"ids": entity_id,
|
|
"languages": "en|nl|ar|de|fr",
|
|
"props": "labels|descriptions|claims|sitelinks",
|
|
"format": "json",
|
|
}
|
|
|
|
try:
|
|
response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "entities" in data and entity_id in data["entities"]:
|
|
return data["entities"][entity_id]
|
|
except Exception as e:
|
|
logger.error(f"Wikidata entity error for '{entity_id}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def format_wikidata_enrichment(entity_id: str, entity: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Format Wikidata entity into enrichment structure."""
|
|
enrichment = {
|
|
'wikidata_entity_id': entity_id,
|
|
'wikidata_url': f'https://www.wikidata.org/wiki/{entity_id}',
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Add description
|
|
descriptions = entity.get('descriptions', {})
|
|
for lang in ['en', 'nl', 'ar', 'de', 'fr']:
|
|
if lang in descriptions:
|
|
enrichment['wikidata_description'] = descriptions[lang].get('value', '')
|
|
break
|
|
|
|
# Add labels
|
|
labels = entity.get('labels', {})
|
|
enrichment['labels'] = {
|
|
lang: label.get('value', '')
|
|
for lang, label in labels.items()
|
|
}
|
|
|
|
# Extract key identifiers from claims
|
|
claims = entity.get('claims', {})
|
|
identifiers = {}
|
|
|
|
id_properties = {
|
|
'P214': 'viaf',
|
|
'P213': 'isni',
|
|
'P244': 'lcnaf',
|
|
'P227': 'gnd',
|
|
'P791': 'isil',
|
|
'P856': 'official_website',
|
|
'P18': 'image',
|
|
}
|
|
|
|
for prop, name in id_properties.items():
|
|
if prop in claims:
|
|
claim = claims[prop][0]
|
|
if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']:
|
|
value = claim['mainsnak']['datavalue'].get('value', '')
|
|
if isinstance(value, str):
|
|
identifiers[name] = value
|
|
|
|
if identifiers:
|
|
enrichment['identifiers'] = identifiers
|
|
|
|
return enrichment
|
|
|
|
|
|
def enrich_with_google_maps(filepath: Path, dry_run: bool = False) -> bool:
|
|
"""Enrich a single file with Google Maps data."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
name = get_institution_name(data)
|
|
location = get_institution_location(data)
|
|
|
|
if not name:
|
|
logger.warning(f"No name found in {filepath.name}")
|
|
return False
|
|
|
|
logger.info(f"Searching Google Maps: {name} ({location})")
|
|
|
|
place = search_google_places(name, location)
|
|
|
|
if not place:
|
|
logger.warning(f"No Google Maps result for: {name}")
|
|
# Add empty enrichment to mark as searched
|
|
if not dry_run:
|
|
data['google_maps_enrichment'] = {
|
|
'api_status': 'NOT_FOUND',
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'search_query': f"{name} {location}".strip()
|
|
}
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
return False
|
|
|
|
enrichment = format_google_maps_enrichment(place)
|
|
logger.info(f" Found: {enrichment.get('name', 'Unknown')}")
|
|
|
|
if not dry_run:
|
|
data['google_maps_enrichment'] = enrichment
|
|
data['enrichment_status'] = 'enriched'
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
return True
|
|
|
|
|
|
def enrich_with_wikidata(filepath: Path, dry_run: bool = False) -> bool:
|
|
"""Enrich a single file with Wikidata data."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
name = get_institution_name(data)
|
|
|
|
if not name:
|
|
logger.warning(f"No name found in {filepath.name}")
|
|
return False
|
|
|
|
# Check if we already have a Wikidata ID in original_entry
|
|
existing_id = None
|
|
if 'original_entry' in data:
|
|
wikidata = data['original_entry'].get('wikidata', {})
|
|
if isinstance(wikidata, dict):
|
|
existing_id = wikidata.get('id')
|
|
|
|
if existing_id:
|
|
logger.info(f"Using existing Wikidata ID: {existing_id}")
|
|
entity_id = existing_id
|
|
else:
|
|
logger.info(f"Searching Wikidata: {name}")
|
|
entity_id = search_wikidata(name)
|
|
|
|
if not entity_id:
|
|
# Try Dutch search for NL files
|
|
if filepath.name.startswith('NL-'):
|
|
entity_id = search_wikidata(name, language='nl')
|
|
|
|
if not entity_id:
|
|
logger.warning(f"No Wikidata result for: {name}")
|
|
if not dry_run:
|
|
data['wikidata_enrichment'] = {
|
|
'status': 'NOT_FOUND',
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'search_query': name
|
|
}
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
return False
|
|
|
|
entity = get_wikidata_entity(entity_id)
|
|
|
|
if not entity:
|
|
logger.warning(f"Could not fetch Wikidata entity: {entity_id}")
|
|
return False
|
|
|
|
enrichment = format_wikidata_enrichment(entity_id, entity)
|
|
logger.info(f" Found: {entity_id} - {enrichment.get('wikidata_description', '')[:50]}")
|
|
|
|
if not dry_run:
|
|
data['wikidata_enrichment'] = enrichment
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Enrich custodian files with Google Maps and Wikidata')
|
|
parser.add_argument('--google-maps', action='store_true', help='Enrich with Google Maps')
|
|
parser.add_argument('--wikidata', action='store_true', help='Enrich with Wikidata')
|
|
parser.add_argument('--all', action='store_true', help='Enrich with both sources')
|
|
parser.add_argument('--dry-run', action='store_true', help='Do not write changes')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process')
|
|
parser.add_argument('--country', type=str, default=None, help='Filter by country code (e.g., PS, NL)')
|
|
args = parser.parse_args()
|
|
|
|
if not (args.google_maps or args.wikidata or args.all):
|
|
parser.error("Must specify --google-maps, --wikidata, or --all")
|
|
|
|
do_google = args.google_maps or args.all
|
|
do_wikidata = args.wikidata or args.all
|
|
|
|
if do_google and not GOOGLE_PLACES_TOKEN:
|
|
logger.error("GOOGLE_PLACES_TOKEN environment variable required for Google Maps enrichment")
|
|
sys.exit(1)
|
|
|
|
# Find files to process
|
|
if do_google:
|
|
google_files = find_files_missing_google_maps()
|
|
if args.country:
|
|
google_files = [f for f in google_files if f.name.startswith(f"{args.country}-")]
|
|
logger.info(f"Found {len(google_files)} files missing Google Maps enrichment")
|
|
|
|
if do_wikidata:
|
|
wikidata_files = find_files_missing_wikidata()
|
|
if args.country:
|
|
wikidata_files = [f for f in wikidata_files if f.name.startswith(f"{args.country}-")]
|
|
logger.info(f"Found {len(wikidata_files)} files missing Wikidata enrichment")
|
|
|
|
# Process Google Maps
|
|
if do_google:
|
|
files_to_process = google_files[:args.limit] if args.limit else google_files
|
|
logger.info(f"\n=== Processing {len(files_to_process)} files for Google Maps ===\n")
|
|
|
|
success = 0
|
|
for i, filepath in enumerate(files_to_process, 1):
|
|
logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}")
|
|
if enrich_with_google_maps(filepath, args.dry_run):
|
|
success += 1
|
|
time.sleep(GOOGLE_DELAY)
|
|
|
|
logger.info(f"\nGoogle Maps: {success}/{len(files_to_process)} enriched successfully")
|
|
|
|
# Process Wikidata
|
|
if do_wikidata:
|
|
files_to_process = wikidata_files[:args.limit] if args.limit else wikidata_files
|
|
logger.info(f"\n=== Processing {len(files_to_process)} files for Wikidata ===\n")
|
|
|
|
success = 0
|
|
for i, filepath in enumerate(files_to_process, 1):
|
|
logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}")
|
|
if enrich_with_wikidata(filepath, args.dry_run):
|
|
success += 1
|
|
time.sleep(WIKIDATA_DELAY)
|
|
|
|
logger.info(f"\nWikidata: {success}/{len(files_to_process)} enriched successfully")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|