glam/scripts/enrich_custodian_files.py

#!/usr/bin/env python3
"""
Enrich custodian files with Google Maps and Wikidata data.

This script finds custodian files missing enrichment data and adds:
- Google Maps: coordinates, place_id, address, phone, website, hours, ratings
- Wikidata: entity ID, descriptions, identifiers (VIAF, ISNI, etc.)

Usage:
    python scripts/enrich_custodian_files.py --google-maps [--dry-run] [--limit N]
    python scripts/enrich_custodian_files.py --wikidata [--dry-run] [--limit N]
    python scripts/enrich_custodian_files.py --all [--dry-run] [--limit N]

Environment Variables:
    GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
"""

import os
import sys
import time
import argparse
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
import yaml
import requests
import httpx
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Paths
PROJECT_ROOT = Path(__file__).parent.parent
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"

# API Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
WIKIDATA_API = "https://www.wikidata.org/w/api.php"

# Rate limiting
GOOGLE_DELAY = 0.2  # 5 requests per second
WIKIDATA_DELAY = 0.5  # 2 requests per second

# Fields to request from Places API (New)
PLACE_FIELDS = [
    "id", "displayName", "formattedAddress", "location", "types",
    "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber",
    "websiteUri", "rating", "userRatingCount", "photos"
]


def find_files_missing_google_maps() -> List[Path]:
    """Find custodian files without google_maps_enrichment."""
    missing = []
    for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            if 'google_maps_enrichment:' not in content:
                missing.append(filepath)
    return missing


def find_files_missing_wikidata() -> List[Path]:
    """Find custodian files without wikidata_enrichment."""
    missing = []
    for filepath in sorted(CUSTODIAN_DIR.glob("*.yaml")):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            if 'wikidata_enrichment:' not in content:
                missing.append(filepath)
    return missing


def get_institution_name(data: dict) -> str:
    """Extract institution name from custodian data."""
    # Try various locations for the name
    if 'original_entry' in data and 'name' in data['original_entry']:
        return data['original_entry']['name']
    if 'custodian_name' in data:
        if isinstance(data['custodian_name'], dict):
            return data['custodian_name'].get('claim_value', '')
        return str(data['custodian_name'])
    if 'name' in data:
        return data['name']
    return ''


def get_institution_location(data: dict) -> str:
    """Extract location info for search query."""
    parts = []

    original = data.get('original_entry', {})
    if original.get('city'):
        parts.append(original['city'])
    if original.get('location'):
        parts.append(original['location'])
    elif original.get('country'):
        # Map country codes to names
        country_map = {
            'NL': 'Netherlands',
            'PS': 'Palestine',
            'LB': 'Lebanon',
            'BE': 'Belgium',
            'US': 'United States',
        }
        parts.append(country_map.get(original['country'], original['country']))

    return ', '.join(parts)


def search_google_places(name: str, location: str) -> Optional[Dict[str, Any]]:
    """Search Google Places API for an institution."""
    if not GOOGLE_PLACES_TOKEN:
        logger.error("GOOGLE_PLACES_TOKEN not set")
        return None

    query = f"{name} {location}".strip()

    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
        "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS])
    }

    payload = {
        "textQuery": query,
        "maxResultCount": 1
    }

    try:
        response = httpx.post(TEXT_SEARCH_URL, headers=headers, json=payload, timeout=30)
        response.raise_for_status()
        data = response.json()

        if data.get("places"):
            return data["places"][0]
    except Exception as e:
        logger.error(f"Google Places error for '{query}': {e}")

    return None


def format_google_maps_enrichment(place: Dict[str, Any]) -> Dict[str, Any]:
    """Format Google Places response into enrichment structure."""
    enrichment = {
        'place_id': place.get('id', ''),
        'name': place.get('displayName', {}).get('text', ''),
        'formatted_address': place.get('formattedAddress', ''),
        'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
        'api_status': 'OK'
    }

    # Add coordinates
    if 'location' in place:
        enrichment['coordinates'] = {
            'latitude': place['location'].get('latitude'),
            'longitude': place['location'].get('longitude')
        }

    # Add phone
    if place.get('internationalPhoneNumber'):
        enrichment['phone_international'] = place['internationalPhoneNumber']
    if place.get('nationalPhoneNumber'):
        enrichment['phone_local'] = place['nationalPhoneNumber']

    # Add website
    if place.get('websiteUri'):
        enrichment['website'] = place['websiteUri']

    # Add types
    if place.get('types'):
        enrichment['google_place_types'] = place['types']

    # Add business status
    if place.get('businessStatus'):
        enrichment['business_status'] = place['businessStatus']

    # Add rating
    if place.get('rating'):
        enrichment['rating'] = place['rating']
    if place.get('userRatingCount'):
        enrichment['user_rating_count'] = place['userRatingCount']

    # Add photo count
    if place.get('photos'):
        enrichment['photo_count'] = len(place['photos'])

    return enrichment


# Wikidata requires User-Agent header
WIKIDATA_HEADERS = {
    "User-Agent": "GLAM-Enrichment-Bot/1.0 (https://github.com/glamorga; contact@example.com)"
}


def search_wikidata(name: str, language: str = "en") -> Optional[str]:
    """Search Wikidata for an entity by name."""
    params = {
        "action": "wbsearchentities",
        "search": name,
        "language": language,
        "format": "json",
        "limit": 5,
    }

    try:
        response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data.get("search"):
            return data["search"][0]["id"]
    except Exception as e:
        logger.error(f"Wikidata search error for '{name}': {e}")

    return None


def get_wikidata_entity(entity_id: str) -> Optional[Dict[str, Any]]:
    """Get entity data from Wikidata."""
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "languages": "en|nl|ar|de|fr",
        "props": "labels|descriptions|claims|sitelinks",
        "format": "json",
    }

    try:
        response = requests.get(WIKIDATA_API, params=params, headers=WIKIDATA_HEADERS, timeout=10)
        response.raise_for_status()
        data = response.json()

        if "entities" in data and entity_id in data["entities"]:
            return data["entities"][entity_id]
    except Exception as e:
        logger.error(f"Wikidata entity error for '{entity_id}': {e}")

    return None


def format_wikidata_enrichment(entity_id: str, entity: Dict[str, Any]) -> Dict[str, Any]:
    """Format Wikidata entity into enrichment structure."""
    enrichment = {
        'wikidata_entity_id': entity_id,
        'wikidata_url': f'https://www.wikidata.org/wiki/{entity_id}',
        'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
    }

    # Add description
    descriptions = entity.get('descriptions', {})
    for lang in ['en', 'nl', 'ar', 'de', 'fr']:
        if lang in descriptions:
            enrichment['wikidata_description'] = descriptions[lang].get('value', '')
            break

    # Add labels
    labels = entity.get('labels', {})
    enrichment['labels'] = {
        lang: label.get('value', '')
        for lang, label in labels.items()
    }

    # Extract key identifiers from claims
    claims = entity.get('claims', {})
    identifiers = {}

    id_properties = {
        'P214': 'viaf',
        'P213': 'isni',
        'P244': 'lcnaf',
        'P227': 'gnd',
        'P791': 'isil',
        'P856': 'official_website',
        'P18': 'image',
    }

    for prop, name in id_properties.items():
        if prop in claims:
            claim = claims[prop][0]
            if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']:
                value = claim['mainsnak']['datavalue'].get('value', '')
                if isinstance(value, str):
                    identifiers[name] = value

    if identifiers:
        enrichment['identifiers'] = identifiers

    return enrichment


def enrich_with_google_maps(filepath: Path, dry_run: bool = False) -> bool:
    """Enrich a single file with Google Maps data."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    name = get_institution_name(data)
    location = get_institution_location(data)

    if not name:
        logger.warning(f"No name found in {filepath.name}")
        return False

    logger.info(f"Searching Google Maps: {name} ({location})")

    place = search_google_places(name, location)

    if not place:
        logger.warning(f"No Google Maps result for: {name}")
        # Add empty enrichment to mark as searched
        if not dry_run:
            data['google_maps_enrichment'] = {
                'api_status': 'NOT_FOUND',
                'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
                'search_query': f"{name} {location}".strip()
            }
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
        return False

    enrichment = format_google_maps_enrichment(place)
    logger.info(f"  Found: {enrichment.get('name', 'Unknown')}")

    if not dry_run:
        data['google_maps_enrichment'] = enrichment
        data['enrichment_status'] = 'enriched'

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    return True


def enrich_with_wikidata(filepath: Path, dry_run: bool = False) -> bool:
    """Enrich a single file with Wikidata data."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    name = get_institution_name(data)

    if not name:
        logger.warning(f"No name found in {filepath.name}")
        return False

    # Check if we already have a Wikidata ID in original_entry
    existing_id = None
    if 'original_entry' in data:
        wikidata = data['original_entry'].get('wikidata', {})
        if isinstance(wikidata, dict):
            existing_id = wikidata.get('id')

    if existing_id:
        logger.info(f"Using existing Wikidata ID: {existing_id}")
        entity_id = existing_id
    else:
        logger.info(f"Searching Wikidata: {name}")
        entity_id = search_wikidata(name)

        if not entity_id:
            # Try Dutch search for NL files
            if filepath.name.startswith('NL-'):
                entity_id = search_wikidata(name, language='nl')

    if not entity_id:
        logger.warning(f"No Wikidata result for: {name}")
        if not dry_run:
            data['wikidata_enrichment'] = {
                'status': 'NOT_FOUND',
                'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
                'search_query': name
            }
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
        return False

    entity = get_wikidata_entity(entity_id)

    if not entity:
        logger.warning(f"Could not fetch Wikidata entity: {entity_id}")
        return False

    enrichment = format_wikidata_enrichment(entity_id, entity)
    logger.info(f"  Found: {entity_id} - {enrichment.get('wikidata_description', '')[:50]}")

    if not dry_run:
        data['wikidata_enrichment'] = enrichment

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    return True


def main():
    parser = argparse.ArgumentParser(description='Enrich custodian files with Google Maps and Wikidata')
    parser.add_argument('--google-maps', action='store_true', help='Enrich with Google Maps')
    parser.add_argument('--wikidata', action='store_true', help='Enrich with Wikidata')
    parser.add_argument('--all', action='store_true', help='Enrich with both sources')
    parser.add_argument('--dry-run', action='store_true', help='Do not write changes')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process')
    parser.add_argument('--country', type=str, default=None, help='Filter by country code (e.g., PS, NL)')
    args = parser.parse_args()

    if not (args.google_maps or args.wikidata or args.all):
        parser.error("Must specify --google-maps, --wikidata, or --all")

    do_google = args.google_maps or args.all
    do_wikidata = args.wikidata or args.all

    if do_google and not GOOGLE_PLACES_TOKEN:
        logger.error("GOOGLE_PLACES_TOKEN environment variable required for Google Maps enrichment")
        sys.exit(1)

    # Find files to process
    if do_google:
        google_files = find_files_missing_google_maps()
        if args.country:
            google_files = [f for f in google_files if f.name.startswith(f"{args.country}-")]
        logger.info(f"Found {len(google_files)} files missing Google Maps enrichment")

    if do_wikidata:
        wikidata_files = find_files_missing_wikidata()
        if args.country:
            wikidata_files = [f for f in wikidata_files if f.name.startswith(f"{args.country}-")]
        logger.info(f"Found {len(wikidata_files)} files missing Wikidata enrichment")

    # Process Google Maps
    if do_google:
        files_to_process = google_files[:args.limit] if args.limit else google_files
        logger.info(f"\n=== Processing {len(files_to_process)} files for Google Maps ===\n")

        success = 0
        for i, filepath in enumerate(files_to_process, 1):
            logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}")
            if enrich_with_google_maps(filepath, args.dry_run):
                success += 1
            time.sleep(GOOGLE_DELAY)

        logger.info(f"\nGoogle Maps: {success}/{len(files_to_process)} enriched successfully")

    # Process Wikidata
    if do_wikidata:
        files_to_process = wikidata_files[:args.limit] if args.limit else wikidata_files
        logger.info(f"\n=== Processing {len(files_to_process)} files for Wikidata ===\n")

        success = 0
        for i, filepath in enumerate(files_to_process, 1):
            logger.info(f"[{i}/{len(files_to_process)}] {filepath.name}")
            if enrich_with_wikidata(filepath, args.dry_run):
                success += 1
            time.sleep(WIKIDATA_DELAY)

        logger.info(f"\nWikidata: {success}/{len(files_to_process)} enriched successfully")


if __name__ == '__main__':
    main()