glam/scripts/enrich_custodian_youtube_maps.py

#!/usr/bin/env python3
"""
Enrich Heritage Custodian YAML files with YouTube and Google Maps data.

This script enriches custodian files in data/custodian/ with:
1. YouTube channel/video data (if channel can be found)
2. Google Maps/Places API data (address, ratings, reviews, photos)
3. GLM-4.6 verification of matches (CH-Annotator convention)

Usage:
    python scripts/enrich_custodian_youtube_maps.py [--dry-run] [--limit N] [--force]
    python scripts/enrich_custodian_youtube_maps.py --files FILE1.yaml FILE2.yaml
    python scripts/enrich_custodian_youtube_maps.py --pattern "ZA-*.yaml"

Environment Variables:
    GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
    GOOGLE_YOUTUBE_TOKEN - Required for YouTube enrichment
    ZAI_API_TOKEN - Required for GLM-4.6 verification (optional but recommended)

Author: GLAM Data Extraction Project
Date: December 2025
"""

import argparse
import asyncio
import fnmatch
import json
import logging
import os
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import httpx
import yaml

# Add project src to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))

# Load environment variables
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# ============================================================================
# Configuration
# ============================================================================

CUSTODIAN_DIR = PROJECT_ROOT / "data/custodian"

# API Keys
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")

# YouTube API keys - rotate through all available keys when quota exceeded
YOUTUBE_API_KEYS = [
    os.getenv("GOOGLE_YOUTUBE_TOKEN_v3", ""),
    os.getenv("GOOGLE_YOUTUBE_TOKEN_v4", ""),
    os.getenv("GOOGLE_YOUTUBE_TOKEN_v5", ""),
    os.getenv("GOOGLE_YOUTUBE_TOKEN_v2", ""),
    os.getenv("GOOGLE_YOUTUBE_TOKEN", ""),
]
YOUTUBE_API_KEYS = [k for k in YOUTUBE_API_KEYS if k]  # Filter empty keys
CURRENT_YOUTUBE_KEY_INDEX = 0

# API Endpoints (defined early for use in helper functions)
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"

def get_youtube_api_key() -> str:
    """Get current YouTube API key."""
    global CURRENT_YOUTUBE_KEY_INDEX
    if not YOUTUBE_API_KEYS:
        return ""
    return YOUTUBE_API_KEYS[CURRENT_YOUTUBE_KEY_INDEX % len(YOUTUBE_API_KEYS)]

def rotate_youtube_api_key() -> bool:
    """Rotate to next YouTube API key. Returns False if all keys exhausted."""
    global CURRENT_YOUTUBE_KEY_INDEX
    CURRENT_YOUTUBE_KEY_INDEX += 1
    if CURRENT_YOUTUBE_KEY_INDEX >= len(YOUTUBE_API_KEYS):
        logger.error(f"All {len(YOUTUBE_API_KEYS)} YouTube API keys exhausted!")
        return False
    logger.warning(f"Rotating to YouTube API key {CURRENT_YOUTUBE_KEY_INDEX + 1}/{len(YOUTUBE_API_KEYS)}")
    return True


def youtube_api_request(
    client: httpx.Client,
    endpoint: str,
    params: Dict[str, Any],
    timeout: float = 30.0,
) -> Optional[Dict[str, Any]]:
    """
    Make a YouTube API request with automatic key rotation on quota errors.

    Args:
        client: httpx Client instance
        endpoint: API endpoint (e.g., "search", "channels", "playlistItems", "videos")
        params: Query parameters (key will be added automatically)
        timeout: Request timeout in seconds

    Returns:
        JSON response dict or None if all keys exhausted or error
    """
    url = f"{YOUTUBE_API_BASE}/{endpoint}"

    while True:
        api_key = get_youtube_api_key()
        if not api_key:
            logger.error("No YouTube API keys available")
            return None

        # Add current key to params
        request_params = {**params, "key": api_key}

        try:
            response = client.get(url, params=request_params, timeout=timeout)
            response.raise_for_status()
            return response.json()

        except httpx.HTTPStatusError as e:
            error_text = str(e.response.text) if hasattr(e, 'response') else str(e)

            # Check for quota exceeded (403) or rate limit errors
            if e.response.status_code == 403 or "quotaExceeded" in error_text or "rateLimitExceeded" in error_text:
                logger.warning(f"YouTube API quota/rate limit hit for key {CURRENT_YOUTUBE_KEY_INDEX + 1}")
                if not rotate_youtube_api_key():
                    # All keys exhausted
                    raise YouTubeQuotaExhaustedError("All YouTube API keys exhausted")
                # Retry with new key
                continue
            else:
                logger.error(f"YouTube API error: {e}")
                return None

        except Exception as e:
            logger.error(f"Error making YouTube API request to {endpoint}: {e}")
            return None


class YouTubeQuotaExhaustedError(Exception):
    """Raised when all YouTube API keys are exhausted."""
    pass


# For backwards compatibility (deprecated - use get_youtube_api_key())
GOOGLE_YOUTUBE_TOKEN = YOUTUBE_API_KEYS[0] if YOUTUBE_API_KEYS else ""

# Z.AI GLM 4.6 API for CH-Annotator verification (NOT Anthropic Claude)
ZAI_API_TOKEN = os.getenv("ZAI_API_TOKEN", "")

# API Endpoints
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
# Z.AI GLM 4.6 API endpoint (Anthropic-compatible interface)
ZAI_API_BASE = "https://api.z.ai/api/anthropic/v1"
ZAI_MODEL = "glm-4.6"

# Rate limiting
REQUEST_DELAY = 0.3  # seconds between API calls

# CH-Annotator convention version
CH_ANNOTATOR_VERSION = "ch_annotator-v1_7_0"

# CH-Annotator entity type definitions for heritage custodians
# From: data/entity_annotation/ch_annotator-v1_7_0.yaml
# Extended with GLAMORCUBESFIXPHDNT taxonomy subtypes
CH_ANNOTATOR_ENTITY_TYPES = {
    # === HERITAGE INSTITUTION SUBTYPES (GRP.HER.*) ===
    "GRP.HER.MUS": {
        "code": "GRP.HER.MUS",
        "name": "Museum",
        "definition": "Museums of all types: art, history, science, natural history, etc.",
        "ontology_class": "schema:Museum",
        "glamorcubesfixphdnt_code": "M",
        "google_place_types": [
            "museum", "art_museum", "history_museum", "natural_history_museum",
            "science_museum", "children's_museum", "war_memorial",
            "tourist_attraction", "point_of_interest",
        ],
        "wikidata_types": ["museum", "art museum", "history museum", "science museum"],
    },
    "GRP.HER.GAL": {
        "code": "GRP.HER.GAL",
        "name": "Gallery",
        "definition": "Art galleries, exhibition spaces, and kunsthallen",
        "ontology_class": "schema:ArtGallery",
        "glamorcubesfixphdnt_code": "G",
        "google_place_types": [
            "art_gallery", "museum", "tourist_attraction", "point_of_interest",
        ],
        "wikidata_types": ["art gallery", "gallery", "kunsthalle"],
    },
    "GRP.HER.LIB": {
        "code": "GRP.HER.LIB",
        "name": "Library",
        "definition": "Libraries: public, academic, national, special",
        "ontology_class": "schema:Library",
        "glamorcubesfixphdnt_code": "L",
        "google_place_types": [
            "library", "public_library", "point_of_interest", "establishment",
        ],
        "wikidata_types": ["library", "public library", "national library", "academic library"],
    },
    "GRP.HER.ARC": {
        "code": "GRP.HER.ARC",
        "name": "Archive",
        "definition": "Archives: government, corporate, religious, personal",
        "ontology_class": "schema:ArchiveOrganization",
        "glamorcubesfixphdnt_code": "A",
        "google_place_types": [
            "archive", "government_office", "city_hall", "local_government_office",
            "point_of_interest", "establishment",
        ],
        "wikidata_types": ["archive", "national archive", "state archive", "city archive"],
    },
    "GRP.HER": {
        "code": "GRP.HER",
        "name": "Heritage Institution (General)",
        "definition": "Heritage institutions: museums, archives, libraries, galleries (unspecified subtype)",
        "ontology_class": "glam:HeritageCustodian",
        "glamorcubesfixphdnt_code": "X",
        "close_mappings": ["schema:Museum", "schema:Library", "schema:ArchiveOrganization"],
        "google_place_types": [
            # Museums
            "museum", "art_gallery", "art_museum", "history_museum",
            "natural_history_museum", "science_museum", "war_memorial",
            # Libraries
            "library", "public_library", "research_library",
            # Archives
            "archive", "government_office", "city_hall", "local_government_office",
            # Cultural centers
            "cultural_center", "community_center", "performing_arts_theater",
            "tourist_attraction", "point_of_interest", "establishment",
        ],
        "wikidata_types": ["museum", "library", "archive", "cultural institution", "heritage institution"],
    },
    # === OTHER HERITAGE-ADJACENT TYPES ===
    "GRP.HER.RES": {
        "code": "GRP.HER.RES",
        "name": "Research Center",
        "definition": "Research institutes and documentation centers with heritage focus",
        "ontology_class": "schema:ResearchOrganization",
        "glamorcubesfixphdnt_code": "R",
        "google_place_types": [
            "research_institute", "university", "point_of_interest", "establishment",
        ],
        "wikidata_types": ["research institute", "documentation center", "research center"],
    },
    "GRP.HER.BOT": {
        "code": "GRP.HER.BOT",
        "name": "Botanical Garden / Zoo",
        "definition": "Botanical gardens and zoological parks",
        "ontology_class": "schema:Zoo",
        "glamorcubesfixphdnt_code": "B",
        "google_place_types": [
            "zoo", "aquarium", "park", "tourist_attraction", "point_of_interest",
        ],
        "wikidata_types": ["botanical garden", "zoo", "arboretum", "aquarium"],
    },
    "GRP.HER.HOL": {
        "code": "GRP.HER.HOL",
        "name": "Holy Site",
        "definition": "Religious heritage sites with collections (churches, temples, mosques)",
        "ontology_class": "schema:PlaceOfWorship",
        "glamorcubesfixphdnt_code": "H",
        "google_place_types": [
            "church", "mosque", "synagogue", "hindu_temple", "buddhist_temple",
            "place_of_worship", "tourist_attraction",
        ],
        "wikidata_types": ["church", "cathedral", "monastery", "abbey", "temple", "mosque", "synagogue"],
    },
    "GRP.HER.FEA": {
        "code": "GRP.HER.FEA",
        "name": "Heritage Feature",
        "definition": "Monuments, sculptures, memorials, landmarks",
        "ontology_class": "schema:LandmarksOrHistoricalBuildings",
        "glamorcubesfixphdnt_code": "F",
        "google_place_types": [
            "monument", "landmark", "historical_landmark", "tourist_attraction",
            "point_of_interest", "cultural_landmark",
        ],
        "wikidata_types": ["monument", "memorial", "statue", "sculpture", "landmark"],
    },
    # === NON-HERITAGE ORGANIZATION TYPES ===
    "GRP.EDU": {
        "code": "GRP.EDU",
        "name": "Educational Institution",
        "definition": "Universities, schools, and educational institutions",
        "ontology_class": "schema:EducationalOrganization",
        "glamorcubesfixphdnt_code": "E",
        "google_place_types": ["university", "school", "college", "educational_institution"],
        "wikidata_types": ["university", "school", "college", "academy"],
    },
    "GRP.GOV": {
        "code": "GRP.GOV",
        "name": "Government Organization",
        "definition": "Government agencies, legislatures, and public bodies",
        "ontology_class": "schema:GovernmentOrganization",
        "glamorcubesfixphdnt_code": "O",
        "google_place_types": ["government_office", "city_hall", "embassy", "courthouse"],
        "wikidata_types": ["government agency", "ministry", "department"],
    },
    "GRP.REL": {
        "code": "GRP.REL",
        "name": "Religious Organization",
        "definition": "Religious organizations, denominations, and congregations",
        "ontology_class": "schema:ReligiousOrganization",
        "glamorcubesfixphdnt_code": "H",
        "google_place_types": ["church", "mosque", "synagogue", "hindu_temple", "buddhist_temple"],
        "wikidata_types": ["religious organization", "church", "congregation"],
    },
    "GRP.COR": {
        "code": "GRP.COR",
        "name": "Corporation",
        "definition": "Commercial companies and businesses with heritage collections",
        "ontology_class": "schema:Corporation",
        "glamorcubesfixphdnt_code": "C",
        "google_place_types": ["corporate_office", "headquarters", "business", "establishment"],
        "wikidata_types": ["company", "corporation", "business"],
    },
}

# Mapping from GHCID type codes to CH-Annotator entity types
GHCID_TYPE_TO_CH_ANNOTATOR = {
    "G": "GRP.HER.GAL",  # Gallery
    "L": "GRP.HER.LIB",  # Library
    "A": "GRP.HER.ARC",  # Archive
    "M": "GRP.HER.MUS",  # Museum
    "O": "GRP.GOV",      # Official institution
    "R": "GRP.HER.RES",  # Research center
    "C": "GRP.COR",      # Corporation
    "U": "GRP.HER",      # Unknown (defaults to general heritage)
    "B": "GRP.HER.BOT",  # Botanical garden / Zoo
    "E": "GRP.EDU",      # Education provider
    "S": "GRP.HER",      # Collecting society
    "F": "GRP.HER.FEA",  # Features (monuments)
    "I": "GRP.HER",      # Intangible heritage group
    "X": "GRP.HER",      # Mixed
    "P": "GRP.HER",      # Personal collection
    "H": "GRP.HER.HOL",  # Holy sites
    "D": "GRP.HER",      # Digital platform
    "N": "GRP.HER",      # NGO
    "T": "GRP.HER",      # Taste/smell heritage
}

# Google Places fields to request
PLACE_FIELDS = [
    "id", "displayName", "formattedAddress", "addressComponents",
    "location", "types", "businessStatus", "internationalPhoneNumber",
    "nationalPhoneNumber", "regularOpeningHours", "currentOpeningHours",
    "websiteUri", "rating", "userRatingCount", "reviews", "priceLevel",
    "photos", "googleMapsUri", "utcOffsetMinutes", "primaryType",
    "primaryTypeDisplayName", "shortFormattedAddress", "editorialSummary",
]

# ============================================================================
# Utility Functions
# ============================================================================

def get_institution_name(entry: Dict[str, Any]) -> str:
    """Extract institution name from custodian entry.

    PRIORITY ORDER (emic/native names first for better YouTube matching):
    1. custodian_name.emic_name - Native language/script name (best for YouTube search)
    2. custodian_name.claim_value - Standardized name
    3. wikidata native label - Wikidata label in institution's country language
    4. original_entry fields - Source data
    5. enrichment data - Google Maps, ZCBS, etc.
    """
    # PRIORITY 1: Emic name (native language/script) - BEST for YouTube search
    if entry.get("custodian_name", {}).get("emic_name"):
        return entry["custodian_name"]["emic_name"]

    # PRIORITY 2: Standardized custodian name
    if entry.get("custodian_name", {}).get("claim_value"):
        return entry["custodian_name"]["claim_value"]

    # PRIORITY 3: Wikidata label in native language (if available)
    # Try to get label in institution's language first
    wikidata = entry.get("wikidata_enrichment", {})
    country_code = get_country_code(entry)
    # Map country codes to likely Wikidata language codes
    country_to_lang = {
        "JP": "wikidata_label_ja", "CN": "wikidata_label_zh", "KR": "wikidata_label_ko",
        "RU": "wikidata_label_ru", "UA": "wikidata_label_uk", "GR": "wikidata_label_el",
        "IL": "wikidata_label_he", "IR": "wikidata_label_fa", "SA": "wikidata_label_ar",
        "EG": "wikidata_label_ar", "TH": "wikidata_label_th", "VN": "wikidata_label_vi",
        "DE": "wikidata_label_de", "FR": "wikidata_label_fr", "ES": "wikidata_label_es",
        "IT": "wikidata_label_it", "PT": "wikidata_label_pt", "BR": "wikidata_label_pt",
        "NL": "wikidata_label_nl", "BE": "wikidata_label_nl", "PL": "wikidata_label_pl",
        "CZ": "wikidata_label_cs", "HU": "wikidata_label_hu", "RO": "wikidata_label_ro",
        "BG": "wikidata_label_bg", "RS": "wikidata_label_sr", "HR": "wikidata_label_hr",
        "SI": "wikidata_label_sl", "SK": "wikidata_label_sk", "TR": "wikidata_label_tr",
        "IN": "wikidata_label_hi", "ID": "wikidata_label_id", "MY": "wikidata_label_ms",
        "PH": "wikidata_label_tl", "SE": "wikidata_label_sv", "NO": "wikidata_label_no",
        "DK": "wikidata_label_da", "FI": "wikidata_label_fi", "EE": "wikidata_label_et",
        "LV": "wikidata_label_lv", "LT": "wikidata_label_lt",
    }
    native_label_key = country_to_lang.get(country_code)
    if native_label_key and wikidata.get(native_label_key):
        return wikidata[native_label_key]

    # Fall back to English Wikidata label
    if wikidata.get("wikidata_label_en"):
        return wikidata["wikidata_label_en"]

    # PRIORITY 4: Original entry fields
    if entry.get("original_entry", {}).get("name"):
        return entry["original_entry"]["name"]
    if entry.get("original_entry", {}).get("organisatie"):
        return entry["original_entry"]["organisatie"]

    # PRIORITY 5: Enrichment data
    if entry.get("zcbs_enrichment", {}).get("zcbs_name"):
        return entry["zcbs_enrichment"]["zcbs_name"]
    if entry.get("google_maps_enrichment", {}).get("place_name"):
        return entry["google_maps_enrichment"]["place_name"]

    return ""


def get_country_code(entry: Dict[str, Any]) -> str:
    """Extract country code from entry."""
    loc = entry.get("ghcid", {}).get("location_resolution", {})
    if loc.get("country_code"):
        return loc["country_code"]
    # Parse from GHCID
    ghcid = entry.get("ghcid", {}).get("ghcid_current", "")
    if ghcid and "-" in ghcid:
        return ghcid.split("-")[0]
    return ""


def get_coordinates(entry: Dict[str, Any]) -> Optional[Tuple[float, float]]:
    """Extract coordinates from entry if available."""
    loc = entry.get("ghcid", {}).get("location_resolution", {})
    src = loc.get("source_coordinates", {})
    if src.get("latitude") and src.get("longitude"):
        return (src["latitude"], src["longitude"])
    return None


def get_city_name(entry: Dict[str, Any]) -> str:
    """Extract city name from entry."""
    loc = entry.get("ghcid", {}).get("location_resolution", {})
    return loc.get("city_name", "")


def get_wikidata_id(entry: Dict[str, Any]) -> str:
    """Extract Wikidata ID from entry."""
    if entry.get("wikidata_enrichment", {}).get("wikidata_entity_id"):
        return entry["wikidata_enrichment"]["wikidata_entity_id"]
    if entry.get("original_entry", {}).get("wikidata_id"):
        return entry["original_entry"]["wikidata_id"]
    return ""


# ============================================================================
# Google Maps Enrichment
# ============================================================================

def build_maps_search_query(entry: Dict[str, Any]) -> str:
    """Build Google Maps search query from entry data."""
    parts = []

    name = get_institution_name(entry)
    if name:
        parts.append(name)

    city = get_city_name(entry)
    if city:
        parts.append(city)

    # Get country name
    loc = entry.get("ghcid", {}).get("location_resolution", {})
    country = loc.get("country_label", "")
    if country:
        parts.append(country)

    return ", ".join(parts)


def search_google_place(
    query: str,
    client: httpx.Client,
    country_code: str = "",
    location_bias: Optional[Tuple[float, float]] = None,
) -> Optional[Dict[str, Any]]:
    """Search for a place using Google Places API (New)."""
    if not GOOGLE_PLACES_TOKEN:
        logger.warning("GOOGLE_PLACES_TOKEN not set, skipping Maps enrichment")
        return None

    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
        "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
    }

    body = {
        "textQuery": query,
        "maxResultCount": 1,
    }

    # Set language/region based on country
    if country_code == "ZA":
        body["languageCode"] = "en"
        body["regionCode"] = "ZA"
    elif country_code == "ZW":
        body["languageCode"] = "en"
        body["regionCode"] = "ZW"

    # Add location bias if coordinates available
    if location_bias:
        lat, lng = location_bias
        body["locationBias"] = {
            "circle": {
                "center": {"latitude": lat, "longitude": lng},
                "radius": 50000.0  # 50km radius
            }
        }

    try:
        response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
        response.raise_for_status()
        data = response.json()

        places = data.get("places", [])
        if places:
            return places[0]
        else:
            logger.warning(f"No place found for: {query}")
            return None

    except httpx.HTTPStatusError as e:
        error_data = {}
        try:
            error_data = e.response.json()
        except Exception:
            pass
        error_msg = error_data.get("error", {}).get("message", str(e))
        logger.error(f"Google Places API error: {error_msg}")
        return None
    except Exception as e:
        logger.error(f"Error searching for '{query}': {e}")
        return None


def parse_google_place(place: Dict[str, Any]) -> Dict[str, Any]:
    """Parse Google Places API response into enrichment dict."""
    result = {
        "place_id": place.get("id", ""),
        "name": place.get("displayName", {}).get("text", ""),
        "fetch_timestamp": datetime.now(timezone.utc).isoformat(),
        "api_status": "OK",
    }

    # Location
    location = place.get("location", {})
    if location.get("latitude") and location.get("longitude"):
        result["coordinates"] = {
            "latitude": location["latitude"],
            "longitude": location["longitude"],
        }

    if place.get("formattedAddress"):
        result["formatted_address"] = place["formattedAddress"]
    if place.get("shortFormattedAddress"):
        result["short_address"] = place["shortFormattedAddress"]

    # Contact
    if place.get("nationalPhoneNumber"):
        result["phone_local"] = place["nationalPhoneNumber"]
    if place.get("internationalPhoneNumber"):
        result["phone_international"] = place["internationalPhoneNumber"]
    if place.get("websiteUri"):
        result["website"] = place["websiteUri"]

    # Business info
    if place.get("types"):
        result["google_place_types"] = place["types"]
    if place.get("primaryType"):
        result["primary_type"] = place["primaryType"]
    if place.get("businessStatus"):
        result["business_status"] = place["businessStatus"]

    # Ratings and reviews
    if place.get("rating") is not None:
        result["rating"] = place["rating"]
    if place.get("userRatingCount") is not None:
        result["total_ratings"] = place["userRatingCount"]

    # Parse reviews
    reviews = place.get("reviews", [])
    if reviews:
        result["reviews"] = [
            {
                "author_name": r.get("authorAttribution", {}).get("displayName"),
                "author_uri": r.get("authorAttribution", {}).get("uri"),
                "rating": r.get("rating"),
                "relative_time_description": r.get("relativePublishTimeDescription"),
                "text": r.get("text", {}).get("text"),
                "publish_time": r.get("publishTime"),
            }
            for r in reviews
        ]

    # Opening hours
    if place.get("regularOpeningHours"):
        result["opening_hours"] = {
            "open_now": place.get("currentOpeningHours", {}).get("openNow"),
            "weekday_text": place["regularOpeningHours"].get("weekdayDescriptions"),
        }

    # Editorial summary
    if place.get("editorialSummary"):
        result["editorial_summary"] = place["editorialSummary"].get("text")

    # Photos (just references, not downloading)
    photos = place.get("photos", [])
    if photos:
        result["photo_count"] = len(photos)
        result["photos_metadata"] = [
            {
                "name": p.get("name"),
                "height": p.get("heightPx"),
                "width": p.get("widthPx"),
            }
            for p in photos[:5]  # First 5 only
        ]

    # Links
    if place.get("googleMapsUri"):
        result["google_maps_url"] = place["googleMapsUri"]

    return result


# ============================================================================
# YouTube Enrichment
# ============================================================================

def search_youtube_channel(
    query: str,
    client: httpx.Client,
) -> Optional[Dict[str, Any]]:
    """Search for a YouTube channel with automatic API key rotation."""
    if not get_youtube_api_key():
        logger.warning("No YouTube API keys available, skipping YouTube enrichment")
        return None

    params = {
        "part": "snippet",
        "type": "channel",
        "q": query,
        "maxResults": 3,  # Get top 3 for verification
    }

    try:
        data = youtube_api_request(client, "search", params)
        if data is None:
            return None

        items = data.get("items", [])
        if items:
            # Return all candidates for LLM verification
            return {"candidates": items, "query": query}
        return None

    except YouTubeQuotaExhaustedError:
        raise  # Let caller handle exhausted keys
    except Exception as e:
        logger.error(f"Error searching YouTube for '{query}': {e}")
        return None


def get_youtube_channel_details(
    channel_id: str,
    client: httpx.Client,
) -> Optional[Dict[str, Any]]:
    """Get detailed channel information with automatic API key rotation."""
    if not get_youtube_api_key():
        return None

    params = {
        "part": "snippet,statistics,brandingSettings,contentDetails",
        "id": channel_id,
    }

    try:
        data = youtube_api_request(client, "channels", params)
        if data is None:
            return None

        items = data.get("items", [])
        if items:
            return items[0]
        return None

    except YouTubeQuotaExhaustedError:
        raise  # Let caller handle exhausted keys
    except Exception as e:
        logger.error(f"Error getting channel details for '{channel_id}': {e}")
        return None


def parse_youtube_channel(channel: Dict[str, Any]) -> Dict[str, Any]:
    """Parse YouTube channel API response."""
    snippet = channel.get("snippet", {})
    stats = channel.get("statistics", {})
    branding = channel.get("brandingSettings", {})

    result = {
        "channel_id": channel.get("id", ""),
        "channel_url": f"https://www.youtube.com/channel/{channel.get('id', '')}",
        "title": snippet.get("title", ""),
        "description": snippet.get("description", ""),
        "custom_url": snippet.get("customUrl", ""),
        "published_at": snippet.get("publishedAt", ""),
        "country": snippet.get("country", ""),
        "fetch_timestamp": datetime.now(timezone.utc).isoformat(),
    }

    # Statistics
    if stats.get("subscriberCount"):
        result["subscriber_count"] = int(stats["subscriberCount"])
    if stats.get("videoCount"):
        result["video_count"] = int(stats["videoCount"])
    if stats.get("viewCount"):
        result["view_count"] = int(stats["viewCount"])

    # Thumbnails
    thumbnails = snippet.get("thumbnails", {})
    if thumbnails.get("high", {}).get("url"):
        result["thumbnail_url"] = thumbnails["high"]["url"]

    return result


def get_uploads_playlist_id(channel_data: Dict[str, Any]) -> Optional[str]:
    """
    Extract the uploads playlist ID from channel contentDetails.

    The uploads playlist contains all public videos from the channel.
    Playlist ID format: UU + channel_id (without UC prefix).

    Args:
        channel_data: Raw channel API response (must include contentDetails part)

    Returns:
        Uploads playlist ID or None if not found
    """
    content_details = channel_data.get("contentDetails", {})
    related_playlists = content_details.get("relatedPlaylists", {})
    return related_playlists.get("uploads")


def get_playlist_videos(
    playlist_id: str,
    client: httpx.Client,
    max_results: int = 50,
) -> List[str]:
    """
    Fetch video IDs from a YouTube playlist with automatic API key rotation.

    Args:
        playlist_id: YouTube playlist ID (e.g., uploads playlist)
        client: httpx Client instance
        max_results: Maximum number of videos to fetch (default 50, max 50 per request)

    Returns:
        List of video IDs
    """
    if not get_youtube_api_key():
        return []

    params = {
        "part": "contentDetails",
        "playlistId": playlist_id,
        "maxResults": min(max_results, 50),
    }

    try:
        data = youtube_api_request(client, "playlistItems", params)
        if data is None:
            return []

        video_ids = []
        for item in data.get("items", []):
            content_details = item.get("contentDetails", {})
            video_id = content_details.get("videoId")
            if video_id:
                video_ids.append(video_id)

        return video_ids

    except YouTubeQuotaExhaustedError:
        raise  # Let caller handle exhausted keys
    except Exception as e:
        logger.error(f"Error getting playlist videos for '{playlist_id}': {e}")
        return []


def get_video_details(
    video_ids: List[str],
    client: httpx.Client,
) -> List[Dict[str, Any]]:
    """
    Fetch detailed metadata for multiple videos with automatic API key rotation.

    Args:
        video_ids: List of YouTube video IDs (max 50 per request)
        client: httpx Client instance

    Returns:
        List of parsed video metadata dictionaries
    """
    if not get_youtube_api_key() or not video_ids:
        return []

    # YouTube API accepts comma-separated video IDs (max 50)
    params = {
        "part": "snippet,contentDetails,statistics",
        "id": ",".join(video_ids[:50]),
    }

    try:
        data = youtube_api_request(client, "videos", params)
        if data is None:
            return []

        videos = []
        for item in data.get("items", []):
            video_id = item.get("id", "")
            snippet = item.get("snippet", {})
            content_details = item.get("contentDetails", {})
            stats = item.get("statistics", {})

            video_data = {
                "video_id": video_id,
                "video_url": f"https://www.youtube.com/watch?v={video_id}",
                "title": snippet.get("title", ""),
                "description": snippet.get("description", ""),
                "published_at": snippet.get("publishedAt", ""),
                "duration": content_details.get("duration", ""),
                "view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0,
                "like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0,
                "comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0,
                "comments": [],  # Placeholder for future comment fetching
            }

            # Get highest quality thumbnail
            thumbnails = snippet.get("thumbnails", {})
            for quality in ["maxres", "high", "medium", "default"]:
                if thumbnails.get(quality, {}).get("url"):
                    video_data["thumbnail_url"] = thumbnails[quality]["url"]
                    break

            videos.append(video_data)

        return videos

    except YouTubeQuotaExhaustedError:
        raise  # Let caller handle exhausted keys
    except Exception as e:
        logger.error(f"Error getting video details for {len(video_ids)} videos: {e}")
        return []


def fetch_channel_videos(
    channel_data: Dict[str, Any],
    client: httpx.Client,
    max_videos: int = 50,
) -> List[Dict[str, Any]]:
    """
    Fetch all videos from a YouTube channel.

    This is the main entry point for video fetching. It:
    1. Extracts the uploads playlist ID from channel data
    2. Fetches video IDs from the playlist
    3. Gets detailed metadata for each video

    Args:
        channel_data: Raw channel API response (must include contentDetails part)
        client: httpx Client instance
        max_videos: Maximum number of videos to fetch (default 50)

    Returns:
        List of parsed video metadata dictionaries
    """
    # Step 1: Get uploads playlist ID
    uploads_playlist_id = get_uploads_playlist_id(channel_data)
    if not uploads_playlist_id:
        logger.warning("No uploads playlist found for channel")
        return []

    # Step 2: Get video IDs from playlist
    video_ids = get_playlist_videos(uploads_playlist_id, client, max_videos)
    if not video_ids:
        logger.info("No videos found in uploads playlist")
        return []

    logger.info(f"Found {len(video_ids)} videos in uploads playlist")

    # Step 3: Get detailed video metadata
    videos = get_video_details(video_ids, client)
    logger.info(f"Fetched details for {len(videos)} videos")

    return videos


# ============================================================================
# Z.AI GLM 4.6 Verification with Exponential Backoff (CH-Annotator)
# ============================================================================

MAX_RETRIES = 3
BASE_DELAY = 1.0  # seconds
MAX_DELAY = 30.0  # seconds


async def call_glm_with_retry(
    prompt: str,
    max_retries: int = MAX_RETRIES,
) -> Optional[str]:
    """
    Call Z.AI GLM 4.6 API with exponential backoff retry.

    Uses Anthropic-compatible interface at api.z.ai.

    Returns:
        Response content string or None if all retries fail
    """
    headers = {
        "x-api-key": ZAI_API_TOKEN,
        "anthropic-version": "2023-06-01",
        "Content-Type": "application/json",
    }

    body = {
        "model": ZAI_MODEL,
        "max_tokens": 500,
        "messages": [
            {"role": "user", "content": prompt}
        ],
    }

    for attempt in range(max_retries):
        try:
            async with httpx.AsyncClient() as client:
                response = await client.post(
                    f"{ZAI_API_BASE}/messages",
                    headers=headers,
                    json=body,
                    timeout=60.0
                )
                response.raise_for_status()
                data = response.json()

            # Anthropic-compatible response format
            content_blocks = data.get("content", [])
            if content_blocks and content_blocks[0].get("type") == "text":
                return content_blocks[0].get("text", "")
            return ""

        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                # Rate limited - exponential backoff
                delay = min(BASE_DELAY * (2 ** attempt), MAX_DELAY)
                logger.warning(f"Rate limited, waiting {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
                await asyncio.sleep(delay)
            else:
                logger.error(f"GLM 4.6 API error: {e}")
                return None
        except Exception as e:
            logger.error(f"GLM 4.6 API call failed: {e}")
            return None

    logger.error(f"All {max_retries} GLM 4.6 API retries exhausted")
    return None


def extract_ghcid_type_code(filepath: Path) -> str:
    """
    Extract the institution type code from GHCID filename.

    GHCID format: {COUNTRY}-{REGION}-{CITY}-{TYPE}-{ABBREV}.yaml
    Example: NL-NH-AMS-M-RM.yaml → "M" (Museum)

    Args:
        filepath: Path to custodian YAML file

    Returns:
        Single-letter type code (G, L, A, M, O, R, C, U, B, E, S, F, I, X, P, H, D, N, T)
        or empty string if cannot be extracted
    """
    filename = filepath.stem  # Remove .yaml extension
    parts = filename.split("-")

    # GHCID has at least 5 parts: COUNTRY-REGION-CITY-TYPE-ABBREV
    if len(parts) >= 5:
        # Type code is the 4th part (index 3)
        type_code = parts[3]
        # Type code should be a single letter
        if len(type_code) == 1 and type_code.isalpha():
            return type_code.upper()

    return ""


def get_expected_entity_type(
    institution_type: Any = None,
    filepath: Optional[Path] = None,
) -> Dict[str, Any]:
    """
    Get CH-Annotator entity type based on GHCID type code from filename.

    PRIMARY: Uses GHCID type code from filename (most reliable)
    FALLBACK: Uses Wikidata instance_of text matching

    Args:
        institution_type: Wikidata instance_of value (fallback only)
        filepath: Path to custodian file (primary source)

    Returns:
        CH-Annotator entity type definition dict
    """
    # PRIMARY: Extract type code from GHCID filename
    if filepath:
        type_code = extract_ghcid_type_code(filepath)
        if type_code and type_code in GHCID_TYPE_TO_CH_ANNOTATOR:
            ch_annotator_code = GHCID_TYPE_TO_CH_ANNOTATOR[type_code]
            if ch_annotator_code in CH_ANNOTATOR_ENTITY_TYPES:
                return CH_ANNOTATOR_ENTITY_TYPES[ch_annotator_code]

    # FALLBACK: Use Wikidata instance_of text matching
    # Handle list of types (Wikidata can return multiple instance_of)
    if isinstance(institution_type, list):
        institution_type = " ".join(str(t) for t in institution_type)

    inst_lower = str(institution_type).lower() if institution_type else ""

    # Map Wikidata types to CH-Annotator entity types
    if any(term in inst_lower for term in ["museum", "gallery", "kunsthall"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.MUS"]
    elif any(term in inst_lower for term in ["archive", "archiv", "archief"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.ARC"]
    elif any(term in inst_lower for term in ["library", "bibliothek", "bibliotheek", "biblioteca"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.LIB"]
    elif any(term in inst_lower for term in ["university", "college", "school", "academy"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.EDU"]
    elif any(term in inst_lower for term in ["church", "mosque", "temple", "synagogue", "cathedral"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.REL"]
    elif any(term in inst_lower for term in ["government", "ministry", "department"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.GOV"]
    elif any(term in inst_lower for term in ["botanical", "zoo", "aquarium", "arboretum"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.BOT"]
    elif any(term in inst_lower for term in ["monument", "memorial", "statue", "landmark"]):
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.FEA"]
    else:
        # Default to general heritage institution for custodian files
        return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER"]


async def verify_match_with_llm(
    institution_name: str,
    institution_info: Dict[str, Any],
    candidate_name: str,
    candidate_info: Dict[str, Any],
    match_type: str,  # "google_maps" or "youtube"
    filepath: Optional[Path] = None,
) -> Dict[str, Any]:
    """
    Use Z.AI GLM 4.6 to verify if a candidate match is correct.

    Uses CH-Annotator v1.7.0 entity type definitions for validation.
    Expected entity type derived from GHCID type code in filename.

    Args:
        institution_name: Name of the heritage custodian institution
        institution_info: Dict with wikidata_id, city, country, type
        candidate_name: Name from Google Maps or YouTube
        candidate_info: Dict with place/channel details
        match_type: "google_maps" or "youtube"
        filepath: Path to custodian YAML file (for GHCID type extraction)

    Returns:
        Dict with keys:
        - is_match: bool
        - confidence: float (0.0-1.0)
        - reasoning: str
        - agent: str (model version)
        - entity_type: str (CH-Annotator entity type code)
    """
    if not ZAI_API_TOKEN:
        logger.warning("ZAI_API_TOKEN not set, skipping LLM verification")
        return {
            "is_match": None,
            "confidence": 0.5,
            "reasoning": "LLM verification skipped - no API key",
            "agent": "none",
            "verified": False,
        }

    # Get expected CH-Annotator entity type (PRIMARY: from GHCID, FALLBACK: from Wikidata)
    expected_entity = get_expected_entity_type(
        institution_type=institution_info.get('type', ''),
        filepath=filepath,
    )
    expected_place_types = expected_entity.get("google_place_types", [])

    # Build verification prompt
    if match_type == "google_maps":
        prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention.

TASK: Verify if a Google Maps place matches a heritage custodian institution.

== CH-ANNOTATOR ENTITY TYPE ==
Expected Type: {expected_entity['code']} ({expected_entity['name']})
Definition: {expected_entity['definition']}
Ontology Class: {expected_entity['ontology_class']}
Expected Google Place Types: {', '.join(expected_place_types[:10])}

== SOURCE INSTITUTION (GRP.HER) ==
- Name: {institution_name}
- Wikidata ID: {institution_info.get('wikidata_id', 'N/A')}
- City (TOP.SET): {institution_info.get('city', 'N/A')}
- Country (TOP.CTY): {institution_info.get('country', 'N/A')}
- Instance Type: {institution_info.get('type', 'N/A')}

== GOOGLE MAPS CANDIDATE ==
- Name: {candidate_name}
- Address (TOP.ADR): {candidate_info.get('formatted_address', 'N/A')}
- Google Place Types: {candidate_info.get('google_place_types', 'N/A')}
- Website: {candidate_info.get('website', 'N/A')}
- Business Status: {candidate_info.get('business_status', 'N/A')}

== VERIFICATION CRITERIA ==
1. NAME MATCH: Do the names refer to the same institution? (Allow translations, abbreviations, acronyms)
2. LOCATION MATCH: Is the address in the same city/country?
3. TYPE MATCH: Does Google Place type match expected heritage types (museum, library, archive, gallery)?
4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})?

REJECT if:
- Different institution with similar name
- Google Place types indicate non-heritage (restaurant, hotel, shop)
- Location mismatch (different city/country)
- Name is a person, not an institution

Respond ONLY with JSON (no explanation outside JSON):
{{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}}
"""
    else:  # youtube
        prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention.

TASK: Verify if a YouTube channel is the official channel of a heritage custodian institution.

== CH-ANNOTATOR ENTITY TYPE ==
Expected Type: {expected_entity['code']} ({expected_entity['name']})
Definition: {expected_entity['definition']}
Ontology Class: {expected_entity['ontology_class']}

== SOURCE INSTITUTION (GRP.HER) ==
- Name: {institution_name}
- Wikidata ID: {institution_info.get('wikidata_id', 'N/A')}
- City (TOP.SET): {institution_info.get('city', 'N/A')}
- Country (TOP.CTY): {institution_info.get('country', 'N/A')}
- Instance Type: {institution_info.get('type', 'N/A')}

== YOUTUBE CHANNEL CANDIDATE ==
- Channel Title: {candidate_name}
- Description: {candidate_info.get('description', 'N/A')[:500]}
- Country: {candidate_info.get('country', 'N/A')}
- Subscribers: {candidate_info.get('subscriber_count', 'N/A')}
- Video Count: {candidate_info.get('video_count', 'N/A')}

== VERIFICATION CRITERIA ==
1. NAME MATCH: Does channel name match institution? (Allow abbreviations, acronyms)
2. DESCRIPTION: Does description mention heritage, culture, museum, archive, library?
3. CONTENT: Is this likely an official institutional channel (not fan-made, personal)?
4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})?

REJECT if:
- Channel is personal/fan-made (not official)
- Description indicates unrelated content (gaming, personal vlogs)
- Different institution with similar name
- Channel is for a different city/country

Respond ONLY with JSON (no explanation outside JSON):
{{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}}
"""

    # Call GLM 4.6 API with retry
    content = await call_glm_with_retry(prompt)

    if content is None:
        return {
            "is_match": None,
            "confidence": 0.5,
            "reasoning": "LLM verification failed - API error",
            "agent": ZAI_MODEL,
            "verified": False,
        }

    # Parse JSON response
    try:
        # Extract JSON from response
        json_match = re.search(r'\{[^}]+\}', content, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group())
            result["agent"] = ZAI_MODEL
            result["verified"] = True
            result["ch_annotator_version"] = CH_ANNOTATOR_VERSION
            return result
    except json.JSONDecodeError:
        pass

    # Fallback if JSON parsing fails
    is_match = "true" in content.lower() and "false" not in content.lower()
    return {
        "is_match": is_match,
        "confidence": 0.7 if is_match else 0.3,
        "reasoning": content[:200],
        "agent": ZAI_MODEL,
        "verified": True,
        "ch_annotator_version": CH_ANNOTATOR_VERSION,
    }


# ============================================================================
# Main Enrichment Pipeline
# ============================================================================

async def enrich_custodian_file(
    filepath: Path,
    client: httpx.Client,
    force: bool = False,
    dry_run: bool = False,
    youtube_only: bool = False,
    maps_only: bool = False,
) -> Tuple[bool, str]:
    """
    Enrich a single custodian YAML file with YouTube and Google Maps data.

    Returns:
        Tuple of (modified: bool, status: str)
    """
    logger.info(f"Processing: {filepath.name}")

    # Load YAML
    with open(filepath, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)

    if not entry:
        return False, "Empty file"

    modified = False
    statuses = []

    # Check if already enriched (including rejections/not found - we've already tried)
    has_maps = entry.get("google_maps_enrichment") is not None or entry.get("google_maps_status") is not None
    has_youtube = entry.get("youtube_enrichment") is not None or entry.get("youtube_status") is not None

    # Determine what needs enrichment based on flags
    skip_maps = youtube_only or (has_maps and not force)
    skip_youtube = maps_only or (has_youtube and not force)

    if skip_maps and skip_youtube:
        return False, "Already enriched (use --force to re-enrich)"

    # Extract info for matching
    institution_name = get_institution_name(entry)
    if not institution_name:
        return False, "No institution name found"

    country_code = get_country_code(entry)
    city_name = get_city_name(entry)
    coords = get_coordinates(entry)
    wikidata_id = get_wikidata_id(entry)

    institution_info = {
        "wikidata_id": wikidata_id,
        "city": city_name,
        "country": country_code,
        "type": entry.get("wikidata_enrichment", {}).get("instance_of", ""),
    }

    logger.info(f"  Institution: {institution_name}")
    logger.info(f"  Location: {city_name}, {country_code}")

    # -------------------------------------------------------------------------
    # Google Maps Enrichment
    # -------------------------------------------------------------------------
    if not skip_maps:
        query = build_maps_search_query(entry)
        logger.info(f"  Maps query: {query}")

        time.sleep(REQUEST_DELAY)
        place = search_google_place(query, client, country_code, coords)

        if place:
            maps_data = parse_google_place(place)
            candidate_name = maps_data.get("name", "")
            logger.info(f"  Maps found: {candidate_name}")

            # LLM verification (uses GHCID type code from filepath)
            verification = await verify_match_with_llm(
                institution_name,
                institution_info,
                candidate_name,
                maps_data,
                "google_maps",
                filepath=filepath,
            )

            if verification.get("is_match") is True:
                maps_data["llm_verification"] = verification
                entry["google_maps_enrichment"] = maps_data
                entry["google_maps_status"] = "SUCCESS"
                modified = True
                statuses.append(f"Maps: {candidate_name} (conf: {verification.get('confidence', 0):.2f})")
                logger.info(f"  ✓ Maps verified: {verification.get('reasoning', '')[:60]}")
            elif verification.get("is_match") is False:
                entry["google_maps_status"] = "NO_MATCH"
                entry["google_maps_rejected"] = {
                    "candidate_name": candidate_name,
                    "rejection_reason": verification.get("reasoning", ""),
                    "timestamp": datetime.now(timezone.utc).isoformat(),
                }
                modified = True
                statuses.append("Maps: rejected by LLM")
                logger.info(f"  ✗ Maps rejected: {verification.get('reasoning', '')[:60]}")
            else:
                # Verification skipped or failed - include with warning
                maps_data["llm_verification"] = verification
                entry["google_maps_enrichment"] = maps_data
                entry["google_maps_status"] = "UNVERIFIED"
                modified = True
                statuses.append(f"Maps: {candidate_name} (unverified)")
        else:
            entry["google_maps_status"] = "NOT_FOUND"
            entry["google_maps_search_query"] = query
            entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
            modified = True
            statuses.append("Maps: not found")

    # -------------------------------------------------------------------------
    # YouTube Enrichment
    # -------------------------------------------------------------------------
    if not skip_youtube:
        # Build YouTube search query
        youtube_query = f"{institution_name} official"
        logger.info(f"  YouTube query: {youtube_query}")

        time.sleep(REQUEST_DELAY)
        search_result = search_youtube_channel(youtube_query, client)

        if search_result and search_result.get("candidates"):
            candidates = search_result["candidates"]
            logger.info(f"  YouTube candidates: {len(candidates)}")

            # Try each candidate
            best_match = None
            best_verification = None

            for candidate in candidates[:3]:  # Top 3 candidates
                channel_id = candidate.get("id", {}).get("channelId")
                if not channel_id:
                    continue

                # Get full channel details
                time.sleep(REQUEST_DELAY)
                channel_details = get_youtube_channel_details(channel_id, client)

                if not channel_details:
                    continue

                youtube_data = parse_youtube_channel(channel_details)

                # Fetch individual video metadata
                videos = fetch_channel_videos(channel_details, client, max_videos=50)
                if videos:
                    youtube_data["videos"] = videos
                    logger.info(f"  Fetched {len(videos)} videos for channel")
                else:
                    youtube_data["videos"] = []

                candidate_name = youtube_data.get("title", "")

                # LLM verification (uses GHCID type code from filepath)
                verification = await verify_match_with_llm(
                    institution_name,
                    institution_info,
                    candidate_name,
                    youtube_data,
                    "youtube",
                    filepath=filepath,
                )

                if verification.get("is_match") is True:
                    if best_verification is None or verification.get("confidence", 0) > best_verification.get("confidence", 0):
                        best_match = youtube_data
                        best_verification = verification
                        logger.info(f"  YouTube match: {candidate_name} (conf: {verification.get('confidence', 0):.2f})")

            if best_match:
                best_match["llm_verification"] = best_verification
                entry["youtube_enrichment"] = best_match
                entry["youtube_status"] = "SUCCESS"
                modified = True
                statuses.append(f"YouTube: {best_match.get('title', '')} ({best_match.get('subscriber_count', 0)} subs)")
            else:
                entry["youtube_status"] = "NO_MATCH"
                entry["youtube_search_query"] = youtube_query
                entry["youtube_candidates_rejected"] = len(candidates)
                entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat()
                modified = True
                statuses.append("YouTube: no verified match")
        else:
            entry["youtube_status"] = "NOT_FOUND"
            entry["youtube_search_query"] = youtube_query
            entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat()
            modified = True
            statuses.append("YouTube: not found")

    # -------------------------------------------------------------------------
    # Add provenance note
    # -------------------------------------------------------------------------
    if modified:
        if "provenance" not in entry:
            entry["provenance"] = {}

        # Handle notes field - can be string, list, or missing
        existing_notes = entry["provenance"].get("notes")
        if existing_notes is None:
            entry["provenance"]["notes"] = []
        elif isinstance(existing_notes, str):
            # Convert string notes to list
            entry["provenance"]["notes"] = [existing_notes]
        # else: it's already a list

        timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
        entry["provenance"]["notes"].append(
            f"YouTube/Google Maps enrichment {timestamp}: {'; '.join(statuses)}"
        )

    # -------------------------------------------------------------------------
    # Save file
    # -------------------------------------------------------------------------
    if modified and not dry_run:
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
        logger.info(f"  Saved: {filepath.name}")

    status = "; ".join(statuses) if statuses else "No changes"
    return modified, status


async def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Enrich custodian files with YouTube and Google Maps data"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Don't save changes, just show what would be done"
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Re-enrich even if already enriched"
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=None,
        help="Limit number of files to process"
    )
    parser.add_argument(
        "--files",
        nargs="+",
        help="Specific files to process (just filenames)"
    )
    parser.add_argument(
        "--pattern",
        type=str,
        default=None,
        help="Glob pattern for files (e.g., 'ZA-*.yaml')"
    )
    parser.add_argument(
        "--youtube-only",
        action="store_true",
        help="Only enrich YouTube data (skip Google Maps)"
    )
    parser.add_argument(
        "--maps-only",
        action="store_true",
        help="Only enrich Google Maps data (skip YouTube)"
    )

    args = parser.parse_args()

    # Check for required API keys
    if not GOOGLE_PLACES_TOKEN and not get_youtube_api_key():
        logger.error("No API keys found! Set GOOGLE_PLACES_TOKEN or GOOGLE_YOUTUBE_TOKEN")
        sys.exit(1)

    # Find files to process
    if args.files:
        files = [CUSTODIAN_DIR / f for f in args.files]
        files = [f for f in files if f.exists()]
    elif args.pattern:
        files = sorted(CUSTODIAN_DIR.glob(args.pattern))
    else:
        files = sorted(CUSTODIAN_DIR.glob("*.yaml"))

    if args.limit:
        files = files[:args.limit]

    logger.info(f"Found {len(files)} files to process")

    if args.dry_run:
        logger.info("DRY RUN - no files will be modified")

    if args.youtube_only:
        logger.info("YOUTUBE-ONLY mode - skipping Google Maps enrichment")
    elif args.maps_only:
        logger.info("MAPS-ONLY mode - skipping YouTube enrichment")

    # Process files
    results = {"modified": 0, "skipped": 0, "errors": 0}

    with httpx.Client(timeout=60.0) as client:
        for filepath in files:
            try:
                modified, status = await enrich_custodian_file(
                    filepath, client, args.force, args.dry_run,
                    youtube_only=args.youtube_only,
                    maps_only=args.maps_only,
                )
                if modified:
                    results["modified"] += 1
                else:
                    results["skipped"] += 1
                logger.info(f"  Status: {status}")
            except YouTubeQuotaExhaustedError:
                logger.error("=" * 60)
                logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment")
                logger.error("=" * 60)
                break  # Exit the loop gracefully
            except Exception as e:
                logger.error(f"Error processing {filepath.name}: {e}")
                results["errors"] += 1

            # Rate limiting between files
            time.sleep(REQUEST_DELAY)

    # Summary
    logger.info("=" * 60)
    logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors")


if __name__ == "__main__":
    asyncio.run(main())