glam/scripts/enrich_custodian_youtube_maps.py

1574 lines
58 KiB
Python

#!/usr/bin/env python3
"""
Enrich Heritage Custodian YAML files with YouTube and Google Maps data.
This script enriches custodian files in data/custodian/ with:
1. YouTube channel/video data (if channel can be found)
2. Google Maps/Places API data (address, ratings, reviews, photos)
3. GLM-4.6 verification of matches (CH-Annotator convention)
Usage:
python scripts/enrich_custodian_youtube_maps.py [--dry-run] [--limit N] [--force]
python scripts/enrich_custodian_youtube_maps.py --files FILE1.yaml FILE2.yaml
python scripts/enrich_custodian_youtube_maps.py --pattern "ZA-*.yaml"
Environment Variables:
GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
GOOGLE_YOUTUBE_TOKEN - Required for YouTube enrichment
ZAI_API_TOKEN - Required for GLM-4.6 verification (optional but recommended)
Author: GLAM Data Extraction Project
Date: December 2025
"""
import argparse
import asyncio
import fnmatch
import json
import logging
import os
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import httpx
import yaml
# Add project src to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
# Load environment variables
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# ============================================================================
# Configuration
# ============================================================================
CUSTODIAN_DIR = PROJECT_ROOT / "data/custodian"
# API Keys
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
# YouTube API keys - rotate through all available keys when quota exceeded
YOUTUBE_API_KEYS = [
os.getenv("GOOGLE_YOUTUBE_TOKEN_v3", ""),
os.getenv("GOOGLE_YOUTUBE_TOKEN_v4", ""),
os.getenv("GOOGLE_YOUTUBE_TOKEN_v5", ""),
os.getenv("GOOGLE_YOUTUBE_TOKEN_v2", ""),
os.getenv("GOOGLE_YOUTUBE_TOKEN", ""),
]
YOUTUBE_API_KEYS = [k for k in YOUTUBE_API_KEYS if k] # Filter empty keys
CURRENT_YOUTUBE_KEY_INDEX = 0
# API Endpoints (defined early for use in helper functions)
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
def get_youtube_api_key() -> str:
"""Get current YouTube API key."""
global CURRENT_YOUTUBE_KEY_INDEX
if not YOUTUBE_API_KEYS:
return ""
return YOUTUBE_API_KEYS[CURRENT_YOUTUBE_KEY_INDEX % len(YOUTUBE_API_KEYS)]
def rotate_youtube_api_key() -> bool:
"""Rotate to next YouTube API key. Returns False if all keys exhausted."""
global CURRENT_YOUTUBE_KEY_INDEX
CURRENT_YOUTUBE_KEY_INDEX += 1
if CURRENT_YOUTUBE_KEY_INDEX >= len(YOUTUBE_API_KEYS):
logger.error(f"All {len(YOUTUBE_API_KEYS)} YouTube API keys exhausted!")
return False
logger.warning(f"Rotating to YouTube API key {CURRENT_YOUTUBE_KEY_INDEX + 1}/{len(YOUTUBE_API_KEYS)}")
return True
def youtube_api_request(
client: httpx.Client,
endpoint: str,
params: Dict[str, Any],
timeout: float = 30.0,
) -> Optional[Dict[str, Any]]:
"""
Make a YouTube API request with automatic key rotation on quota errors.
Args:
client: httpx Client instance
endpoint: API endpoint (e.g., "search", "channels", "playlistItems", "videos")
params: Query parameters (key will be added automatically)
timeout: Request timeout in seconds
Returns:
JSON response dict or None if all keys exhausted or error
"""
url = f"{YOUTUBE_API_BASE}/{endpoint}"
while True:
api_key = get_youtube_api_key()
if not api_key:
logger.error("No YouTube API keys available")
return None
# Add current key to params
request_params = {**params, "key": api_key}
try:
response = client.get(url, params=request_params, timeout=timeout)
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
error_text = str(e.response.text) if hasattr(e, 'response') else str(e)
# Check for quota exceeded (403) or rate limit errors
if e.response.status_code == 403 or "quotaExceeded" in error_text or "rateLimitExceeded" in error_text:
logger.warning(f"YouTube API quota/rate limit hit for key {CURRENT_YOUTUBE_KEY_INDEX + 1}")
if not rotate_youtube_api_key():
# All keys exhausted
raise YouTubeQuotaExhaustedError("All YouTube API keys exhausted")
# Retry with new key
continue
else:
logger.error(f"YouTube API error: {e}")
return None
except Exception as e:
logger.error(f"Error making YouTube API request to {endpoint}: {e}")
return None
class YouTubeQuotaExhaustedError(Exception):
"""Raised when all YouTube API keys are exhausted."""
pass
# For backwards compatibility (deprecated - use get_youtube_api_key())
GOOGLE_YOUTUBE_TOKEN = YOUTUBE_API_KEYS[0] if YOUTUBE_API_KEYS else ""
# Z.AI GLM 4.6 API for CH-Annotator verification (NOT Anthropic Claude)
ZAI_API_TOKEN = os.getenv("ZAI_API_TOKEN", "")
# API Endpoints
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
# Z.AI GLM 4.6 API endpoint (Anthropic-compatible interface)
ZAI_API_BASE = "https://api.z.ai/api/anthropic/v1"
ZAI_MODEL = "glm-4.6"
# Rate limiting
REQUEST_DELAY = 0.3 # seconds between API calls
# CH-Annotator convention version
CH_ANNOTATOR_VERSION = "ch_annotator-v1_7_0"
# CH-Annotator entity type definitions for heritage custodians
# From: data/entity_annotation/ch_annotator-v1_7_0.yaml
# Extended with GLAMORCUBESFIXPHDNT taxonomy subtypes
CH_ANNOTATOR_ENTITY_TYPES = {
# === HERITAGE INSTITUTION SUBTYPES (GRP.HER.*) ===
"GRP.HER.MUS": {
"code": "GRP.HER.MUS",
"name": "Museum",
"definition": "Museums of all types: art, history, science, natural history, etc.",
"ontology_class": "schema:Museum",
"glamorcubesfixphdnt_code": "M",
"google_place_types": [
"museum", "art_museum", "history_museum", "natural_history_museum",
"science_museum", "children's_museum", "war_memorial",
"tourist_attraction", "point_of_interest",
],
"wikidata_types": ["museum", "art museum", "history museum", "science museum"],
},
"GRP.HER.GAL": {
"code": "GRP.HER.GAL",
"name": "Gallery",
"definition": "Art galleries, exhibition spaces, and kunsthallen",
"ontology_class": "schema:ArtGallery",
"glamorcubesfixphdnt_code": "G",
"google_place_types": [
"art_gallery", "museum", "tourist_attraction", "point_of_interest",
],
"wikidata_types": ["art gallery", "gallery", "kunsthalle"],
},
"GRP.HER.LIB": {
"code": "GRP.HER.LIB",
"name": "Library",
"definition": "Libraries: public, academic, national, special",
"ontology_class": "schema:Library",
"glamorcubesfixphdnt_code": "L",
"google_place_types": [
"library", "public_library", "point_of_interest", "establishment",
],
"wikidata_types": ["library", "public library", "national library", "academic library"],
},
"GRP.HER.ARC": {
"code": "GRP.HER.ARC",
"name": "Archive",
"definition": "Archives: government, corporate, religious, personal",
"ontology_class": "schema:ArchiveOrganization",
"glamorcubesfixphdnt_code": "A",
"google_place_types": [
"archive", "government_office", "city_hall", "local_government_office",
"point_of_interest", "establishment",
],
"wikidata_types": ["archive", "national archive", "state archive", "city archive"],
},
"GRP.HER": {
"code": "GRP.HER",
"name": "Heritage Institution (General)",
"definition": "Heritage institutions: museums, archives, libraries, galleries (unspecified subtype)",
"ontology_class": "glam:HeritageCustodian",
"glamorcubesfixphdnt_code": "X",
"close_mappings": ["schema:Museum", "schema:Library", "schema:ArchiveOrganization"],
"google_place_types": [
# Museums
"museum", "art_gallery", "art_museum", "history_museum",
"natural_history_museum", "science_museum", "war_memorial",
# Libraries
"library", "public_library", "research_library",
# Archives
"archive", "government_office", "city_hall", "local_government_office",
# Cultural centers
"cultural_center", "community_center", "performing_arts_theater",
"tourist_attraction", "point_of_interest", "establishment",
],
"wikidata_types": ["museum", "library", "archive", "cultural institution", "heritage institution"],
},
# === OTHER HERITAGE-ADJACENT TYPES ===
"GRP.HER.RES": {
"code": "GRP.HER.RES",
"name": "Research Center",
"definition": "Research institutes and documentation centers with heritage focus",
"ontology_class": "schema:ResearchOrganization",
"glamorcubesfixphdnt_code": "R",
"google_place_types": [
"research_institute", "university", "point_of_interest", "establishment",
],
"wikidata_types": ["research institute", "documentation center", "research center"],
},
"GRP.HER.BOT": {
"code": "GRP.HER.BOT",
"name": "Botanical Garden / Zoo",
"definition": "Botanical gardens and zoological parks",
"ontology_class": "schema:Zoo",
"glamorcubesfixphdnt_code": "B",
"google_place_types": [
"zoo", "aquarium", "park", "tourist_attraction", "point_of_interest",
],
"wikidata_types": ["botanical garden", "zoo", "arboretum", "aquarium"],
},
"GRP.HER.HOL": {
"code": "GRP.HER.HOL",
"name": "Holy Site",
"definition": "Religious heritage sites with collections (churches, temples, mosques)",
"ontology_class": "schema:PlaceOfWorship",
"glamorcubesfixphdnt_code": "H",
"google_place_types": [
"church", "mosque", "synagogue", "hindu_temple", "buddhist_temple",
"place_of_worship", "tourist_attraction",
],
"wikidata_types": ["church", "cathedral", "monastery", "abbey", "temple", "mosque", "synagogue"],
},
"GRP.HER.FEA": {
"code": "GRP.HER.FEA",
"name": "Heritage Feature",
"definition": "Monuments, sculptures, memorials, landmarks",
"ontology_class": "schema:LandmarksOrHistoricalBuildings",
"glamorcubesfixphdnt_code": "F",
"google_place_types": [
"monument", "landmark", "historical_landmark", "tourist_attraction",
"point_of_interest", "cultural_landmark",
],
"wikidata_types": ["monument", "memorial", "statue", "sculpture", "landmark"],
},
# === NON-HERITAGE ORGANIZATION TYPES ===
"GRP.EDU": {
"code": "GRP.EDU",
"name": "Educational Institution",
"definition": "Universities, schools, and educational institutions",
"ontology_class": "schema:EducationalOrganization",
"glamorcubesfixphdnt_code": "E",
"google_place_types": ["university", "school", "college", "educational_institution"],
"wikidata_types": ["university", "school", "college", "academy"],
},
"GRP.GOV": {
"code": "GRP.GOV",
"name": "Government Organization",
"definition": "Government agencies, legislatures, and public bodies",
"ontology_class": "schema:GovernmentOrganization",
"glamorcubesfixphdnt_code": "O",
"google_place_types": ["government_office", "city_hall", "embassy", "courthouse"],
"wikidata_types": ["government agency", "ministry", "department"],
},
"GRP.REL": {
"code": "GRP.REL",
"name": "Religious Organization",
"definition": "Religious organizations, denominations, and congregations",
"ontology_class": "schema:ReligiousOrganization",
"glamorcubesfixphdnt_code": "H",
"google_place_types": ["church", "mosque", "synagogue", "hindu_temple", "buddhist_temple"],
"wikidata_types": ["religious organization", "church", "congregation"],
},
"GRP.COR": {
"code": "GRP.COR",
"name": "Corporation",
"definition": "Commercial companies and businesses with heritage collections",
"ontology_class": "schema:Corporation",
"glamorcubesfixphdnt_code": "C",
"google_place_types": ["corporate_office", "headquarters", "business", "establishment"],
"wikidata_types": ["company", "corporation", "business"],
},
}
# Mapping from GHCID type codes to CH-Annotator entity types
GHCID_TYPE_TO_CH_ANNOTATOR = {
"G": "GRP.HER.GAL", # Gallery
"L": "GRP.HER.LIB", # Library
"A": "GRP.HER.ARC", # Archive
"M": "GRP.HER.MUS", # Museum
"O": "GRP.GOV", # Official institution
"R": "GRP.HER.RES", # Research center
"C": "GRP.COR", # Corporation
"U": "GRP.HER", # Unknown (defaults to general heritage)
"B": "GRP.HER.BOT", # Botanical garden / Zoo
"E": "GRP.EDU", # Education provider
"S": "GRP.HER", # Collecting society
"F": "GRP.HER.FEA", # Features (monuments)
"I": "GRP.HER", # Intangible heritage group
"X": "GRP.HER", # Mixed
"P": "GRP.HER", # Personal collection
"H": "GRP.HER.HOL", # Holy sites
"D": "GRP.HER", # Digital platform
"N": "GRP.HER", # NGO
"T": "GRP.HER", # Taste/smell heritage
}
# Google Places fields to request
PLACE_FIELDS = [
"id", "displayName", "formattedAddress", "addressComponents",
"location", "types", "businessStatus", "internationalPhoneNumber",
"nationalPhoneNumber", "regularOpeningHours", "currentOpeningHours",
"websiteUri", "rating", "userRatingCount", "reviews", "priceLevel",
"photos", "googleMapsUri", "utcOffsetMinutes", "primaryType",
"primaryTypeDisplayName", "shortFormattedAddress", "editorialSummary",
]
# ============================================================================
# Utility Functions
# ============================================================================
def get_institution_name(entry: Dict[str, Any]) -> str:
"""Extract institution name from custodian entry.
PRIORITY ORDER (emic/native names first for better YouTube matching):
1. custodian_name.emic_name - Native language/script name (best for YouTube search)
2. custodian_name.claim_value - Standardized name
3. wikidata native label - Wikidata label in institution's country language
4. original_entry fields - Source data
5. enrichment data - Google Maps, ZCBS, etc.
"""
# PRIORITY 1: Emic name (native language/script) - BEST for YouTube search
if entry.get("custodian_name", {}).get("emic_name"):
return entry["custodian_name"]["emic_name"]
# PRIORITY 2: Standardized custodian name
if entry.get("custodian_name", {}).get("claim_value"):
return entry["custodian_name"]["claim_value"]
# PRIORITY 3: Wikidata label in native language (if available)
# Try to get label in institution's language first
wikidata = entry.get("wikidata_enrichment", {})
country_code = get_country_code(entry)
# Map country codes to likely Wikidata language codes
country_to_lang = {
"JP": "wikidata_label_ja", "CN": "wikidata_label_zh", "KR": "wikidata_label_ko",
"RU": "wikidata_label_ru", "UA": "wikidata_label_uk", "GR": "wikidata_label_el",
"IL": "wikidata_label_he", "IR": "wikidata_label_fa", "SA": "wikidata_label_ar",
"EG": "wikidata_label_ar", "TH": "wikidata_label_th", "VN": "wikidata_label_vi",
"DE": "wikidata_label_de", "FR": "wikidata_label_fr", "ES": "wikidata_label_es",
"IT": "wikidata_label_it", "PT": "wikidata_label_pt", "BR": "wikidata_label_pt",
"NL": "wikidata_label_nl", "BE": "wikidata_label_nl", "PL": "wikidata_label_pl",
"CZ": "wikidata_label_cs", "HU": "wikidata_label_hu", "RO": "wikidata_label_ro",
"BG": "wikidata_label_bg", "RS": "wikidata_label_sr", "HR": "wikidata_label_hr",
"SI": "wikidata_label_sl", "SK": "wikidata_label_sk", "TR": "wikidata_label_tr",
"IN": "wikidata_label_hi", "ID": "wikidata_label_id", "MY": "wikidata_label_ms",
"PH": "wikidata_label_tl", "SE": "wikidata_label_sv", "NO": "wikidata_label_no",
"DK": "wikidata_label_da", "FI": "wikidata_label_fi", "EE": "wikidata_label_et",
"LV": "wikidata_label_lv", "LT": "wikidata_label_lt",
}
native_label_key = country_to_lang.get(country_code)
if native_label_key and wikidata.get(native_label_key):
return wikidata[native_label_key]
# Fall back to English Wikidata label
if wikidata.get("wikidata_label_en"):
return wikidata["wikidata_label_en"]
# PRIORITY 4: Original entry fields
if entry.get("original_entry", {}).get("name"):
return entry["original_entry"]["name"]
if entry.get("original_entry", {}).get("organisatie"):
return entry["original_entry"]["organisatie"]
# PRIORITY 5: Enrichment data
if entry.get("zcbs_enrichment", {}).get("zcbs_name"):
return entry["zcbs_enrichment"]["zcbs_name"]
if entry.get("google_maps_enrichment", {}).get("place_name"):
return entry["google_maps_enrichment"]["place_name"]
return ""
def get_country_code(entry: Dict[str, Any]) -> str:
"""Extract country code from entry."""
loc = entry.get("ghcid", {}).get("location_resolution", {})
if loc.get("country_code"):
return loc["country_code"]
# Parse from GHCID
ghcid = entry.get("ghcid", {}).get("ghcid_current", "")
if ghcid and "-" in ghcid:
return ghcid.split("-")[0]
return ""
def get_coordinates(entry: Dict[str, Any]) -> Optional[Tuple[float, float]]:
"""Extract coordinates from entry if available."""
loc = entry.get("ghcid", {}).get("location_resolution", {})
src = loc.get("source_coordinates", {})
if src.get("latitude") and src.get("longitude"):
return (src["latitude"], src["longitude"])
return None
def get_city_name(entry: Dict[str, Any]) -> str:
"""Extract city name from entry."""
loc = entry.get("ghcid", {}).get("location_resolution", {})
return loc.get("city_name", "")
def get_wikidata_id(entry: Dict[str, Any]) -> str:
"""Extract Wikidata ID from entry."""
if entry.get("wikidata_enrichment", {}).get("wikidata_entity_id"):
return entry["wikidata_enrichment"]["wikidata_entity_id"]
if entry.get("original_entry", {}).get("wikidata_id"):
return entry["original_entry"]["wikidata_id"]
return ""
# ============================================================================
# Google Maps Enrichment
# ============================================================================
def build_maps_search_query(entry: Dict[str, Any]) -> str:
"""Build Google Maps search query from entry data."""
parts = []
name = get_institution_name(entry)
if name:
parts.append(name)
city = get_city_name(entry)
if city:
parts.append(city)
# Get country name
loc = entry.get("ghcid", {}).get("location_resolution", {})
country = loc.get("country_label", "")
if country:
parts.append(country)
return ", ".join(parts)
def search_google_place(
query: str,
client: httpx.Client,
country_code: str = "",
location_bias: Optional[Tuple[float, float]] = None,
) -> Optional[Dict[str, Any]]:
"""Search for a place using Google Places API (New)."""
if not GOOGLE_PLACES_TOKEN:
logger.warning("GOOGLE_PLACES_TOKEN not set, skipping Maps enrichment")
return None
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
}
body = {
"textQuery": query,
"maxResultCount": 1,
}
# Set language/region based on country
if country_code == "ZA":
body["languageCode"] = "en"
body["regionCode"] = "ZA"
elif country_code == "ZW":
body["languageCode"] = "en"
body["regionCode"] = "ZW"
# Add location bias if coordinates available
if location_bias:
lat, lng = location_bias
body["locationBias"] = {
"circle": {
"center": {"latitude": lat, "longitude": lng},
"radius": 50000.0 # 50km radius
}
}
try:
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
response.raise_for_status()
data = response.json()
places = data.get("places", [])
if places:
return places[0]
else:
logger.warning(f"No place found for: {query}")
return None
except httpx.HTTPStatusError as e:
error_data = {}
try:
error_data = e.response.json()
except Exception:
pass
error_msg = error_data.get("error", {}).get("message", str(e))
logger.error(f"Google Places API error: {error_msg}")
return None
except Exception as e:
logger.error(f"Error searching for '{query}': {e}")
return None
def parse_google_place(place: Dict[str, Any]) -> Dict[str, Any]:
"""Parse Google Places API response into enrichment dict."""
result = {
"place_id": place.get("id", ""),
"name": place.get("displayName", {}).get("text", ""),
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
"api_status": "OK",
}
# Location
location = place.get("location", {})
if location.get("latitude") and location.get("longitude"):
result["coordinates"] = {
"latitude": location["latitude"],
"longitude": location["longitude"],
}
if place.get("formattedAddress"):
result["formatted_address"] = place["formattedAddress"]
if place.get("shortFormattedAddress"):
result["short_address"] = place["shortFormattedAddress"]
# Contact
if place.get("nationalPhoneNumber"):
result["phone_local"] = place["nationalPhoneNumber"]
if place.get("internationalPhoneNumber"):
result["phone_international"] = place["internationalPhoneNumber"]
if place.get("websiteUri"):
result["website"] = place["websiteUri"]
# Business info
if place.get("types"):
result["google_place_types"] = place["types"]
if place.get("primaryType"):
result["primary_type"] = place["primaryType"]
if place.get("businessStatus"):
result["business_status"] = place["businessStatus"]
# Ratings and reviews
if place.get("rating") is not None:
result["rating"] = place["rating"]
if place.get("userRatingCount") is not None:
result["total_ratings"] = place["userRatingCount"]
# Parse reviews
reviews = place.get("reviews", [])
if reviews:
result["reviews"] = [
{
"author_name": r.get("authorAttribution", {}).get("displayName"),
"author_uri": r.get("authorAttribution", {}).get("uri"),
"rating": r.get("rating"),
"relative_time_description": r.get("relativePublishTimeDescription"),
"text": r.get("text", {}).get("text"),
"publish_time": r.get("publishTime"),
}
for r in reviews
]
# Opening hours
if place.get("regularOpeningHours"):
result["opening_hours"] = {
"open_now": place.get("currentOpeningHours", {}).get("openNow"),
"weekday_text": place["regularOpeningHours"].get("weekdayDescriptions"),
}
# Editorial summary
if place.get("editorialSummary"):
result["editorial_summary"] = place["editorialSummary"].get("text")
# Photos (just references, not downloading)
photos = place.get("photos", [])
if photos:
result["photo_count"] = len(photos)
result["photos_metadata"] = [
{
"name": p.get("name"),
"height": p.get("heightPx"),
"width": p.get("widthPx"),
}
for p in photos[:5] # First 5 only
]
# Links
if place.get("googleMapsUri"):
result["google_maps_url"] = place["googleMapsUri"]
return result
# ============================================================================
# YouTube Enrichment
# ============================================================================
def search_youtube_channel(
query: str,
client: httpx.Client,
) -> Optional[Dict[str, Any]]:
"""Search for a YouTube channel with automatic API key rotation."""
if not get_youtube_api_key():
logger.warning("No YouTube API keys available, skipping YouTube enrichment")
return None
params = {
"part": "snippet",
"type": "channel",
"q": query,
"maxResults": 3, # Get top 3 for verification
}
try:
data = youtube_api_request(client, "search", params)
if data is None:
return None
items = data.get("items", [])
if items:
# Return all candidates for LLM verification
return {"candidates": items, "query": query}
return None
except YouTubeQuotaExhaustedError:
raise # Let caller handle exhausted keys
except Exception as e:
logger.error(f"Error searching YouTube for '{query}': {e}")
return None
def get_youtube_channel_details(
channel_id: str,
client: httpx.Client,
) -> Optional[Dict[str, Any]]:
"""Get detailed channel information with automatic API key rotation."""
if not get_youtube_api_key():
return None
params = {
"part": "snippet,statistics,brandingSettings,contentDetails",
"id": channel_id,
}
try:
data = youtube_api_request(client, "channels", params)
if data is None:
return None
items = data.get("items", [])
if items:
return items[0]
return None
except YouTubeQuotaExhaustedError:
raise # Let caller handle exhausted keys
except Exception as e:
logger.error(f"Error getting channel details for '{channel_id}': {e}")
return None
def parse_youtube_channel(channel: Dict[str, Any]) -> Dict[str, Any]:
"""Parse YouTube channel API response."""
snippet = channel.get("snippet", {})
stats = channel.get("statistics", {})
branding = channel.get("brandingSettings", {})
result = {
"channel_id": channel.get("id", ""),
"channel_url": f"https://www.youtube.com/channel/{channel.get('id', '')}",
"title": snippet.get("title", ""),
"description": snippet.get("description", ""),
"custom_url": snippet.get("customUrl", ""),
"published_at": snippet.get("publishedAt", ""),
"country": snippet.get("country", ""),
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
}
# Statistics
if stats.get("subscriberCount"):
result["subscriber_count"] = int(stats["subscriberCount"])
if stats.get("videoCount"):
result["video_count"] = int(stats["videoCount"])
if stats.get("viewCount"):
result["view_count"] = int(stats["viewCount"])
# Thumbnails
thumbnails = snippet.get("thumbnails", {})
if thumbnails.get("high", {}).get("url"):
result["thumbnail_url"] = thumbnails["high"]["url"]
return result
def get_uploads_playlist_id(channel_data: Dict[str, Any]) -> Optional[str]:
"""
Extract the uploads playlist ID from channel contentDetails.
The uploads playlist contains all public videos from the channel.
Playlist ID format: UU + channel_id (without UC prefix).
Args:
channel_data: Raw channel API response (must include contentDetails part)
Returns:
Uploads playlist ID or None if not found
"""
content_details = channel_data.get("contentDetails", {})
related_playlists = content_details.get("relatedPlaylists", {})
return related_playlists.get("uploads")
def get_playlist_videos(
playlist_id: str,
client: httpx.Client,
max_results: int = 50,
) -> List[str]:
"""
Fetch video IDs from a YouTube playlist with automatic API key rotation.
Args:
playlist_id: YouTube playlist ID (e.g., uploads playlist)
client: httpx Client instance
max_results: Maximum number of videos to fetch (default 50, max 50 per request)
Returns:
List of video IDs
"""
if not get_youtube_api_key():
return []
params = {
"part": "contentDetails",
"playlistId": playlist_id,
"maxResults": min(max_results, 50),
}
try:
data = youtube_api_request(client, "playlistItems", params)
if data is None:
return []
video_ids = []
for item in data.get("items", []):
content_details = item.get("contentDetails", {})
video_id = content_details.get("videoId")
if video_id:
video_ids.append(video_id)
return video_ids
except YouTubeQuotaExhaustedError:
raise # Let caller handle exhausted keys
except Exception as e:
logger.error(f"Error getting playlist videos for '{playlist_id}': {e}")
return []
def get_video_details(
video_ids: List[str],
client: httpx.Client,
) -> List[Dict[str, Any]]:
"""
Fetch detailed metadata for multiple videos with automatic API key rotation.
Args:
video_ids: List of YouTube video IDs (max 50 per request)
client: httpx Client instance
Returns:
List of parsed video metadata dictionaries
"""
if not get_youtube_api_key() or not video_ids:
return []
# YouTube API accepts comma-separated video IDs (max 50)
params = {
"part": "snippet,contentDetails,statistics",
"id": ",".join(video_ids[:50]),
}
try:
data = youtube_api_request(client, "videos", params)
if data is None:
return []
videos = []
for item in data.get("items", []):
video_id = item.get("id", "")
snippet = item.get("snippet", {})
content_details = item.get("contentDetails", {})
stats = item.get("statistics", {})
video_data = {
"video_id": video_id,
"video_url": f"https://www.youtube.com/watch?v={video_id}",
"title": snippet.get("title", ""),
"description": snippet.get("description", ""),
"published_at": snippet.get("publishedAt", ""),
"duration": content_details.get("duration", ""),
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0,
"like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0,
"comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0,
"comments": [], # Placeholder for future comment fetching
}
# Get highest quality thumbnail
thumbnails = snippet.get("thumbnails", {})
for quality in ["maxres", "high", "medium", "default"]:
if thumbnails.get(quality, {}).get("url"):
video_data["thumbnail_url"] = thumbnails[quality]["url"]
break
videos.append(video_data)
return videos
except YouTubeQuotaExhaustedError:
raise # Let caller handle exhausted keys
except Exception as e:
logger.error(f"Error getting video details for {len(video_ids)} videos: {e}")
return []
def fetch_channel_videos(
channel_data: Dict[str, Any],
client: httpx.Client,
max_videos: int = 50,
) -> List[Dict[str, Any]]:
"""
Fetch all videos from a YouTube channel.
This is the main entry point for video fetching. It:
1. Extracts the uploads playlist ID from channel data
2. Fetches video IDs from the playlist
3. Gets detailed metadata for each video
Args:
channel_data: Raw channel API response (must include contentDetails part)
client: httpx Client instance
max_videos: Maximum number of videos to fetch (default 50)
Returns:
List of parsed video metadata dictionaries
"""
# Step 1: Get uploads playlist ID
uploads_playlist_id = get_uploads_playlist_id(channel_data)
if not uploads_playlist_id:
logger.warning("No uploads playlist found for channel")
return []
# Step 2: Get video IDs from playlist
video_ids = get_playlist_videos(uploads_playlist_id, client, max_videos)
if not video_ids:
logger.info("No videos found in uploads playlist")
return []
logger.info(f"Found {len(video_ids)} videos in uploads playlist")
# Step 3: Get detailed video metadata
videos = get_video_details(video_ids, client)
logger.info(f"Fetched details for {len(videos)} videos")
return videos
# ============================================================================
# Z.AI GLM 4.6 Verification with Exponential Backoff (CH-Annotator)
# ============================================================================
MAX_RETRIES = 3
BASE_DELAY = 1.0 # seconds
MAX_DELAY = 30.0 # seconds
async def call_glm_with_retry(
prompt: str,
max_retries: int = MAX_RETRIES,
) -> Optional[str]:
"""
Call Z.AI GLM 4.6 API with exponential backoff retry.
Uses Anthropic-compatible interface at api.z.ai.
Returns:
Response content string or None if all retries fail
"""
headers = {
"x-api-key": ZAI_API_TOKEN,
"anthropic-version": "2023-06-01",
"Content-Type": "application/json",
}
body = {
"model": ZAI_MODEL,
"max_tokens": 500,
"messages": [
{"role": "user", "content": prompt}
],
}
for attempt in range(max_retries):
try:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{ZAI_API_BASE}/messages",
headers=headers,
json=body,
timeout=60.0
)
response.raise_for_status()
data = response.json()
# Anthropic-compatible response format
content_blocks = data.get("content", [])
if content_blocks and content_blocks[0].get("type") == "text":
return content_blocks[0].get("text", "")
return ""
except httpx.HTTPStatusError as e:
if e.response.status_code == 429:
# Rate limited - exponential backoff
delay = min(BASE_DELAY * (2 ** attempt), MAX_DELAY)
logger.warning(f"Rate limited, waiting {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(delay)
else:
logger.error(f"GLM 4.6 API error: {e}")
return None
except Exception as e:
logger.error(f"GLM 4.6 API call failed: {e}")
return None
logger.error(f"All {max_retries} GLM 4.6 API retries exhausted")
return None
def extract_ghcid_type_code(filepath: Path) -> str:
"""
Extract the institution type code from GHCID filename.
GHCID format: {COUNTRY}-{REGION}-{CITY}-{TYPE}-{ABBREV}.yaml
Example: NL-NH-AMS-M-RM.yaml → "M" (Museum)
Args:
filepath: Path to custodian YAML file
Returns:
Single-letter type code (G, L, A, M, O, R, C, U, B, E, S, F, I, X, P, H, D, N, T)
or empty string if cannot be extracted
"""
filename = filepath.stem # Remove .yaml extension
parts = filename.split("-")
# GHCID has at least 5 parts: COUNTRY-REGION-CITY-TYPE-ABBREV
if len(parts) >= 5:
# Type code is the 4th part (index 3)
type_code = parts[3]
# Type code should be a single letter
if len(type_code) == 1 and type_code.isalpha():
return type_code.upper()
return ""
def get_expected_entity_type(
institution_type: Any = None,
filepath: Optional[Path] = None,
) -> Dict[str, Any]:
"""
Get CH-Annotator entity type based on GHCID type code from filename.
PRIMARY: Uses GHCID type code from filename (most reliable)
FALLBACK: Uses Wikidata instance_of text matching
Args:
institution_type: Wikidata instance_of value (fallback only)
filepath: Path to custodian file (primary source)
Returns:
CH-Annotator entity type definition dict
"""
# PRIMARY: Extract type code from GHCID filename
if filepath:
type_code = extract_ghcid_type_code(filepath)
if type_code and type_code in GHCID_TYPE_TO_CH_ANNOTATOR:
ch_annotator_code = GHCID_TYPE_TO_CH_ANNOTATOR[type_code]
if ch_annotator_code in CH_ANNOTATOR_ENTITY_TYPES:
return CH_ANNOTATOR_ENTITY_TYPES[ch_annotator_code]
# FALLBACK: Use Wikidata instance_of text matching
# Handle list of types (Wikidata can return multiple instance_of)
if isinstance(institution_type, list):
institution_type = " ".join(str(t) for t in institution_type)
inst_lower = str(institution_type).lower() if institution_type else ""
# Map Wikidata types to CH-Annotator entity types
if any(term in inst_lower for term in ["museum", "gallery", "kunsthall"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.MUS"]
elif any(term in inst_lower for term in ["archive", "archiv", "archief"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.ARC"]
elif any(term in inst_lower for term in ["library", "bibliothek", "bibliotheek", "biblioteca"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.LIB"]
elif any(term in inst_lower for term in ["university", "college", "school", "academy"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.EDU"]
elif any(term in inst_lower for term in ["church", "mosque", "temple", "synagogue", "cathedral"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.REL"]
elif any(term in inst_lower for term in ["government", "ministry", "department"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.GOV"]
elif any(term in inst_lower for term in ["botanical", "zoo", "aquarium", "arboretum"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.BOT"]
elif any(term in inst_lower for term in ["monument", "memorial", "statue", "landmark"]):
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.FEA"]
else:
# Default to general heritage institution for custodian files
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER"]
async def verify_match_with_llm(
institution_name: str,
institution_info: Dict[str, Any],
candidate_name: str,
candidate_info: Dict[str, Any],
match_type: str, # "google_maps" or "youtube"
filepath: Optional[Path] = None,
) -> Dict[str, Any]:
"""
Use Z.AI GLM 4.6 to verify if a candidate match is correct.
Uses CH-Annotator v1.7.0 entity type definitions for validation.
Expected entity type derived from GHCID type code in filename.
Args:
institution_name: Name of the heritage custodian institution
institution_info: Dict with wikidata_id, city, country, type
candidate_name: Name from Google Maps or YouTube
candidate_info: Dict with place/channel details
match_type: "google_maps" or "youtube"
filepath: Path to custodian YAML file (for GHCID type extraction)
Returns:
Dict with keys:
- is_match: bool
- confidence: float (0.0-1.0)
- reasoning: str
- agent: str (model version)
- entity_type: str (CH-Annotator entity type code)
"""
if not ZAI_API_TOKEN:
logger.warning("ZAI_API_TOKEN not set, skipping LLM verification")
return {
"is_match": None,
"confidence": 0.5,
"reasoning": "LLM verification skipped - no API key",
"agent": "none",
"verified": False,
}
# Get expected CH-Annotator entity type (PRIMARY: from GHCID, FALLBACK: from Wikidata)
expected_entity = get_expected_entity_type(
institution_type=institution_info.get('type', ''),
filepath=filepath,
)
expected_place_types = expected_entity.get("google_place_types", [])
# Build verification prompt
if match_type == "google_maps":
prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention.
TASK: Verify if a Google Maps place matches a heritage custodian institution.
== CH-ANNOTATOR ENTITY TYPE ==
Expected Type: {expected_entity['code']} ({expected_entity['name']})
Definition: {expected_entity['definition']}
Ontology Class: {expected_entity['ontology_class']}
Expected Google Place Types: {', '.join(expected_place_types[:10])}
== SOURCE INSTITUTION (GRP.HER) ==
- Name: {institution_name}
- Wikidata ID: {institution_info.get('wikidata_id', 'N/A')}
- City (TOP.SET): {institution_info.get('city', 'N/A')}
- Country (TOP.CTY): {institution_info.get('country', 'N/A')}
- Instance Type: {institution_info.get('type', 'N/A')}
== GOOGLE MAPS CANDIDATE ==
- Name: {candidate_name}
- Address (TOP.ADR): {candidate_info.get('formatted_address', 'N/A')}
- Google Place Types: {candidate_info.get('google_place_types', 'N/A')}
- Website: {candidate_info.get('website', 'N/A')}
- Business Status: {candidate_info.get('business_status', 'N/A')}
== VERIFICATION CRITERIA ==
1. NAME MATCH: Do the names refer to the same institution? (Allow translations, abbreviations, acronyms)
2. LOCATION MATCH: Is the address in the same city/country?
3. TYPE MATCH: Does Google Place type match expected heritage types (museum, library, archive, gallery)?
4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})?
REJECT if:
- Different institution with similar name
- Google Place types indicate non-heritage (restaurant, hotel, shop)
- Location mismatch (different city/country)
- Name is a person, not an institution
Respond ONLY with JSON (no explanation outside JSON):
{{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}}
"""
else: # youtube
prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention.
TASK: Verify if a YouTube channel is the official channel of a heritage custodian institution.
== CH-ANNOTATOR ENTITY TYPE ==
Expected Type: {expected_entity['code']} ({expected_entity['name']})
Definition: {expected_entity['definition']}
Ontology Class: {expected_entity['ontology_class']}
== SOURCE INSTITUTION (GRP.HER) ==
- Name: {institution_name}
- Wikidata ID: {institution_info.get('wikidata_id', 'N/A')}
- City (TOP.SET): {institution_info.get('city', 'N/A')}
- Country (TOP.CTY): {institution_info.get('country', 'N/A')}
- Instance Type: {institution_info.get('type', 'N/A')}
== YOUTUBE CHANNEL CANDIDATE ==
- Channel Title: {candidate_name}
- Description: {candidate_info.get('description', 'N/A')[:500]}
- Country: {candidate_info.get('country', 'N/A')}
- Subscribers: {candidate_info.get('subscriber_count', 'N/A')}
- Video Count: {candidate_info.get('video_count', 'N/A')}
== VERIFICATION CRITERIA ==
1. NAME MATCH: Does channel name match institution? (Allow abbreviations, acronyms)
2. DESCRIPTION: Does description mention heritage, culture, museum, archive, library?
3. CONTENT: Is this likely an official institutional channel (not fan-made, personal)?
4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})?
REJECT if:
- Channel is personal/fan-made (not official)
- Description indicates unrelated content (gaming, personal vlogs)
- Different institution with similar name
- Channel is for a different city/country
Respond ONLY with JSON (no explanation outside JSON):
{{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}}
"""
# Call GLM 4.6 API with retry
content = await call_glm_with_retry(prompt)
if content is None:
return {
"is_match": None,
"confidence": 0.5,
"reasoning": "LLM verification failed - API error",
"agent": ZAI_MODEL,
"verified": False,
}
# Parse JSON response
try:
# Extract JSON from response
json_match = re.search(r'\{[^}]+\}', content, re.DOTALL)
if json_match:
result = json.loads(json_match.group())
result["agent"] = ZAI_MODEL
result["verified"] = True
result["ch_annotator_version"] = CH_ANNOTATOR_VERSION
return result
except json.JSONDecodeError:
pass
# Fallback if JSON parsing fails
is_match = "true" in content.lower() and "false" not in content.lower()
return {
"is_match": is_match,
"confidence": 0.7 if is_match else 0.3,
"reasoning": content[:200],
"agent": ZAI_MODEL,
"verified": True,
"ch_annotator_version": CH_ANNOTATOR_VERSION,
}
# ============================================================================
# Main Enrichment Pipeline
# ============================================================================
async def enrich_custodian_file(
filepath: Path,
client: httpx.Client,
force: bool = False,
dry_run: bool = False,
youtube_only: bool = False,
maps_only: bool = False,
) -> Tuple[bool, str]:
"""
Enrich a single custodian YAML file with YouTube and Google Maps data.
Returns:
Tuple of (modified: bool, status: str)
"""
logger.info(f"Processing: {filepath.name}")
# Load YAML
with open(filepath, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
return False, "Empty file"
modified = False
statuses = []
# Check if already enriched (including rejections/not found - we've already tried)
has_maps = entry.get("google_maps_enrichment") is not None or entry.get("google_maps_status") is not None
has_youtube = entry.get("youtube_enrichment") is not None or entry.get("youtube_status") is not None
# Determine what needs enrichment based on flags
skip_maps = youtube_only or (has_maps and not force)
skip_youtube = maps_only or (has_youtube and not force)
if skip_maps and skip_youtube:
return False, "Already enriched (use --force to re-enrich)"
# Extract info for matching
institution_name = get_institution_name(entry)
if not institution_name:
return False, "No institution name found"
country_code = get_country_code(entry)
city_name = get_city_name(entry)
coords = get_coordinates(entry)
wikidata_id = get_wikidata_id(entry)
institution_info = {
"wikidata_id": wikidata_id,
"city": city_name,
"country": country_code,
"type": entry.get("wikidata_enrichment", {}).get("instance_of", ""),
}
logger.info(f" Institution: {institution_name}")
logger.info(f" Location: {city_name}, {country_code}")
# -------------------------------------------------------------------------
# Google Maps Enrichment
# -------------------------------------------------------------------------
if not skip_maps:
query = build_maps_search_query(entry)
logger.info(f" Maps query: {query}")
time.sleep(REQUEST_DELAY)
place = search_google_place(query, client, country_code, coords)
if place:
maps_data = parse_google_place(place)
candidate_name = maps_data.get("name", "")
logger.info(f" Maps found: {candidate_name}")
# LLM verification (uses GHCID type code from filepath)
verification = await verify_match_with_llm(
institution_name,
institution_info,
candidate_name,
maps_data,
"google_maps",
filepath=filepath,
)
if verification.get("is_match") is True:
maps_data["llm_verification"] = verification
entry["google_maps_enrichment"] = maps_data
entry["google_maps_status"] = "SUCCESS"
modified = True
statuses.append(f"Maps: {candidate_name} (conf: {verification.get('confidence', 0):.2f})")
logger.info(f" ✓ Maps verified: {verification.get('reasoning', '')[:60]}")
elif verification.get("is_match") is False:
entry["google_maps_status"] = "NO_MATCH"
entry["google_maps_rejected"] = {
"candidate_name": candidate_name,
"rejection_reason": verification.get("reasoning", ""),
"timestamp": datetime.now(timezone.utc).isoformat(),
}
modified = True
statuses.append("Maps: rejected by LLM")
logger.info(f" ✗ Maps rejected: {verification.get('reasoning', '')[:60]}")
else:
# Verification skipped or failed - include with warning
maps_data["llm_verification"] = verification
entry["google_maps_enrichment"] = maps_data
entry["google_maps_status"] = "UNVERIFIED"
modified = True
statuses.append(f"Maps: {candidate_name} (unverified)")
else:
entry["google_maps_status"] = "NOT_FOUND"
entry["google_maps_search_query"] = query
entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
modified = True
statuses.append("Maps: not found")
# -------------------------------------------------------------------------
# YouTube Enrichment
# -------------------------------------------------------------------------
if not skip_youtube:
# Build YouTube search query
youtube_query = f"{institution_name} official"
logger.info(f" YouTube query: {youtube_query}")
time.sleep(REQUEST_DELAY)
search_result = search_youtube_channel(youtube_query, client)
if search_result and search_result.get("candidates"):
candidates = search_result["candidates"]
logger.info(f" YouTube candidates: {len(candidates)}")
# Try each candidate
best_match = None
best_verification = None
for candidate in candidates[:3]: # Top 3 candidates
channel_id = candidate.get("id", {}).get("channelId")
if not channel_id:
continue
# Get full channel details
time.sleep(REQUEST_DELAY)
channel_details = get_youtube_channel_details(channel_id, client)
if not channel_details:
continue
youtube_data = parse_youtube_channel(channel_details)
# Fetch individual video metadata
videos = fetch_channel_videos(channel_details, client, max_videos=50)
if videos:
youtube_data["videos"] = videos
logger.info(f" Fetched {len(videos)} videos for channel")
else:
youtube_data["videos"] = []
candidate_name = youtube_data.get("title", "")
# LLM verification (uses GHCID type code from filepath)
verification = await verify_match_with_llm(
institution_name,
institution_info,
candidate_name,
youtube_data,
"youtube",
filepath=filepath,
)
if verification.get("is_match") is True:
if best_verification is None or verification.get("confidence", 0) > best_verification.get("confidence", 0):
best_match = youtube_data
best_verification = verification
logger.info(f" YouTube match: {candidate_name} (conf: {verification.get('confidence', 0):.2f})")
if best_match:
best_match["llm_verification"] = best_verification
entry["youtube_enrichment"] = best_match
entry["youtube_status"] = "SUCCESS"
modified = True
statuses.append(f"YouTube: {best_match.get('title', '')} ({best_match.get('subscriber_count', 0)} subs)")
else:
entry["youtube_status"] = "NO_MATCH"
entry["youtube_search_query"] = youtube_query
entry["youtube_candidates_rejected"] = len(candidates)
entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat()
modified = True
statuses.append("YouTube: no verified match")
else:
entry["youtube_status"] = "NOT_FOUND"
entry["youtube_search_query"] = youtube_query
entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat()
modified = True
statuses.append("YouTube: not found")
# -------------------------------------------------------------------------
# Add provenance note
# -------------------------------------------------------------------------
if modified:
if "provenance" not in entry:
entry["provenance"] = {}
# Handle notes field - can be string, list, or missing
existing_notes = entry["provenance"].get("notes")
if existing_notes is None:
entry["provenance"]["notes"] = []
elif isinstance(existing_notes, str):
# Convert string notes to list
entry["provenance"]["notes"] = [existing_notes]
# else: it's already a list
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
entry["provenance"]["notes"].append(
f"YouTube/Google Maps enrichment {timestamp}: {'; '.join(statuses)}"
)
# -------------------------------------------------------------------------
# Save file
# -------------------------------------------------------------------------
if modified and not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
logger.info(f" Saved: {filepath.name}")
status = "; ".join(statuses) if statuses else "No changes"
return modified, status
async def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Enrich custodian files with YouTube and Google Maps data"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Don't save changes, just show what would be done"
)
parser.add_argument(
"--force",
action="store_true",
help="Re-enrich even if already enriched"
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit number of files to process"
)
parser.add_argument(
"--files",
nargs="+",
help="Specific files to process (just filenames)"
)
parser.add_argument(
"--pattern",
type=str,
default=None,
help="Glob pattern for files (e.g., 'ZA-*.yaml')"
)
parser.add_argument(
"--youtube-only",
action="store_true",
help="Only enrich YouTube data (skip Google Maps)"
)
parser.add_argument(
"--maps-only",
action="store_true",
help="Only enrich Google Maps data (skip YouTube)"
)
args = parser.parse_args()
# Check for required API keys
if not GOOGLE_PLACES_TOKEN and not get_youtube_api_key():
logger.error("No API keys found! Set GOOGLE_PLACES_TOKEN or GOOGLE_YOUTUBE_TOKEN")
sys.exit(1)
# Find files to process
if args.files:
files = [CUSTODIAN_DIR / f for f in args.files]
files = [f for f in files if f.exists()]
elif args.pattern:
files = sorted(CUSTODIAN_DIR.glob(args.pattern))
else:
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
if args.limit:
files = files[:args.limit]
logger.info(f"Found {len(files)} files to process")
if args.dry_run:
logger.info("DRY RUN - no files will be modified")
if args.youtube_only:
logger.info("YOUTUBE-ONLY mode - skipping Google Maps enrichment")
elif args.maps_only:
logger.info("MAPS-ONLY mode - skipping YouTube enrichment")
# Process files
results = {"modified": 0, "skipped": 0, "errors": 0}
with httpx.Client(timeout=60.0) as client:
for filepath in files:
try:
modified, status = await enrich_custodian_file(
filepath, client, args.force, args.dry_run,
youtube_only=args.youtube_only,
maps_only=args.maps_only,
)
if modified:
results["modified"] += 1
else:
results["skipped"] += 1
logger.info(f" Status: {status}")
except YouTubeQuotaExhaustedError:
logger.error("=" * 60)
logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment")
logger.error("=" * 60)
break # Exit the loop gracefully
except Exception as e:
logger.error(f"Error processing {filepath.name}: {e}")
results["errors"] += 1
# Rate limiting between files
time.sleep(REQUEST_DELAY)
# Summary
logger.info("=" * 60)
logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors")
if __name__ == "__main__":
asyncio.run(main())