#!/usr/bin/env python3 """ Enrich Heritage Custodian YAML files with YouTube and Google Maps data. This script enriches custodian files in data/custodian/ with: 1. YouTube channel/video data (if channel can be found) 2. Google Maps/Places API data (address, ratings, reviews, photos) 3. GLM-4.6 verification of matches (CH-Annotator convention) Usage: python scripts/enrich_custodian_youtube_maps.py [--dry-run] [--limit N] [--force] python scripts/enrich_custodian_youtube_maps.py --files FILE1.yaml FILE2.yaml python scripts/enrich_custodian_youtube_maps.py --pattern "ZA-*.yaml" Environment Variables: GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment GOOGLE_YOUTUBE_TOKEN - Required for YouTube enrichment ZAI_API_TOKEN - Required for GLM-4.6 verification (optional but recommended) Author: GLAM Data Extraction Project Date: December 2025 """ import argparse import asyncio import fnmatch import json import logging import os import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import httpx import yaml # Add project src to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) # Load environment variables from dotenv import load_dotenv load_dotenv(PROJECT_ROOT / ".env") # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # ============================================================================ # Configuration # ============================================================================ CUSTODIAN_DIR = PROJECT_ROOT / "data/custodian" # API Keys GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") # YouTube API keys - rotate through all available keys when quota exceeded YOUTUBE_API_KEYS = [ os.getenv("GOOGLE_YOUTUBE_TOKEN_v3", ""), os.getenv("GOOGLE_YOUTUBE_TOKEN_v4", ""), os.getenv("GOOGLE_YOUTUBE_TOKEN_v5", ""), os.getenv("GOOGLE_YOUTUBE_TOKEN_v2", ""), os.getenv("GOOGLE_YOUTUBE_TOKEN", ""), ] YOUTUBE_API_KEYS = [k for k in YOUTUBE_API_KEYS if k] # Filter empty keys CURRENT_YOUTUBE_KEY_INDEX = 0 # API Endpoints (defined early for use in helper functions) YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3" def get_youtube_api_key() -> str: """Get current YouTube API key.""" global CURRENT_YOUTUBE_KEY_INDEX if not YOUTUBE_API_KEYS: return "" return YOUTUBE_API_KEYS[CURRENT_YOUTUBE_KEY_INDEX % len(YOUTUBE_API_KEYS)] def rotate_youtube_api_key() -> bool: """Rotate to next YouTube API key. Returns False if all keys exhausted.""" global CURRENT_YOUTUBE_KEY_INDEX CURRENT_YOUTUBE_KEY_INDEX += 1 if CURRENT_YOUTUBE_KEY_INDEX >= len(YOUTUBE_API_KEYS): logger.error(f"All {len(YOUTUBE_API_KEYS)} YouTube API keys exhausted!") return False logger.warning(f"Rotating to YouTube API key {CURRENT_YOUTUBE_KEY_INDEX + 1}/{len(YOUTUBE_API_KEYS)}") return True def youtube_api_request( client: httpx.Client, endpoint: str, params: Dict[str, Any], timeout: float = 30.0, ) -> Optional[Dict[str, Any]]: """ Make a YouTube API request with automatic key rotation on quota errors. Args: client: httpx Client instance endpoint: API endpoint (e.g., "search", "channels", "playlistItems", "videos") params: Query parameters (key will be added automatically) timeout: Request timeout in seconds Returns: JSON response dict or None if all keys exhausted or error """ url = f"{YOUTUBE_API_BASE}/{endpoint}" while True: api_key = get_youtube_api_key() if not api_key: logger.error("No YouTube API keys available") return None # Add current key to params request_params = {**params, "key": api_key} try: response = client.get(url, params=request_params, timeout=timeout) response.raise_for_status() return response.json() except httpx.HTTPStatusError as e: error_text = str(e.response.text) if hasattr(e, 'response') else str(e) # Check for quota exceeded (403) or rate limit errors if e.response.status_code == 403 or "quotaExceeded" in error_text or "rateLimitExceeded" in error_text: logger.warning(f"YouTube API quota/rate limit hit for key {CURRENT_YOUTUBE_KEY_INDEX + 1}") if not rotate_youtube_api_key(): # All keys exhausted raise YouTubeQuotaExhaustedError("All YouTube API keys exhausted") # Retry with new key continue else: logger.error(f"YouTube API error: {e}") return None except Exception as e: logger.error(f"Error making YouTube API request to {endpoint}: {e}") return None class YouTubeQuotaExhaustedError(Exception): """Raised when all YouTube API keys are exhausted.""" pass # For backwards compatibility (deprecated - use get_youtube_api_key()) GOOGLE_YOUTUBE_TOKEN = YOUTUBE_API_KEYS[0] if YOUTUBE_API_KEYS else "" # Z.AI GLM 4.6 API for CH-Annotator verification (NOT Anthropic Claude) ZAI_API_TOKEN = os.getenv("ZAI_API_TOKEN", "") # API Endpoints TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" # Z.AI GLM 4.6 API endpoint (Anthropic-compatible interface) ZAI_API_BASE = "https://api.z.ai/api/anthropic/v1" ZAI_MODEL = "glm-4.6" # Rate limiting REQUEST_DELAY = 0.3 # seconds between API calls # CH-Annotator convention version CH_ANNOTATOR_VERSION = "ch_annotator-v1_7_0" # CH-Annotator entity type definitions for heritage custodians # From: data/entity_annotation/ch_annotator-v1_7_0.yaml # Extended with GLAMORCUBESFIXPHDNT taxonomy subtypes CH_ANNOTATOR_ENTITY_TYPES = { # === HERITAGE INSTITUTION SUBTYPES (GRP.HER.*) === "GRP.HER.MUS": { "code": "GRP.HER.MUS", "name": "Museum", "definition": "Museums of all types: art, history, science, natural history, etc.", "ontology_class": "schema:Museum", "glamorcubesfixphdnt_code": "M", "google_place_types": [ "museum", "art_museum", "history_museum", "natural_history_museum", "science_museum", "children's_museum", "war_memorial", "tourist_attraction", "point_of_interest", ], "wikidata_types": ["museum", "art museum", "history museum", "science museum"], }, "GRP.HER.GAL": { "code": "GRP.HER.GAL", "name": "Gallery", "definition": "Art galleries, exhibition spaces, and kunsthallen", "ontology_class": "schema:ArtGallery", "glamorcubesfixphdnt_code": "G", "google_place_types": [ "art_gallery", "museum", "tourist_attraction", "point_of_interest", ], "wikidata_types": ["art gallery", "gallery", "kunsthalle"], }, "GRP.HER.LIB": { "code": "GRP.HER.LIB", "name": "Library", "definition": "Libraries: public, academic, national, special", "ontology_class": "schema:Library", "glamorcubesfixphdnt_code": "L", "google_place_types": [ "library", "public_library", "point_of_interest", "establishment", ], "wikidata_types": ["library", "public library", "national library", "academic library"], }, "GRP.HER.ARC": { "code": "GRP.HER.ARC", "name": "Archive", "definition": "Archives: government, corporate, religious, personal", "ontology_class": "schema:ArchiveOrganization", "glamorcubesfixphdnt_code": "A", "google_place_types": [ "archive", "government_office", "city_hall", "local_government_office", "point_of_interest", "establishment", ], "wikidata_types": ["archive", "national archive", "state archive", "city archive"], }, "GRP.HER": { "code": "GRP.HER", "name": "Heritage Institution (General)", "definition": "Heritage institutions: museums, archives, libraries, galleries (unspecified subtype)", "ontology_class": "glam:HeritageCustodian", "glamorcubesfixphdnt_code": "X", "close_mappings": ["schema:Museum", "schema:Library", "schema:ArchiveOrganization"], "google_place_types": [ # Museums "museum", "art_gallery", "art_museum", "history_museum", "natural_history_museum", "science_museum", "war_memorial", # Libraries "library", "public_library", "research_library", # Archives "archive", "government_office", "city_hall", "local_government_office", # Cultural centers "cultural_center", "community_center", "performing_arts_theater", "tourist_attraction", "point_of_interest", "establishment", ], "wikidata_types": ["museum", "library", "archive", "cultural institution", "heritage institution"], }, # === OTHER HERITAGE-ADJACENT TYPES === "GRP.HER.RES": { "code": "GRP.HER.RES", "name": "Research Center", "definition": "Research institutes and documentation centers with heritage focus", "ontology_class": "schema:ResearchOrganization", "glamorcubesfixphdnt_code": "R", "google_place_types": [ "research_institute", "university", "point_of_interest", "establishment", ], "wikidata_types": ["research institute", "documentation center", "research center"], }, "GRP.HER.BOT": { "code": "GRP.HER.BOT", "name": "Botanical Garden / Zoo", "definition": "Botanical gardens and zoological parks", "ontology_class": "schema:Zoo", "glamorcubesfixphdnt_code": "B", "google_place_types": [ "zoo", "aquarium", "park", "tourist_attraction", "point_of_interest", ], "wikidata_types": ["botanical garden", "zoo", "arboretum", "aquarium"], }, "GRP.HER.HOL": { "code": "GRP.HER.HOL", "name": "Holy Site", "definition": "Religious heritage sites with collections (churches, temples, mosques)", "ontology_class": "schema:PlaceOfWorship", "glamorcubesfixphdnt_code": "H", "google_place_types": [ "church", "mosque", "synagogue", "hindu_temple", "buddhist_temple", "place_of_worship", "tourist_attraction", ], "wikidata_types": ["church", "cathedral", "monastery", "abbey", "temple", "mosque", "synagogue"], }, "GRP.HER.FEA": { "code": "GRP.HER.FEA", "name": "Heritage Feature", "definition": "Monuments, sculptures, memorials, landmarks", "ontology_class": "schema:LandmarksOrHistoricalBuildings", "glamorcubesfixphdnt_code": "F", "google_place_types": [ "monument", "landmark", "historical_landmark", "tourist_attraction", "point_of_interest", "cultural_landmark", ], "wikidata_types": ["monument", "memorial", "statue", "sculpture", "landmark"], }, # === NON-HERITAGE ORGANIZATION TYPES === "GRP.EDU": { "code": "GRP.EDU", "name": "Educational Institution", "definition": "Universities, schools, and educational institutions", "ontology_class": "schema:EducationalOrganization", "glamorcubesfixphdnt_code": "E", "google_place_types": ["university", "school", "college", "educational_institution"], "wikidata_types": ["university", "school", "college", "academy"], }, "GRP.GOV": { "code": "GRP.GOV", "name": "Government Organization", "definition": "Government agencies, legislatures, and public bodies", "ontology_class": "schema:GovernmentOrganization", "glamorcubesfixphdnt_code": "O", "google_place_types": ["government_office", "city_hall", "embassy", "courthouse"], "wikidata_types": ["government agency", "ministry", "department"], }, "GRP.REL": { "code": "GRP.REL", "name": "Religious Organization", "definition": "Religious organizations, denominations, and congregations", "ontology_class": "schema:ReligiousOrganization", "glamorcubesfixphdnt_code": "H", "google_place_types": ["church", "mosque", "synagogue", "hindu_temple", "buddhist_temple"], "wikidata_types": ["religious organization", "church", "congregation"], }, "GRP.COR": { "code": "GRP.COR", "name": "Corporation", "definition": "Commercial companies and businesses with heritage collections", "ontology_class": "schema:Corporation", "glamorcubesfixphdnt_code": "C", "google_place_types": ["corporate_office", "headquarters", "business", "establishment"], "wikidata_types": ["company", "corporation", "business"], }, } # Mapping from GHCID type codes to CH-Annotator entity types GHCID_TYPE_TO_CH_ANNOTATOR = { "G": "GRP.HER.GAL", # Gallery "L": "GRP.HER.LIB", # Library "A": "GRP.HER.ARC", # Archive "M": "GRP.HER.MUS", # Museum "O": "GRP.GOV", # Official institution "R": "GRP.HER.RES", # Research center "C": "GRP.COR", # Corporation "U": "GRP.HER", # Unknown (defaults to general heritage) "B": "GRP.HER.BOT", # Botanical garden / Zoo "E": "GRP.EDU", # Education provider "S": "GRP.HER", # Collecting society "F": "GRP.HER.FEA", # Features (monuments) "I": "GRP.HER", # Intangible heritage group "X": "GRP.HER", # Mixed "P": "GRP.HER", # Personal collection "H": "GRP.HER.HOL", # Holy sites "D": "GRP.HER", # Digital platform "N": "GRP.HER", # NGO "T": "GRP.HER", # Taste/smell heritage } # Google Places fields to request PLACE_FIELDS = [ "id", "displayName", "formattedAddress", "addressComponents", "location", "types", "businessStatus", "internationalPhoneNumber", "nationalPhoneNumber", "regularOpeningHours", "currentOpeningHours", "websiteUri", "rating", "userRatingCount", "reviews", "priceLevel", "photos", "googleMapsUri", "utcOffsetMinutes", "primaryType", "primaryTypeDisplayName", "shortFormattedAddress", "editorialSummary", ] # ============================================================================ # Utility Functions # ============================================================================ def get_institution_name(entry: Dict[str, Any]) -> str: """Extract institution name from custodian entry. PRIORITY ORDER (emic/native names first for better YouTube matching): 1. custodian_name.emic_name - Native language/script name (best for YouTube search) 2. custodian_name.claim_value - Standardized name 3. wikidata native label - Wikidata label in institution's country language 4. original_entry fields - Source data 5. enrichment data - Google Maps, ZCBS, etc. """ # PRIORITY 1: Emic name (native language/script) - BEST for YouTube search if entry.get("custodian_name", {}).get("emic_name"): return entry["custodian_name"]["emic_name"] # PRIORITY 2: Standardized custodian name if entry.get("custodian_name", {}).get("claim_value"): return entry["custodian_name"]["claim_value"] # PRIORITY 3: Wikidata label in native language (if available) # Try to get label in institution's language first wikidata = entry.get("wikidata_enrichment", {}) country_code = get_country_code(entry) # Map country codes to likely Wikidata language codes country_to_lang = { "JP": "wikidata_label_ja", "CN": "wikidata_label_zh", "KR": "wikidata_label_ko", "RU": "wikidata_label_ru", "UA": "wikidata_label_uk", "GR": "wikidata_label_el", "IL": "wikidata_label_he", "IR": "wikidata_label_fa", "SA": "wikidata_label_ar", "EG": "wikidata_label_ar", "TH": "wikidata_label_th", "VN": "wikidata_label_vi", "DE": "wikidata_label_de", "FR": "wikidata_label_fr", "ES": "wikidata_label_es", "IT": "wikidata_label_it", "PT": "wikidata_label_pt", "BR": "wikidata_label_pt", "NL": "wikidata_label_nl", "BE": "wikidata_label_nl", "PL": "wikidata_label_pl", "CZ": "wikidata_label_cs", "HU": "wikidata_label_hu", "RO": "wikidata_label_ro", "BG": "wikidata_label_bg", "RS": "wikidata_label_sr", "HR": "wikidata_label_hr", "SI": "wikidata_label_sl", "SK": "wikidata_label_sk", "TR": "wikidata_label_tr", "IN": "wikidata_label_hi", "ID": "wikidata_label_id", "MY": "wikidata_label_ms", "PH": "wikidata_label_tl", "SE": "wikidata_label_sv", "NO": "wikidata_label_no", "DK": "wikidata_label_da", "FI": "wikidata_label_fi", "EE": "wikidata_label_et", "LV": "wikidata_label_lv", "LT": "wikidata_label_lt", } native_label_key = country_to_lang.get(country_code) if native_label_key and wikidata.get(native_label_key): return wikidata[native_label_key] # Fall back to English Wikidata label if wikidata.get("wikidata_label_en"): return wikidata["wikidata_label_en"] # PRIORITY 4: Original entry fields if entry.get("original_entry", {}).get("name"): return entry["original_entry"]["name"] if entry.get("original_entry", {}).get("organisatie"): return entry["original_entry"]["organisatie"] # PRIORITY 5: Enrichment data if entry.get("zcbs_enrichment", {}).get("zcbs_name"): return entry["zcbs_enrichment"]["zcbs_name"] if entry.get("google_maps_enrichment", {}).get("place_name"): return entry["google_maps_enrichment"]["place_name"] return "" def get_country_code(entry: Dict[str, Any]) -> str: """Extract country code from entry.""" loc = entry.get("ghcid", {}).get("location_resolution", {}) if loc.get("country_code"): return loc["country_code"] # Parse from GHCID ghcid = entry.get("ghcid", {}).get("ghcid_current", "") if ghcid and "-" in ghcid: return ghcid.split("-")[0] return "" def get_coordinates(entry: Dict[str, Any]) -> Optional[Tuple[float, float]]: """Extract coordinates from entry if available.""" loc = entry.get("ghcid", {}).get("location_resolution", {}) src = loc.get("source_coordinates", {}) if src.get("latitude") and src.get("longitude"): return (src["latitude"], src["longitude"]) return None def get_city_name(entry: Dict[str, Any]) -> str: """Extract city name from entry.""" loc = entry.get("ghcid", {}).get("location_resolution", {}) return loc.get("city_name", "") def get_wikidata_id(entry: Dict[str, Any]) -> str: """Extract Wikidata ID from entry.""" if entry.get("wikidata_enrichment", {}).get("wikidata_entity_id"): return entry["wikidata_enrichment"]["wikidata_entity_id"] if entry.get("original_entry", {}).get("wikidata_id"): return entry["original_entry"]["wikidata_id"] return "" # ============================================================================ # Google Maps Enrichment # ============================================================================ def build_maps_search_query(entry: Dict[str, Any]) -> str: """Build Google Maps search query from entry data.""" parts = [] name = get_institution_name(entry) if name: parts.append(name) city = get_city_name(entry) if city: parts.append(city) # Get country name loc = entry.get("ghcid", {}).get("location_resolution", {}) country = loc.get("country_label", "") if country: parts.append(country) return ", ".join(parts) def search_google_place( query: str, client: httpx.Client, country_code: str = "", location_bias: Optional[Tuple[float, float]] = None, ) -> Optional[Dict[str, Any]]: """Search for a place using Google Places API (New).""" if not GOOGLE_PLACES_TOKEN: logger.warning("GOOGLE_PLACES_TOKEN not set, skipping Maps enrichment") return None headers = { "Content-Type": "application/json", "X-Goog-Api-Key": GOOGLE_PLACES_TOKEN, "X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]), } body = { "textQuery": query, "maxResultCount": 1, } # Set language/region based on country if country_code == "ZA": body["languageCode"] = "en" body["regionCode"] = "ZA" elif country_code == "ZW": body["languageCode"] = "en" body["regionCode"] = "ZW" # Add location bias if coordinates available if location_bias: lat, lng = location_bias body["locationBias"] = { "circle": { "center": {"latitude": lat, "longitude": lng}, "radius": 50000.0 # 50km radius } } try: response = client.post(TEXT_SEARCH_URL, headers=headers, json=body) response.raise_for_status() data = response.json() places = data.get("places", []) if places: return places[0] else: logger.warning(f"No place found for: {query}") return None except httpx.HTTPStatusError as e: error_data = {} try: error_data = e.response.json() except Exception: pass error_msg = error_data.get("error", {}).get("message", str(e)) logger.error(f"Google Places API error: {error_msg}") return None except Exception as e: logger.error(f"Error searching for '{query}': {e}") return None def parse_google_place(place: Dict[str, Any]) -> Dict[str, Any]: """Parse Google Places API response into enrichment dict.""" result = { "place_id": place.get("id", ""), "name": place.get("displayName", {}).get("text", ""), "fetch_timestamp": datetime.now(timezone.utc).isoformat(), "api_status": "OK", } # Location location = place.get("location", {}) if location.get("latitude") and location.get("longitude"): result["coordinates"] = { "latitude": location["latitude"], "longitude": location["longitude"], } if place.get("formattedAddress"): result["formatted_address"] = place["formattedAddress"] if place.get("shortFormattedAddress"): result["short_address"] = place["shortFormattedAddress"] # Contact if place.get("nationalPhoneNumber"): result["phone_local"] = place["nationalPhoneNumber"] if place.get("internationalPhoneNumber"): result["phone_international"] = place["internationalPhoneNumber"] if place.get("websiteUri"): result["website"] = place["websiteUri"] # Business info if place.get("types"): result["google_place_types"] = place["types"] if place.get("primaryType"): result["primary_type"] = place["primaryType"] if place.get("businessStatus"): result["business_status"] = place["businessStatus"] # Ratings and reviews if place.get("rating") is not None: result["rating"] = place["rating"] if place.get("userRatingCount") is not None: result["total_ratings"] = place["userRatingCount"] # Parse reviews reviews = place.get("reviews", []) if reviews: result["reviews"] = [ { "author_name": r.get("authorAttribution", {}).get("displayName"), "author_uri": r.get("authorAttribution", {}).get("uri"), "rating": r.get("rating"), "relative_time_description": r.get("relativePublishTimeDescription"), "text": r.get("text", {}).get("text"), "publish_time": r.get("publishTime"), } for r in reviews ] # Opening hours if place.get("regularOpeningHours"): result["opening_hours"] = { "open_now": place.get("currentOpeningHours", {}).get("openNow"), "weekday_text": place["regularOpeningHours"].get("weekdayDescriptions"), } # Editorial summary if place.get("editorialSummary"): result["editorial_summary"] = place["editorialSummary"].get("text") # Photos (just references, not downloading) photos = place.get("photos", []) if photos: result["photo_count"] = len(photos) result["photos_metadata"] = [ { "name": p.get("name"), "height": p.get("heightPx"), "width": p.get("widthPx"), } for p in photos[:5] # First 5 only ] # Links if place.get("googleMapsUri"): result["google_maps_url"] = place["googleMapsUri"] return result # ============================================================================ # YouTube Enrichment # ============================================================================ def search_youtube_channel( query: str, client: httpx.Client, ) -> Optional[Dict[str, Any]]: """Search for a YouTube channel with automatic API key rotation.""" if not get_youtube_api_key(): logger.warning("No YouTube API keys available, skipping YouTube enrichment") return None params = { "part": "snippet", "type": "channel", "q": query, "maxResults": 3, # Get top 3 for verification } try: data = youtube_api_request(client, "search", params) if data is None: return None items = data.get("items", []) if items: # Return all candidates for LLM verification return {"candidates": items, "query": query} return None except YouTubeQuotaExhaustedError: raise # Let caller handle exhausted keys except Exception as e: logger.error(f"Error searching YouTube for '{query}': {e}") return None def get_youtube_channel_details( channel_id: str, client: httpx.Client, ) -> Optional[Dict[str, Any]]: """Get detailed channel information with automatic API key rotation.""" if not get_youtube_api_key(): return None params = { "part": "snippet,statistics,brandingSettings,contentDetails", "id": channel_id, } try: data = youtube_api_request(client, "channels", params) if data is None: return None items = data.get("items", []) if items: return items[0] return None except YouTubeQuotaExhaustedError: raise # Let caller handle exhausted keys except Exception as e: logger.error(f"Error getting channel details for '{channel_id}': {e}") return None def parse_youtube_channel(channel: Dict[str, Any]) -> Dict[str, Any]: """Parse YouTube channel API response.""" snippet = channel.get("snippet", {}) stats = channel.get("statistics", {}) branding = channel.get("brandingSettings", {}) result = { "channel_id": channel.get("id", ""), "channel_url": f"https://www.youtube.com/channel/{channel.get('id', '')}", "title": snippet.get("title", ""), "description": snippet.get("description", ""), "custom_url": snippet.get("customUrl", ""), "published_at": snippet.get("publishedAt", ""), "country": snippet.get("country", ""), "fetch_timestamp": datetime.now(timezone.utc).isoformat(), } # Statistics if stats.get("subscriberCount"): result["subscriber_count"] = int(stats["subscriberCount"]) if stats.get("videoCount"): result["video_count"] = int(stats["videoCount"]) if stats.get("viewCount"): result["view_count"] = int(stats["viewCount"]) # Thumbnails thumbnails = snippet.get("thumbnails", {}) if thumbnails.get("high", {}).get("url"): result["thumbnail_url"] = thumbnails["high"]["url"] return result def get_uploads_playlist_id(channel_data: Dict[str, Any]) -> Optional[str]: """ Extract the uploads playlist ID from channel contentDetails. The uploads playlist contains all public videos from the channel. Playlist ID format: UU + channel_id (without UC prefix). Args: channel_data: Raw channel API response (must include contentDetails part) Returns: Uploads playlist ID or None if not found """ content_details = channel_data.get("contentDetails", {}) related_playlists = content_details.get("relatedPlaylists", {}) return related_playlists.get("uploads") def get_playlist_videos( playlist_id: str, client: httpx.Client, max_results: int = 50, ) -> List[str]: """ Fetch video IDs from a YouTube playlist with automatic API key rotation. Args: playlist_id: YouTube playlist ID (e.g., uploads playlist) client: httpx Client instance max_results: Maximum number of videos to fetch (default 50, max 50 per request) Returns: List of video IDs """ if not get_youtube_api_key(): return [] params = { "part": "contentDetails", "playlistId": playlist_id, "maxResults": min(max_results, 50), } try: data = youtube_api_request(client, "playlistItems", params) if data is None: return [] video_ids = [] for item in data.get("items", []): content_details = item.get("contentDetails", {}) video_id = content_details.get("videoId") if video_id: video_ids.append(video_id) return video_ids except YouTubeQuotaExhaustedError: raise # Let caller handle exhausted keys except Exception as e: logger.error(f"Error getting playlist videos for '{playlist_id}': {e}") return [] def get_video_details( video_ids: List[str], client: httpx.Client, ) -> List[Dict[str, Any]]: """ Fetch detailed metadata for multiple videos with automatic API key rotation. Args: video_ids: List of YouTube video IDs (max 50 per request) client: httpx Client instance Returns: List of parsed video metadata dictionaries """ if not get_youtube_api_key() or not video_ids: return [] # YouTube API accepts comma-separated video IDs (max 50) params = { "part": "snippet,contentDetails,statistics", "id": ",".join(video_ids[:50]), } try: data = youtube_api_request(client, "videos", params) if data is None: return [] videos = [] for item in data.get("items", []): video_id = item.get("id", "") snippet = item.get("snippet", {}) content_details = item.get("contentDetails", {}) stats = item.get("statistics", {}) video_data = { "video_id": video_id, "video_url": f"https://www.youtube.com/watch?v={video_id}", "title": snippet.get("title", ""), "description": snippet.get("description", ""), "published_at": snippet.get("publishedAt", ""), "duration": content_details.get("duration", ""), "view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0, "like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0, "comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0, "comments": [], # Placeholder for future comment fetching } # Get highest quality thumbnail thumbnails = snippet.get("thumbnails", {}) for quality in ["maxres", "high", "medium", "default"]: if thumbnails.get(quality, {}).get("url"): video_data["thumbnail_url"] = thumbnails[quality]["url"] break videos.append(video_data) return videos except YouTubeQuotaExhaustedError: raise # Let caller handle exhausted keys except Exception as e: logger.error(f"Error getting video details for {len(video_ids)} videos: {e}") return [] def fetch_channel_videos( channel_data: Dict[str, Any], client: httpx.Client, max_videos: int = 50, ) -> List[Dict[str, Any]]: """ Fetch all videos from a YouTube channel. This is the main entry point for video fetching. It: 1. Extracts the uploads playlist ID from channel data 2. Fetches video IDs from the playlist 3. Gets detailed metadata for each video Args: channel_data: Raw channel API response (must include contentDetails part) client: httpx Client instance max_videos: Maximum number of videos to fetch (default 50) Returns: List of parsed video metadata dictionaries """ # Step 1: Get uploads playlist ID uploads_playlist_id = get_uploads_playlist_id(channel_data) if not uploads_playlist_id: logger.warning("No uploads playlist found for channel") return [] # Step 2: Get video IDs from playlist video_ids = get_playlist_videos(uploads_playlist_id, client, max_videos) if not video_ids: logger.info("No videos found in uploads playlist") return [] logger.info(f"Found {len(video_ids)} videos in uploads playlist") # Step 3: Get detailed video metadata videos = get_video_details(video_ids, client) logger.info(f"Fetched details for {len(videos)} videos") return videos # ============================================================================ # Z.AI GLM 4.6 Verification with Exponential Backoff (CH-Annotator) # ============================================================================ MAX_RETRIES = 3 BASE_DELAY = 1.0 # seconds MAX_DELAY = 30.0 # seconds async def call_glm_with_retry( prompt: str, max_retries: int = MAX_RETRIES, ) -> Optional[str]: """ Call Z.AI GLM 4.6 API with exponential backoff retry. Uses Anthropic-compatible interface at api.z.ai. Returns: Response content string or None if all retries fail """ headers = { "x-api-key": ZAI_API_TOKEN, "anthropic-version": "2023-06-01", "Content-Type": "application/json", } body = { "model": ZAI_MODEL, "max_tokens": 500, "messages": [ {"role": "user", "content": prompt} ], } for attempt in range(max_retries): try: async with httpx.AsyncClient() as client: response = await client.post( f"{ZAI_API_BASE}/messages", headers=headers, json=body, timeout=60.0 ) response.raise_for_status() data = response.json() # Anthropic-compatible response format content_blocks = data.get("content", []) if content_blocks and content_blocks[0].get("type") == "text": return content_blocks[0].get("text", "") return "" except httpx.HTTPStatusError as e: if e.response.status_code == 429: # Rate limited - exponential backoff delay = min(BASE_DELAY * (2 ** attempt), MAX_DELAY) logger.warning(f"Rate limited, waiting {delay:.1f}s (attempt {attempt + 1}/{max_retries})") await asyncio.sleep(delay) else: logger.error(f"GLM 4.6 API error: {e}") return None except Exception as e: logger.error(f"GLM 4.6 API call failed: {e}") return None logger.error(f"All {max_retries} GLM 4.6 API retries exhausted") return None def extract_ghcid_type_code(filepath: Path) -> str: """ Extract the institution type code from GHCID filename. GHCID format: {COUNTRY}-{REGION}-{CITY}-{TYPE}-{ABBREV}.yaml Example: NL-NH-AMS-M-RM.yaml → "M" (Museum) Args: filepath: Path to custodian YAML file Returns: Single-letter type code (G, L, A, M, O, R, C, U, B, E, S, F, I, X, P, H, D, N, T) or empty string if cannot be extracted """ filename = filepath.stem # Remove .yaml extension parts = filename.split("-") # GHCID has at least 5 parts: COUNTRY-REGION-CITY-TYPE-ABBREV if len(parts) >= 5: # Type code is the 4th part (index 3) type_code = parts[3] # Type code should be a single letter if len(type_code) == 1 and type_code.isalpha(): return type_code.upper() return "" def get_expected_entity_type( institution_type: Any = None, filepath: Optional[Path] = None, ) -> Dict[str, Any]: """ Get CH-Annotator entity type based on GHCID type code from filename. PRIMARY: Uses GHCID type code from filename (most reliable) FALLBACK: Uses Wikidata instance_of text matching Args: institution_type: Wikidata instance_of value (fallback only) filepath: Path to custodian file (primary source) Returns: CH-Annotator entity type definition dict """ # PRIMARY: Extract type code from GHCID filename if filepath: type_code = extract_ghcid_type_code(filepath) if type_code and type_code in GHCID_TYPE_TO_CH_ANNOTATOR: ch_annotator_code = GHCID_TYPE_TO_CH_ANNOTATOR[type_code] if ch_annotator_code in CH_ANNOTATOR_ENTITY_TYPES: return CH_ANNOTATOR_ENTITY_TYPES[ch_annotator_code] # FALLBACK: Use Wikidata instance_of text matching # Handle list of types (Wikidata can return multiple instance_of) if isinstance(institution_type, list): institution_type = " ".join(str(t) for t in institution_type) inst_lower = str(institution_type).lower() if institution_type else "" # Map Wikidata types to CH-Annotator entity types if any(term in inst_lower for term in ["museum", "gallery", "kunsthall"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.MUS"] elif any(term in inst_lower for term in ["archive", "archiv", "archief"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.ARC"] elif any(term in inst_lower for term in ["library", "bibliothek", "bibliotheek", "biblioteca"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.LIB"] elif any(term in inst_lower for term in ["university", "college", "school", "academy"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.EDU"] elif any(term in inst_lower for term in ["church", "mosque", "temple", "synagogue", "cathedral"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.REL"] elif any(term in inst_lower for term in ["government", "ministry", "department"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.GOV"] elif any(term in inst_lower for term in ["botanical", "zoo", "aquarium", "arboretum"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.BOT"] elif any(term in inst_lower for term in ["monument", "memorial", "statue", "landmark"]): return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.FEA"] else: # Default to general heritage institution for custodian files return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER"] async def verify_match_with_llm( institution_name: str, institution_info: Dict[str, Any], candidate_name: str, candidate_info: Dict[str, Any], match_type: str, # "google_maps" or "youtube" filepath: Optional[Path] = None, ) -> Dict[str, Any]: """ Use Z.AI GLM 4.6 to verify if a candidate match is correct. Uses CH-Annotator v1.7.0 entity type definitions for validation. Expected entity type derived from GHCID type code in filename. Args: institution_name: Name of the heritage custodian institution institution_info: Dict with wikidata_id, city, country, type candidate_name: Name from Google Maps or YouTube candidate_info: Dict with place/channel details match_type: "google_maps" or "youtube" filepath: Path to custodian YAML file (for GHCID type extraction) Returns: Dict with keys: - is_match: bool - confidence: float (0.0-1.0) - reasoning: str - agent: str (model version) - entity_type: str (CH-Annotator entity type code) """ if not ZAI_API_TOKEN: logger.warning("ZAI_API_TOKEN not set, skipping LLM verification") return { "is_match": None, "confidence": 0.5, "reasoning": "LLM verification skipped - no API key", "agent": "none", "verified": False, } # Get expected CH-Annotator entity type (PRIMARY: from GHCID, FALLBACK: from Wikidata) expected_entity = get_expected_entity_type( institution_type=institution_info.get('type', ''), filepath=filepath, ) expected_place_types = expected_entity.get("google_place_types", []) # Build verification prompt if match_type == "google_maps": prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention. TASK: Verify if a Google Maps place matches a heritage custodian institution. == CH-ANNOTATOR ENTITY TYPE == Expected Type: {expected_entity['code']} ({expected_entity['name']}) Definition: {expected_entity['definition']} Ontology Class: {expected_entity['ontology_class']} Expected Google Place Types: {', '.join(expected_place_types[:10])} == SOURCE INSTITUTION (GRP.HER) == - Name: {institution_name} - Wikidata ID: {institution_info.get('wikidata_id', 'N/A')} - City (TOP.SET): {institution_info.get('city', 'N/A')} - Country (TOP.CTY): {institution_info.get('country', 'N/A')} - Instance Type: {institution_info.get('type', 'N/A')} == GOOGLE MAPS CANDIDATE == - Name: {candidate_name} - Address (TOP.ADR): {candidate_info.get('formatted_address', 'N/A')} - Google Place Types: {candidate_info.get('google_place_types', 'N/A')} - Website: {candidate_info.get('website', 'N/A')} - Business Status: {candidate_info.get('business_status', 'N/A')} == VERIFICATION CRITERIA == 1. NAME MATCH: Do the names refer to the same institution? (Allow translations, abbreviations, acronyms) 2. LOCATION MATCH: Is the address in the same city/country? 3. TYPE MATCH: Does Google Place type match expected heritage types (museum, library, archive, gallery)? 4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})? REJECT if: - Different institution with similar name - Google Place types indicate non-heritage (restaurant, hotel, shop) - Location mismatch (different city/country) - Name is a person, not an institution Respond ONLY with JSON (no explanation outside JSON): {{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}} """ else: # youtube prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention. TASK: Verify if a YouTube channel is the official channel of a heritage custodian institution. == CH-ANNOTATOR ENTITY TYPE == Expected Type: {expected_entity['code']} ({expected_entity['name']}) Definition: {expected_entity['definition']} Ontology Class: {expected_entity['ontology_class']} == SOURCE INSTITUTION (GRP.HER) == - Name: {institution_name} - Wikidata ID: {institution_info.get('wikidata_id', 'N/A')} - City (TOP.SET): {institution_info.get('city', 'N/A')} - Country (TOP.CTY): {institution_info.get('country', 'N/A')} - Instance Type: {institution_info.get('type', 'N/A')} == YOUTUBE CHANNEL CANDIDATE == - Channel Title: {candidate_name} - Description: {candidate_info.get('description', 'N/A')[:500]} - Country: {candidate_info.get('country', 'N/A')} - Subscribers: {candidate_info.get('subscriber_count', 'N/A')} - Video Count: {candidate_info.get('video_count', 'N/A')} == VERIFICATION CRITERIA == 1. NAME MATCH: Does channel name match institution? (Allow abbreviations, acronyms) 2. DESCRIPTION: Does description mention heritage, culture, museum, archive, library? 3. CONTENT: Is this likely an official institutional channel (not fan-made, personal)? 4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})? REJECT if: - Channel is personal/fan-made (not official) - Description indicates unrelated content (gaming, personal vlogs) - Different institution with similar name - Channel is for a different city/country Respond ONLY with JSON (no explanation outside JSON): {{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}} """ # Call GLM 4.6 API with retry content = await call_glm_with_retry(prompt) if content is None: return { "is_match": None, "confidence": 0.5, "reasoning": "LLM verification failed - API error", "agent": ZAI_MODEL, "verified": False, } # Parse JSON response try: # Extract JSON from response json_match = re.search(r'\{[^}]+\}', content, re.DOTALL) if json_match: result = json.loads(json_match.group()) result["agent"] = ZAI_MODEL result["verified"] = True result["ch_annotator_version"] = CH_ANNOTATOR_VERSION return result except json.JSONDecodeError: pass # Fallback if JSON parsing fails is_match = "true" in content.lower() and "false" not in content.lower() return { "is_match": is_match, "confidence": 0.7 if is_match else 0.3, "reasoning": content[:200], "agent": ZAI_MODEL, "verified": True, "ch_annotator_version": CH_ANNOTATOR_VERSION, } # ============================================================================ # Main Enrichment Pipeline # ============================================================================ async def enrich_custodian_file( filepath: Path, client: httpx.Client, force: bool = False, dry_run: bool = False, youtube_only: bool = False, maps_only: bool = False, ) -> Tuple[bool, str]: """ Enrich a single custodian YAML file with YouTube and Google Maps data. Returns: Tuple of (modified: bool, status: str) """ logger.info(f"Processing: {filepath.name}") # Load YAML with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return False, "Empty file" modified = False statuses = [] # Check if already enriched (including rejections/not found - we've already tried) has_maps = entry.get("google_maps_enrichment") is not None or entry.get("google_maps_status") is not None has_youtube = entry.get("youtube_enrichment") is not None or entry.get("youtube_status") is not None # Determine what needs enrichment based on flags skip_maps = youtube_only or (has_maps and not force) skip_youtube = maps_only or (has_youtube and not force) if skip_maps and skip_youtube: return False, "Already enriched (use --force to re-enrich)" # Extract info for matching institution_name = get_institution_name(entry) if not institution_name: return False, "No institution name found" country_code = get_country_code(entry) city_name = get_city_name(entry) coords = get_coordinates(entry) wikidata_id = get_wikidata_id(entry) institution_info = { "wikidata_id": wikidata_id, "city": city_name, "country": country_code, "type": entry.get("wikidata_enrichment", {}).get("instance_of", ""), } logger.info(f" Institution: {institution_name}") logger.info(f" Location: {city_name}, {country_code}") # ------------------------------------------------------------------------- # Google Maps Enrichment # ------------------------------------------------------------------------- if not skip_maps: query = build_maps_search_query(entry) logger.info(f" Maps query: {query}") time.sleep(REQUEST_DELAY) place = search_google_place(query, client, country_code, coords) if place: maps_data = parse_google_place(place) candidate_name = maps_data.get("name", "") logger.info(f" Maps found: {candidate_name}") # LLM verification (uses GHCID type code from filepath) verification = await verify_match_with_llm( institution_name, institution_info, candidate_name, maps_data, "google_maps", filepath=filepath, ) if verification.get("is_match") is True: maps_data["llm_verification"] = verification entry["google_maps_enrichment"] = maps_data entry["google_maps_status"] = "SUCCESS" modified = True statuses.append(f"Maps: {candidate_name} (conf: {verification.get('confidence', 0):.2f})") logger.info(f" ✓ Maps verified: {verification.get('reasoning', '')[:60]}") elif verification.get("is_match") is False: entry["google_maps_status"] = "NO_MATCH" entry["google_maps_rejected"] = { "candidate_name": candidate_name, "rejection_reason": verification.get("reasoning", ""), "timestamp": datetime.now(timezone.utc).isoformat(), } modified = True statuses.append("Maps: rejected by LLM") logger.info(f" ✗ Maps rejected: {verification.get('reasoning', '')[:60]}") else: # Verification skipped or failed - include with warning maps_data["llm_verification"] = verification entry["google_maps_enrichment"] = maps_data entry["google_maps_status"] = "UNVERIFIED" modified = True statuses.append(f"Maps: {candidate_name} (unverified)") else: entry["google_maps_status"] = "NOT_FOUND" entry["google_maps_search_query"] = query entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat() modified = True statuses.append("Maps: not found") # ------------------------------------------------------------------------- # YouTube Enrichment # ------------------------------------------------------------------------- if not skip_youtube: # Build YouTube search query youtube_query = f"{institution_name} official" logger.info(f" YouTube query: {youtube_query}") time.sleep(REQUEST_DELAY) search_result = search_youtube_channel(youtube_query, client) if search_result and search_result.get("candidates"): candidates = search_result["candidates"] logger.info(f" YouTube candidates: {len(candidates)}") # Try each candidate best_match = None best_verification = None for candidate in candidates[:3]: # Top 3 candidates channel_id = candidate.get("id", {}).get("channelId") if not channel_id: continue # Get full channel details time.sleep(REQUEST_DELAY) channel_details = get_youtube_channel_details(channel_id, client) if not channel_details: continue youtube_data = parse_youtube_channel(channel_details) # Fetch individual video metadata videos = fetch_channel_videos(channel_details, client, max_videos=50) if videos: youtube_data["videos"] = videos logger.info(f" Fetched {len(videos)} videos for channel") else: youtube_data["videos"] = [] candidate_name = youtube_data.get("title", "") # LLM verification (uses GHCID type code from filepath) verification = await verify_match_with_llm( institution_name, institution_info, candidate_name, youtube_data, "youtube", filepath=filepath, ) if verification.get("is_match") is True: if best_verification is None or verification.get("confidence", 0) > best_verification.get("confidence", 0): best_match = youtube_data best_verification = verification logger.info(f" YouTube match: {candidate_name} (conf: {verification.get('confidence', 0):.2f})") if best_match: best_match["llm_verification"] = best_verification entry["youtube_enrichment"] = best_match entry["youtube_status"] = "SUCCESS" modified = True statuses.append(f"YouTube: {best_match.get('title', '')} ({best_match.get('subscriber_count', 0)} subs)") else: entry["youtube_status"] = "NO_MATCH" entry["youtube_search_query"] = youtube_query entry["youtube_candidates_rejected"] = len(candidates) entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat() modified = True statuses.append("YouTube: no verified match") else: entry["youtube_status"] = "NOT_FOUND" entry["youtube_search_query"] = youtube_query entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat() modified = True statuses.append("YouTube: not found") # ------------------------------------------------------------------------- # Add provenance note # ------------------------------------------------------------------------- if modified: if "provenance" not in entry: entry["provenance"] = {} # Handle notes field - can be string, list, or missing existing_notes = entry["provenance"].get("notes") if existing_notes is None: entry["provenance"]["notes"] = [] elif isinstance(existing_notes, str): # Convert string notes to list entry["provenance"]["notes"] = [existing_notes] # else: it's already a list timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") entry["provenance"]["notes"].append( f"YouTube/Google Maps enrichment {timestamp}: {'; '.join(statuses)}" ) # ------------------------------------------------------------------------- # Save file # ------------------------------------------------------------------------- if modified and not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) logger.info(f" Saved: {filepath.name}") status = "; ".join(statuses) if statuses else "No changes" return modified, status async def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Enrich custodian files with YouTube and Google Maps data" ) parser.add_argument( "--dry-run", action="store_true", help="Don't save changes, just show what would be done" ) parser.add_argument( "--force", action="store_true", help="Re-enrich even if already enriched" ) parser.add_argument( "--limit", type=int, default=None, help="Limit number of files to process" ) parser.add_argument( "--files", nargs="+", help="Specific files to process (just filenames)" ) parser.add_argument( "--pattern", type=str, default=None, help="Glob pattern for files (e.g., 'ZA-*.yaml')" ) parser.add_argument( "--youtube-only", action="store_true", help="Only enrich YouTube data (skip Google Maps)" ) parser.add_argument( "--maps-only", action="store_true", help="Only enrich Google Maps data (skip YouTube)" ) args = parser.parse_args() # Check for required API keys if not GOOGLE_PLACES_TOKEN and not get_youtube_api_key(): logger.error("No API keys found! Set GOOGLE_PLACES_TOKEN or GOOGLE_YOUTUBE_TOKEN") sys.exit(1) # Find files to process if args.files: files = [CUSTODIAN_DIR / f for f in args.files] files = [f for f in files if f.exists()] elif args.pattern: files = sorted(CUSTODIAN_DIR.glob(args.pattern)) else: files = sorted(CUSTODIAN_DIR.glob("*.yaml")) if args.limit: files = files[:args.limit] logger.info(f"Found {len(files)} files to process") if args.dry_run: logger.info("DRY RUN - no files will be modified") if args.youtube_only: logger.info("YOUTUBE-ONLY mode - skipping Google Maps enrichment") elif args.maps_only: logger.info("MAPS-ONLY mode - skipping YouTube enrichment") # Process files results = {"modified": 0, "skipped": 0, "errors": 0} with httpx.Client(timeout=60.0) as client: for filepath in files: try: modified, status = await enrich_custodian_file( filepath, client, args.force, args.dry_run, youtube_only=args.youtube_only, maps_only=args.maps_only, ) if modified: results["modified"] += 1 else: results["skipped"] += 1 logger.info(f" Status: {status}") except YouTubeQuotaExhaustedError: logger.error("=" * 60) logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment") logger.error("=" * 60) break # Exit the loop gracefully except Exception as e: logger.error(f"Error processing {filepath.name}: {e}") results["errors"] += 1 # Rate limiting between files time.sleep(REQUEST_DELAY) # Summary logger.info("=" * 60) logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors") if __name__ == "__main__": asyncio.run(main())