1574 lines
58 KiB
Python
1574 lines
58 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Heritage Custodian YAML files with YouTube and Google Maps data.
|
|
|
|
This script enriches custodian files in data/custodian/ with:
|
|
1. YouTube channel/video data (if channel can be found)
|
|
2. Google Maps/Places API data (address, ratings, reviews, photos)
|
|
3. GLM-4.6 verification of matches (CH-Annotator convention)
|
|
|
|
Usage:
|
|
python scripts/enrich_custodian_youtube_maps.py [--dry-run] [--limit N] [--force]
|
|
python scripts/enrich_custodian_youtube_maps.py --files FILE1.yaml FILE2.yaml
|
|
python scripts/enrich_custodian_youtube_maps.py --pattern "ZA-*.yaml"
|
|
|
|
Environment Variables:
|
|
GOOGLE_PLACES_TOKEN - Required for Google Maps enrichment
|
|
GOOGLE_YOUTUBE_TOKEN - Required for YouTube enrichment
|
|
ZAI_API_TOKEN - Required for GLM-4.6 verification (optional but recommended)
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: December 2025
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import fnmatch
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import httpx
|
|
import yaml
|
|
|
|
# Add project src to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT / "src"))
|
|
|
|
# Load environment variables
|
|
from dotenv import load_dotenv
|
|
load_dotenv(PROJECT_ROOT / ".env")
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ============================================================================
|
|
# Configuration
|
|
# ============================================================================
|
|
|
|
CUSTODIAN_DIR = PROJECT_ROOT / "data/custodian"
|
|
|
|
# API Keys
|
|
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
|
|
# YouTube API keys - rotate through all available keys when quota exceeded
|
|
YOUTUBE_API_KEYS = [
|
|
os.getenv("GOOGLE_YOUTUBE_TOKEN_v3", ""),
|
|
os.getenv("GOOGLE_YOUTUBE_TOKEN_v4", ""),
|
|
os.getenv("GOOGLE_YOUTUBE_TOKEN_v5", ""),
|
|
os.getenv("GOOGLE_YOUTUBE_TOKEN_v2", ""),
|
|
os.getenv("GOOGLE_YOUTUBE_TOKEN", ""),
|
|
]
|
|
YOUTUBE_API_KEYS = [k for k in YOUTUBE_API_KEYS if k] # Filter empty keys
|
|
CURRENT_YOUTUBE_KEY_INDEX = 0
|
|
|
|
# API Endpoints (defined early for use in helper functions)
|
|
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
|
|
|
|
def get_youtube_api_key() -> str:
|
|
"""Get current YouTube API key."""
|
|
global CURRENT_YOUTUBE_KEY_INDEX
|
|
if not YOUTUBE_API_KEYS:
|
|
return ""
|
|
return YOUTUBE_API_KEYS[CURRENT_YOUTUBE_KEY_INDEX % len(YOUTUBE_API_KEYS)]
|
|
|
|
def rotate_youtube_api_key() -> bool:
|
|
"""Rotate to next YouTube API key. Returns False if all keys exhausted."""
|
|
global CURRENT_YOUTUBE_KEY_INDEX
|
|
CURRENT_YOUTUBE_KEY_INDEX += 1
|
|
if CURRENT_YOUTUBE_KEY_INDEX >= len(YOUTUBE_API_KEYS):
|
|
logger.error(f"All {len(YOUTUBE_API_KEYS)} YouTube API keys exhausted!")
|
|
return False
|
|
logger.warning(f"Rotating to YouTube API key {CURRENT_YOUTUBE_KEY_INDEX + 1}/{len(YOUTUBE_API_KEYS)}")
|
|
return True
|
|
|
|
|
|
def youtube_api_request(
|
|
client: httpx.Client,
|
|
endpoint: str,
|
|
params: Dict[str, Any],
|
|
timeout: float = 30.0,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Make a YouTube API request with automatic key rotation on quota errors.
|
|
|
|
Args:
|
|
client: httpx Client instance
|
|
endpoint: API endpoint (e.g., "search", "channels", "playlistItems", "videos")
|
|
params: Query parameters (key will be added automatically)
|
|
timeout: Request timeout in seconds
|
|
|
|
Returns:
|
|
JSON response dict or None if all keys exhausted or error
|
|
"""
|
|
url = f"{YOUTUBE_API_BASE}/{endpoint}"
|
|
|
|
while True:
|
|
api_key = get_youtube_api_key()
|
|
if not api_key:
|
|
logger.error("No YouTube API keys available")
|
|
return None
|
|
|
|
# Add current key to params
|
|
request_params = {**params, "key": api_key}
|
|
|
|
try:
|
|
response = client.get(url, params=request_params, timeout=timeout)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
error_text = str(e.response.text) if hasattr(e, 'response') else str(e)
|
|
|
|
# Check for quota exceeded (403) or rate limit errors
|
|
if e.response.status_code == 403 or "quotaExceeded" in error_text or "rateLimitExceeded" in error_text:
|
|
logger.warning(f"YouTube API quota/rate limit hit for key {CURRENT_YOUTUBE_KEY_INDEX + 1}")
|
|
if not rotate_youtube_api_key():
|
|
# All keys exhausted
|
|
raise YouTubeQuotaExhaustedError("All YouTube API keys exhausted")
|
|
# Retry with new key
|
|
continue
|
|
else:
|
|
logger.error(f"YouTube API error: {e}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error making YouTube API request to {endpoint}: {e}")
|
|
return None
|
|
|
|
|
|
class YouTubeQuotaExhaustedError(Exception):
|
|
"""Raised when all YouTube API keys are exhausted."""
|
|
pass
|
|
|
|
|
|
# For backwards compatibility (deprecated - use get_youtube_api_key())
|
|
GOOGLE_YOUTUBE_TOKEN = YOUTUBE_API_KEYS[0] if YOUTUBE_API_KEYS else ""
|
|
|
|
# Z.AI GLM 4.6 API for CH-Annotator verification (NOT Anthropic Claude)
|
|
ZAI_API_TOKEN = os.getenv("ZAI_API_TOKEN", "")
|
|
|
|
# API Endpoints
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
# Z.AI GLM 4.6 API endpoint (Anthropic-compatible interface)
|
|
ZAI_API_BASE = "https://api.z.ai/api/anthropic/v1"
|
|
ZAI_MODEL = "glm-4.6"
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 0.3 # seconds between API calls
|
|
|
|
# CH-Annotator convention version
|
|
CH_ANNOTATOR_VERSION = "ch_annotator-v1_7_0"
|
|
|
|
# CH-Annotator entity type definitions for heritage custodians
|
|
# From: data/entity_annotation/ch_annotator-v1_7_0.yaml
|
|
# Extended with GLAMORCUBESFIXPHDNT taxonomy subtypes
|
|
CH_ANNOTATOR_ENTITY_TYPES = {
|
|
# === HERITAGE INSTITUTION SUBTYPES (GRP.HER.*) ===
|
|
"GRP.HER.MUS": {
|
|
"code": "GRP.HER.MUS",
|
|
"name": "Museum",
|
|
"definition": "Museums of all types: art, history, science, natural history, etc.",
|
|
"ontology_class": "schema:Museum",
|
|
"glamorcubesfixphdnt_code": "M",
|
|
"google_place_types": [
|
|
"museum", "art_museum", "history_museum", "natural_history_museum",
|
|
"science_museum", "children's_museum", "war_memorial",
|
|
"tourist_attraction", "point_of_interest",
|
|
],
|
|
"wikidata_types": ["museum", "art museum", "history museum", "science museum"],
|
|
},
|
|
"GRP.HER.GAL": {
|
|
"code": "GRP.HER.GAL",
|
|
"name": "Gallery",
|
|
"definition": "Art galleries, exhibition spaces, and kunsthallen",
|
|
"ontology_class": "schema:ArtGallery",
|
|
"glamorcubesfixphdnt_code": "G",
|
|
"google_place_types": [
|
|
"art_gallery", "museum", "tourist_attraction", "point_of_interest",
|
|
],
|
|
"wikidata_types": ["art gallery", "gallery", "kunsthalle"],
|
|
},
|
|
"GRP.HER.LIB": {
|
|
"code": "GRP.HER.LIB",
|
|
"name": "Library",
|
|
"definition": "Libraries: public, academic, national, special",
|
|
"ontology_class": "schema:Library",
|
|
"glamorcubesfixphdnt_code": "L",
|
|
"google_place_types": [
|
|
"library", "public_library", "point_of_interest", "establishment",
|
|
],
|
|
"wikidata_types": ["library", "public library", "national library", "academic library"],
|
|
},
|
|
"GRP.HER.ARC": {
|
|
"code": "GRP.HER.ARC",
|
|
"name": "Archive",
|
|
"definition": "Archives: government, corporate, religious, personal",
|
|
"ontology_class": "schema:ArchiveOrganization",
|
|
"glamorcubesfixphdnt_code": "A",
|
|
"google_place_types": [
|
|
"archive", "government_office", "city_hall", "local_government_office",
|
|
"point_of_interest", "establishment",
|
|
],
|
|
"wikidata_types": ["archive", "national archive", "state archive", "city archive"],
|
|
},
|
|
"GRP.HER": {
|
|
"code": "GRP.HER",
|
|
"name": "Heritage Institution (General)",
|
|
"definition": "Heritage institutions: museums, archives, libraries, galleries (unspecified subtype)",
|
|
"ontology_class": "glam:HeritageCustodian",
|
|
"glamorcubesfixphdnt_code": "X",
|
|
"close_mappings": ["schema:Museum", "schema:Library", "schema:ArchiveOrganization"],
|
|
"google_place_types": [
|
|
# Museums
|
|
"museum", "art_gallery", "art_museum", "history_museum",
|
|
"natural_history_museum", "science_museum", "war_memorial",
|
|
# Libraries
|
|
"library", "public_library", "research_library",
|
|
# Archives
|
|
"archive", "government_office", "city_hall", "local_government_office",
|
|
# Cultural centers
|
|
"cultural_center", "community_center", "performing_arts_theater",
|
|
"tourist_attraction", "point_of_interest", "establishment",
|
|
],
|
|
"wikidata_types": ["museum", "library", "archive", "cultural institution", "heritage institution"],
|
|
},
|
|
# === OTHER HERITAGE-ADJACENT TYPES ===
|
|
"GRP.HER.RES": {
|
|
"code": "GRP.HER.RES",
|
|
"name": "Research Center",
|
|
"definition": "Research institutes and documentation centers with heritage focus",
|
|
"ontology_class": "schema:ResearchOrganization",
|
|
"glamorcubesfixphdnt_code": "R",
|
|
"google_place_types": [
|
|
"research_institute", "university", "point_of_interest", "establishment",
|
|
],
|
|
"wikidata_types": ["research institute", "documentation center", "research center"],
|
|
},
|
|
"GRP.HER.BOT": {
|
|
"code": "GRP.HER.BOT",
|
|
"name": "Botanical Garden / Zoo",
|
|
"definition": "Botanical gardens and zoological parks",
|
|
"ontology_class": "schema:Zoo",
|
|
"glamorcubesfixphdnt_code": "B",
|
|
"google_place_types": [
|
|
"zoo", "aquarium", "park", "tourist_attraction", "point_of_interest",
|
|
],
|
|
"wikidata_types": ["botanical garden", "zoo", "arboretum", "aquarium"],
|
|
},
|
|
"GRP.HER.HOL": {
|
|
"code": "GRP.HER.HOL",
|
|
"name": "Holy Site",
|
|
"definition": "Religious heritage sites with collections (churches, temples, mosques)",
|
|
"ontology_class": "schema:PlaceOfWorship",
|
|
"glamorcubesfixphdnt_code": "H",
|
|
"google_place_types": [
|
|
"church", "mosque", "synagogue", "hindu_temple", "buddhist_temple",
|
|
"place_of_worship", "tourist_attraction",
|
|
],
|
|
"wikidata_types": ["church", "cathedral", "monastery", "abbey", "temple", "mosque", "synagogue"],
|
|
},
|
|
"GRP.HER.FEA": {
|
|
"code": "GRP.HER.FEA",
|
|
"name": "Heritage Feature",
|
|
"definition": "Monuments, sculptures, memorials, landmarks",
|
|
"ontology_class": "schema:LandmarksOrHistoricalBuildings",
|
|
"glamorcubesfixphdnt_code": "F",
|
|
"google_place_types": [
|
|
"monument", "landmark", "historical_landmark", "tourist_attraction",
|
|
"point_of_interest", "cultural_landmark",
|
|
],
|
|
"wikidata_types": ["monument", "memorial", "statue", "sculpture", "landmark"],
|
|
},
|
|
# === NON-HERITAGE ORGANIZATION TYPES ===
|
|
"GRP.EDU": {
|
|
"code": "GRP.EDU",
|
|
"name": "Educational Institution",
|
|
"definition": "Universities, schools, and educational institutions",
|
|
"ontology_class": "schema:EducationalOrganization",
|
|
"glamorcubesfixphdnt_code": "E",
|
|
"google_place_types": ["university", "school", "college", "educational_institution"],
|
|
"wikidata_types": ["university", "school", "college", "academy"],
|
|
},
|
|
"GRP.GOV": {
|
|
"code": "GRP.GOV",
|
|
"name": "Government Organization",
|
|
"definition": "Government agencies, legislatures, and public bodies",
|
|
"ontology_class": "schema:GovernmentOrganization",
|
|
"glamorcubesfixphdnt_code": "O",
|
|
"google_place_types": ["government_office", "city_hall", "embassy", "courthouse"],
|
|
"wikidata_types": ["government agency", "ministry", "department"],
|
|
},
|
|
"GRP.REL": {
|
|
"code": "GRP.REL",
|
|
"name": "Religious Organization",
|
|
"definition": "Religious organizations, denominations, and congregations",
|
|
"ontology_class": "schema:ReligiousOrganization",
|
|
"glamorcubesfixphdnt_code": "H",
|
|
"google_place_types": ["church", "mosque", "synagogue", "hindu_temple", "buddhist_temple"],
|
|
"wikidata_types": ["religious organization", "church", "congregation"],
|
|
},
|
|
"GRP.COR": {
|
|
"code": "GRP.COR",
|
|
"name": "Corporation",
|
|
"definition": "Commercial companies and businesses with heritage collections",
|
|
"ontology_class": "schema:Corporation",
|
|
"glamorcubesfixphdnt_code": "C",
|
|
"google_place_types": ["corporate_office", "headquarters", "business", "establishment"],
|
|
"wikidata_types": ["company", "corporation", "business"],
|
|
},
|
|
}
|
|
|
|
# Mapping from GHCID type codes to CH-Annotator entity types
|
|
GHCID_TYPE_TO_CH_ANNOTATOR = {
|
|
"G": "GRP.HER.GAL", # Gallery
|
|
"L": "GRP.HER.LIB", # Library
|
|
"A": "GRP.HER.ARC", # Archive
|
|
"M": "GRP.HER.MUS", # Museum
|
|
"O": "GRP.GOV", # Official institution
|
|
"R": "GRP.HER.RES", # Research center
|
|
"C": "GRP.COR", # Corporation
|
|
"U": "GRP.HER", # Unknown (defaults to general heritage)
|
|
"B": "GRP.HER.BOT", # Botanical garden / Zoo
|
|
"E": "GRP.EDU", # Education provider
|
|
"S": "GRP.HER", # Collecting society
|
|
"F": "GRP.HER.FEA", # Features (monuments)
|
|
"I": "GRP.HER", # Intangible heritage group
|
|
"X": "GRP.HER", # Mixed
|
|
"P": "GRP.HER", # Personal collection
|
|
"H": "GRP.HER.HOL", # Holy sites
|
|
"D": "GRP.HER", # Digital platform
|
|
"N": "GRP.HER", # NGO
|
|
"T": "GRP.HER", # Taste/smell heritage
|
|
}
|
|
|
|
# Google Places fields to request
|
|
PLACE_FIELDS = [
|
|
"id", "displayName", "formattedAddress", "addressComponents",
|
|
"location", "types", "businessStatus", "internationalPhoneNumber",
|
|
"nationalPhoneNumber", "regularOpeningHours", "currentOpeningHours",
|
|
"websiteUri", "rating", "userRatingCount", "reviews", "priceLevel",
|
|
"photos", "googleMapsUri", "utcOffsetMinutes", "primaryType",
|
|
"primaryTypeDisplayName", "shortFormattedAddress", "editorialSummary",
|
|
]
|
|
|
|
# ============================================================================
|
|
# Utility Functions
|
|
# ============================================================================
|
|
|
|
def get_institution_name(entry: Dict[str, Any]) -> str:
|
|
"""Extract institution name from custodian entry.
|
|
|
|
PRIORITY ORDER (emic/native names first for better YouTube matching):
|
|
1. custodian_name.emic_name - Native language/script name (best for YouTube search)
|
|
2. custodian_name.claim_value - Standardized name
|
|
3. wikidata native label - Wikidata label in institution's country language
|
|
4. original_entry fields - Source data
|
|
5. enrichment data - Google Maps, ZCBS, etc.
|
|
"""
|
|
# PRIORITY 1: Emic name (native language/script) - BEST for YouTube search
|
|
if entry.get("custodian_name", {}).get("emic_name"):
|
|
return entry["custodian_name"]["emic_name"]
|
|
|
|
# PRIORITY 2: Standardized custodian name
|
|
if entry.get("custodian_name", {}).get("claim_value"):
|
|
return entry["custodian_name"]["claim_value"]
|
|
|
|
# PRIORITY 3: Wikidata label in native language (if available)
|
|
# Try to get label in institution's language first
|
|
wikidata = entry.get("wikidata_enrichment", {})
|
|
country_code = get_country_code(entry)
|
|
# Map country codes to likely Wikidata language codes
|
|
country_to_lang = {
|
|
"JP": "wikidata_label_ja", "CN": "wikidata_label_zh", "KR": "wikidata_label_ko",
|
|
"RU": "wikidata_label_ru", "UA": "wikidata_label_uk", "GR": "wikidata_label_el",
|
|
"IL": "wikidata_label_he", "IR": "wikidata_label_fa", "SA": "wikidata_label_ar",
|
|
"EG": "wikidata_label_ar", "TH": "wikidata_label_th", "VN": "wikidata_label_vi",
|
|
"DE": "wikidata_label_de", "FR": "wikidata_label_fr", "ES": "wikidata_label_es",
|
|
"IT": "wikidata_label_it", "PT": "wikidata_label_pt", "BR": "wikidata_label_pt",
|
|
"NL": "wikidata_label_nl", "BE": "wikidata_label_nl", "PL": "wikidata_label_pl",
|
|
"CZ": "wikidata_label_cs", "HU": "wikidata_label_hu", "RO": "wikidata_label_ro",
|
|
"BG": "wikidata_label_bg", "RS": "wikidata_label_sr", "HR": "wikidata_label_hr",
|
|
"SI": "wikidata_label_sl", "SK": "wikidata_label_sk", "TR": "wikidata_label_tr",
|
|
"IN": "wikidata_label_hi", "ID": "wikidata_label_id", "MY": "wikidata_label_ms",
|
|
"PH": "wikidata_label_tl", "SE": "wikidata_label_sv", "NO": "wikidata_label_no",
|
|
"DK": "wikidata_label_da", "FI": "wikidata_label_fi", "EE": "wikidata_label_et",
|
|
"LV": "wikidata_label_lv", "LT": "wikidata_label_lt",
|
|
}
|
|
native_label_key = country_to_lang.get(country_code)
|
|
if native_label_key and wikidata.get(native_label_key):
|
|
return wikidata[native_label_key]
|
|
|
|
# Fall back to English Wikidata label
|
|
if wikidata.get("wikidata_label_en"):
|
|
return wikidata["wikidata_label_en"]
|
|
|
|
# PRIORITY 4: Original entry fields
|
|
if entry.get("original_entry", {}).get("name"):
|
|
return entry["original_entry"]["name"]
|
|
if entry.get("original_entry", {}).get("organisatie"):
|
|
return entry["original_entry"]["organisatie"]
|
|
|
|
# PRIORITY 5: Enrichment data
|
|
if entry.get("zcbs_enrichment", {}).get("zcbs_name"):
|
|
return entry["zcbs_enrichment"]["zcbs_name"]
|
|
if entry.get("google_maps_enrichment", {}).get("place_name"):
|
|
return entry["google_maps_enrichment"]["place_name"]
|
|
|
|
return ""
|
|
|
|
|
|
def get_country_code(entry: Dict[str, Any]) -> str:
|
|
"""Extract country code from entry."""
|
|
loc = entry.get("ghcid", {}).get("location_resolution", {})
|
|
if loc.get("country_code"):
|
|
return loc["country_code"]
|
|
# Parse from GHCID
|
|
ghcid = entry.get("ghcid", {}).get("ghcid_current", "")
|
|
if ghcid and "-" in ghcid:
|
|
return ghcid.split("-")[0]
|
|
return ""
|
|
|
|
|
|
def get_coordinates(entry: Dict[str, Any]) -> Optional[Tuple[float, float]]:
|
|
"""Extract coordinates from entry if available."""
|
|
loc = entry.get("ghcid", {}).get("location_resolution", {})
|
|
src = loc.get("source_coordinates", {})
|
|
if src.get("latitude") and src.get("longitude"):
|
|
return (src["latitude"], src["longitude"])
|
|
return None
|
|
|
|
|
|
def get_city_name(entry: Dict[str, Any]) -> str:
|
|
"""Extract city name from entry."""
|
|
loc = entry.get("ghcid", {}).get("location_resolution", {})
|
|
return loc.get("city_name", "")
|
|
|
|
|
|
def get_wikidata_id(entry: Dict[str, Any]) -> str:
|
|
"""Extract Wikidata ID from entry."""
|
|
if entry.get("wikidata_enrichment", {}).get("wikidata_entity_id"):
|
|
return entry["wikidata_enrichment"]["wikidata_entity_id"]
|
|
if entry.get("original_entry", {}).get("wikidata_id"):
|
|
return entry["original_entry"]["wikidata_id"]
|
|
return ""
|
|
|
|
|
|
# ============================================================================
|
|
# Google Maps Enrichment
|
|
# ============================================================================
|
|
|
|
def build_maps_search_query(entry: Dict[str, Any]) -> str:
|
|
"""Build Google Maps search query from entry data."""
|
|
parts = []
|
|
|
|
name = get_institution_name(entry)
|
|
if name:
|
|
parts.append(name)
|
|
|
|
city = get_city_name(entry)
|
|
if city:
|
|
parts.append(city)
|
|
|
|
# Get country name
|
|
loc = entry.get("ghcid", {}).get("location_resolution", {})
|
|
country = loc.get("country_label", "")
|
|
if country:
|
|
parts.append(country)
|
|
|
|
return ", ".join(parts)
|
|
|
|
|
|
def search_google_place(
|
|
query: str,
|
|
client: httpx.Client,
|
|
country_code: str = "",
|
|
location_bias: Optional[Tuple[float, float]] = None,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Search for a place using Google Places API (New)."""
|
|
if not GOOGLE_PLACES_TOKEN:
|
|
logger.warning("GOOGLE_PLACES_TOKEN not set, skipping Maps enrichment")
|
|
return None
|
|
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": GOOGLE_PLACES_TOKEN,
|
|
"X-Goog-FieldMask": ",".join([f"places.{f}" for f in PLACE_FIELDS]),
|
|
}
|
|
|
|
body = {
|
|
"textQuery": query,
|
|
"maxResultCount": 1,
|
|
}
|
|
|
|
# Set language/region based on country
|
|
if country_code == "ZA":
|
|
body["languageCode"] = "en"
|
|
body["regionCode"] = "ZA"
|
|
elif country_code == "ZW":
|
|
body["languageCode"] = "en"
|
|
body["regionCode"] = "ZW"
|
|
|
|
# Add location bias if coordinates available
|
|
if location_bias:
|
|
lat, lng = location_bias
|
|
body["locationBias"] = {
|
|
"circle": {
|
|
"center": {"latitude": lat, "longitude": lng},
|
|
"radius": 50000.0 # 50km radius
|
|
}
|
|
}
|
|
|
|
try:
|
|
response = client.post(TEXT_SEARCH_URL, headers=headers, json=body)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
places = data.get("places", [])
|
|
if places:
|
|
return places[0]
|
|
else:
|
|
logger.warning(f"No place found for: {query}")
|
|
return None
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
error_data = {}
|
|
try:
|
|
error_data = e.response.json()
|
|
except Exception:
|
|
pass
|
|
error_msg = error_data.get("error", {}).get("message", str(e))
|
|
logger.error(f"Google Places API error: {error_msg}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error searching for '{query}': {e}")
|
|
return None
|
|
|
|
|
|
def parse_google_place(place: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Parse Google Places API response into enrichment dict."""
|
|
result = {
|
|
"place_id": place.get("id", ""),
|
|
"name": place.get("displayName", {}).get("text", ""),
|
|
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"api_status": "OK",
|
|
}
|
|
|
|
# Location
|
|
location = place.get("location", {})
|
|
if location.get("latitude") and location.get("longitude"):
|
|
result["coordinates"] = {
|
|
"latitude": location["latitude"],
|
|
"longitude": location["longitude"],
|
|
}
|
|
|
|
if place.get("formattedAddress"):
|
|
result["formatted_address"] = place["formattedAddress"]
|
|
if place.get("shortFormattedAddress"):
|
|
result["short_address"] = place["shortFormattedAddress"]
|
|
|
|
# Contact
|
|
if place.get("nationalPhoneNumber"):
|
|
result["phone_local"] = place["nationalPhoneNumber"]
|
|
if place.get("internationalPhoneNumber"):
|
|
result["phone_international"] = place["internationalPhoneNumber"]
|
|
if place.get("websiteUri"):
|
|
result["website"] = place["websiteUri"]
|
|
|
|
# Business info
|
|
if place.get("types"):
|
|
result["google_place_types"] = place["types"]
|
|
if place.get("primaryType"):
|
|
result["primary_type"] = place["primaryType"]
|
|
if place.get("businessStatus"):
|
|
result["business_status"] = place["businessStatus"]
|
|
|
|
# Ratings and reviews
|
|
if place.get("rating") is not None:
|
|
result["rating"] = place["rating"]
|
|
if place.get("userRatingCount") is not None:
|
|
result["total_ratings"] = place["userRatingCount"]
|
|
|
|
# Parse reviews
|
|
reviews = place.get("reviews", [])
|
|
if reviews:
|
|
result["reviews"] = [
|
|
{
|
|
"author_name": r.get("authorAttribution", {}).get("displayName"),
|
|
"author_uri": r.get("authorAttribution", {}).get("uri"),
|
|
"rating": r.get("rating"),
|
|
"relative_time_description": r.get("relativePublishTimeDescription"),
|
|
"text": r.get("text", {}).get("text"),
|
|
"publish_time": r.get("publishTime"),
|
|
}
|
|
for r in reviews
|
|
]
|
|
|
|
# Opening hours
|
|
if place.get("regularOpeningHours"):
|
|
result["opening_hours"] = {
|
|
"open_now": place.get("currentOpeningHours", {}).get("openNow"),
|
|
"weekday_text": place["regularOpeningHours"].get("weekdayDescriptions"),
|
|
}
|
|
|
|
# Editorial summary
|
|
if place.get("editorialSummary"):
|
|
result["editorial_summary"] = place["editorialSummary"].get("text")
|
|
|
|
# Photos (just references, not downloading)
|
|
photos = place.get("photos", [])
|
|
if photos:
|
|
result["photo_count"] = len(photos)
|
|
result["photos_metadata"] = [
|
|
{
|
|
"name": p.get("name"),
|
|
"height": p.get("heightPx"),
|
|
"width": p.get("widthPx"),
|
|
}
|
|
for p in photos[:5] # First 5 only
|
|
]
|
|
|
|
# Links
|
|
if place.get("googleMapsUri"):
|
|
result["google_maps_url"] = place["googleMapsUri"]
|
|
|
|
return result
|
|
|
|
|
|
# ============================================================================
|
|
# YouTube Enrichment
|
|
# ============================================================================
|
|
|
|
def search_youtube_channel(
|
|
query: str,
|
|
client: httpx.Client,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Search for a YouTube channel with automatic API key rotation."""
|
|
if not get_youtube_api_key():
|
|
logger.warning("No YouTube API keys available, skipping YouTube enrichment")
|
|
return None
|
|
|
|
params = {
|
|
"part": "snippet",
|
|
"type": "channel",
|
|
"q": query,
|
|
"maxResults": 3, # Get top 3 for verification
|
|
}
|
|
|
|
try:
|
|
data = youtube_api_request(client, "search", params)
|
|
if data is None:
|
|
return None
|
|
|
|
items = data.get("items", [])
|
|
if items:
|
|
# Return all candidates for LLM verification
|
|
return {"candidates": items, "query": query}
|
|
return None
|
|
|
|
except YouTubeQuotaExhaustedError:
|
|
raise # Let caller handle exhausted keys
|
|
except Exception as e:
|
|
logger.error(f"Error searching YouTube for '{query}': {e}")
|
|
return None
|
|
|
|
|
|
def get_youtube_channel_details(
|
|
channel_id: str,
|
|
client: httpx.Client,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Get detailed channel information with automatic API key rotation."""
|
|
if not get_youtube_api_key():
|
|
return None
|
|
|
|
params = {
|
|
"part": "snippet,statistics,brandingSettings,contentDetails",
|
|
"id": channel_id,
|
|
}
|
|
|
|
try:
|
|
data = youtube_api_request(client, "channels", params)
|
|
if data is None:
|
|
return None
|
|
|
|
items = data.get("items", [])
|
|
if items:
|
|
return items[0]
|
|
return None
|
|
|
|
except YouTubeQuotaExhaustedError:
|
|
raise # Let caller handle exhausted keys
|
|
except Exception as e:
|
|
logger.error(f"Error getting channel details for '{channel_id}': {e}")
|
|
return None
|
|
|
|
|
|
def parse_youtube_channel(channel: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Parse YouTube channel API response."""
|
|
snippet = channel.get("snippet", {})
|
|
stats = channel.get("statistics", {})
|
|
branding = channel.get("brandingSettings", {})
|
|
|
|
result = {
|
|
"channel_id": channel.get("id", ""),
|
|
"channel_url": f"https://www.youtube.com/channel/{channel.get('id', '')}",
|
|
"title": snippet.get("title", ""),
|
|
"description": snippet.get("description", ""),
|
|
"custom_url": snippet.get("customUrl", ""),
|
|
"published_at": snippet.get("publishedAt", ""),
|
|
"country": snippet.get("country", ""),
|
|
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Statistics
|
|
if stats.get("subscriberCount"):
|
|
result["subscriber_count"] = int(stats["subscriberCount"])
|
|
if stats.get("videoCount"):
|
|
result["video_count"] = int(stats["videoCount"])
|
|
if stats.get("viewCount"):
|
|
result["view_count"] = int(stats["viewCount"])
|
|
|
|
# Thumbnails
|
|
thumbnails = snippet.get("thumbnails", {})
|
|
if thumbnails.get("high", {}).get("url"):
|
|
result["thumbnail_url"] = thumbnails["high"]["url"]
|
|
|
|
return result
|
|
|
|
|
|
def get_uploads_playlist_id(channel_data: Dict[str, Any]) -> Optional[str]:
|
|
"""
|
|
Extract the uploads playlist ID from channel contentDetails.
|
|
|
|
The uploads playlist contains all public videos from the channel.
|
|
Playlist ID format: UU + channel_id (without UC prefix).
|
|
|
|
Args:
|
|
channel_data: Raw channel API response (must include contentDetails part)
|
|
|
|
Returns:
|
|
Uploads playlist ID or None if not found
|
|
"""
|
|
content_details = channel_data.get("contentDetails", {})
|
|
related_playlists = content_details.get("relatedPlaylists", {})
|
|
return related_playlists.get("uploads")
|
|
|
|
|
|
def get_playlist_videos(
|
|
playlist_id: str,
|
|
client: httpx.Client,
|
|
max_results: int = 50,
|
|
) -> List[str]:
|
|
"""
|
|
Fetch video IDs from a YouTube playlist with automatic API key rotation.
|
|
|
|
Args:
|
|
playlist_id: YouTube playlist ID (e.g., uploads playlist)
|
|
client: httpx Client instance
|
|
max_results: Maximum number of videos to fetch (default 50, max 50 per request)
|
|
|
|
Returns:
|
|
List of video IDs
|
|
"""
|
|
if not get_youtube_api_key():
|
|
return []
|
|
|
|
params = {
|
|
"part": "contentDetails",
|
|
"playlistId": playlist_id,
|
|
"maxResults": min(max_results, 50),
|
|
}
|
|
|
|
try:
|
|
data = youtube_api_request(client, "playlistItems", params)
|
|
if data is None:
|
|
return []
|
|
|
|
video_ids = []
|
|
for item in data.get("items", []):
|
|
content_details = item.get("contentDetails", {})
|
|
video_id = content_details.get("videoId")
|
|
if video_id:
|
|
video_ids.append(video_id)
|
|
|
|
return video_ids
|
|
|
|
except YouTubeQuotaExhaustedError:
|
|
raise # Let caller handle exhausted keys
|
|
except Exception as e:
|
|
logger.error(f"Error getting playlist videos for '{playlist_id}': {e}")
|
|
return []
|
|
|
|
|
|
def get_video_details(
|
|
video_ids: List[str],
|
|
client: httpx.Client,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch detailed metadata for multiple videos with automatic API key rotation.
|
|
|
|
Args:
|
|
video_ids: List of YouTube video IDs (max 50 per request)
|
|
client: httpx Client instance
|
|
|
|
Returns:
|
|
List of parsed video metadata dictionaries
|
|
"""
|
|
if not get_youtube_api_key() or not video_ids:
|
|
return []
|
|
|
|
# YouTube API accepts comma-separated video IDs (max 50)
|
|
params = {
|
|
"part": "snippet,contentDetails,statistics",
|
|
"id": ",".join(video_ids[:50]),
|
|
}
|
|
|
|
try:
|
|
data = youtube_api_request(client, "videos", params)
|
|
if data is None:
|
|
return []
|
|
|
|
videos = []
|
|
for item in data.get("items", []):
|
|
video_id = item.get("id", "")
|
|
snippet = item.get("snippet", {})
|
|
content_details = item.get("contentDetails", {})
|
|
stats = item.get("statistics", {})
|
|
|
|
video_data = {
|
|
"video_id": video_id,
|
|
"video_url": f"https://www.youtube.com/watch?v={video_id}",
|
|
"title": snippet.get("title", ""),
|
|
"description": snippet.get("description", ""),
|
|
"published_at": snippet.get("publishedAt", ""),
|
|
"duration": content_details.get("duration", ""),
|
|
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0,
|
|
"like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0,
|
|
"comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0,
|
|
"comments": [], # Placeholder for future comment fetching
|
|
}
|
|
|
|
# Get highest quality thumbnail
|
|
thumbnails = snippet.get("thumbnails", {})
|
|
for quality in ["maxres", "high", "medium", "default"]:
|
|
if thumbnails.get(quality, {}).get("url"):
|
|
video_data["thumbnail_url"] = thumbnails[quality]["url"]
|
|
break
|
|
|
|
videos.append(video_data)
|
|
|
|
return videos
|
|
|
|
except YouTubeQuotaExhaustedError:
|
|
raise # Let caller handle exhausted keys
|
|
except Exception as e:
|
|
logger.error(f"Error getting video details for {len(video_ids)} videos: {e}")
|
|
return []
|
|
|
|
|
|
def fetch_channel_videos(
|
|
channel_data: Dict[str, Any],
|
|
client: httpx.Client,
|
|
max_videos: int = 50,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch all videos from a YouTube channel.
|
|
|
|
This is the main entry point for video fetching. It:
|
|
1. Extracts the uploads playlist ID from channel data
|
|
2. Fetches video IDs from the playlist
|
|
3. Gets detailed metadata for each video
|
|
|
|
Args:
|
|
channel_data: Raw channel API response (must include contentDetails part)
|
|
client: httpx Client instance
|
|
max_videos: Maximum number of videos to fetch (default 50)
|
|
|
|
Returns:
|
|
List of parsed video metadata dictionaries
|
|
"""
|
|
# Step 1: Get uploads playlist ID
|
|
uploads_playlist_id = get_uploads_playlist_id(channel_data)
|
|
if not uploads_playlist_id:
|
|
logger.warning("No uploads playlist found for channel")
|
|
return []
|
|
|
|
# Step 2: Get video IDs from playlist
|
|
video_ids = get_playlist_videos(uploads_playlist_id, client, max_videos)
|
|
if not video_ids:
|
|
logger.info("No videos found in uploads playlist")
|
|
return []
|
|
|
|
logger.info(f"Found {len(video_ids)} videos in uploads playlist")
|
|
|
|
# Step 3: Get detailed video metadata
|
|
videos = get_video_details(video_ids, client)
|
|
logger.info(f"Fetched details for {len(videos)} videos")
|
|
|
|
return videos
|
|
|
|
|
|
# ============================================================================
|
|
# Z.AI GLM 4.6 Verification with Exponential Backoff (CH-Annotator)
|
|
# ============================================================================
|
|
|
|
MAX_RETRIES = 3
|
|
BASE_DELAY = 1.0 # seconds
|
|
MAX_DELAY = 30.0 # seconds
|
|
|
|
|
|
async def call_glm_with_retry(
|
|
prompt: str,
|
|
max_retries: int = MAX_RETRIES,
|
|
) -> Optional[str]:
|
|
"""
|
|
Call Z.AI GLM 4.6 API with exponential backoff retry.
|
|
|
|
Uses Anthropic-compatible interface at api.z.ai.
|
|
|
|
Returns:
|
|
Response content string or None if all retries fail
|
|
"""
|
|
headers = {
|
|
"x-api-key": ZAI_API_TOKEN,
|
|
"anthropic-version": "2023-06-01",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
body = {
|
|
"model": ZAI_MODEL,
|
|
"max_tokens": 500,
|
|
"messages": [
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
}
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
f"{ZAI_API_BASE}/messages",
|
|
headers=headers,
|
|
json=body,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Anthropic-compatible response format
|
|
content_blocks = data.get("content", [])
|
|
if content_blocks and content_blocks[0].get("type") == "text":
|
|
return content_blocks[0].get("text", "")
|
|
return ""
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 429:
|
|
# Rate limited - exponential backoff
|
|
delay = min(BASE_DELAY * (2 ** attempt), MAX_DELAY)
|
|
logger.warning(f"Rate limited, waiting {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
|
|
await asyncio.sleep(delay)
|
|
else:
|
|
logger.error(f"GLM 4.6 API error: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"GLM 4.6 API call failed: {e}")
|
|
return None
|
|
|
|
logger.error(f"All {max_retries} GLM 4.6 API retries exhausted")
|
|
return None
|
|
|
|
|
|
def extract_ghcid_type_code(filepath: Path) -> str:
|
|
"""
|
|
Extract the institution type code from GHCID filename.
|
|
|
|
GHCID format: {COUNTRY}-{REGION}-{CITY}-{TYPE}-{ABBREV}.yaml
|
|
Example: NL-NH-AMS-M-RM.yaml → "M" (Museum)
|
|
|
|
Args:
|
|
filepath: Path to custodian YAML file
|
|
|
|
Returns:
|
|
Single-letter type code (G, L, A, M, O, R, C, U, B, E, S, F, I, X, P, H, D, N, T)
|
|
or empty string if cannot be extracted
|
|
"""
|
|
filename = filepath.stem # Remove .yaml extension
|
|
parts = filename.split("-")
|
|
|
|
# GHCID has at least 5 parts: COUNTRY-REGION-CITY-TYPE-ABBREV
|
|
if len(parts) >= 5:
|
|
# Type code is the 4th part (index 3)
|
|
type_code = parts[3]
|
|
# Type code should be a single letter
|
|
if len(type_code) == 1 and type_code.isalpha():
|
|
return type_code.upper()
|
|
|
|
return ""
|
|
|
|
|
|
def get_expected_entity_type(
|
|
institution_type: Any = None,
|
|
filepath: Optional[Path] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Get CH-Annotator entity type based on GHCID type code from filename.
|
|
|
|
PRIMARY: Uses GHCID type code from filename (most reliable)
|
|
FALLBACK: Uses Wikidata instance_of text matching
|
|
|
|
Args:
|
|
institution_type: Wikidata instance_of value (fallback only)
|
|
filepath: Path to custodian file (primary source)
|
|
|
|
Returns:
|
|
CH-Annotator entity type definition dict
|
|
"""
|
|
# PRIMARY: Extract type code from GHCID filename
|
|
if filepath:
|
|
type_code = extract_ghcid_type_code(filepath)
|
|
if type_code and type_code in GHCID_TYPE_TO_CH_ANNOTATOR:
|
|
ch_annotator_code = GHCID_TYPE_TO_CH_ANNOTATOR[type_code]
|
|
if ch_annotator_code in CH_ANNOTATOR_ENTITY_TYPES:
|
|
return CH_ANNOTATOR_ENTITY_TYPES[ch_annotator_code]
|
|
|
|
# FALLBACK: Use Wikidata instance_of text matching
|
|
# Handle list of types (Wikidata can return multiple instance_of)
|
|
if isinstance(institution_type, list):
|
|
institution_type = " ".join(str(t) for t in institution_type)
|
|
|
|
inst_lower = str(institution_type).lower() if institution_type else ""
|
|
|
|
# Map Wikidata types to CH-Annotator entity types
|
|
if any(term in inst_lower for term in ["museum", "gallery", "kunsthall"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.MUS"]
|
|
elif any(term in inst_lower for term in ["archive", "archiv", "archief"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.ARC"]
|
|
elif any(term in inst_lower for term in ["library", "bibliothek", "bibliotheek", "biblioteca"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.LIB"]
|
|
elif any(term in inst_lower for term in ["university", "college", "school", "academy"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.EDU"]
|
|
elif any(term in inst_lower for term in ["church", "mosque", "temple", "synagogue", "cathedral"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.REL"]
|
|
elif any(term in inst_lower for term in ["government", "ministry", "department"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.GOV"]
|
|
elif any(term in inst_lower for term in ["botanical", "zoo", "aquarium", "arboretum"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.BOT"]
|
|
elif any(term in inst_lower for term in ["monument", "memorial", "statue", "landmark"]):
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER.FEA"]
|
|
else:
|
|
# Default to general heritage institution for custodian files
|
|
return CH_ANNOTATOR_ENTITY_TYPES["GRP.HER"]
|
|
|
|
|
|
async def verify_match_with_llm(
|
|
institution_name: str,
|
|
institution_info: Dict[str, Any],
|
|
candidate_name: str,
|
|
candidate_info: Dict[str, Any],
|
|
match_type: str, # "google_maps" or "youtube"
|
|
filepath: Optional[Path] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Use Z.AI GLM 4.6 to verify if a candidate match is correct.
|
|
|
|
Uses CH-Annotator v1.7.0 entity type definitions for validation.
|
|
Expected entity type derived from GHCID type code in filename.
|
|
|
|
Args:
|
|
institution_name: Name of the heritage custodian institution
|
|
institution_info: Dict with wikidata_id, city, country, type
|
|
candidate_name: Name from Google Maps or YouTube
|
|
candidate_info: Dict with place/channel details
|
|
match_type: "google_maps" or "youtube"
|
|
filepath: Path to custodian YAML file (for GHCID type extraction)
|
|
|
|
Returns:
|
|
Dict with keys:
|
|
- is_match: bool
|
|
- confidence: float (0.0-1.0)
|
|
- reasoning: str
|
|
- agent: str (model version)
|
|
- entity_type: str (CH-Annotator entity type code)
|
|
"""
|
|
if not ZAI_API_TOKEN:
|
|
logger.warning("ZAI_API_TOKEN not set, skipping LLM verification")
|
|
return {
|
|
"is_match": None,
|
|
"confidence": 0.5,
|
|
"reasoning": "LLM verification skipped - no API key",
|
|
"agent": "none",
|
|
"verified": False,
|
|
}
|
|
|
|
# Get expected CH-Annotator entity type (PRIMARY: from GHCID, FALLBACK: from Wikidata)
|
|
expected_entity = get_expected_entity_type(
|
|
institution_type=institution_info.get('type', ''),
|
|
filepath=filepath,
|
|
)
|
|
expected_place_types = expected_entity.get("google_place_types", [])
|
|
|
|
# Build verification prompt
|
|
if match_type == "google_maps":
|
|
prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention.
|
|
|
|
TASK: Verify if a Google Maps place matches a heritage custodian institution.
|
|
|
|
== CH-ANNOTATOR ENTITY TYPE ==
|
|
Expected Type: {expected_entity['code']} ({expected_entity['name']})
|
|
Definition: {expected_entity['definition']}
|
|
Ontology Class: {expected_entity['ontology_class']}
|
|
Expected Google Place Types: {', '.join(expected_place_types[:10])}
|
|
|
|
== SOURCE INSTITUTION (GRP.HER) ==
|
|
- Name: {institution_name}
|
|
- Wikidata ID: {institution_info.get('wikidata_id', 'N/A')}
|
|
- City (TOP.SET): {institution_info.get('city', 'N/A')}
|
|
- Country (TOP.CTY): {institution_info.get('country', 'N/A')}
|
|
- Instance Type: {institution_info.get('type', 'N/A')}
|
|
|
|
== GOOGLE MAPS CANDIDATE ==
|
|
- Name: {candidate_name}
|
|
- Address (TOP.ADR): {candidate_info.get('formatted_address', 'N/A')}
|
|
- Google Place Types: {candidate_info.get('google_place_types', 'N/A')}
|
|
- Website: {candidate_info.get('website', 'N/A')}
|
|
- Business Status: {candidate_info.get('business_status', 'N/A')}
|
|
|
|
== VERIFICATION CRITERIA ==
|
|
1. NAME MATCH: Do the names refer to the same institution? (Allow translations, abbreviations, acronyms)
|
|
2. LOCATION MATCH: Is the address in the same city/country?
|
|
3. TYPE MATCH: Does Google Place type match expected heritage types (museum, library, archive, gallery)?
|
|
4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})?
|
|
|
|
REJECT if:
|
|
- Different institution with similar name
|
|
- Google Place types indicate non-heritage (restaurant, hotel, shop)
|
|
- Location mismatch (different city/country)
|
|
- Name is a person, not an institution
|
|
|
|
Respond ONLY with JSON (no explanation outside JSON):
|
|
{{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}}
|
|
"""
|
|
else: # youtube
|
|
prompt = f"""You are an entity annotator following CH-Annotator v1.7.0 convention.
|
|
|
|
TASK: Verify if a YouTube channel is the official channel of a heritage custodian institution.
|
|
|
|
== CH-ANNOTATOR ENTITY TYPE ==
|
|
Expected Type: {expected_entity['code']} ({expected_entity['name']})
|
|
Definition: {expected_entity['definition']}
|
|
Ontology Class: {expected_entity['ontology_class']}
|
|
|
|
== SOURCE INSTITUTION (GRP.HER) ==
|
|
- Name: {institution_name}
|
|
- Wikidata ID: {institution_info.get('wikidata_id', 'N/A')}
|
|
- City (TOP.SET): {institution_info.get('city', 'N/A')}
|
|
- Country (TOP.CTY): {institution_info.get('country', 'N/A')}
|
|
- Instance Type: {institution_info.get('type', 'N/A')}
|
|
|
|
== YOUTUBE CHANNEL CANDIDATE ==
|
|
- Channel Title: {candidate_name}
|
|
- Description: {candidate_info.get('description', 'N/A')[:500]}
|
|
- Country: {candidate_info.get('country', 'N/A')}
|
|
- Subscribers: {candidate_info.get('subscriber_count', 'N/A')}
|
|
- Video Count: {candidate_info.get('video_count', 'N/A')}
|
|
|
|
== VERIFICATION CRITERIA ==
|
|
1. NAME MATCH: Does channel name match institution? (Allow abbreviations, acronyms)
|
|
2. DESCRIPTION: Does description mention heritage, culture, museum, archive, library?
|
|
3. CONTENT: Is this likely an official institutional channel (not fan-made, personal)?
|
|
4. ENTITY TYPE: Is this truly a {expected_entity['code']} ({expected_entity['name']})?
|
|
|
|
REJECT if:
|
|
- Channel is personal/fan-made (not official)
|
|
- Description indicates unrelated content (gaming, personal vlogs)
|
|
- Different institution with similar name
|
|
- Channel is for a different city/country
|
|
|
|
Respond ONLY with JSON (no explanation outside JSON):
|
|
{{"is_match": true/false, "confidence": 0.0-1.0, "entity_type": "{expected_entity['code']}", "reasoning": "..."}}
|
|
"""
|
|
|
|
# Call GLM 4.6 API with retry
|
|
content = await call_glm_with_retry(prompt)
|
|
|
|
if content is None:
|
|
return {
|
|
"is_match": None,
|
|
"confidence": 0.5,
|
|
"reasoning": "LLM verification failed - API error",
|
|
"agent": ZAI_MODEL,
|
|
"verified": False,
|
|
}
|
|
|
|
# Parse JSON response
|
|
try:
|
|
# Extract JSON from response
|
|
json_match = re.search(r'\{[^}]+\}', content, re.DOTALL)
|
|
if json_match:
|
|
result = json.loads(json_match.group())
|
|
result["agent"] = ZAI_MODEL
|
|
result["verified"] = True
|
|
result["ch_annotator_version"] = CH_ANNOTATOR_VERSION
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Fallback if JSON parsing fails
|
|
is_match = "true" in content.lower() and "false" not in content.lower()
|
|
return {
|
|
"is_match": is_match,
|
|
"confidence": 0.7 if is_match else 0.3,
|
|
"reasoning": content[:200],
|
|
"agent": ZAI_MODEL,
|
|
"verified": True,
|
|
"ch_annotator_version": CH_ANNOTATOR_VERSION,
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# Main Enrichment Pipeline
|
|
# ============================================================================
|
|
|
|
async def enrich_custodian_file(
|
|
filepath: Path,
|
|
client: httpx.Client,
|
|
force: bool = False,
|
|
dry_run: bool = False,
|
|
youtube_only: bool = False,
|
|
maps_only: bool = False,
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Enrich a single custodian YAML file with YouTube and Google Maps data.
|
|
|
|
Returns:
|
|
Tuple of (modified: bool, status: str)
|
|
"""
|
|
logger.info(f"Processing: {filepath.name}")
|
|
|
|
# Load YAML
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
return False, "Empty file"
|
|
|
|
modified = False
|
|
statuses = []
|
|
|
|
# Check if already enriched (including rejections/not found - we've already tried)
|
|
has_maps = entry.get("google_maps_enrichment") is not None or entry.get("google_maps_status") is not None
|
|
has_youtube = entry.get("youtube_enrichment") is not None or entry.get("youtube_status") is not None
|
|
|
|
# Determine what needs enrichment based on flags
|
|
skip_maps = youtube_only or (has_maps and not force)
|
|
skip_youtube = maps_only or (has_youtube and not force)
|
|
|
|
if skip_maps and skip_youtube:
|
|
return False, "Already enriched (use --force to re-enrich)"
|
|
|
|
# Extract info for matching
|
|
institution_name = get_institution_name(entry)
|
|
if not institution_name:
|
|
return False, "No institution name found"
|
|
|
|
country_code = get_country_code(entry)
|
|
city_name = get_city_name(entry)
|
|
coords = get_coordinates(entry)
|
|
wikidata_id = get_wikidata_id(entry)
|
|
|
|
institution_info = {
|
|
"wikidata_id": wikidata_id,
|
|
"city": city_name,
|
|
"country": country_code,
|
|
"type": entry.get("wikidata_enrichment", {}).get("instance_of", ""),
|
|
}
|
|
|
|
logger.info(f" Institution: {institution_name}")
|
|
logger.info(f" Location: {city_name}, {country_code}")
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Google Maps Enrichment
|
|
# -------------------------------------------------------------------------
|
|
if not skip_maps:
|
|
query = build_maps_search_query(entry)
|
|
logger.info(f" Maps query: {query}")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
place = search_google_place(query, client, country_code, coords)
|
|
|
|
if place:
|
|
maps_data = parse_google_place(place)
|
|
candidate_name = maps_data.get("name", "")
|
|
logger.info(f" Maps found: {candidate_name}")
|
|
|
|
# LLM verification (uses GHCID type code from filepath)
|
|
verification = await verify_match_with_llm(
|
|
institution_name,
|
|
institution_info,
|
|
candidate_name,
|
|
maps_data,
|
|
"google_maps",
|
|
filepath=filepath,
|
|
)
|
|
|
|
if verification.get("is_match") is True:
|
|
maps_data["llm_verification"] = verification
|
|
entry["google_maps_enrichment"] = maps_data
|
|
entry["google_maps_status"] = "SUCCESS"
|
|
modified = True
|
|
statuses.append(f"Maps: {candidate_name} (conf: {verification.get('confidence', 0):.2f})")
|
|
logger.info(f" ✓ Maps verified: {verification.get('reasoning', '')[:60]}")
|
|
elif verification.get("is_match") is False:
|
|
entry["google_maps_status"] = "NO_MATCH"
|
|
entry["google_maps_rejected"] = {
|
|
"candidate_name": candidate_name,
|
|
"rejection_reason": verification.get("reasoning", ""),
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
modified = True
|
|
statuses.append("Maps: rejected by LLM")
|
|
logger.info(f" ✗ Maps rejected: {verification.get('reasoning', '')[:60]}")
|
|
else:
|
|
# Verification skipped or failed - include with warning
|
|
maps_data["llm_verification"] = verification
|
|
entry["google_maps_enrichment"] = maps_data
|
|
entry["google_maps_status"] = "UNVERIFIED"
|
|
modified = True
|
|
statuses.append(f"Maps: {candidate_name} (unverified)")
|
|
else:
|
|
entry["google_maps_status"] = "NOT_FOUND"
|
|
entry["google_maps_search_query"] = query
|
|
entry["google_maps_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
modified = True
|
|
statuses.append("Maps: not found")
|
|
|
|
# -------------------------------------------------------------------------
|
|
# YouTube Enrichment
|
|
# -------------------------------------------------------------------------
|
|
if not skip_youtube:
|
|
# Build YouTube search query
|
|
youtube_query = f"{institution_name} official"
|
|
logger.info(f" YouTube query: {youtube_query}")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
search_result = search_youtube_channel(youtube_query, client)
|
|
|
|
if search_result and search_result.get("candidates"):
|
|
candidates = search_result["candidates"]
|
|
logger.info(f" YouTube candidates: {len(candidates)}")
|
|
|
|
# Try each candidate
|
|
best_match = None
|
|
best_verification = None
|
|
|
|
for candidate in candidates[:3]: # Top 3 candidates
|
|
channel_id = candidate.get("id", {}).get("channelId")
|
|
if not channel_id:
|
|
continue
|
|
|
|
# Get full channel details
|
|
time.sleep(REQUEST_DELAY)
|
|
channel_details = get_youtube_channel_details(channel_id, client)
|
|
|
|
if not channel_details:
|
|
continue
|
|
|
|
youtube_data = parse_youtube_channel(channel_details)
|
|
|
|
# Fetch individual video metadata
|
|
videos = fetch_channel_videos(channel_details, client, max_videos=50)
|
|
if videos:
|
|
youtube_data["videos"] = videos
|
|
logger.info(f" Fetched {len(videos)} videos for channel")
|
|
else:
|
|
youtube_data["videos"] = []
|
|
|
|
candidate_name = youtube_data.get("title", "")
|
|
|
|
# LLM verification (uses GHCID type code from filepath)
|
|
verification = await verify_match_with_llm(
|
|
institution_name,
|
|
institution_info,
|
|
candidate_name,
|
|
youtube_data,
|
|
"youtube",
|
|
filepath=filepath,
|
|
)
|
|
|
|
if verification.get("is_match") is True:
|
|
if best_verification is None or verification.get("confidence", 0) > best_verification.get("confidence", 0):
|
|
best_match = youtube_data
|
|
best_verification = verification
|
|
logger.info(f" YouTube match: {candidate_name} (conf: {verification.get('confidence', 0):.2f})")
|
|
|
|
if best_match:
|
|
best_match["llm_verification"] = best_verification
|
|
entry["youtube_enrichment"] = best_match
|
|
entry["youtube_status"] = "SUCCESS"
|
|
modified = True
|
|
statuses.append(f"YouTube: {best_match.get('title', '')} ({best_match.get('subscriber_count', 0)} subs)")
|
|
else:
|
|
entry["youtube_status"] = "NO_MATCH"
|
|
entry["youtube_search_query"] = youtube_query
|
|
entry["youtube_candidates_rejected"] = len(candidates)
|
|
entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
modified = True
|
|
statuses.append("YouTube: no verified match")
|
|
else:
|
|
entry["youtube_status"] = "NOT_FOUND"
|
|
entry["youtube_search_query"] = youtube_query
|
|
entry["youtube_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
modified = True
|
|
statuses.append("YouTube: not found")
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Add provenance note
|
|
# -------------------------------------------------------------------------
|
|
if modified:
|
|
if "provenance" not in entry:
|
|
entry["provenance"] = {}
|
|
|
|
# Handle notes field - can be string, list, or missing
|
|
existing_notes = entry["provenance"].get("notes")
|
|
if existing_notes is None:
|
|
entry["provenance"]["notes"] = []
|
|
elif isinstance(existing_notes, str):
|
|
# Convert string notes to list
|
|
entry["provenance"]["notes"] = [existing_notes]
|
|
# else: it's already a list
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
entry["provenance"]["notes"].append(
|
|
f"YouTube/Google Maps enrichment {timestamp}: {'; '.join(statuses)}"
|
|
)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Save file
|
|
# -------------------------------------------------------------------------
|
|
if modified and not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
logger.info(f" Saved: {filepath.name}")
|
|
|
|
status = "; ".join(statuses) if statuses else "No changes"
|
|
return modified, status
|
|
|
|
|
|
async def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich custodian files with YouTube and Google Maps data"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Don't save changes, just show what would be done"
|
|
)
|
|
parser.add_argument(
|
|
"--force",
|
|
action="store_true",
|
|
help="Re-enrich even if already enriched"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=None,
|
|
help="Limit number of files to process"
|
|
)
|
|
parser.add_argument(
|
|
"--files",
|
|
nargs="+",
|
|
help="Specific files to process (just filenames)"
|
|
)
|
|
parser.add_argument(
|
|
"--pattern",
|
|
type=str,
|
|
default=None,
|
|
help="Glob pattern for files (e.g., 'ZA-*.yaml')"
|
|
)
|
|
parser.add_argument(
|
|
"--youtube-only",
|
|
action="store_true",
|
|
help="Only enrich YouTube data (skip Google Maps)"
|
|
)
|
|
parser.add_argument(
|
|
"--maps-only",
|
|
action="store_true",
|
|
help="Only enrich Google Maps data (skip YouTube)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Check for required API keys
|
|
if not GOOGLE_PLACES_TOKEN and not get_youtube_api_key():
|
|
logger.error("No API keys found! Set GOOGLE_PLACES_TOKEN or GOOGLE_YOUTUBE_TOKEN")
|
|
sys.exit(1)
|
|
|
|
# Find files to process
|
|
if args.files:
|
|
files = [CUSTODIAN_DIR / f for f in args.files]
|
|
files = [f for f in files if f.exists()]
|
|
elif args.pattern:
|
|
files = sorted(CUSTODIAN_DIR.glob(args.pattern))
|
|
else:
|
|
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
logger.info(f"Found {len(files)} files to process")
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN - no files will be modified")
|
|
|
|
if args.youtube_only:
|
|
logger.info("YOUTUBE-ONLY mode - skipping Google Maps enrichment")
|
|
elif args.maps_only:
|
|
logger.info("MAPS-ONLY mode - skipping YouTube enrichment")
|
|
|
|
# Process files
|
|
results = {"modified": 0, "skipped": 0, "errors": 0}
|
|
|
|
with httpx.Client(timeout=60.0) as client:
|
|
for filepath in files:
|
|
try:
|
|
modified, status = await enrich_custodian_file(
|
|
filepath, client, args.force, args.dry_run,
|
|
youtube_only=args.youtube_only,
|
|
maps_only=args.maps_only,
|
|
)
|
|
if modified:
|
|
results["modified"] += 1
|
|
else:
|
|
results["skipped"] += 1
|
|
logger.info(f" Status: {status}")
|
|
except YouTubeQuotaExhaustedError:
|
|
logger.error("=" * 60)
|
|
logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment")
|
|
logger.error("=" * 60)
|
|
break # Exit the loop gracefully
|
|
except Exception as e:
|
|
logger.error(f"Error processing {filepath.name}: {e}")
|
|
results["errors"] += 1
|
|
|
|
# Rate limiting between files
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Summary
|
|
logger.info("=" * 60)
|
|
logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|