glam/scripts/enrich_youtube_from_wikidata.py

#!/usr/bin/env python3
"""
Enrich heritage custodian files with YouTube data using existing youtube_channel_id.

This script targets files that have:
- A youtube_channel_id (from Wikidata P2397 or web_claims)
- NO youtube_enrichment section yet

It extracts the channel ID and fetches full enrichment data from YouTube Data API.

Usage:
    python scripts/enrich_youtube_from_wikidata.py [--dry-run] [--limit N] [--file PATH]

Environment Variables:
    GOOGLE_YOUTUBE_TOKEN: Required. YouTube Data API key.
"""

import argparse
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import httpx
import yaml

# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass

# Configuration
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
USER_AGENT = "GLAMDataExtractor/1.0 (heritage-data@example.com) Python/httpx"
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
REQUEST_DELAY = 0.3  # seconds between API calls


class APIKeyManager:
    """Manages multiple YouTube API keys with rotation on quota exhaustion."""

    def __init__(self):
        self.keys = []
        self.current_index = 0
        self.exhausted_keys = set()
        self._load_keys()

    def _load_keys(self):
        """Load all available API keys from environment variables."""
        key_patterns = [
            "YOUTUBE_API_KEY",
            "GOOGLE_YOUTUBE_TOKEN",
            "GOOGLE_YOUTUBE_TOKEN_v2",
            "GOOGLE_YOUTUBE_TOKEN_v3",
            "GOOGLE_YOUTUBE_TOKEN_v4",
            "GOOGLE_YOUTUBE_TOKEN_v5",
        ]

        seen = set()
        for pattern in key_patterns:
            key = os.getenv(pattern)
            if key and key not in seen:
                self.keys.append({"key": key, "name": pattern})
                seen.add(key)

        if not self.keys:
            print("WARNING: No YouTube API keys found in environment variables")
        else:
            print(f"Loaded {len(self.keys)} API key(s): {[k['name'] for k in self.keys]}")

    def get_current_key(self) -> Optional[str]:
        available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
        if not available:
            return None
        return available[self.current_index % len(available)]["key"]

    def get_current_key_name(self) -> str:
        available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
        if not available:
            return "none"
        return available[self.current_index % len(available)]["name"]

    def mark_quota_exceeded(self):
        if not self.keys:
            return
        available_indices = [i for i in range(len(self.keys)) if i not in self.exhausted_keys]
        if available_indices:
            current_actual_index = available_indices[self.current_index % len(available_indices)]
            self.exhausted_keys.add(current_actual_index)
            key_name = self.keys[current_actual_index]["name"]
            print(f"\n⚠️  Quota exceeded for {key_name}, rotating to next key...")
        self.current_index = 0

    def rotate_key(self):
        available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
        if len(available) > 1:
            self.current_index = (self.current_index + 1) % len(available)

    def has_available_keys(self) -> bool:
        return len(self.exhausted_keys) < len(self.keys)

    def get_status(self) -> str:
        available = len(self.keys) - len(self.exhausted_keys)
        return f"{available}/{len(self.keys)} keys available"


api_key_manager = APIKeyManager()


def find_files_needing_enrichment() -> List[Path]:
    """Find custodian files that have youtube_channel_id but no youtube_enrichment."""
    print("Finding files with youtube_channel_id but no youtube_enrichment...")

    # Use ripgrep to find files with youtube_channel_id
    try:
        result = subprocess.run(
            ["rg", "-l", "youtube_channel_id:", str(DATA_DIR)],
            capture_output=True,
            text=True,
            timeout=60
        )
        candidate_files = [Path(f.strip()) for f in result.stdout.strip().split('\n') if f.strip()]
    except Exception as e:
        print(f"Error running ripgrep: {e}")
        return []

    print(f"Found {len(candidate_files)} files with youtube_channel_id")

    # Filter to only those without youtube_enrichment
    files_needing_enrichment = []
    for yaml_file in candidate_files:
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                content = f.read()

            if "youtube_enrichment:" not in content:
                files_needing_enrichment.append(yaml_file)

        except Exception as e:
            print(f"  Warning: Error reading {yaml_file}: {e}")

    print(f"Files needing YouTube enrichment: {len(files_needing_enrichment)}")
    return files_needing_enrichment


def extract_channel_id(entry: Dict[str, Any]) -> Optional[str]:
    """Extract YouTube channel ID from entry data (Wikidata or web_claims)."""
    channel_id = None

    # Check wikidata_enrichment in multiple locations
    wikidata = entry.get("wikidata_enrichment", {})

    # Location 1: wikidata_social_media.youtube_channel_id (most common)
    social_media = wikidata.get("wikidata_social_media", {})
    if social_media and isinstance(social_media, dict):
        channel_id = social_media.get("youtube_channel_id")
        if channel_id:
            return channel_id

    # Location 2: wikidata_claims.P2397_youtube_channel_id
    claims = wikidata.get("wikidata_claims", {})
    if claims:
        p2397 = claims.get("P2397_youtube_channel_id", {})
        if p2397:
            channel_id = p2397.get("value") if isinstance(p2397, dict) else p2397
            if channel_id:
                return channel_id

        # Also check direct youtube_channel_id field under wikidata_claims
        for key, value in claims.items():
            if "youtube_channel_id" in key.lower():
                if isinstance(value, dict):
                    channel_id = value.get("value")
                else:
                    channel_id = value
                if channel_id:
                    return channel_id

    # Location 3: wikidata_external_ids
    ext_ids = wikidata.get("wikidata_external_ids", {})
    if ext_ids and isinstance(ext_ids, dict):
        channel_id = ext_ids.get("youtube_channel_id")
        if channel_id:
            return channel_id

    # Location 4: Direct youtube_channel_id at entry level
    channel_id = entry.get("youtube_channel_id")
    if channel_id:
        return channel_id

    # Location 5: Check web_claims for social_youtube_channel
    web_claims = entry.get("web_claims", {}).get("claims", [])
    for claim in web_claims:
        if claim.get("claim_type") == "social_youtube_channel":
            url = claim.get("claim_value", "")
            # Extract channel ID from URL
            match = re.search(r'youtube\.com/channel/(UC[0-9A-Za-z_-]{22})', url)
            if match:
                return match.group(1)

    return None


def get_channel_info(channel_id: str, api_key: str) -> Dict[str, Any]:
    """Get detailed channel information from YouTube Data API."""
    params = {
        "part": "snippet,statistics,brandingSettings,contentDetails",
        "id": channel_id,
        "key": api_key
    }

    response = httpx.get(
        f"{YOUTUBE_API_BASE}/channels",
        params=params,
        headers={"User-Agent": USER_AGENT},
        timeout=30.0
    )
    response.raise_for_status()
    data = response.json()

    if not data.get("items"):
        return {"error": f"Channel not found: {channel_id}"}

    item = data["items"][0]
    snippet = item.get("snippet", {})
    stats = item.get("statistics", {})
    branding = item.get("brandingSettings", {})

    return {
        "channel_id": channel_id,
        "channel_url": f"https://www.youtube.com/channel/{channel_id}",
        "title": snippet.get("title"),
        "description": snippet.get("description"),
        "custom_url": snippet.get("customUrl"),
        "published_at": snippet.get("publishedAt"),
        "country": snippet.get("country"),
        "default_language": snippet.get("defaultLanguage"),
        "thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
        "subscriber_count": int(stats.get("subscriberCount", 0)) if stats.get("subscriberCount") else None,
        "video_count": int(stats.get("videoCount", 0)) if stats.get("videoCount") else None,
        "view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else None,
        "subscriber_count_hidden": stats.get("hiddenSubscriberCount", False),
        "uploads_playlist_id": item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads"),
    }


def get_playlist_videos(playlist_id: str, api_key: str, max_results: int = 50) -> List[str]:
    """Get video IDs from uploads playlist."""
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_results:
        params = {
            "part": "contentDetails",
            "playlistId": playlist_id,
            "maxResults": min(50, max_results - len(video_ids)),
            "key": api_key
        }

        if next_page_token:
            params["pageToken"] = next_page_token

        try:
            response = httpx.get(
                f"{YOUTUBE_API_BASE}/playlistItems",
                params=params,
                headers={"User-Agent": USER_AGENT},
                timeout=30.0
            )
            response.raise_for_status()
            data = response.json()

            for item in data.get("items", []):
                video_id = item.get("contentDetails", {}).get("videoId")
                if video_id:
                    video_ids.append(video_id)

            next_page_token = data.get("nextPageToken")
            if not next_page_token:
                break

            time.sleep(0.05)

        except Exception as e:
            error_str = str(e)
            if "403" in error_str or "quota" in error_str.lower():
                raise Exception(f"quotaExceeded: {error_str}")
            print(f"      Warning: Error fetching playlist: {e}")
            break

    return video_ids


def get_video_details(video_ids: List[str], api_key: str) -> List[Dict[str, Any]]:
    """Get detailed metadata for videos."""
    if not video_ids:
        return []

    all_videos = []

    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i+50]

        params = {
            "part": "snippet,contentDetails,statistics",
            "id": ",".join(batch_ids),
            "key": api_key
        }

        try:
            response = httpx.get(
                f"{YOUTUBE_API_BASE}/videos",
                params=params,
                headers={"User-Agent": USER_AGENT},
                timeout=30.0
            )
            response.raise_for_status()
            data = response.json()

            for item in data.get("items", []):
                snippet = item.get("snippet", {})
                stats = item.get("statistics", {})
                content = item.get("contentDetails", {})

                video = {
                    "video_id": item["id"],
                    "video_url": f"https://www.youtube.com/watch?v={item['id']}",
                    "title": snippet.get("title"),
                    "description": snippet.get("description", "")[:500],  # Truncate
                    "published_at": snippet.get("publishedAt"),
                    "duration": content.get("duration"),
                    "view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0,
                    "like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0,
                    "comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0,
                    "comments": [],
                    "thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
                }
                all_videos.append(video)

            time.sleep(0.05)

        except Exception as e:
            error_str = str(e)
            if "403" in error_str or "quota" in error_str.lower():
                raise Exception(f"quotaExceeded: {error_str}")
            print(f"      Warning: Error fetching video details: {e}")

    return all_videos


def create_youtube_enrichment(channel_id: str, api_key: str) -> Dict[str, Any]:
    """Create full YouTube enrichment data from channel ID."""
    timestamp = datetime.now(timezone.utc).isoformat()

    enrichment = {
        "fetch_timestamp": timestamp,
        "api_endpoint": YOUTUBE_API_BASE,
        "api_version": "v3",
    }

    try:
        # Get channel info
        print(f"    Fetching channel info for {channel_id}...")
        channel_info = get_channel_info(channel_id, api_key)

        if channel_info.get("error"):
            enrichment["error"] = channel_info["error"]
            enrichment["status"] = "FAILED"
            return enrichment

        # Flatten channel info into enrichment (match expected structure)
        enrichment.update({
            "channel_id": channel_info["channel_id"],
            "channel_url": channel_info["channel_url"],
            "title": channel_info["title"],
            "description": channel_info["description"],
            "custom_url": channel_info["custom_url"],
            "published_at": channel_info["published_at"],
            "country": channel_info["country"],
            "thumbnail_url": channel_info["thumbnail_url"],
            "subscriber_count": channel_info["subscriber_count"],
            "video_count": channel_info["video_count"],
            "view_count": channel_info["view_count"],
        })

        # Get videos from uploads playlist
        uploads_playlist_id = channel_info.get("uploads_playlist_id")
        if uploads_playlist_id:
            print(f"    Fetching videos from uploads playlist...")
            video_ids = get_playlist_videos(uploads_playlist_id, api_key, max_results=50)

            if video_ids:
                print(f"    Found {len(video_ids)} videos, fetching details...")
                videos = get_video_details(video_ids, api_key)
                enrichment["videos"] = videos
            else:
                enrichment["videos"] = []
        else:
            enrichment["videos"] = []

        enrichment["status"] = "SUCCESS"

    except httpx.HTTPStatusError as e:
        enrichment["error"] = f"YouTube API error: {e.response.status_code}"
        enrichment["status"] = "FAILED"
    except Exception as e:
        error_str = str(e)
        if "quotaExceeded" in error_str:
            raise  # Re-raise for key rotation
        enrichment["error"] = str(e)
        enrichment["status"] = "FAILED"

    return enrichment


def process_file(yaml_file: Path, api_key: str, dry_run: bool = False) -> bool:
    """Process a single file and add YouTube enrichment."""
    print(f"\nProcessing: {yaml_file.name}")

    try:
        with open(yaml_file, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)

        if not entry:
            print(f"  Empty file, skipping")
            return False

        # Extract channel ID
        channel_id = extract_channel_id(entry)

        if not channel_id:
            print(f"  No channel_id found, skipping")
            return False

        print(f"  Found channel_id: {channel_id}")

        if dry_run:
            print(f"  [DRY RUN] Would enrich with YouTube data")
            return True

        # Create enrichment
        enrichment = create_youtube_enrichment(channel_id, api_key)

        # Check for quota exceeded
        if "quotaExceeded" in str(enrichment.get("error", "")):
            raise Exception(f"quotaExceeded: {enrichment['error']}")

        # Add to entry
        entry["youtube_enrichment"] = enrichment

        # Update provenance
        if "provenance" not in entry:
            entry["provenance"] = {}
        if "notes" not in entry["provenance"]:
            entry["provenance"]["notes"] = []
        if isinstance(entry["provenance"]["notes"], list):
            note = f"YouTube enrichment added on {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}"
            entry["provenance"]["notes"].append(note)

        # Save
        with open(yaml_file, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        status = enrichment.get("status", "UNKNOWN")
        print(f"  Status: {status}")

        if status == "SUCCESS":
            print(f"    Channel: {enrichment.get('title')}")
            print(f"    Subscribers: {enrichment.get('subscriber_count', 'Hidden'):,}" if enrichment.get('subscriber_count') else "    Subscribers: Hidden")
            print(f"    Videos: {len(enrichment.get('videos', []))}")

        return status == "SUCCESS"

    except Exception as e:
        error_str = str(e)
        if "quotaExceeded" in error_str:
            raise
        print(f"  ERROR: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(
        description="Enrich custodian files with YouTube data from existing channel IDs"
    )
    parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
    parser.add_argument("--limit", type=int, help="Limit number of files to process")
    parser.add_argument("--file", type=str, help="Process a specific file")

    args = parser.parse_args()

    if not api_key_manager.has_available_keys():
        print("ERROR: No YouTube API keys found")
        print("Set GOOGLE_YOUTUBE_TOKEN or YOUTUBE_API_KEY environment variable")
        sys.exit(1)

    print("=" * 60)
    print("YouTube Enrichment from Wikidata Channel IDs")
    print("=" * 60)
    print(f"API Keys: {api_key_manager.get_status()}")
    print(f"Dry run: {args.dry_run}")

    # Get files to process
    if args.file:
        files = [Path(args.file)]
        if not files[0].exists():
            files = [DATA_DIR / args.file]
        if not files[0].exists():
            print(f"ERROR: File not found: {args.file}")
            sys.exit(1)
    else:
        files = find_files_needing_enrichment()

    if args.limit:
        files = files[:args.limit]

    print(f"Files to process: {len(files)}")
    print("=" * 60)

    success_count = 0
    skip_count = 0
    error_count = 0

    for i, yaml_file in enumerate(files, 1):
        if not api_key_manager.has_available_keys():
            print("\n⚠️  ALL API KEYS EXHAUSTED - Stopping")
            break

        print(f"\n[{i}/{len(files)}]", end="")

        try:
            api_key = api_key_manager.get_current_key()
            result = process_file(yaml_file, api_key, args.dry_run)

            if result:
                success_count += 1
                api_key_manager.rotate_key()
            else:
                skip_count += 1

        except Exception as e:
            error_str = str(e)
            if "quotaExceeded" in error_str or "403" in error_str:
                print(f"\n  ⚠️  Quota exceeded, rotating key...")
                api_key_manager.mark_quota_exceeded()

                # Retry with new key
                if api_key_manager.has_available_keys():
                    try:
                        api_key = api_key_manager.get_current_key()
                        result = process_file(yaml_file, api_key, args.dry_run)
                        if result:
                            success_count += 1
                        else:
                            skip_count += 1
                    except Exception as retry_e:
                        print(f"  Retry failed: {retry_e}")
                        error_count += 1
            else:
                print(f"  ERROR: {e}")
                error_count += 1

        time.sleep(REQUEST_DELAY)

    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"Processed: {success_count + skip_count + error_count}")
    print(f"Successfully enriched: {success_count}")
    print(f"Skipped: {skip_count}")
    print(f"Errors: {error_count}")


if __name__ == "__main__":
    main()