glam/scripts/enrich_youtube_from_wikidata.py
2025-12-10 18:04:25 +01:00

581 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Enrich heritage custodian files with YouTube data using existing youtube_channel_id.
This script targets files that have:
- A youtube_channel_id (from Wikidata P2397 or web_claims)
- NO youtube_enrichment section yet
It extracts the channel ID and fetches full enrichment data from YouTube Data API.
Usage:
python scripts/enrich_youtube_from_wikidata.py [--dry-run] [--limit N] [--file PATH]
Environment Variables:
GOOGLE_YOUTUBE_TOKEN: Required. YouTube Data API key.
"""
import argparse
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import httpx
import yaml
# Load environment variables
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass
# Configuration
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
USER_AGENT = "GLAMDataExtractor/1.0 (heritage-data@example.com) Python/httpx"
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
REQUEST_DELAY = 0.3 # seconds between API calls
class APIKeyManager:
"""Manages multiple YouTube API keys with rotation on quota exhaustion."""
def __init__(self):
self.keys = []
self.current_index = 0
self.exhausted_keys = set()
self._load_keys()
def _load_keys(self):
"""Load all available API keys from environment variables."""
key_patterns = [
"YOUTUBE_API_KEY",
"GOOGLE_YOUTUBE_TOKEN",
"GOOGLE_YOUTUBE_TOKEN_v2",
"GOOGLE_YOUTUBE_TOKEN_v3",
"GOOGLE_YOUTUBE_TOKEN_v4",
"GOOGLE_YOUTUBE_TOKEN_v5",
]
seen = set()
for pattern in key_patterns:
key = os.getenv(pattern)
if key and key not in seen:
self.keys.append({"key": key, "name": pattern})
seen.add(key)
if not self.keys:
print("WARNING: No YouTube API keys found in environment variables")
else:
print(f"Loaded {len(self.keys)} API key(s): {[k['name'] for k in self.keys]}")
def get_current_key(self) -> Optional[str]:
available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
if not available:
return None
return available[self.current_index % len(available)]["key"]
def get_current_key_name(self) -> str:
available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
if not available:
return "none"
return available[self.current_index % len(available)]["name"]
def mark_quota_exceeded(self):
if not self.keys:
return
available_indices = [i for i in range(len(self.keys)) if i not in self.exhausted_keys]
if available_indices:
current_actual_index = available_indices[self.current_index % len(available_indices)]
self.exhausted_keys.add(current_actual_index)
key_name = self.keys[current_actual_index]["name"]
print(f"\n⚠️ Quota exceeded for {key_name}, rotating to next key...")
self.current_index = 0
def rotate_key(self):
available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
if len(available) > 1:
self.current_index = (self.current_index + 1) % len(available)
def has_available_keys(self) -> bool:
return len(self.exhausted_keys) < len(self.keys)
def get_status(self) -> str:
available = len(self.keys) - len(self.exhausted_keys)
return f"{available}/{len(self.keys)} keys available"
api_key_manager = APIKeyManager()
def find_files_needing_enrichment() -> List[Path]:
"""Find custodian files that have youtube_channel_id but no youtube_enrichment."""
print("Finding files with youtube_channel_id but no youtube_enrichment...")
# Use ripgrep to find files with youtube_channel_id
try:
result = subprocess.run(
["rg", "-l", "youtube_channel_id:", str(DATA_DIR)],
capture_output=True,
text=True,
timeout=60
)
candidate_files = [Path(f.strip()) for f in result.stdout.strip().split('\n') if f.strip()]
except Exception as e:
print(f"Error running ripgrep: {e}")
return []
print(f"Found {len(candidate_files)} files with youtube_channel_id")
# Filter to only those without youtube_enrichment
files_needing_enrichment = []
for yaml_file in candidate_files:
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
content = f.read()
if "youtube_enrichment:" not in content:
files_needing_enrichment.append(yaml_file)
except Exception as e:
print(f" Warning: Error reading {yaml_file}: {e}")
print(f"Files needing YouTube enrichment: {len(files_needing_enrichment)}")
return files_needing_enrichment
def extract_channel_id(entry: Dict[str, Any]) -> Optional[str]:
"""Extract YouTube channel ID from entry data (Wikidata or web_claims)."""
channel_id = None
# Check wikidata_enrichment in multiple locations
wikidata = entry.get("wikidata_enrichment", {})
# Location 1: wikidata_social_media.youtube_channel_id (most common)
social_media = wikidata.get("wikidata_social_media", {})
if social_media and isinstance(social_media, dict):
channel_id = social_media.get("youtube_channel_id")
if channel_id:
return channel_id
# Location 2: wikidata_claims.P2397_youtube_channel_id
claims = wikidata.get("wikidata_claims", {})
if claims:
p2397 = claims.get("P2397_youtube_channel_id", {})
if p2397:
channel_id = p2397.get("value") if isinstance(p2397, dict) else p2397
if channel_id:
return channel_id
# Also check direct youtube_channel_id field under wikidata_claims
for key, value in claims.items():
if "youtube_channel_id" in key.lower():
if isinstance(value, dict):
channel_id = value.get("value")
else:
channel_id = value
if channel_id:
return channel_id
# Location 3: wikidata_external_ids
ext_ids = wikidata.get("wikidata_external_ids", {})
if ext_ids and isinstance(ext_ids, dict):
channel_id = ext_ids.get("youtube_channel_id")
if channel_id:
return channel_id
# Location 4: Direct youtube_channel_id at entry level
channel_id = entry.get("youtube_channel_id")
if channel_id:
return channel_id
# Location 5: Check web_claims for social_youtube_channel
web_claims = entry.get("web_claims", {}).get("claims", [])
for claim in web_claims:
if claim.get("claim_type") == "social_youtube_channel":
url = claim.get("claim_value", "")
# Extract channel ID from URL
match = re.search(r'youtube\.com/channel/(UC[0-9A-Za-z_-]{22})', url)
if match:
return match.group(1)
return None
def get_channel_info(channel_id: str, api_key: str) -> Dict[str, Any]:
"""Get detailed channel information from YouTube Data API."""
params = {
"part": "snippet,statistics,brandingSettings,contentDetails",
"id": channel_id,
"key": api_key
}
response = httpx.get(
f"{YOUTUBE_API_BASE}/channels",
params=params,
headers={"User-Agent": USER_AGENT},
timeout=30.0
)
response.raise_for_status()
data = response.json()
if not data.get("items"):
return {"error": f"Channel not found: {channel_id}"}
item = data["items"][0]
snippet = item.get("snippet", {})
stats = item.get("statistics", {})
branding = item.get("brandingSettings", {})
return {
"channel_id": channel_id,
"channel_url": f"https://www.youtube.com/channel/{channel_id}",
"title": snippet.get("title"),
"description": snippet.get("description"),
"custom_url": snippet.get("customUrl"),
"published_at": snippet.get("publishedAt"),
"country": snippet.get("country"),
"default_language": snippet.get("defaultLanguage"),
"thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
"subscriber_count": int(stats.get("subscriberCount", 0)) if stats.get("subscriberCount") else None,
"video_count": int(stats.get("videoCount", 0)) if stats.get("videoCount") else None,
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else None,
"subscriber_count_hidden": stats.get("hiddenSubscriberCount", False),
"uploads_playlist_id": item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads"),
}
def get_playlist_videos(playlist_id: str, api_key: str, max_results: int = 50) -> List[str]:
"""Get video IDs from uploads playlist."""
video_ids = []
next_page_token = None
while len(video_ids) < max_results:
params = {
"part": "contentDetails",
"playlistId": playlist_id,
"maxResults": min(50, max_results - len(video_ids)),
"key": api_key
}
if next_page_token:
params["pageToken"] = next_page_token
try:
response = httpx.get(
f"{YOUTUBE_API_BASE}/playlistItems",
params=params,
headers={"User-Agent": USER_AGENT},
timeout=30.0
)
response.raise_for_status()
data = response.json()
for item in data.get("items", []):
video_id = item.get("contentDetails", {}).get("videoId")
if video_id:
video_ids.append(video_id)
next_page_token = data.get("nextPageToken")
if not next_page_token:
break
time.sleep(0.05)
except Exception as e:
error_str = str(e)
if "403" in error_str or "quota" in error_str.lower():
raise Exception(f"quotaExceeded: {error_str}")
print(f" Warning: Error fetching playlist: {e}")
break
return video_ids
def get_video_details(video_ids: List[str], api_key: str) -> List[Dict[str, Any]]:
"""Get detailed metadata for videos."""
if not video_ids:
return []
all_videos = []
for i in range(0, len(video_ids), 50):
batch_ids = video_ids[i:i+50]
params = {
"part": "snippet,contentDetails,statistics",
"id": ",".join(batch_ids),
"key": api_key
}
try:
response = httpx.get(
f"{YOUTUBE_API_BASE}/videos",
params=params,
headers={"User-Agent": USER_AGENT},
timeout=30.0
)
response.raise_for_status()
data = response.json()
for item in data.get("items", []):
snippet = item.get("snippet", {})
stats = item.get("statistics", {})
content = item.get("contentDetails", {})
video = {
"video_id": item["id"],
"video_url": f"https://www.youtube.com/watch?v={item['id']}",
"title": snippet.get("title"),
"description": snippet.get("description", "")[:500], # Truncate
"published_at": snippet.get("publishedAt"),
"duration": content.get("duration"),
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0,
"like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0,
"comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0,
"comments": [],
"thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
}
all_videos.append(video)
time.sleep(0.05)
except Exception as e:
error_str = str(e)
if "403" in error_str or "quota" in error_str.lower():
raise Exception(f"quotaExceeded: {error_str}")
print(f" Warning: Error fetching video details: {e}")
return all_videos
def create_youtube_enrichment(channel_id: str, api_key: str) -> Dict[str, Any]:
"""Create full YouTube enrichment data from channel ID."""
timestamp = datetime.now(timezone.utc).isoformat()
enrichment = {
"fetch_timestamp": timestamp,
"api_endpoint": YOUTUBE_API_BASE,
"api_version": "v3",
}
try:
# Get channel info
print(f" Fetching channel info for {channel_id}...")
channel_info = get_channel_info(channel_id, api_key)
if channel_info.get("error"):
enrichment["error"] = channel_info["error"]
enrichment["status"] = "FAILED"
return enrichment
# Flatten channel info into enrichment (match expected structure)
enrichment.update({
"channel_id": channel_info["channel_id"],
"channel_url": channel_info["channel_url"],
"title": channel_info["title"],
"description": channel_info["description"],
"custom_url": channel_info["custom_url"],
"published_at": channel_info["published_at"],
"country": channel_info["country"],
"thumbnail_url": channel_info["thumbnail_url"],
"subscriber_count": channel_info["subscriber_count"],
"video_count": channel_info["video_count"],
"view_count": channel_info["view_count"],
})
# Get videos from uploads playlist
uploads_playlist_id = channel_info.get("uploads_playlist_id")
if uploads_playlist_id:
print(f" Fetching videos from uploads playlist...")
video_ids = get_playlist_videos(uploads_playlist_id, api_key, max_results=50)
if video_ids:
print(f" Found {len(video_ids)} videos, fetching details...")
videos = get_video_details(video_ids, api_key)
enrichment["videos"] = videos
else:
enrichment["videos"] = []
else:
enrichment["videos"] = []
enrichment["status"] = "SUCCESS"
except httpx.HTTPStatusError as e:
enrichment["error"] = f"YouTube API error: {e.response.status_code}"
enrichment["status"] = "FAILED"
except Exception as e:
error_str = str(e)
if "quotaExceeded" in error_str:
raise # Re-raise for key rotation
enrichment["error"] = str(e)
enrichment["status"] = "FAILED"
return enrichment
def process_file(yaml_file: Path, api_key: str, dry_run: bool = False) -> bool:
"""Process a single file and add YouTube enrichment."""
print(f"\nProcessing: {yaml_file.name}")
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
print(f" Empty file, skipping")
return False
# Extract channel ID
channel_id = extract_channel_id(entry)
if not channel_id:
print(f" No channel_id found, skipping")
return False
print(f" Found channel_id: {channel_id}")
if dry_run:
print(f" [DRY RUN] Would enrich with YouTube data")
return True
# Create enrichment
enrichment = create_youtube_enrichment(channel_id, api_key)
# Check for quota exceeded
if "quotaExceeded" in str(enrichment.get("error", "")):
raise Exception(f"quotaExceeded: {enrichment['error']}")
# Add to entry
entry["youtube_enrichment"] = enrichment
# Update provenance
if "provenance" not in entry:
entry["provenance"] = {}
if "notes" not in entry["provenance"]:
entry["provenance"]["notes"] = []
if isinstance(entry["provenance"]["notes"], list):
note = f"YouTube enrichment added on {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}"
entry["provenance"]["notes"].append(note)
# Save
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
status = enrichment.get("status", "UNKNOWN")
print(f" Status: {status}")
if status == "SUCCESS":
print(f" Channel: {enrichment.get('title')}")
print(f" Subscribers: {enrichment.get('subscriber_count', 'Hidden'):,}" if enrichment.get('subscriber_count') else " Subscribers: Hidden")
print(f" Videos: {len(enrichment.get('videos', []))}")
return status == "SUCCESS"
except Exception as e:
error_str = str(e)
if "quotaExceeded" in error_str:
raise
print(f" ERROR: {e}")
return False
def main():
parser = argparse.ArgumentParser(
description="Enrich custodian files with YouTube data from existing channel IDs"
)
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
parser.add_argument("--limit", type=int, help="Limit number of files to process")
parser.add_argument("--file", type=str, help="Process a specific file")
args = parser.parse_args()
if not api_key_manager.has_available_keys():
print("ERROR: No YouTube API keys found")
print("Set GOOGLE_YOUTUBE_TOKEN or YOUTUBE_API_KEY environment variable")
sys.exit(1)
print("=" * 60)
print("YouTube Enrichment from Wikidata Channel IDs")
print("=" * 60)
print(f"API Keys: {api_key_manager.get_status()}")
print(f"Dry run: {args.dry_run}")
# Get files to process
if args.file:
files = [Path(args.file)]
if not files[0].exists():
files = [DATA_DIR / args.file]
if not files[0].exists():
print(f"ERROR: File not found: {args.file}")
sys.exit(1)
else:
files = find_files_needing_enrichment()
if args.limit:
files = files[:args.limit]
print(f"Files to process: {len(files)}")
print("=" * 60)
success_count = 0
skip_count = 0
error_count = 0
for i, yaml_file in enumerate(files, 1):
if not api_key_manager.has_available_keys():
print("\n⚠️ ALL API KEYS EXHAUSTED - Stopping")
break
print(f"\n[{i}/{len(files)}]", end="")
try:
api_key = api_key_manager.get_current_key()
result = process_file(yaml_file, api_key, args.dry_run)
if result:
success_count += 1
api_key_manager.rotate_key()
else:
skip_count += 1
except Exception as e:
error_str = str(e)
if "quotaExceeded" in error_str or "403" in error_str:
print(f"\n ⚠️ Quota exceeded, rotating key...")
api_key_manager.mark_quota_exceeded()
# Retry with new key
if api_key_manager.has_available_keys():
try:
api_key = api_key_manager.get_current_key()
result = process_file(yaml_file, api_key, args.dry_run)
if result:
success_count += 1
else:
skip_count += 1
except Exception as retry_e:
print(f" Retry failed: {retry_e}")
error_count += 1
else:
print(f" ERROR: {e}")
error_count += 1
time.sleep(REQUEST_DELAY)
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f"Processed: {success_count + skip_count + error_count}")
print(f"Successfully enriched: {success_count}")
print(f"Skipped: {skip_count}")
print(f"Errors: {error_count}")
if __name__ == "__main__":
main()