581 lines
20 KiB
Python
581 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich heritage custodian files with YouTube data using existing youtube_channel_id.
|
|
|
|
This script targets files that have:
|
|
- A youtube_channel_id (from Wikidata P2397 or web_claims)
|
|
- NO youtube_enrichment section yet
|
|
|
|
It extracts the channel ID and fetches full enrichment data from YouTube Data API.
|
|
|
|
Usage:
|
|
python scripts/enrich_youtube_from_wikidata.py [--dry-run] [--limit N] [--file PATH]
|
|
|
|
Environment Variables:
|
|
GOOGLE_YOUTUBE_TOKEN: Required. YouTube Data API key.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import httpx
|
|
import yaml
|
|
|
|
# Load environment variables
|
|
try:
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
except ImportError:
|
|
pass
|
|
|
|
# Configuration
|
|
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
|
|
USER_AGENT = "GLAMDataExtractor/1.0 (heritage-data@example.com) Python/httpx"
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
REQUEST_DELAY = 0.3 # seconds between API calls
|
|
|
|
|
|
class APIKeyManager:
|
|
"""Manages multiple YouTube API keys with rotation on quota exhaustion."""
|
|
|
|
def __init__(self):
|
|
self.keys = []
|
|
self.current_index = 0
|
|
self.exhausted_keys = set()
|
|
self._load_keys()
|
|
|
|
def _load_keys(self):
|
|
"""Load all available API keys from environment variables."""
|
|
key_patterns = [
|
|
"YOUTUBE_API_KEY",
|
|
"GOOGLE_YOUTUBE_TOKEN",
|
|
"GOOGLE_YOUTUBE_TOKEN_v2",
|
|
"GOOGLE_YOUTUBE_TOKEN_v3",
|
|
"GOOGLE_YOUTUBE_TOKEN_v4",
|
|
"GOOGLE_YOUTUBE_TOKEN_v5",
|
|
]
|
|
|
|
seen = set()
|
|
for pattern in key_patterns:
|
|
key = os.getenv(pattern)
|
|
if key and key not in seen:
|
|
self.keys.append({"key": key, "name": pattern})
|
|
seen.add(key)
|
|
|
|
if not self.keys:
|
|
print("WARNING: No YouTube API keys found in environment variables")
|
|
else:
|
|
print(f"Loaded {len(self.keys)} API key(s): {[k['name'] for k in self.keys]}")
|
|
|
|
def get_current_key(self) -> Optional[str]:
|
|
available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
|
|
if not available:
|
|
return None
|
|
return available[self.current_index % len(available)]["key"]
|
|
|
|
def get_current_key_name(self) -> str:
|
|
available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
|
|
if not available:
|
|
return "none"
|
|
return available[self.current_index % len(available)]["name"]
|
|
|
|
def mark_quota_exceeded(self):
|
|
if not self.keys:
|
|
return
|
|
available_indices = [i for i in range(len(self.keys)) if i not in self.exhausted_keys]
|
|
if available_indices:
|
|
current_actual_index = available_indices[self.current_index % len(available_indices)]
|
|
self.exhausted_keys.add(current_actual_index)
|
|
key_name = self.keys[current_actual_index]["name"]
|
|
print(f"\n⚠️ Quota exceeded for {key_name}, rotating to next key...")
|
|
self.current_index = 0
|
|
|
|
def rotate_key(self):
|
|
available = [k for i, k in enumerate(self.keys) if i not in self.exhausted_keys]
|
|
if len(available) > 1:
|
|
self.current_index = (self.current_index + 1) % len(available)
|
|
|
|
def has_available_keys(self) -> bool:
|
|
return len(self.exhausted_keys) < len(self.keys)
|
|
|
|
def get_status(self) -> str:
|
|
available = len(self.keys) - len(self.exhausted_keys)
|
|
return f"{available}/{len(self.keys)} keys available"
|
|
|
|
|
|
api_key_manager = APIKeyManager()
|
|
|
|
|
|
def find_files_needing_enrichment() -> List[Path]:
|
|
"""Find custodian files that have youtube_channel_id but no youtube_enrichment."""
|
|
print("Finding files with youtube_channel_id but no youtube_enrichment...")
|
|
|
|
# Use ripgrep to find files with youtube_channel_id
|
|
try:
|
|
result = subprocess.run(
|
|
["rg", "-l", "youtube_channel_id:", str(DATA_DIR)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60
|
|
)
|
|
candidate_files = [Path(f.strip()) for f in result.stdout.strip().split('\n') if f.strip()]
|
|
except Exception as e:
|
|
print(f"Error running ripgrep: {e}")
|
|
return []
|
|
|
|
print(f"Found {len(candidate_files)} files with youtube_channel_id")
|
|
|
|
# Filter to only those without youtube_enrichment
|
|
files_needing_enrichment = []
|
|
for yaml_file in candidate_files:
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
if "youtube_enrichment:" not in content:
|
|
files_needing_enrichment.append(yaml_file)
|
|
|
|
except Exception as e:
|
|
print(f" Warning: Error reading {yaml_file}: {e}")
|
|
|
|
print(f"Files needing YouTube enrichment: {len(files_needing_enrichment)}")
|
|
return files_needing_enrichment
|
|
|
|
|
|
def extract_channel_id(entry: Dict[str, Any]) -> Optional[str]:
|
|
"""Extract YouTube channel ID from entry data (Wikidata or web_claims)."""
|
|
channel_id = None
|
|
|
|
# Check wikidata_enrichment in multiple locations
|
|
wikidata = entry.get("wikidata_enrichment", {})
|
|
|
|
# Location 1: wikidata_social_media.youtube_channel_id (most common)
|
|
social_media = wikidata.get("wikidata_social_media", {})
|
|
if social_media and isinstance(social_media, dict):
|
|
channel_id = social_media.get("youtube_channel_id")
|
|
if channel_id:
|
|
return channel_id
|
|
|
|
# Location 2: wikidata_claims.P2397_youtube_channel_id
|
|
claims = wikidata.get("wikidata_claims", {})
|
|
if claims:
|
|
p2397 = claims.get("P2397_youtube_channel_id", {})
|
|
if p2397:
|
|
channel_id = p2397.get("value") if isinstance(p2397, dict) else p2397
|
|
if channel_id:
|
|
return channel_id
|
|
|
|
# Also check direct youtube_channel_id field under wikidata_claims
|
|
for key, value in claims.items():
|
|
if "youtube_channel_id" in key.lower():
|
|
if isinstance(value, dict):
|
|
channel_id = value.get("value")
|
|
else:
|
|
channel_id = value
|
|
if channel_id:
|
|
return channel_id
|
|
|
|
# Location 3: wikidata_external_ids
|
|
ext_ids = wikidata.get("wikidata_external_ids", {})
|
|
if ext_ids and isinstance(ext_ids, dict):
|
|
channel_id = ext_ids.get("youtube_channel_id")
|
|
if channel_id:
|
|
return channel_id
|
|
|
|
# Location 4: Direct youtube_channel_id at entry level
|
|
channel_id = entry.get("youtube_channel_id")
|
|
if channel_id:
|
|
return channel_id
|
|
|
|
# Location 5: Check web_claims for social_youtube_channel
|
|
web_claims = entry.get("web_claims", {}).get("claims", [])
|
|
for claim in web_claims:
|
|
if claim.get("claim_type") == "social_youtube_channel":
|
|
url = claim.get("claim_value", "")
|
|
# Extract channel ID from URL
|
|
match = re.search(r'youtube\.com/channel/(UC[0-9A-Za-z_-]{22})', url)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
|
|
def get_channel_info(channel_id: str, api_key: str) -> Dict[str, Any]:
|
|
"""Get detailed channel information from YouTube Data API."""
|
|
params = {
|
|
"part": "snippet,statistics,brandingSettings,contentDetails",
|
|
"id": channel_id,
|
|
"key": api_key
|
|
}
|
|
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/channels",
|
|
params=params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if not data.get("items"):
|
|
return {"error": f"Channel not found: {channel_id}"}
|
|
|
|
item = data["items"][0]
|
|
snippet = item.get("snippet", {})
|
|
stats = item.get("statistics", {})
|
|
branding = item.get("brandingSettings", {})
|
|
|
|
return {
|
|
"channel_id": channel_id,
|
|
"channel_url": f"https://www.youtube.com/channel/{channel_id}",
|
|
"title": snippet.get("title"),
|
|
"description": snippet.get("description"),
|
|
"custom_url": snippet.get("customUrl"),
|
|
"published_at": snippet.get("publishedAt"),
|
|
"country": snippet.get("country"),
|
|
"default_language": snippet.get("defaultLanguage"),
|
|
"thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
|
|
"subscriber_count": int(stats.get("subscriberCount", 0)) if stats.get("subscriberCount") else None,
|
|
"video_count": int(stats.get("videoCount", 0)) if stats.get("videoCount") else None,
|
|
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else None,
|
|
"subscriber_count_hidden": stats.get("hiddenSubscriberCount", False),
|
|
"uploads_playlist_id": item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads"),
|
|
}
|
|
|
|
|
|
def get_playlist_videos(playlist_id: str, api_key: str, max_results: int = 50) -> List[str]:
|
|
"""Get video IDs from uploads playlist."""
|
|
video_ids = []
|
|
next_page_token = None
|
|
|
|
while len(video_ids) < max_results:
|
|
params = {
|
|
"part": "contentDetails",
|
|
"playlistId": playlist_id,
|
|
"maxResults": min(50, max_results - len(video_ids)),
|
|
"key": api_key
|
|
}
|
|
|
|
if next_page_token:
|
|
params["pageToken"] = next_page_token
|
|
|
|
try:
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/playlistItems",
|
|
params=params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for item in data.get("items", []):
|
|
video_id = item.get("contentDetails", {}).get("videoId")
|
|
if video_id:
|
|
video_ids.append(video_id)
|
|
|
|
next_page_token = data.get("nextPageToken")
|
|
if not next_page_token:
|
|
break
|
|
|
|
time.sleep(0.05)
|
|
|
|
except Exception as e:
|
|
error_str = str(e)
|
|
if "403" in error_str or "quota" in error_str.lower():
|
|
raise Exception(f"quotaExceeded: {error_str}")
|
|
print(f" Warning: Error fetching playlist: {e}")
|
|
break
|
|
|
|
return video_ids
|
|
|
|
|
|
def get_video_details(video_ids: List[str], api_key: str) -> List[Dict[str, Any]]:
|
|
"""Get detailed metadata for videos."""
|
|
if not video_ids:
|
|
return []
|
|
|
|
all_videos = []
|
|
|
|
for i in range(0, len(video_ids), 50):
|
|
batch_ids = video_ids[i:i+50]
|
|
|
|
params = {
|
|
"part": "snippet,contentDetails,statistics",
|
|
"id": ",".join(batch_ids),
|
|
"key": api_key
|
|
}
|
|
|
|
try:
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/videos",
|
|
params=params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for item in data.get("items", []):
|
|
snippet = item.get("snippet", {})
|
|
stats = item.get("statistics", {})
|
|
content = item.get("contentDetails", {})
|
|
|
|
video = {
|
|
"video_id": item["id"],
|
|
"video_url": f"https://www.youtube.com/watch?v={item['id']}",
|
|
"title": snippet.get("title"),
|
|
"description": snippet.get("description", "")[:500], # Truncate
|
|
"published_at": snippet.get("publishedAt"),
|
|
"duration": content.get("duration"),
|
|
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else 0,
|
|
"like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else 0,
|
|
"comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else 0,
|
|
"comments": [],
|
|
"thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
|
|
}
|
|
all_videos.append(video)
|
|
|
|
time.sleep(0.05)
|
|
|
|
except Exception as e:
|
|
error_str = str(e)
|
|
if "403" in error_str or "quota" in error_str.lower():
|
|
raise Exception(f"quotaExceeded: {error_str}")
|
|
print(f" Warning: Error fetching video details: {e}")
|
|
|
|
return all_videos
|
|
|
|
|
|
def create_youtube_enrichment(channel_id: str, api_key: str) -> Dict[str, Any]:
|
|
"""Create full YouTube enrichment data from channel ID."""
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
enrichment = {
|
|
"fetch_timestamp": timestamp,
|
|
"api_endpoint": YOUTUBE_API_BASE,
|
|
"api_version": "v3",
|
|
}
|
|
|
|
try:
|
|
# Get channel info
|
|
print(f" Fetching channel info for {channel_id}...")
|
|
channel_info = get_channel_info(channel_id, api_key)
|
|
|
|
if channel_info.get("error"):
|
|
enrichment["error"] = channel_info["error"]
|
|
enrichment["status"] = "FAILED"
|
|
return enrichment
|
|
|
|
# Flatten channel info into enrichment (match expected structure)
|
|
enrichment.update({
|
|
"channel_id": channel_info["channel_id"],
|
|
"channel_url": channel_info["channel_url"],
|
|
"title": channel_info["title"],
|
|
"description": channel_info["description"],
|
|
"custom_url": channel_info["custom_url"],
|
|
"published_at": channel_info["published_at"],
|
|
"country": channel_info["country"],
|
|
"thumbnail_url": channel_info["thumbnail_url"],
|
|
"subscriber_count": channel_info["subscriber_count"],
|
|
"video_count": channel_info["video_count"],
|
|
"view_count": channel_info["view_count"],
|
|
})
|
|
|
|
# Get videos from uploads playlist
|
|
uploads_playlist_id = channel_info.get("uploads_playlist_id")
|
|
if uploads_playlist_id:
|
|
print(f" Fetching videos from uploads playlist...")
|
|
video_ids = get_playlist_videos(uploads_playlist_id, api_key, max_results=50)
|
|
|
|
if video_ids:
|
|
print(f" Found {len(video_ids)} videos, fetching details...")
|
|
videos = get_video_details(video_ids, api_key)
|
|
enrichment["videos"] = videos
|
|
else:
|
|
enrichment["videos"] = []
|
|
else:
|
|
enrichment["videos"] = []
|
|
|
|
enrichment["status"] = "SUCCESS"
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
enrichment["error"] = f"YouTube API error: {e.response.status_code}"
|
|
enrichment["status"] = "FAILED"
|
|
except Exception as e:
|
|
error_str = str(e)
|
|
if "quotaExceeded" in error_str:
|
|
raise # Re-raise for key rotation
|
|
enrichment["error"] = str(e)
|
|
enrichment["status"] = "FAILED"
|
|
|
|
return enrichment
|
|
|
|
|
|
def process_file(yaml_file: Path, api_key: str, dry_run: bool = False) -> bool:
|
|
"""Process a single file and add YouTube enrichment."""
|
|
print(f"\nProcessing: {yaml_file.name}")
|
|
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
print(f" Empty file, skipping")
|
|
return False
|
|
|
|
# Extract channel ID
|
|
channel_id = extract_channel_id(entry)
|
|
|
|
if not channel_id:
|
|
print(f" No channel_id found, skipping")
|
|
return False
|
|
|
|
print(f" Found channel_id: {channel_id}")
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would enrich with YouTube data")
|
|
return True
|
|
|
|
# Create enrichment
|
|
enrichment = create_youtube_enrichment(channel_id, api_key)
|
|
|
|
# Check for quota exceeded
|
|
if "quotaExceeded" in str(enrichment.get("error", "")):
|
|
raise Exception(f"quotaExceeded: {enrichment['error']}")
|
|
|
|
# Add to entry
|
|
entry["youtube_enrichment"] = enrichment
|
|
|
|
# Update provenance
|
|
if "provenance" not in entry:
|
|
entry["provenance"] = {}
|
|
if "notes" not in entry["provenance"]:
|
|
entry["provenance"]["notes"] = []
|
|
if isinstance(entry["provenance"]["notes"], list):
|
|
note = f"YouTube enrichment added on {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}"
|
|
entry["provenance"]["notes"].append(note)
|
|
|
|
# Save
|
|
with open(yaml_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
status = enrichment.get("status", "UNKNOWN")
|
|
print(f" Status: {status}")
|
|
|
|
if status == "SUCCESS":
|
|
print(f" Channel: {enrichment.get('title')}")
|
|
print(f" Subscribers: {enrichment.get('subscriber_count', 'Hidden'):,}" if enrichment.get('subscriber_count') else " Subscribers: Hidden")
|
|
print(f" Videos: {len(enrichment.get('videos', []))}")
|
|
|
|
return status == "SUCCESS"
|
|
|
|
except Exception as e:
|
|
error_str = str(e)
|
|
if "quotaExceeded" in error_str:
|
|
raise
|
|
print(f" ERROR: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich custodian files with YouTube data from existing channel IDs"
|
|
)
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
|
|
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
parser.add_argument("--file", type=str, help="Process a specific file")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not api_key_manager.has_available_keys():
|
|
print("ERROR: No YouTube API keys found")
|
|
print("Set GOOGLE_YOUTUBE_TOKEN or YOUTUBE_API_KEY environment variable")
|
|
sys.exit(1)
|
|
|
|
print("=" * 60)
|
|
print("YouTube Enrichment from Wikidata Channel IDs")
|
|
print("=" * 60)
|
|
print(f"API Keys: {api_key_manager.get_status()}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
|
|
# Get files to process
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
if not files[0].exists():
|
|
files = [DATA_DIR / args.file]
|
|
if not files[0].exists():
|
|
print(f"ERROR: File not found: {args.file}")
|
|
sys.exit(1)
|
|
else:
|
|
files = find_files_needing_enrichment()
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
print(f"Files to process: {len(files)}")
|
|
print("=" * 60)
|
|
|
|
success_count = 0
|
|
skip_count = 0
|
|
error_count = 0
|
|
|
|
for i, yaml_file in enumerate(files, 1):
|
|
if not api_key_manager.has_available_keys():
|
|
print("\n⚠️ ALL API KEYS EXHAUSTED - Stopping")
|
|
break
|
|
|
|
print(f"\n[{i}/{len(files)}]", end="")
|
|
|
|
try:
|
|
api_key = api_key_manager.get_current_key()
|
|
result = process_file(yaml_file, api_key, args.dry_run)
|
|
|
|
if result:
|
|
success_count += 1
|
|
api_key_manager.rotate_key()
|
|
else:
|
|
skip_count += 1
|
|
|
|
except Exception as e:
|
|
error_str = str(e)
|
|
if "quotaExceeded" in error_str or "403" in error_str:
|
|
print(f"\n ⚠️ Quota exceeded, rotating key...")
|
|
api_key_manager.mark_quota_exceeded()
|
|
|
|
# Retry with new key
|
|
if api_key_manager.has_available_keys():
|
|
try:
|
|
api_key = api_key_manager.get_current_key()
|
|
result = process_file(yaml_file, api_key, args.dry_run)
|
|
if result:
|
|
success_count += 1
|
|
else:
|
|
skip_count += 1
|
|
except Exception as retry_e:
|
|
print(f" Retry failed: {retry_e}")
|
|
error_count += 1
|
|
else:
|
|
print(f" ERROR: {e}")
|
|
error_count += 1
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Summary")
|
|
print("=" * 60)
|
|
print(f"Processed: {success_count + skip_count + error_count}")
|
|
print(f"Successfully enriched: {success_count}")
|
|
print(f"Skipped: {skip_count}")
|
|
print(f"Errors: {error_count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|