- Implemented `generate_mermaid_with_instances.py` to create ER diagrams that include all classes, relationships, enum values, and instance data. - Loaded instance data from YAML files and enriched enum definitions with meaningful annotations. - Configured output paths for generated diagrams in both frontend and schema directories. - Added support for excluding technical classes and limiting the number of displayed enum and instance values for readability.
678 lines
22 KiB
Python
Executable file
678 lines
22 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
YouTube Enrichment Script for Heritage Custodian Entries
|
|
|
|
This script enriches heritage custodian YAML entries with YouTube channel/video data.
|
|
It finds YouTube channels from existing web_claims (social_youtube) and fetches:
|
|
- Channel info (subscribers, video count, description, etc.)
|
|
- Recent videos (title, description, views, likes, comments)
|
|
- Video transcripts (when available)
|
|
- Comments on videos
|
|
|
|
All data includes full provenance with URLs and timestamps.
|
|
|
|
Usage:
|
|
python scripts/enrich_youtube.py [--dry-run] [--limit N] [--entry ENTRY_FILE]
|
|
|
|
Environment Variables:
|
|
YOUTUBE_API_KEY: Required. Get from https://console.cloud.google.com/
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: December 2025
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import httpx
|
|
import yaml
|
|
|
|
# Load environment variables from .env file
|
|
try:
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
except ImportError:
|
|
pass # dotenv not installed, rely on shell environment
|
|
|
|
# ============================================================================
|
|
# Configuration
|
|
# ============================================================================
|
|
|
|
# Support both YOUTUBE_API_KEY and GOOGLE_YOUTUBE_TOKEN
|
|
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") or os.getenv("GOOGLE_YOUTUBE_TOKEN", "")
|
|
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
|
|
USER_AGENT = "GLAMDataExtractor/1.0 (heritage-data@example.com) Python/httpx"
|
|
|
|
ENTRIES_DIR = Path("data/nde/enriched/entries")
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 0.5 # seconds between API calls
|
|
|
|
|
|
# ============================================================================
|
|
# Helper Functions
|
|
# ============================================================================
|
|
|
|
def extract_channel_id_or_username(youtube_url: str) -> Tuple[Optional[str], str]:
|
|
"""
|
|
Extract YouTube channel ID or username from various URL formats.
|
|
|
|
Returns:
|
|
Tuple of (identifier, identifier_type) where type is 'channel_id', 'username', or 'handle'
|
|
"""
|
|
if not youtube_url:
|
|
return None, ""
|
|
|
|
# Channel ID format: /channel/UCxxxxx
|
|
match = re.search(r'youtube\.com/channel/([UC][0-9A-Za-z_-]{22})', youtube_url)
|
|
if match:
|
|
return match.group(1), "channel_id"
|
|
|
|
# Handle format: /@username
|
|
match = re.search(r'youtube\.com/@([^/?&]+)', youtube_url)
|
|
if match:
|
|
return match.group(1), "handle"
|
|
|
|
# User format: /user/username
|
|
match = re.search(r'youtube\.com/user/([^/?&]+)', youtube_url)
|
|
if match:
|
|
return match.group(1), "username"
|
|
|
|
# Custom URL format: /c/customname
|
|
match = re.search(r'youtube\.com/c/([^/?&]+)', youtube_url)
|
|
if match:
|
|
return match.group(1), "custom_url"
|
|
|
|
# Direct custom URL format: youtube.com/customname (no prefix)
|
|
# Must be after all other patterns to avoid false matches
|
|
match = re.search(r'youtube\.com/([a-zA-Z][a-zA-Z0-9_-]{2,})(?:[/?]|$)', youtube_url)
|
|
if match:
|
|
# Exclude known paths that aren't custom URLs
|
|
name = match.group(1)
|
|
excluded = {'watch', 'playlist', 'channel', 'user', 'c', 'results', 'feed', 'gaming', 'shorts', 'live'}
|
|
if name.lower() not in excluded:
|
|
return name, "custom_url"
|
|
|
|
return None, ""
|
|
|
|
|
|
def resolve_channel_id(identifier: str, id_type: str, api_key: str) -> Optional[str]:
|
|
"""
|
|
Resolve a username, handle, or custom URL to a channel ID.
|
|
"""
|
|
if id_type == "channel_id":
|
|
return identifier
|
|
|
|
# Use search to find channel
|
|
search_params = {
|
|
"part": "snippet",
|
|
"type": "channel",
|
|
"maxResults": 1,
|
|
"key": api_key
|
|
}
|
|
|
|
if id_type == "handle":
|
|
search_params["q"] = f"@{identifier}"
|
|
else:
|
|
search_params["q"] = identifier
|
|
|
|
try:
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/search",
|
|
params=search_params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if data.get("items"):
|
|
return data["items"][0]["id"]["channelId"]
|
|
except Exception as e:
|
|
print(f" Warning: Could not resolve {id_type} '{identifier}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def get_channel_info(channel_id: str, api_key: str) -> Dict[str, Any]:
|
|
"""
|
|
Get detailed channel information from YouTube Data API.
|
|
"""
|
|
params = {
|
|
"part": "snippet,statistics,brandingSettings,contentDetails",
|
|
"id": channel_id,
|
|
"key": api_key
|
|
}
|
|
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/channels",
|
|
params=params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if not data.get("items"):
|
|
return {"error": f"Channel not found: {channel_id}"}
|
|
|
|
item = data["items"][0]
|
|
snippet = item.get("snippet", {})
|
|
stats = item.get("statistics", {})
|
|
branding = item.get("brandingSettings", {})
|
|
|
|
return {
|
|
"channel_id": channel_id,
|
|
"channel_url": f"https://www.youtube.com/channel/{channel_id}",
|
|
"title": snippet.get("title"),
|
|
"description": snippet.get("description"),
|
|
"custom_url": snippet.get("customUrl"),
|
|
"published_at": snippet.get("publishedAt"),
|
|
"country": snippet.get("country"),
|
|
"default_language": snippet.get("defaultLanguage"),
|
|
"thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
|
|
"banner_url": branding.get("image", {}).get("bannerExternalUrl"),
|
|
"subscriber_count": int(stats.get("subscriberCount", 0)) if stats.get("subscriberCount") else None,
|
|
"video_count": int(stats.get("videoCount", 0)) if stats.get("videoCount") else None,
|
|
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else None,
|
|
"subscriber_count_hidden": stats.get("hiddenSubscriberCount", False),
|
|
"uploads_playlist_id": item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads"),
|
|
}
|
|
|
|
|
|
def get_channel_videos(channel_id: str, api_key: str, max_results: int = 20) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get recent videos from a YouTube channel.
|
|
"""
|
|
# First, search for videos from this channel
|
|
search_params = {
|
|
"part": "snippet",
|
|
"channelId": channel_id,
|
|
"type": "video",
|
|
"order": "date",
|
|
"maxResults": min(max_results, 50),
|
|
"key": api_key
|
|
}
|
|
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/search",
|
|
params=search_params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
search_data = response.json()
|
|
|
|
video_ids = [item["id"]["videoId"] for item in search_data.get("items", [])]
|
|
|
|
if not video_ids:
|
|
return []
|
|
|
|
# Get detailed video info
|
|
video_params = {
|
|
"part": "snippet,contentDetails,statistics",
|
|
"id": ",".join(video_ids),
|
|
"key": api_key
|
|
}
|
|
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/videos",
|
|
params=video_params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
video_data = response.json()
|
|
|
|
videos = []
|
|
for item in video_data.get("items", []):
|
|
snippet = item.get("snippet", {})
|
|
stats = item.get("statistics", {})
|
|
content = item.get("contentDetails", {})
|
|
|
|
videos.append({
|
|
"video_id": item["id"],
|
|
"video_url": f"https://www.youtube.com/watch?v={item['id']}",
|
|
"title": snippet.get("title"),
|
|
"description": snippet.get("description", "")[:500], # Truncate long descriptions
|
|
"published_at": snippet.get("publishedAt"),
|
|
"duration": content.get("duration"),
|
|
"definition": content.get("definition"),
|
|
"caption_available": content.get("caption") == "true",
|
|
"view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else None,
|
|
"like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else None,
|
|
"comment_count": int(stats.get("commentCount", 0)) if stats.get("commentCount") else None,
|
|
"tags": snippet.get("tags", [])[:10], # Limit tags
|
|
"thumbnail_url": snippet.get("thumbnails", {}).get("high", {}).get("url"),
|
|
"default_language": snippet.get("defaultLanguage"),
|
|
"default_audio_language": snippet.get("defaultAudioLanguage"),
|
|
})
|
|
|
|
return videos
|
|
|
|
|
|
def get_video_comments(video_id: str, api_key: str, max_results: int = 50) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get top-level comments on a video.
|
|
"""
|
|
params = {
|
|
"part": "snippet",
|
|
"videoId": video_id,
|
|
"order": "relevance",
|
|
"maxResults": min(max_results, 100),
|
|
"textFormat": "plainText",
|
|
"key": api_key
|
|
}
|
|
|
|
try:
|
|
response = httpx.get(
|
|
f"{YOUTUBE_API_BASE}/commentThreads",
|
|
params=params,
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
comments = []
|
|
for item in data.get("items", []):
|
|
snippet = item.get("snippet", {}).get("topLevelComment", {}).get("snippet", {})
|
|
comments.append({
|
|
"comment_id": item["id"],
|
|
"author_display_name": snippet.get("authorDisplayName"),
|
|
"author_channel_url": snippet.get("authorChannelUrl"),
|
|
"text": snippet.get("textDisplay", "")[:1000], # Truncate
|
|
"like_count": snippet.get("likeCount", 0),
|
|
"published_at": snippet.get("publishedAt"),
|
|
"updated_at": snippet.get("updatedAt"),
|
|
"reply_count": item.get("snippet", {}).get("totalReplyCount", 0),
|
|
})
|
|
|
|
return comments
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 403:
|
|
# Comments disabled for this video
|
|
return []
|
|
raise
|
|
|
|
|
|
def get_video_transcript(video_id: str, language: str = "en") -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get video transcript using yt-dlp.
|
|
"""
|
|
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
try:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
result = subprocess.run(
|
|
[
|
|
"yt-dlp",
|
|
"--write-subs",
|
|
"--write-auto-subs",
|
|
"--sub-langs", f"{language},nl,en",
|
|
"--sub-format", "vtt",
|
|
"--skip-download",
|
|
"--output", f"{tmpdir}/%(id)s",
|
|
video_url
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60
|
|
)
|
|
|
|
import glob
|
|
vtt_files = glob.glob(f"{tmpdir}/*.vtt")
|
|
|
|
if vtt_files:
|
|
with open(vtt_files[0], 'r', encoding='utf-8') as f:
|
|
vtt_content = f.read()
|
|
|
|
# Parse VTT to extract text
|
|
lines = []
|
|
for line in vtt_content.split('\n'):
|
|
line = line.strip()
|
|
if line and not line.startswith('WEBVTT') and not line.startswith('Kind:') \
|
|
and not line.startswith('Language:') and '-->' not in line \
|
|
and not re.match(r'^\d+$', line):
|
|
clean_line = re.sub(r'<[^>]+>', '', line)
|
|
if clean_line:
|
|
lines.append(clean_line)
|
|
|
|
# Remove duplicate consecutive lines
|
|
deduped = []
|
|
for line in lines:
|
|
if not deduped or line != deduped[-1]:
|
|
deduped.append(line)
|
|
|
|
transcript = ' '.join(deduped)
|
|
|
|
# Determine language from filename
|
|
detected_lang = "unknown"
|
|
if ".nl." in vtt_files[0]:
|
|
detected_lang = "nl"
|
|
elif ".en." in vtt_files[0]:
|
|
detected_lang = "en"
|
|
|
|
return {
|
|
"video_id": video_id,
|
|
"language": detected_lang,
|
|
"transcript_type": "auto" if ".auto." in vtt_files[0] else "manual",
|
|
"transcript_text": transcript[:10000], # Truncate very long transcripts
|
|
"transcript_length_chars": len(transcript),
|
|
"extraction_method": "yt-dlp",
|
|
}
|
|
|
|
return None
|
|
|
|
except FileNotFoundError:
|
|
return {"error": "yt-dlp not installed"}
|
|
except subprocess.TimeoutExpired:
|
|
return {"error": "Transcript extraction timed out"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
|
|
def find_youtube_url_in_entry(entry: Dict[str, Any]) -> Optional[str]:
|
|
"""
|
|
Find YouTube URL from web_claims or wikidata in an entry.
|
|
"""
|
|
# Check web_claims for social_youtube
|
|
web_claims = entry.get("web_claims", {}).get("claims", [])
|
|
for claim in web_claims:
|
|
if claim.get("claim_type") == "social_youtube":
|
|
return claim.get("claim_value")
|
|
|
|
# Check wikidata for YouTube channel ID (P2397)
|
|
wikidata = entry.get("wikidata_enrichment", {})
|
|
claims = wikidata.get("wikidata_claims", {})
|
|
|
|
youtube_claim = claims.get("P2397_youtube_channel_id")
|
|
if youtube_claim:
|
|
channel_id = youtube_claim.get("value")
|
|
if channel_id:
|
|
return f"https://www.youtube.com/channel/{channel_id}"
|
|
|
|
return None
|
|
|
|
|
|
def create_youtube_enrichment(
|
|
youtube_url: str,
|
|
api_key: str,
|
|
fetch_videos: int = 10,
|
|
fetch_comments_per_video: int = 20,
|
|
fetch_transcripts: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Create full YouTube enrichment data with provenance.
|
|
"""
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
enrichment = {
|
|
"source_url": youtube_url,
|
|
"fetch_timestamp": timestamp,
|
|
"api_endpoint": YOUTUBE_API_BASE,
|
|
"api_version": "v3",
|
|
}
|
|
|
|
# Extract channel identifier
|
|
identifier, id_type = extract_channel_id_or_username(youtube_url)
|
|
|
|
if not identifier:
|
|
enrichment["error"] = f"Could not parse YouTube URL: {youtube_url}"
|
|
enrichment["status"] = "FAILED"
|
|
return enrichment
|
|
|
|
enrichment["identifier_type"] = id_type
|
|
enrichment["identifier_value"] = identifier
|
|
|
|
# Resolve to channel ID
|
|
channel_id = resolve_channel_id(identifier, id_type, api_key)
|
|
|
|
if not channel_id:
|
|
enrichment["error"] = f"Could not resolve channel ID for: {identifier}"
|
|
enrichment["status"] = "FAILED"
|
|
return enrichment
|
|
|
|
try:
|
|
# Get channel info
|
|
print(f" Fetching channel info for {channel_id}...")
|
|
channel_info = get_channel_info(channel_id, api_key)
|
|
enrichment["channel"] = channel_info
|
|
|
|
# Get recent videos
|
|
if fetch_videos > 0:
|
|
print(f" Fetching {fetch_videos} recent videos...")
|
|
videos = get_channel_videos(channel_id, api_key, fetch_videos)
|
|
enrichment["videos"] = videos
|
|
enrichment["videos_count"] = len(videos)
|
|
|
|
# Get comments for top videos
|
|
if fetch_comments_per_video > 0 and videos:
|
|
print(f" Fetching comments for top videos...")
|
|
for i, video in enumerate(videos[:5]): # Only first 5 videos
|
|
video_id = video["video_id"]
|
|
comments = get_video_comments(video_id, api_key, fetch_comments_per_video)
|
|
videos[i]["comments"] = comments
|
|
videos[i]["comments_fetched"] = len(comments)
|
|
|
|
# Get transcripts for videos with captions
|
|
if fetch_transcripts and videos:
|
|
print(f" Fetching transcripts for videos with captions...")
|
|
for i, video in enumerate(videos[:3]): # Only first 3 videos
|
|
if video.get("caption_available"):
|
|
video_id = video["video_id"]
|
|
transcript = get_video_transcript(video_id)
|
|
if transcript and not transcript.get("error"):
|
|
videos[i]["transcript"] = transcript
|
|
|
|
enrichment["status"] = "SUCCESS"
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
enrichment["error"] = f"YouTube API error: {e.response.status_code}"
|
|
enrichment["status"] = "FAILED"
|
|
except Exception as e:
|
|
enrichment["error"] = str(e)
|
|
enrichment["status"] = "FAILED"
|
|
|
|
return enrichment
|
|
|
|
|
|
def update_provenance(entry: Dict[str, Any], enrichment: Dict[str, Any]) -> None:
|
|
"""
|
|
Update provenance section with YouTube enrichment source.
|
|
"""
|
|
if "provenance" not in entry:
|
|
entry["provenance"] = {"sources": {}}
|
|
|
|
if "sources" not in entry["provenance"]:
|
|
entry["provenance"]["sources"] = {}
|
|
|
|
if "youtube" not in entry["provenance"]["sources"]:
|
|
entry["provenance"]["sources"]["youtube"] = []
|
|
|
|
source_entry = {
|
|
"source_type": "youtube_data_api",
|
|
"fetch_timestamp": enrichment.get("fetch_timestamp"),
|
|
"api_endpoint": enrichment.get("api_endpoint"),
|
|
"channel_id": enrichment.get("channel", {}).get("channel_id"),
|
|
"claims_extracted": [
|
|
"channel_info",
|
|
"subscriber_count",
|
|
"video_count",
|
|
"view_count",
|
|
"recent_videos",
|
|
"video_comments",
|
|
"video_transcripts",
|
|
]
|
|
}
|
|
|
|
entry["provenance"]["sources"]["youtube"].append(source_entry)
|
|
|
|
|
|
def process_entry(entry_path: Path, api_key: str, dry_run: bool = False) -> bool:
|
|
"""
|
|
Process a single entry file and add YouTube enrichment.
|
|
"""
|
|
print(f"\nProcessing: {entry_path.name}")
|
|
|
|
# Load entry
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
# Check if already enriched
|
|
if entry.get("youtube_enrichment", {}).get("status") == "SUCCESS":
|
|
print(f" Already enriched, skipping...")
|
|
return False
|
|
|
|
# Find YouTube URL
|
|
youtube_url = find_youtube_url_in_entry(entry)
|
|
|
|
if not youtube_url:
|
|
print(f" No YouTube URL found, skipping...")
|
|
return False
|
|
|
|
print(f" Found YouTube URL: {youtube_url}")
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would enrich with YouTube data")
|
|
return True
|
|
|
|
# Create enrichment
|
|
enrichment = create_youtube_enrichment(
|
|
youtube_url=youtube_url,
|
|
api_key=api_key,
|
|
fetch_videos=10,
|
|
fetch_comments_per_video=20,
|
|
fetch_transcripts=True
|
|
)
|
|
|
|
# Add to entry
|
|
entry["youtube_enrichment"] = enrichment
|
|
|
|
# Update provenance
|
|
if enrichment.get("status") == "SUCCESS":
|
|
update_provenance(entry, enrichment)
|
|
|
|
# Save entry
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
status = enrichment.get("status", "UNKNOWN")
|
|
print(f" Status: {status}")
|
|
|
|
if status == "SUCCESS":
|
|
channel = enrichment.get("channel", {})
|
|
videos = enrichment.get("videos", [])
|
|
print(f" Channel: {channel.get('title')}")
|
|
print(f" Subscribers: {channel.get('subscriber_count'):,}" if channel.get('subscriber_count') else " Subscribers: Hidden")
|
|
print(f" Videos fetched: {len(videos)}")
|
|
|
|
return status == "SUCCESS"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich heritage custodian entries with YouTube channel data"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be done without making changes"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=None,
|
|
help="Limit number of entries to process"
|
|
)
|
|
parser.add_argument(
|
|
"--entry",
|
|
type=str,
|
|
default=None,
|
|
help="Process a specific entry file (filename or full path)"
|
|
)
|
|
parser.add_argument(
|
|
"--skip-existing",
|
|
action="store_true",
|
|
default=True,
|
|
help="Skip entries that already have YouTube enrichment (default: True)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Check API key
|
|
if not YOUTUBE_API_KEY:
|
|
print("ERROR: YOUTUBE_API_KEY environment variable not set")
|
|
print("\nTo get an API key:")
|
|
print("1. Go to https://console.cloud.google.com/")
|
|
print("2. Create a project and enable YouTube Data API v3")
|
|
print("3. Create an API key under Credentials")
|
|
print("4. Set: export YOUTUBE_API_KEY='your-key-here'")
|
|
sys.exit(1)
|
|
|
|
print("=" * 60)
|
|
print("YouTube Enrichment Script for Heritage Custodians")
|
|
print("=" * 60)
|
|
print(f"API Key: {YOUTUBE_API_KEY[:8]}...{YOUTUBE_API_KEY[-4:]}")
|
|
print(f"Entries directory: {ENTRIES_DIR}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
|
|
# Collect entries to process
|
|
if args.entry:
|
|
entry_path = Path(args.entry)
|
|
if not entry_path.exists():
|
|
entry_path = ENTRIES_DIR / args.entry
|
|
if not entry_path.exists():
|
|
print(f"ERROR: Entry not found: {args.entry}")
|
|
sys.exit(1)
|
|
entries = [entry_path]
|
|
else:
|
|
entries = sorted(ENTRIES_DIR.glob("*.yaml"))
|
|
|
|
if args.limit:
|
|
entries = entries[:args.limit]
|
|
|
|
print(f"Entries to process: {len(entries)}")
|
|
print("=" * 60)
|
|
|
|
# Process entries
|
|
success_count = 0
|
|
skip_count = 0
|
|
error_count = 0
|
|
|
|
for entry_path in entries:
|
|
try:
|
|
result = process_entry(entry_path, YOUTUBE_API_KEY, args.dry_run)
|
|
if result:
|
|
success_count += 1
|
|
else:
|
|
skip_count += 1
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
error_count += 1
|
|
|
|
# Rate limiting
|
|
import time
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("Summary")
|
|
print("=" * 60)
|
|
print(f"Entries processed: {len(entries)}")
|
|
print(f"Successfully enriched: {success_count}")
|
|
print(f"Skipped (no YouTube / already done): {skip_count}")
|
|
print(f"Errors: {error_count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|