glam/scripts/youtube_only_from_list.py
2025-12-09 11:34:56 +01:00

108 lines
3.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Process YouTube enrichment ONLY for files listed in /tmp/needs_youtube.txt
Usage:
python scripts/youtube_only_from_list.py [--limit N] [--dry-run]
"""
import asyncio
import sys
import os
import argparse
import logging
from pathlib import Path
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
# Import from the main script
from scripts.enrich_custodian_youtube_maps import (
enrich_custodian_file,
REQUEST_DELAY,
YouTubeQuotaExhaustedError,
)
import httpx
import time
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--limit", type=int, help="Limit number of files to process")
parser.add_argument("--dry-run", action="store_true", help="Don't modify files")
parser.add_argument("--list-file", default="/tmp/needs_youtube.txt", help="File with list of files to process")
args = parser.parse_args()
# Load list of files to process
list_file = Path(args.list_file)
if not list_file.exists():
logger.error(f"List file not found: {list_file}")
return
with open(list_file) as f:
files = [Path(line.strip()) for line in f if line.strip()]
logger.info(f"Loaded {len(files)} files from {list_file}")
if args.limit:
files = files[:args.limit]
logger.info(f"Limited to {len(files)} files")
if args.dry_run:
logger.info("DRY RUN - no files will be modified")
# Process files
results = {"modified": 0, "skipped": 0, "errors": 0, "youtube_found": 0}
with httpx.Client(timeout=60.0) as client:
for i, filepath in enumerate(files):
if not filepath.exists():
logger.warning(f"File not found: {filepath}")
results["errors"] += 1
continue
try:
modified, status = await enrich_custodian_file(
filepath, client, force=False, dry_run=args.dry_run,
youtube_only=True, # Always YouTube-only
maps_only=False,
)
if modified:
results["modified"] += 1
if "YouTube" in status:
results["youtube_found"] += 1
else:
results["skipped"] += 1
# Progress every 100 files
if (i + 1) % 100 == 0:
logger.info(f"Progress: {i+1}/{len(files)} | Modified: {results['modified']} | YouTube found: {results['youtube_found']}")
else:
logger.info(f" [{i+1}/{len(files)}] {filepath.name}: {status}")
except YouTubeQuotaExhaustedError:
logger.error("=" * 60)
logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment")
logger.error("=" * 60)
break # Exit the loop gracefully
except Exception as e:
logger.error(f"Error processing {filepath.name}: {e}")
results["errors"] += 1
# Rate limiting between files
time.sleep(REQUEST_DELAY)
# Summary
logger.info("=" * 60)
logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors")
logger.info(f"YouTube channels found: {results['youtube_found']}")
if __name__ == "__main__":
asyncio.run(main())