#!/usr/bin/env python3 """ Process YouTube enrichment ONLY for files listed in /tmp/needs_youtube.txt Usage: python scripts/youtube_only_from_list.py [--limit N] [--dry-run] """ import asyncio import sys import os import argparse import logging from pathlib import Path # Add parent to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) # Import from the main script from scripts.enrich_custodian_youtube_maps import ( enrich_custodian_file, REQUEST_DELAY, YouTubeQuotaExhaustedError, ) import httpx import time logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) async def main(): parser = argparse.ArgumentParser() parser.add_argument("--limit", type=int, help="Limit number of files to process") parser.add_argument("--dry-run", action="store_true", help="Don't modify files") parser.add_argument("--list-file", default="/tmp/needs_youtube.txt", help="File with list of files to process") args = parser.parse_args() # Load list of files to process list_file = Path(args.list_file) if not list_file.exists(): logger.error(f"List file not found: {list_file}") return with open(list_file) as f: files = [Path(line.strip()) for line in f if line.strip()] logger.info(f"Loaded {len(files)} files from {list_file}") if args.limit: files = files[:args.limit] logger.info(f"Limited to {len(files)} files") if args.dry_run: logger.info("DRY RUN - no files will be modified") # Process files results = {"modified": 0, "skipped": 0, "errors": 0, "youtube_found": 0} with httpx.Client(timeout=60.0) as client: for i, filepath in enumerate(files): if not filepath.exists(): logger.warning(f"File not found: {filepath}") results["errors"] += 1 continue try: modified, status = await enrich_custodian_file( filepath, client, force=False, dry_run=args.dry_run, youtube_only=True, # Always YouTube-only maps_only=False, ) if modified: results["modified"] += 1 if "YouTube" in status: results["youtube_found"] += 1 else: results["skipped"] += 1 # Progress every 100 files if (i + 1) % 100 == 0: logger.info(f"Progress: {i+1}/{len(files)} | Modified: {results['modified']} | YouTube found: {results['youtube_found']}") else: logger.info(f" [{i+1}/{len(files)}] {filepath.name}: {status}") except YouTubeQuotaExhaustedError: logger.error("=" * 60) logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment") logger.error("=" * 60) break # Exit the loop gracefully except Exception as e: logger.error(f"Error processing {filepath.name}: {e}") results["errors"] += 1 # Rate limiting between files time.sleep(REQUEST_DELAY) # Summary logger.info("=" * 60) logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors") logger.info(f"YouTube channels found: {results['youtube_found']}") if __name__ == "__main__": asyncio.run(main())