glam/scripts/youtube_only_from_list.py

#!/usr/bin/env python3
"""
Process YouTube enrichment ONLY for files listed in /tmp/needs_youtube.txt

Usage:
    python scripts/youtube_only_from_list.py [--limit N] [--dry-run]
"""

import asyncio
import sys
import os
import argparse
import logging
from pathlib import Path

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

# Import from the main script
from scripts.enrich_custodian_youtube_maps import (
    enrich_custodian_file,
    REQUEST_DELAY,
    YouTubeQuotaExhaustedError,
)
import httpx
import time

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


async def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--limit", type=int, help="Limit number of files to process")
    parser.add_argument("--dry-run", action="store_true", help="Don't modify files")
    parser.add_argument("--list-file", default="/tmp/needs_youtube.txt", help="File with list of files to process")
    args = parser.parse_args()

    # Load list of files to process
    list_file = Path(args.list_file)
    if not list_file.exists():
        logger.error(f"List file not found: {list_file}")
        return

    with open(list_file) as f:
        files = [Path(line.strip()) for line in f if line.strip()]

    logger.info(f"Loaded {len(files)} files from {list_file}")

    if args.limit:
        files = files[:args.limit]
        logger.info(f"Limited to {len(files)} files")

    if args.dry_run:
        logger.info("DRY RUN - no files will be modified")

    # Process files
    results = {"modified": 0, "skipped": 0, "errors": 0, "youtube_found": 0}

    with httpx.Client(timeout=60.0) as client:
        for i, filepath in enumerate(files):
            if not filepath.exists():
                logger.warning(f"File not found: {filepath}")
                results["errors"] += 1
                continue

            try:
                modified, status = await enrich_custodian_file(
                    filepath, client, force=False, dry_run=args.dry_run,
                    youtube_only=True,  # Always YouTube-only
                    maps_only=False,
                )
                if modified:
                    results["modified"] += 1
                    if "YouTube" in status:
                        results["youtube_found"] += 1
                else:
                    results["skipped"] += 1

                # Progress every 100 files
                if (i + 1) % 100 == 0:
                    logger.info(f"Progress: {i+1}/{len(files)} | Modified: {results['modified']} | YouTube found: {results['youtube_found']}")
                else:
                    logger.info(f"  [{i+1}/{len(files)}] {filepath.name}: {status}")

            except YouTubeQuotaExhaustedError:
                logger.error("=" * 60)
                logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment")
                logger.error("=" * 60)
                break  # Exit the loop gracefully
            except Exception as e:
                logger.error(f"Error processing {filepath.name}: {e}")
                results["errors"] += 1

            # Rate limiting between files
            time.sleep(REQUEST_DELAY)

    # Summary
    logger.info("=" * 60)
    logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors")
    logger.info(f"YouTube channels found: {results['youtube_found']}")


if __name__ == "__main__":
    asyncio.run(main())