108 lines
3.6 KiB
Python
Executable file
108 lines
3.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Process YouTube enrichment ONLY for files listed in /tmp/needs_youtube.txt
|
|
|
|
Usage:
|
|
python scripts/youtube_only_from_list.py [--limit N] [--dry-run]
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
# Add parent to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
# Import from the main script
|
|
from scripts.enrich_custodian_youtube_maps import (
|
|
enrich_custodian_file,
|
|
REQUEST_DELAY,
|
|
YouTubeQuotaExhaustedError,
|
|
)
|
|
import httpx
|
|
import time
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't modify files")
|
|
parser.add_argument("--list-file", default="/tmp/needs_youtube.txt", help="File with list of files to process")
|
|
args = parser.parse_args()
|
|
|
|
# Load list of files to process
|
|
list_file = Path(args.list_file)
|
|
if not list_file.exists():
|
|
logger.error(f"List file not found: {list_file}")
|
|
return
|
|
|
|
with open(list_file) as f:
|
|
files = [Path(line.strip()) for line in f if line.strip()]
|
|
|
|
logger.info(f"Loaded {len(files)} files from {list_file}")
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
logger.info(f"Limited to {len(files)} files")
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN - no files will be modified")
|
|
|
|
# Process files
|
|
results = {"modified": 0, "skipped": 0, "errors": 0, "youtube_found": 0}
|
|
|
|
with httpx.Client(timeout=60.0) as client:
|
|
for i, filepath in enumerate(files):
|
|
if not filepath.exists():
|
|
logger.warning(f"File not found: {filepath}")
|
|
results["errors"] += 1
|
|
continue
|
|
|
|
try:
|
|
modified, status = await enrich_custodian_file(
|
|
filepath, client, force=False, dry_run=args.dry_run,
|
|
youtube_only=True, # Always YouTube-only
|
|
maps_only=False,
|
|
)
|
|
if modified:
|
|
results["modified"] += 1
|
|
if "YouTube" in status:
|
|
results["youtube_found"] += 1
|
|
else:
|
|
results["skipped"] += 1
|
|
|
|
# Progress every 100 files
|
|
if (i + 1) % 100 == 0:
|
|
logger.info(f"Progress: {i+1}/{len(files)} | Modified: {results['modified']} | YouTube found: {results['youtube_found']}")
|
|
else:
|
|
logger.info(f" [{i+1}/{len(files)}] {filepath.name}: {status}")
|
|
|
|
except YouTubeQuotaExhaustedError:
|
|
logger.error("=" * 60)
|
|
logger.error("ALL YOUTUBE API KEYS EXHAUSTED - stopping enrichment")
|
|
logger.error("=" * 60)
|
|
break # Exit the loop gracefully
|
|
except Exception as e:
|
|
logger.error(f"Error processing {filepath.name}: {e}")
|
|
results["errors"] += 1
|
|
|
|
# Rate limiting between files
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Summary
|
|
logger.info("=" * 60)
|
|
logger.info(f"SUMMARY: {results['modified']} modified, {results['skipped']} skipped, {results['errors']} errors")
|
|
logger.info(f"YouTube channels found: {results['youtube_found']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|