glam/scripts/verify_youtube_enrichment_glm.py

#!/usr/bin/env python3
"""
Verify YouTube enrichment quality using GLM-4.6 with CH-Annotator entity verification.

Uses the Z.AI Coding Plan API to validate:
1. Channel name consistency with custodian name
2. Content relevance to heritage domain
3. Video count and subscriber data plausibility
4. Channel description alignment with institution type

Per AGENTS.md Rule 11: Use Z.AI Coding Plan endpoint, NOT BigModel API.
"""

import os
import sys
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime
from typing import Optional, List
import argparse
import time

# Z.AI Coding Plan API configuration (per AGENTS.md Rule 11)
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN")


def load_custodian_file(filepath: Path) -> dict:
    """Load a YAML custodian file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def find_youtube_enriched_files(custodian_dir: Path, limit: int = None) -> List[Path]:
    """Find files that have youtube_enrichment section."""
    files = []
    for filepath in custodian_dir.glob("*.yaml"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
            if 'youtube_enrichment:' in content and 'status: SUCCESS' in content:
                files.append(filepath)
                if limit and len(files) >= limit:
                    break
        except Exception:
            pass
    return files


def verify_youtube_with_glm(custodian: dict) -> dict:
    """Use GLM-4.6 to verify YouTube enrichment quality and relevance."""

    # Extract key fields
    custodian_name = custodian.get('custodian_name', {})
    if isinstance(custodian_name, dict):
        name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown'))
    else:
        name = str(custodian_name) if custodian_name else 'Unknown'

    ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
    inst_type = custodian.get('custodian_type', {})
    if isinstance(inst_type, dict):
        inst_type = inst_type.get('claim_value', 'Unknown')
    else:
        inst_type = str(inst_type) if inst_type else 'Unknown'

    youtube = custodian.get('youtube_enrichment', {})

    # Get sample video titles
    videos = youtube.get('videos', [])[:5]
    video_titles = [v.get('title', '')[:50] for v in videos]

    # Build compact verification prompt
    prompt = f"""Verify YouTube channel enrichment for heritage institution. Return ONLY valid JSON.

INSTITUTION:
- Name: {name}
- GHCID: {ghcid}
- Type: {inst_type}

YOUTUBE CHANNEL:
- Channel Title: {youtube.get('title', 'N/A')}
- Subscribers: {youtube.get('subscriber_count', 'N/A')}
- Video Count: {youtube.get('video_count', 'N/A')}
- Total Views: {youtube.get('view_count', 'N/A')}
- Description: {(youtube.get('description', '') or '')[:200]}

SAMPLE VIDEOS: {video_titles}

Verify:
1. Does channel title match/relate to institution name?
2. Is channel content relevant to heritage/cultural domain?
3. Are subscriber/video counts plausible for this institution type?
4. Do video titles relate to heritage/cultural activities?

CH-Annotator entity types for heritage: GRP.HER.MUS (museum), GRP.HER.LIB (library), GRP.HER.ARC (archive), GRP.HER.GAL (gallery)

Respond with JSON only:
{{"name_match_score": 0.0-1.0, "content_relevance": 0.0-1.0, "data_plausibility": 0.0-1.0, "video_relevance": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}"""

    try:
        response = httpx.post(
            ZAI_API_URL,
            headers={
                "Authorization": f"Bearer {ZAI_API_TOKEN}",
                "Content-Type": "application/json"
            },
            json={
                "model": "glm-4.6",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.1,
                "max_tokens": 4000
            },
            timeout=180.0
        )
        response.raise_for_status()

        result = response.json()
        content = result['choices'][0]['message']['content']

        # Extract JSON from response
        if '```json' in content:
            content = content.split('```json')[1].split('```')[0].strip()
        elif '```' in content:
            content = content.split('```')[1].split('```')[0].strip()

        return json.loads(content)

    except json.JSONDecodeError as e:
        return {
            "error": f"JSON parse error: {str(e)}",
            "verified": False,
            "overall_confidence": 0.0
        }
    except Exception as e:
        return {
            "error": str(e),
            "verified": False,
            "overall_confidence": 0.0
        }


def main():
    """Main verification routine."""

    parser = argparse.ArgumentParser(description="Verify YouTube enrichment with GLM-4.6")
    parser.add_argument("--limit", type=int, default=20, help="Number of files to verify")
    parser.add_argument("--file", type=str, help="Verify a specific file")
    parser.add_argument("--recent", action="store_true", help="Verify most recently modified files")
    args = parser.parse_args()

    if not ZAI_API_TOKEN:
        print("ERROR: ZAI_API_TOKEN not set")
        print("Set it with: export ZAI_API_TOKEN=<your_token>")
        sys.exit(1)

    custodian_dir = Path("data/custodian")

    if args.file:
        filepath = Path(args.file)
        if not filepath.exists():
            filepath = custodian_dir / args.file
        sample_files = [filepath] if filepath.exists() else []
    else:
        # Find files with YouTube enrichment
        sample_files = find_youtube_enriched_files(custodian_dir, limit=args.limit * 2)

        if args.recent:
            # Sort by modification time (most recent first)
            sample_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)

        sample_files = sample_files[:args.limit]

    if not sample_files:
        print("No YouTube-enriched custodian files found!")
        sys.exit(1)

    print("=" * 70)
    print("GLM-4.6 YouTube Enrichment Verification (CH-Annotator v1.7.0)")
    print("=" * 70)
    print(f"Timestamp: {datetime.now().isoformat()}")
    print(f"Files to verify: {len(sample_files)}")
    print()

    results = []

    for i, filepath in enumerate(sample_files, 1):
        print(f"[{i}/{len(sample_files)}] {filepath.name}")

        try:
            custodian = load_custodian_file(filepath)
        except Exception as e:
            print(f"         ERROR loading: {e}")
            continue

        # Get names for display
        custodian_name = custodian.get('custodian_name', {})
        if isinstance(custodian_name, dict):
            name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown'))
        else:
            name = str(custodian_name) if custodian_name else 'Unknown'

        youtube = custodian.get('youtube_enrichment', {})
        channel_title = youtube.get('title', 'N/A')
        subscriber_count = youtube.get('subscriber_count', 0)
        video_count = youtube.get('video_count', 0)

        print(f"         Custodian: {name}")
        print(f"         Channel: {channel_title}")
        print(f"         Subscribers: {subscriber_count:,}" if subscriber_count else "         Subscribers: Hidden")
        print(f"         Videos: {video_count}")

        verification = verify_youtube_with_glm(custodian)

        if 'error' in verification:
            print(f"         ERROR: {verification['error']}")
        else:
            print(f"         Name Match: {verification.get('name_match_score', 0):.2f}")
            print(f"         Content Relevance: {verification.get('content_relevance', 0):.2f}")
            print(f"         Overall Confidence: {verification.get('overall_confidence', 0):.2f}")
            print(f"         CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}")
            print(f"         Verified: {'Y' if verification.get('verified') else 'N'}")
            if verification.get('issues'):
                print(f"         Issues: {verification['issues']}")

        results.append({
            "file": str(filepath),
            "custodian_name": name,
            "channel_title": channel_title,
            "subscriber_count": subscriber_count,
            "video_count": video_count,
            "verification": verification
        })
        print()

        # Small delay to avoid rate limiting
        time.sleep(0.5)

    # Summary
    print("=" * 70)
    print("VERIFICATION SUMMARY")
    print("=" * 70)

    verified_count = sum(1 for r in results if r['verification'].get('verified', False))
    avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0
    avg_name_match = sum(r['verification'].get('name_match_score', 0) for r in results) / len(results) if results else 0
    avg_content_relevance = sum(r['verification'].get('content_relevance', 0) for r in results) / len(results) if results else 0

    print(f"Files Verified: {verified_count}/{len(results)}")
    print(f"Average Overall Confidence: {avg_confidence:.2f}")
    print(f"Average Name Match Score: {avg_name_match:.2f}")
    print(f"Average Content Relevance: {avg_content_relevance:.2f}")

    # List any issues
    issues_found = [r for r in results if r['verification'].get('issues')]
    if issues_found:
        print(f"\nFiles with Issues: {len(issues_found)}")
        for r in issues_found[:5]:
            print(f"  - {r['file']}: {r['verification']['issues']}")

    # Save results
    output_file = Path("reports/youtube_enrichment_verification_glm.json")
    output_file.parent.mkdir(exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump({
            "verification_timestamp": datetime.now().isoformat(),
            "model": "glm-4.6",
            "convention": "ch_annotator-v1_7_0",
            "results": results,
            "summary": {
                "files_verified": verified_count,
                "total_files": len(results),
                "average_confidence": avg_confidence,
                "average_name_match": avg_name_match,
                "average_content_relevance": avg_content_relevance
            }
        }, f, indent=2, ensure_ascii=False)

    print(f"\nResults saved to: {output_file}")


if __name__ == "__main__":
    main()