#!/usr/bin/env python3 """ Verify YouTube enrichment quality using GLM-4.6 with CH-Annotator entity verification. Uses the Z.AI Coding Plan API to validate: 1. Channel name consistency with custodian name 2. Content relevance to heritage domain 3. Video count and subscriber data plausibility 4. Channel description alignment with institution type Per AGENTS.md Rule 11: Use Z.AI Coding Plan endpoint, NOT BigModel API. """ import os import sys import json import yaml import httpx from pathlib import Path from datetime import datetime from typing import Optional, List import argparse import time # Z.AI Coding Plan API configuration (per AGENTS.md Rule 11) ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN") def load_custodian_file(filepath: Path) -> dict: """Load a YAML custodian file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def find_youtube_enriched_files(custodian_dir: Path, limit: int = None) -> List[Path]: """Find files that have youtube_enrichment section.""" files = [] for filepath in custodian_dir.glob("*.yaml"): try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() if 'youtube_enrichment:' in content and 'status: SUCCESS' in content: files.append(filepath) if limit and len(files) >= limit: break except Exception: pass return files def verify_youtube_with_glm(custodian: dict) -> dict: """Use GLM-4.6 to verify YouTube enrichment quality and relevance.""" # Extract key fields custodian_name = custodian.get('custodian_name', {}) if isinstance(custodian_name, dict): name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown')) else: name = str(custodian_name) if custodian_name else 'Unknown' ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown') inst_type = custodian.get('custodian_type', {}) if isinstance(inst_type, dict): inst_type = inst_type.get('claim_value', 'Unknown') else: inst_type = str(inst_type) if inst_type else 'Unknown' youtube = custodian.get('youtube_enrichment', {}) # Get sample video titles videos = youtube.get('videos', [])[:5] video_titles = [v.get('title', '')[:50] for v in videos] # Build compact verification prompt prompt = f"""Verify YouTube channel enrichment for heritage institution. Return ONLY valid JSON. INSTITUTION: - Name: {name} - GHCID: {ghcid} - Type: {inst_type} YOUTUBE CHANNEL: - Channel Title: {youtube.get('title', 'N/A')} - Subscribers: {youtube.get('subscriber_count', 'N/A')} - Video Count: {youtube.get('video_count', 'N/A')} - Total Views: {youtube.get('view_count', 'N/A')} - Description: {(youtube.get('description', '') or '')[:200]} SAMPLE VIDEOS: {video_titles} Verify: 1. Does channel title match/relate to institution name? 2. Is channel content relevant to heritage/cultural domain? 3. Are subscriber/video counts plausible for this institution type? 4. Do video titles relate to heritage/cultural activities? CH-Annotator entity types for heritage: GRP.HER.MUS (museum), GRP.HER.LIB (library), GRP.HER.ARC (archive), GRP.HER.GAL (gallery) Respond with JSON only: {{"name_match_score": 0.0-1.0, "content_relevance": 0.0-1.0, "data_plausibility": 0.0-1.0, "video_relevance": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}""" try: response = httpx.post( ZAI_API_URL, headers={ "Authorization": f"Bearer {ZAI_API_TOKEN}", "Content-Type": "application/json" }, json={ "model": "glm-4.6", "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 4000 }, timeout=180.0 ) response.raise_for_status() result = response.json() content = result['choices'][0]['message']['content'] # Extract JSON from response if '```json' in content: content = content.split('```json')[1].split('```')[0].strip() elif '```' in content: content = content.split('```')[1].split('```')[0].strip() return json.loads(content) except json.JSONDecodeError as e: return { "error": f"JSON parse error: {str(e)}", "verified": False, "overall_confidence": 0.0 } except Exception as e: return { "error": str(e), "verified": False, "overall_confidence": 0.0 } def main(): """Main verification routine.""" parser = argparse.ArgumentParser(description="Verify YouTube enrichment with GLM-4.6") parser.add_argument("--limit", type=int, default=20, help="Number of files to verify") parser.add_argument("--file", type=str, help="Verify a specific file") parser.add_argument("--recent", action="store_true", help="Verify most recently modified files") args = parser.parse_args() if not ZAI_API_TOKEN: print("ERROR: ZAI_API_TOKEN not set") print("Set it with: export ZAI_API_TOKEN=") sys.exit(1) custodian_dir = Path("data/custodian") if args.file: filepath = Path(args.file) if not filepath.exists(): filepath = custodian_dir / args.file sample_files = [filepath] if filepath.exists() else [] else: # Find files with YouTube enrichment sample_files = find_youtube_enriched_files(custodian_dir, limit=args.limit * 2) if args.recent: # Sort by modification time (most recent first) sample_files.sort(key=lambda f: f.stat().st_mtime, reverse=True) sample_files = sample_files[:args.limit] if not sample_files: print("No YouTube-enriched custodian files found!") sys.exit(1) print("=" * 70) print("GLM-4.6 YouTube Enrichment Verification (CH-Annotator v1.7.0)") print("=" * 70) print(f"Timestamp: {datetime.now().isoformat()}") print(f"Files to verify: {len(sample_files)}") print() results = [] for i, filepath in enumerate(sample_files, 1): print(f"[{i}/{len(sample_files)}] {filepath.name}") try: custodian = load_custodian_file(filepath) except Exception as e: print(f" ERROR loading: {e}") continue # Get names for display custodian_name = custodian.get('custodian_name', {}) if isinstance(custodian_name, dict): name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown')) else: name = str(custodian_name) if custodian_name else 'Unknown' youtube = custodian.get('youtube_enrichment', {}) channel_title = youtube.get('title', 'N/A') subscriber_count = youtube.get('subscriber_count', 0) video_count = youtube.get('video_count', 0) print(f" Custodian: {name}") print(f" Channel: {channel_title}") print(f" Subscribers: {subscriber_count:,}" if subscriber_count else " Subscribers: Hidden") print(f" Videos: {video_count}") verification = verify_youtube_with_glm(custodian) if 'error' in verification: print(f" ERROR: {verification['error']}") else: print(f" Name Match: {verification.get('name_match_score', 0):.2f}") print(f" Content Relevance: {verification.get('content_relevance', 0):.2f}") print(f" Overall Confidence: {verification.get('overall_confidence', 0):.2f}") print(f" CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}") print(f" Verified: {'Y' if verification.get('verified') else 'N'}") if verification.get('issues'): print(f" Issues: {verification['issues']}") results.append({ "file": str(filepath), "custodian_name": name, "channel_title": channel_title, "subscriber_count": subscriber_count, "video_count": video_count, "verification": verification }) print() # Small delay to avoid rate limiting time.sleep(0.5) # Summary print("=" * 70) print("VERIFICATION SUMMARY") print("=" * 70) verified_count = sum(1 for r in results if r['verification'].get('verified', False)) avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0 avg_name_match = sum(r['verification'].get('name_match_score', 0) for r in results) / len(results) if results else 0 avg_content_relevance = sum(r['verification'].get('content_relevance', 0) for r in results) / len(results) if results else 0 print(f"Files Verified: {verified_count}/{len(results)}") print(f"Average Overall Confidence: {avg_confidence:.2f}") print(f"Average Name Match Score: {avg_name_match:.2f}") print(f"Average Content Relevance: {avg_content_relevance:.2f}") # List any issues issues_found = [r for r in results if r['verification'].get('issues')] if issues_found: print(f"\nFiles with Issues: {len(issues_found)}") for r in issues_found[:5]: print(f" - {r['file']}: {r['verification']['issues']}") # Save results output_file = Path("reports/youtube_enrichment_verification_glm.json") output_file.parent.mkdir(exist_ok=True) with open(output_file, 'w') as f: json.dump({ "verification_timestamp": datetime.now().isoformat(), "model": "glm-4.6", "convention": "ch_annotator-v1_7_0", "results": results, "summary": { "files_verified": verified_count, "total_files": len(results), "average_confidence": avg_confidence, "average_name_match": avg_name_match, "average_content_relevance": avg_content_relevance } }, f, indent=2, ensure_ascii=False) print(f"\nResults saved to: {output_file}") if __name__ == "__main__": main()