glam/scripts/verify_youtube_enrichment_glm.py
2025-12-10 18:04:25 +01:00

285 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Verify YouTube enrichment quality using GLM-4.6 with CH-Annotator entity verification.
Uses the Z.AI Coding Plan API to validate:
1. Channel name consistency with custodian name
2. Content relevance to heritage domain
3. Video count and subscriber data plausibility
4. Channel description alignment with institution type
Per AGENTS.md Rule 11: Use Z.AI Coding Plan endpoint, NOT BigModel API.
"""
import os
import sys
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime
from typing import Optional, List
import argparse
import time
# Z.AI Coding Plan API configuration (per AGENTS.md Rule 11)
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN")
def load_custodian_file(filepath: Path) -> dict:
"""Load a YAML custodian file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def find_youtube_enriched_files(custodian_dir: Path, limit: int = None) -> List[Path]:
"""Find files that have youtube_enrichment section."""
files = []
for filepath in custodian_dir.glob("*.yaml"):
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if 'youtube_enrichment:' in content and 'status: SUCCESS' in content:
files.append(filepath)
if limit and len(files) >= limit:
break
except Exception:
pass
return files
def verify_youtube_with_glm(custodian: dict) -> dict:
"""Use GLM-4.6 to verify YouTube enrichment quality and relevance."""
# Extract key fields
custodian_name = custodian.get('custodian_name', {})
if isinstance(custodian_name, dict):
name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown'))
else:
name = str(custodian_name) if custodian_name else 'Unknown'
ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
inst_type = custodian.get('custodian_type', {})
if isinstance(inst_type, dict):
inst_type = inst_type.get('claim_value', 'Unknown')
else:
inst_type = str(inst_type) if inst_type else 'Unknown'
youtube = custodian.get('youtube_enrichment', {})
# Get sample video titles
videos = youtube.get('videos', [])[:5]
video_titles = [v.get('title', '')[:50] for v in videos]
# Build compact verification prompt
prompt = f"""Verify YouTube channel enrichment for heritage institution. Return ONLY valid JSON.
INSTITUTION:
- Name: {name}
- GHCID: {ghcid}
- Type: {inst_type}
YOUTUBE CHANNEL:
- Channel Title: {youtube.get('title', 'N/A')}
- Subscribers: {youtube.get('subscriber_count', 'N/A')}
- Video Count: {youtube.get('video_count', 'N/A')}
- Total Views: {youtube.get('view_count', 'N/A')}
- Description: {(youtube.get('description', '') or '')[:200]}
SAMPLE VIDEOS: {video_titles}
Verify:
1. Does channel title match/relate to institution name?
2. Is channel content relevant to heritage/cultural domain?
3. Are subscriber/video counts plausible for this institution type?
4. Do video titles relate to heritage/cultural activities?
CH-Annotator entity types for heritage: GRP.HER.MUS (museum), GRP.HER.LIB (library), GRP.HER.ARC (archive), GRP.HER.GAL (gallery)
Respond with JSON only:
{{"name_match_score": 0.0-1.0, "content_relevance": 0.0-1.0, "data_plausibility": 0.0-1.0, "video_relevance": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}"""
try:
response = httpx.post(
ZAI_API_URL,
headers={
"Authorization": f"Bearer {ZAI_API_TOKEN}",
"Content-Type": "application/json"
},
json={
"model": "glm-4.6",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1,
"max_tokens": 4000
},
timeout=180.0
)
response.raise_for_status()
result = response.json()
content = result['choices'][0]['message']['content']
# Extract JSON from response
if '```json' in content:
content = content.split('```json')[1].split('```')[0].strip()
elif '```' in content:
content = content.split('```')[1].split('```')[0].strip()
return json.loads(content)
except json.JSONDecodeError as e:
return {
"error": f"JSON parse error: {str(e)}",
"verified": False,
"overall_confidence": 0.0
}
except Exception as e:
return {
"error": str(e),
"verified": False,
"overall_confidence": 0.0
}
def main():
"""Main verification routine."""
parser = argparse.ArgumentParser(description="Verify YouTube enrichment with GLM-4.6")
parser.add_argument("--limit", type=int, default=20, help="Number of files to verify")
parser.add_argument("--file", type=str, help="Verify a specific file")
parser.add_argument("--recent", action="store_true", help="Verify most recently modified files")
args = parser.parse_args()
if not ZAI_API_TOKEN:
print("ERROR: ZAI_API_TOKEN not set")
print("Set it with: export ZAI_API_TOKEN=<your_token>")
sys.exit(1)
custodian_dir = Path("data/custodian")
if args.file:
filepath = Path(args.file)
if not filepath.exists():
filepath = custodian_dir / args.file
sample_files = [filepath] if filepath.exists() else []
else:
# Find files with YouTube enrichment
sample_files = find_youtube_enriched_files(custodian_dir, limit=args.limit * 2)
if args.recent:
# Sort by modification time (most recent first)
sample_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)
sample_files = sample_files[:args.limit]
if not sample_files:
print("No YouTube-enriched custodian files found!")
sys.exit(1)
print("=" * 70)
print("GLM-4.6 YouTube Enrichment Verification (CH-Annotator v1.7.0)")
print("=" * 70)
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Files to verify: {len(sample_files)}")
print()
results = []
for i, filepath in enumerate(sample_files, 1):
print(f"[{i}/{len(sample_files)}] {filepath.name}")
try:
custodian = load_custodian_file(filepath)
except Exception as e:
print(f" ERROR loading: {e}")
continue
# Get names for display
custodian_name = custodian.get('custodian_name', {})
if isinstance(custodian_name, dict):
name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown'))
else:
name = str(custodian_name) if custodian_name else 'Unknown'
youtube = custodian.get('youtube_enrichment', {})
channel_title = youtube.get('title', 'N/A')
subscriber_count = youtube.get('subscriber_count', 0)
video_count = youtube.get('video_count', 0)
print(f" Custodian: {name}")
print(f" Channel: {channel_title}")
print(f" Subscribers: {subscriber_count:,}" if subscriber_count else " Subscribers: Hidden")
print(f" Videos: {video_count}")
verification = verify_youtube_with_glm(custodian)
if 'error' in verification:
print(f" ERROR: {verification['error']}")
else:
print(f" Name Match: {verification.get('name_match_score', 0):.2f}")
print(f" Content Relevance: {verification.get('content_relevance', 0):.2f}")
print(f" Overall Confidence: {verification.get('overall_confidence', 0):.2f}")
print(f" CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}")
print(f" Verified: {'Y' if verification.get('verified') else 'N'}")
if verification.get('issues'):
print(f" Issues: {verification['issues']}")
results.append({
"file": str(filepath),
"custodian_name": name,
"channel_title": channel_title,
"subscriber_count": subscriber_count,
"video_count": video_count,
"verification": verification
})
print()
# Small delay to avoid rate limiting
time.sleep(0.5)
# Summary
print("=" * 70)
print("VERIFICATION SUMMARY")
print("=" * 70)
verified_count = sum(1 for r in results if r['verification'].get('verified', False))
avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0
avg_name_match = sum(r['verification'].get('name_match_score', 0) for r in results) / len(results) if results else 0
avg_content_relevance = sum(r['verification'].get('content_relevance', 0) for r in results) / len(results) if results else 0
print(f"Files Verified: {verified_count}/{len(results)}")
print(f"Average Overall Confidence: {avg_confidence:.2f}")
print(f"Average Name Match Score: {avg_name_match:.2f}")
print(f"Average Content Relevance: {avg_content_relevance:.2f}")
# List any issues
issues_found = [r for r in results if r['verification'].get('issues')]
if issues_found:
print(f"\nFiles with Issues: {len(issues_found)}")
for r in issues_found[:5]:
print(f" - {r['file']}: {r['verification']['issues']}")
# Save results
output_file = Path("reports/youtube_enrichment_verification_glm.json")
output_file.parent.mkdir(exist_ok=True)
with open(output_file, 'w') as f:
json.dump({
"verification_timestamp": datetime.now().isoformat(),
"model": "glm-4.6",
"convention": "ch_annotator-v1_7_0",
"results": results,
"summary": {
"files_verified": verified_count,
"total_files": len(results),
"average_confidence": avg_confidence,
"average_name_match": avg_name_match,
"average_content_relevance": avg_content_relevance
}
}, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to: {output_file}")
if __name__ == "__main__":
main()