285 lines
10 KiB
Python
285 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Verify YouTube enrichment quality using GLM-4.6 with CH-Annotator entity verification.
|
|
|
|
Uses the Z.AI Coding Plan API to validate:
|
|
1. Channel name consistency with custodian name
|
|
2. Content relevance to heritage domain
|
|
3. Video count and subscriber data plausibility
|
|
4. Channel description alignment with institution type
|
|
|
|
Per AGENTS.md Rule 11: Use Z.AI Coding Plan endpoint, NOT BigModel API.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional, List
|
|
import argparse
|
|
import time
|
|
|
|
# Z.AI Coding Plan API configuration (per AGENTS.md Rule 11)
|
|
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
|
ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN")
|
|
|
|
|
|
def load_custodian_file(filepath: Path) -> dict:
|
|
"""Load a YAML custodian file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def find_youtube_enriched_files(custodian_dir: Path, limit: int = None) -> List[Path]:
|
|
"""Find files that have youtube_enrichment section."""
|
|
files = []
|
|
for filepath in custodian_dir.glob("*.yaml"):
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
if 'youtube_enrichment:' in content and 'status: SUCCESS' in content:
|
|
files.append(filepath)
|
|
if limit and len(files) >= limit:
|
|
break
|
|
except Exception:
|
|
pass
|
|
return files
|
|
|
|
|
|
def verify_youtube_with_glm(custodian: dict) -> dict:
|
|
"""Use GLM-4.6 to verify YouTube enrichment quality and relevance."""
|
|
|
|
# Extract key fields
|
|
custodian_name = custodian.get('custodian_name', {})
|
|
if isinstance(custodian_name, dict):
|
|
name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown'))
|
|
else:
|
|
name = str(custodian_name) if custodian_name else 'Unknown'
|
|
|
|
ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
|
|
inst_type = custodian.get('custodian_type', {})
|
|
if isinstance(inst_type, dict):
|
|
inst_type = inst_type.get('claim_value', 'Unknown')
|
|
else:
|
|
inst_type = str(inst_type) if inst_type else 'Unknown'
|
|
|
|
youtube = custodian.get('youtube_enrichment', {})
|
|
|
|
# Get sample video titles
|
|
videos = youtube.get('videos', [])[:5]
|
|
video_titles = [v.get('title', '')[:50] for v in videos]
|
|
|
|
# Build compact verification prompt
|
|
prompt = f"""Verify YouTube channel enrichment for heritage institution. Return ONLY valid JSON.
|
|
|
|
INSTITUTION:
|
|
- Name: {name}
|
|
- GHCID: {ghcid}
|
|
- Type: {inst_type}
|
|
|
|
YOUTUBE CHANNEL:
|
|
- Channel Title: {youtube.get('title', 'N/A')}
|
|
- Subscribers: {youtube.get('subscriber_count', 'N/A')}
|
|
- Video Count: {youtube.get('video_count', 'N/A')}
|
|
- Total Views: {youtube.get('view_count', 'N/A')}
|
|
- Description: {(youtube.get('description', '') or '')[:200]}
|
|
|
|
SAMPLE VIDEOS: {video_titles}
|
|
|
|
Verify:
|
|
1. Does channel title match/relate to institution name?
|
|
2. Is channel content relevant to heritage/cultural domain?
|
|
3. Are subscriber/video counts plausible for this institution type?
|
|
4. Do video titles relate to heritage/cultural activities?
|
|
|
|
CH-Annotator entity types for heritage: GRP.HER.MUS (museum), GRP.HER.LIB (library), GRP.HER.ARC (archive), GRP.HER.GAL (gallery)
|
|
|
|
Respond with JSON only:
|
|
{{"name_match_score": 0.0-1.0, "content_relevance": 0.0-1.0, "data_plausibility": 0.0-1.0, "video_relevance": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}"""
|
|
|
|
try:
|
|
response = httpx.post(
|
|
ZAI_API_URL,
|
|
headers={
|
|
"Authorization": f"Bearer {ZAI_API_TOKEN}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": "glm-4.6",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"temperature": 0.1,
|
|
"max_tokens": 4000
|
|
},
|
|
timeout=180.0
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
# Extract JSON from response
|
|
if '```json' in content:
|
|
content = content.split('```json')[1].split('```')[0].strip()
|
|
elif '```' in content:
|
|
content = content.split('```')[1].split('```')[0].strip()
|
|
|
|
return json.loads(content)
|
|
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"error": f"JSON parse error: {str(e)}",
|
|
"verified": False,
|
|
"overall_confidence": 0.0
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"error": str(e),
|
|
"verified": False,
|
|
"overall_confidence": 0.0
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main verification routine."""
|
|
|
|
parser = argparse.ArgumentParser(description="Verify YouTube enrichment with GLM-4.6")
|
|
parser.add_argument("--limit", type=int, default=20, help="Number of files to verify")
|
|
parser.add_argument("--file", type=str, help="Verify a specific file")
|
|
parser.add_argument("--recent", action="store_true", help="Verify most recently modified files")
|
|
args = parser.parse_args()
|
|
|
|
if not ZAI_API_TOKEN:
|
|
print("ERROR: ZAI_API_TOKEN not set")
|
|
print("Set it with: export ZAI_API_TOKEN=<your_token>")
|
|
sys.exit(1)
|
|
|
|
custodian_dir = Path("data/custodian")
|
|
|
|
if args.file:
|
|
filepath = Path(args.file)
|
|
if not filepath.exists():
|
|
filepath = custodian_dir / args.file
|
|
sample_files = [filepath] if filepath.exists() else []
|
|
else:
|
|
# Find files with YouTube enrichment
|
|
sample_files = find_youtube_enriched_files(custodian_dir, limit=args.limit * 2)
|
|
|
|
if args.recent:
|
|
# Sort by modification time (most recent first)
|
|
sample_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)
|
|
|
|
sample_files = sample_files[:args.limit]
|
|
|
|
if not sample_files:
|
|
print("No YouTube-enriched custodian files found!")
|
|
sys.exit(1)
|
|
|
|
print("=" * 70)
|
|
print("GLM-4.6 YouTube Enrichment Verification (CH-Annotator v1.7.0)")
|
|
print("=" * 70)
|
|
print(f"Timestamp: {datetime.now().isoformat()}")
|
|
print(f"Files to verify: {len(sample_files)}")
|
|
print()
|
|
|
|
results = []
|
|
|
|
for i, filepath in enumerate(sample_files, 1):
|
|
print(f"[{i}/{len(sample_files)}] {filepath.name}")
|
|
|
|
try:
|
|
custodian = load_custodian_file(filepath)
|
|
except Exception as e:
|
|
print(f" ERROR loading: {e}")
|
|
continue
|
|
|
|
# Get names for display
|
|
custodian_name = custodian.get('custodian_name', {})
|
|
if isinstance(custodian_name, dict):
|
|
name = custodian_name.get('claim_value', custodian_name.get('emic_name', 'Unknown'))
|
|
else:
|
|
name = str(custodian_name) if custodian_name else 'Unknown'
|
|
|
|
youtube = custodian.get('youtube_enrichment', {})
|
|
channel_title = youtube.get('title', 'N/A')
|
|
subscriber_count = youtube.get('subscriber_count', 0)
|
|
video_count = youtube.get('video_count', 0)
|
|
|
|
print(f" Custodian: {name}")
|
|
print(f" Channel: {channel_title}")
|
|
print(f" Subscribers: {subscriber_count:,}" if subscriber_count else " Subscribers: Hidden")
|
|
print(f" Videos: {video_count}")
|
|
|
|
verification = verify_youtube_with_glm(custodian)
|
|
|
|
if 'error' in verification:
|
|
print(f" ERROR: {verification['error']}")
|
|
else:
|
|
print(f" Name Match: {verification.get('name_match_score', 0):.2f}")
|
|
print(f" Content Relevance: {verification.get('content_relevance', 0):.2f}")
|
|
print(f" Overall Confidence: {verification.get('overall_confidence', 0):.2f}")
|
|
print(f" CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}")
|
|
print(f" Verified: {'Y' if verification.get('verified') else 'N'}")
|
|
if verification.get('issues'):
|
|
print(f" Issues: {verification['issues']}")
|
|
|
|
results.append({
|
|
"file": str(filepath),
|
|
"custodian_name": name,
|
|
"channel_title": channel_title,
|
|
"subscriber_count": subscriber_count,
|
|
"video_count": video_count,
|
|
"verification": verification
|
|
})
|
|
print()
|
|
|
|
# Small delay to avoid rate limiting
|
|
time.sleep(0.5)
|
|
|
|
# Summary
|
|
print("=" * 70)
|
|
print("VERIFICATION SUMMARY")
|
|
print("=" * 70)
|
|
|
|
verified_count = sum(1 for r in results if r['verification'].get('verified', False))
|
|
avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0
|
|
avg_name_match = sum(r['verification'].get('name_match_score', 0) for r in results) / len(results) if results else 0
|
|
avg_content_relevance = sum(r['verification'].get('content_relevance', 0) for r in results) / len(results) if results else 0
|
|
|
|
print(f"Files Verified: {verified_count}/{len(results)}")
|
|
print(f"Average Overall Confidence: {avg_confidence:.2f}")
|
|
print(f"Average Name Match Score: {avg_name_match:.2f}")
|
|
print(f"Average Content Relevance: {avg_content_relevance:.2f}")
|
|
|
|
# List any issues
|
|
issues_found = [r for r in results if r['verification'].get('issues')]
|
|
if issues_found:
|
|
print(f"\nFiles with Issues: {len(issues_found)}")
|
|
for r in issues_found[:5]:
|
|
print(f" - {r['file']}: {r['verification']['issues']}")
|
|
|
|
# Save results
|
|
output_file = Path("reports/youtube_enrichment_verification_glm.json")
|
|
output_file.parent.mkdir(exist_ok=True)
|
|
with open(output_file, 'w') as f:
|
|
json.dump({
|
|
"verification_timestamp": datetime.now().isoformat(),
|
|
"model": "glm-4.6",
|
|
"convention": "ch_annotator-v1_7_0",
|
|
"results": results,
|
|
"summary": {
|
|
"files_verified": verified_count,
|
|
"total_files": len(results),
|
|
"average_confidence": avg_confidence,
|
|
"average_name_match": avg_name_match,
|
|
"average_content_relevance": avg_content_relevance
|
|
}
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nResults saved to: {output_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|