glam/scripts/verify_enrichment_glm.py

#!/usr/bin/env python3
"""
Verify enrichment quality using GLM-4.6 with CH-Annotator entity verification.

Uses the Z.AI Coding Plan API to validate:
1. Entity identity consistency across sources
2. Wikidata property accuracy
3. Cross-source verification (Wikidata <-> Google Maps)
"""

import os
import sys
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime
from typing import Optional

# Z.AI Coding Plan API configuration (per AGENTS.md Rule 11)
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN")

def load_custodian_file(filepath: Path) -> dict:
    """Load a YAML custodian file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def verify_with_glm(custodian: dict) -> dict:
    """Use GLM-4.6 to verify entity consistency."""

    # Extract key fields
    name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown')
    wikidata = custodian.get('wikidata_enrichment', {})
    google_maps = custodian.get('google_maps_enrichment', {})
    ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
    inst_type = custodian.get('custodian_type', {}).get('claim_value', 'Unknown')

    # Build compact verification prompt
    prompt = f"""Verify heritage institution data. Return ONLY valid JSON.

INSTITUTION: {name}
GHCID: {ghcid}
TYPE: {inst_type}

WIKIDATA:
- ID: {wikidata.get('wikidata_entity_id', 'N/A')}
- Label: {wikidata.get('wikidata_label_en', 'N/A')}
- Inception: {wikidata.get('wikidata_inception', 'N/A')}
- Country: {wikidata.get('wikidata_country', 'N/A')}

GOOGLE_MAPS:
- Name: {google_maps.get('name', 'N/A')}
- Address: {google_maps.get('formatted_address', 'N/A')}
- Rating: {google_maps.get('rating', 'N/A')}

Respond with JSON:
{{"name_consistency": 0.0-1.0, "location_accuracy": 0.0-1.0, "type_consistency": 0.0-1.0, "temporal_plausibility": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}"""

    try:
        response = httpx.post(
            ZAI_API_URL,
            headers={
                "Authorization": f"Bearer {ZAI_API_TOKEN}",
                "Content-Type": "application/json"
            },
            json={
                "model": "glm-4.6",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.1,
                "max_tokens": 4000  # GLM-4.6 needs tokens for reasoning
            },
            timeout=180.0  # Allow more time for reasoning model
        )
        response.raise_for_status()

        result = response.json()
        content = result['choices'][0]['message']['content']

        # Extract JSON from response
        if '```json' in content:
            content = content.split('```json')[1].split('```')[0].strip()
        elif '```' in content:
            content = content.split('```')[1].split('```')[0].strip()

        return json.loads(content)

    except json.JSONDecodeError as e:
        return {
            "error": f"JSON parse error: {str(e)}",
            "verified": False,
            "overall_confidence": 0.0
        }
    except Exception as e:
        return {
            "error": str(e),
            "verified": False,
            "overall_confidence": 0.0
        }

def main():
    """Main verification routine."""

    if not ZAI_API_TOKEN:
        print("ERROR: ZAI_API_TOKEN not set")
        print("Set it with: export ZAI_API_TOKEN=<your_token>")
        sys.exit(1)

    # Find files with enrichment
    custodian_dir = Path("data/custodian")

    # Get a diverse sample: files with wikidata enrichment
    sample_files = []

    # Sample from different countries
    for pattern in ["NL-*.yaml", "FR-*.yaml", "JP-*.yaml", "BR-*.yaml", "DE-*.yaml"]:
        files = list(custodian_dir.glob(pattern))[:2]  # 2 from each country
        sample_files.extend(files)

    if not sample_files:
        print("No custodian files found!")
        sys.exit(1)

    sample_files = sample_files[:10]  # Limit to 10 files

    print("=" * 70)
    print("GLM-4.6 Enrichment Verification (CH-Annotator v1.7.0)")
    print("=" * 70)
    print(f"Timestamp: {datetime.now().isoformat()}")
    print(f"Files to verify: {len(sample_files)}")
    print()

    results = []

    for filepath in sample_files:
        print(f"[VERIFY] {filepath.name}")

        try:
            custodian = load_custodian_file(filepath)
        except Exception as e:
            print(f"         ERROR loading: {e}")
            continue

        name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown')
        ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')

        print(f"         Name: {name}")
        print(f"         GHCID: {ghcid}")

        verification = verify_with_glm(custodian)

        if 'error' in verification:
            print(f"         ERROR: {verification['error']}")
        else:
            print(f"         Overall Confidence: {verification.get('overall_confidence', 0):.2f}")
            print(f"         CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}")
            print(f"         Verified: {'Y' if verification.get('verified') else 'N'}")

        results.append({
            "file": str(filepath),
            "name": name,
            "ghcid": ghcid,
            "verification": verification
        })
        print()

    # Summary
    print("=" * 70)
    print("VERIFICATION SUMMARY")
    print("=" * 70)

    verified_count = sum(1 for r in results if r['verification'].get('verified', False))
    avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0

    print(f"Files Verified: {verified_count}/{len(results)}")
    print(f"Average Confidence: {avg_confidence:.2f}")

    # Save results
    output_file = Path("reports/enrichment_verification_glm.json")
    output_file.parent.mkdir(exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump({
            "verification_timestamp": datetime.now().isoformat(),
            "model": "glm-4.6",
            "convention": "ch_annotator-v1_7_0",
            "results": results,
            "summary": {
                "files_verified": verified_count,
                "total_files": len(results),
                "average_confidence": avg_confidence
            }
        }, f, indent=2)

    print(f"\nResults saved to: {output_file}")

if __name__ == "__main__":
    main()