glam/scripts/verify_enrichment_glm.py
2025-12-09 07:56:35 +01:00

197 lines
6.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Verify enrichment quality using GLM-4.6 with CH-Annotator entity verification.
Uses the Z.AI Coding Plan API to validate:
1. Entity identity consistency across sources
2. Wikidata property accuracy
3. Cross-source verification (Wikidata <-> Google Maps)
"""
import os
import sys
import json
import yaml
import httpx
from pathlib import Path
from datetime import datetime
from typing import Optional
# Z.AI Coding Plan API configuration (per AGENTS.md Rule 11)
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN")
def load_custodian_file(filepath: Path) -> dict:
"""Load a YAML custodian file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def verify_with_glm(custodian: dict) -> dict:
"""Use GLM-4.6 to verify entity consistency."""
# Extract key fields
name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown')
wikidata = custodian.get('wikidata_enrichment', {})
google_maps = custodian.get('google_maps_enrichment', {})
ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
inst_type = custodian.get('custodian_type', {}).get('claim_value', 'Unknown')
# Build compact verification prompt
prompt = f"""Verify heritage institution data. Return ONLY valid JSON.
INSTITUTION: {name}
GHCID: {ghcid}
TYPE: {inst_type}
WIKIDATA:
- ID: {wikidata.get('wikidata_entity_id', 'N/A')}
- Label: {wikidata.get('wikidata_label_en', 'N/A')}
- Inception: {wikidata.get('wikidata_inception', 'N/A')}
- Country: {wikidata.get('wikidata_country', 'N/A')}
GOOGLE_MAPS:
- Name: {google_maps.get('name', 'N/A')}
- Address: {google_maps.get('formatted_address', 'N/A')}
- Rating: {google_maps.get('rating', 'N/A')}
Respond with JSON:
{{"name_consistency": 0.0-1.0, "location_accuracy": 0.0-1.0, "type_consistency": 0.0-1.0, "temporal_plausibility": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}"""
try:
response = httpx.post(
ZAI_API_URL,
headers={
"Authorization": f"Bearer {ZAI_API_TOKEN}",
"Content-Type": "application/json"
},
json={
"model": "glm-4.6",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1,
"max_tokens": 4000 # GLM-4.6 needs tokens for reasoning
},
timeout=180.0 # Allow more time for reasoning model
)
response.raise_for_status()
result = response.json()
content = result['choices'][0]['message']['content']
# Extract JSON from response
if '```json' in content:
content = content.split('```json')[1].split('```')[0].strip()
elif '```' in content:
content = content.split('```')[1].split('```')[0].strip()
return json.loads(content)
except json.JSONDecodeError as e:
return {
"error": f"JSON parse error: {str(e)}",
"verified": False,
"overall_confidence": 0.0
}
except Exception as e:
return {
"error": str(e),
"verified": False,
"overall_confidence": 0.0
}
def main():
"""Main verification routine."""
if not ZAI_API_TOKEN:
print("ERROR: ZAI_API_TOKEN not set")
print("Set it with: export ZAI_API_TOKEN=<your_token>")
sys.exit(1)
# Find files with enrichment
custodian_dir = Path("data/custodian")
# Get a diverse sample: files with wikidata enrichment
sample_files = []
# Sample from different countries
for pattern in ["NL-*.yaml", "FR-*.yaml", "JP-*.yaml", "BR-*.yaml", "DE-*.yaml"]:
files = list(custodian_dir.glob(pattern))[:2] # 2 from each country
sample_files.extend(files)
if not sample_files:
print("No custodian files found!")
sys.exit(1)
sample_files = sample_files[:10] # Limit to 10 files
print("=" * 70)
print("GLM-4.6 Enrichment Verification (CH-Annotator v1.7.0)")
print("=" * 70)
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Files to verify: {len(sample_files)}")
print()
results = []
for filepath in sample_files:
print(f"[VERIFY] {filepath.name}")
try:
custodian = load_custodian_file(filepath)
except Exception as e:
print(f" ERROR loading: {e}")
continue
name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown')
ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
print(f" Name: {name}")
print(f" GHCID: {ghcid}")
verification = verify_with_glm(custodian)
if 'error' in verification:
print(f" ERROR: {verification['error']}")
else:
print(f" Overall Confidence: {verification.get('overall_confidence', 0):.2f}")
print(f" CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}")
print(f" Verified: {'Y' if verification.get('verified') else 'N'}")
results.append({
"file": str(filepath),
"name": name,
"ghcid": ghcid,
"verification": verification
})
print()
# Summary
print("=" * 70)
print("VERIFICATION SUMMARY")
print("=" * 70)
verified_count = sum(1 for r in results if r['verification'].get('verified', False))
avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0
print(f"Files Verified: {verified_count}/{len(results)}")
print(f"Average Confidence: {avg_confidence:.2f}")
# Save results
output_file = Path("reports/enrichment_verification_glm.json")
output_file.parent.mkdir(exist_ok=True)
with open(output_file, 'w') as f:
json.dump({
"verification_timestamp": datetime.now().isoformat(),
"model": "glm-4.6",
"convention": "ch_annotator-v1_7_0",
"results": results,
"summary": {
"files_verified": verified_count,
"total_files": len(results),
"average_confidence": avg_confidence
}
}, f, indent=2)
print(f"\nResults saved to: {output_file}")
if __name__ == "__main__":
main()