#!/usr/bin/env python3 """ Verify enrichment quality using GLM-4.6 with CH-Annotator entity verification. Uses the Z.AI Coding Plan API to validate: 1. Entity identity consistency across sources 2. Wikidata property accuracy 3. Cross-source verification (Wikidata <-> Google Maps) """ import os import sys import json import yaml import httpx from pathlib import Path from datetime import datetime from typing import Optional # Z.AI Coding Plan API configuration (per AGENTS.md Rule 11) ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN") def load_custodian_file(filepath: Path) -> dict: """Load a YAML custodian file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def verify_with_glm(custodian: dict) -> dict: """Use GLM-4.6 to verify entity consistency.""" # Extract key fields name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown') wikidata = custodian.get('wikidata_enrichment', {}) google_maps = custodian.get('google_maps_enrichment', {}) ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown') inst_type = custodian.get('custodian_type', {}).get('claim_value', 'Unknown') # Build compact verification prompt prompt = f"""Verify heritage institution data. Return ONLY valid JSON. INSTITUTION: {name} GHCID: {ghcid} TYPE: {inst_type} WIKIDATA: - ID: {wikidata.get('wikidata_entity_id', 'N/A')} - Label: {wikidata.get('wikidata_label_en', 'N/A')} - Inception: {wikidata.get('wikidata_inception', 'N/A')} - Country: {wikidata.get('wikidata_country', 'N/A')} GOOGLE_MAPS: - Name: {google_maps.get('name', 'N/A')} - Address: {google_maps.get('formatted_address', 'N/A')} - Rating: {google_maps.get('rating', 'N/A')} Respond with JSON: {{"name_consistency": 0.0-1.0, "location_accuracy": 0.0-1.0, "type_consistency": 0.0-1.0, "temporal_plausibility": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}""" try: response = httpx.post( ZAI_API_URL, headers={ "Authorization": f"Bearer {ZAI_API_TOKEN}", "Content-Type": "application/json" }, json={ "model": "glm-4.6", "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 4000 # GLM-4.6 needs tokens for reasoning }, timeout=180.0 # Allow more time for reasoning model ) response.raise_for_status() result = response.json() content = result['choices'][0]['message']['content'] # Extract JSON from response if '```json' in content: content = content.split('```json')[1].split('```')[0].strip() elif '```' in content: content = content.split('```')[1].split('```')[0].strip() return json.loads(content) except json.JSONDecodeError as e: return { "error": f"JSON parse error: {str(e)}", "verified": False, "overall_confidence": 0.0 } except Exception as e: return { "error": str(e), "verified": False, "overall_confidence": 0.0 } def main(): """Main verification routine.""" if not ZAI_API_TOKEN: print("ERROR: ZAI_API_TOKEN not set") print("Set it with: export ZAI_API_TOKEN=") sys.exit(1) # Find files with enrichment custodian_dir = Path("data/custodian") # Get a diverse sample: files with wikidata enrichment sample_files = [] # Sample from different countries for pattern in ["NL-*.yaml", "FR-*.yaml", "JP-*.yaml", "BR-*.yaml", "DE-*.yaml"]: files = list(custodian_dir.glob(pattern))[:2] # 2 from each country sample_files.extend(files) if not sample_files: print("No custodian files found!") sys.exit(1) sample_files = sample_files[:10] # Limit to 10 files print("=" * 70) print("GLM-4.6 Enrichment Verification (CH-Annotator v1.7.0)") print("=" * 70) print(f"Timestamp: {datetime.now().isoformat()}") print(f"Files to verify: {len(sample_files)}") print() results = [] for filepath in sample_files: print(f"[VERIFY] {filepath.name}") try: custodian = load_custodian_file(filepath) except Exception as e: print(f" ERROR loading: {e}") continue name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown') ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown') print(f" Name: {name}") print(f" GHCID: {ghcid}") verification = verify_with_glm(custodian) if 'error' in verification: print(f" ERROR: {verification['error']}") else: print(f" Overall Confidence: {verification.get('overall_confidence', 0):.2f}") print(f" CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}") print(f" Verified: {'Y' if verification.get('verified') else 'N'}") results.append({ "file": str(filepath), "name": name, "ghcid": ghcid, "verification": verification }) print() # Summary print("=" * 70) print("VERIFICATION SUMMARY") print("=" * 70) verified_count = sum(1 for r in results if r['verification'].get('verified', False)) avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0 print(f"Files Verified: {verified_count}/{len(results)}") print(f"Average Confidence: {avg_confidence:.2f}") # Save results output_file = Path("reports/enrichment_verification_glm.json") output_file.parent.mkdir(exist_ok=True) with open(output_file, 'w') as f: json.dump({ "verification_timestamp": datetime.now().isoformat(), "model": "glm-4.6", "convention": "ch_annotator-v1_7_0", "results": results, "summary": { "files_verified": verified_count, "total_files": len(results), "average_confidence": avg_confidence } }, f, indent=2) print(f"\nResults saved to: {output_file}") if __name__ == "__main__": main()