197 lines
6.5 KiB
Python
Executable file
197 lines
6.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Verify enrichment quality using GLM-4.6 with CH-Annotator entity verification.
|
|
|
|
Uses the Z.AI Coding Plan API to validate:
|
|
1. Entity identity consistency across sources
|
|
2. Wikidata property accuracy
|
|
3. Cross-source verification (Wikidata <-> Google Maps)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
# Z.AI Coding Plan API configuration (per AGENTS.md Rule 11)
|
|
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
|
ZAI_API_TOKEN = os.environ.get("ZAI_API_TOKEN")
|
|
|
|
def load_custodian_file(filepath: Path) -> dict:
|
|
"""Load a YAML custodian file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def verify_with_glm(custodian: dict) -> dict:
|
|
"""Use GLM-4.6 to verify entity consistency."""
|
|
|
|
# Extract key fields
|
|
name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown')
|
|
wikidata = custodian.get('wikidata_enrichment', {})
|
|
google_maps = custodian.get('google_maps_enrichment', {})
|
|
ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
|
|
inst_type = custodian.get('custodian_type', {}).get('claim_value', 'Unknown')
|
|
|
|
# Build compact verification prompt
|
|
prompt = f"""Verify heritage institution data. Return ONLY valid JSON.
|
|
|
|
INSTITUTION: {name}
|
|
GHCID: {ghcid}
|
|
TYPE: {inst_type}
|
|
|
|
WIKIDATA:
|
|
- ID: {wikidata.get('wikidata_entity_id', 'N/A')}
|
|
- Label: {wikidata.get('wikidata_label_en', 'N/A')}
|
|
- Inception: {wikidata.get('wikidata_inception', 'N/A')}
|
|
- Country: {wikidata.get('wikidata_country', 'N/A')}
|
|
|
|
GOOGLE_MAPS:
|
|
- Name: {google_maps.get('name', 'N/A')}
|
|
- Address: {google_maps.get('formatted_address', 'N/A')}
|
|
- Rating: {google_maps.get('rating', 'N/A')}
|
|
|
|
Respond with JSON:
|
|
{{"name_consistency": 0.0-1.0, "location_accuracy": 0.0-1.0, "type_consistency": 0.0-1.0, "temporal_plausibility": 0.0-1.0, "overall_confidence": 0.0-1.0, "ch_annotator_type": "GRP.HER.XXX", "issues": [], "verified": true}}"""
|
|
|
|
try:
|
|
response = httpx.post(
|
|
ZAI_API_URL,
|
|
headers={
|
|
"Authorization": f"Bearer {ZAI_API_TOKEN}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": "glm-4.6",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"temperature": 0.1,
|
|
"max_tokens": 4000 # GLM-4.6 needs tokens for reasoning
|
|
},
|
|
timeout=180.0 # Allow more time for reasoning model
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
# Extract JSON from response
|
|
if '```json' in content:
|
|
content = content.split('```json')[1].split('```')[0].strip()
|
|
elif '```' in content:
|
|
content = content.split('```')[1].split('```')[0].strip()
|
|
|
|
return json.loads(content)
|
|
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"error": f"JSON parse error: {str(e)}",
|
|
"verified": False,
|
|
"overall_confidence": 0.0
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"error": str(e),
|
|
"verified": False,
|
|
"overall_confidence": 0.0
|
|
}
|
|
|
|
def main():
|
|
"""Main verification routine."""
|
|
|
|
if not ZAI_API_TOKEN:
|
|
print("ERROR: ZAI_API_TOKEN not set")
|
|
print("Set it with: export ZAI_API_TOKEN=<your_token>")
|
|
sys.exit(1)
|
|
|
|
# Find files with enrichment
|
|
custodian_dir = Path("data/custodian")
|
|
|
|
# Get a diverse sample: files with wikidata enrichment
|
|
sample_files = []
|
|
|
|
# Sample from different countries
|
|
for pattern in ["NL-*.yaml", "FR-*.yaml", "JP-*.yaml", "BR-*.yaml", "DE-*.yaml"]:
|
|
files = list(custodian_dir.glob(pattern))[:2] # 2 from each country
|
|
sample_files.extend(files)
|
|
|
|
if not sample_files:
|
|
print("No custodian files found!")
|
|
sys.exit(1)
|
|
|
|
sample_files = sample_files[:10] # Limit to 10 files
|
|
|
|
print("=" * 70)
|
|
print("GLM-4.6 Enrichment Verification (CH-Annotator v1.7.0)")
|
|
print("=" * 70)
|
|
print(f"Timestamp: {datetime.now().isoformat()}")
|
|
print(f"Files to verify: {len(sample_files)}")
|
|
print()
|
|
|
|
results = []
|
|
|
|
for filepath in sample_files:
|
|
print(f"[VERIFY] {filepath.name}")
|
|
|
|
try:
|
|
custodian = load_custodian_file(filepath)
|
|
except Exception as e:
|
|
print(f" ERROR loading: {e}")
|
|
continue
|
|
|
|
name = custodian.get('custodian_name', {}).get('claim_value', 'Unknown')
|
|
ghcid = custodian.get('ghcid', {}).get('ghcid_current', 'Unknown')
|
|
|
|
print(f" Name: {name}")
|
|
print(f" GHCID: {ghcid}")
|
|
|
|
verification = verify_with_glm(custodian)
|
|
|
|
if 'error' in verification:
|
|
print(f" ERROR: {verification['error']}")
|
|
else:
|
|
print(f" Overall Confidence: {verification.get('overall_confidence', 0):.2f}")
|
|
print(f" CH-Annotator Type: {verification.get('ch_annotator_type', 'N/A')}")
|
|
print(f" Verified: {'Y' if verification.get('verified') else 'N'}")
|
|
|
|
results.append({
|
|
"file": str(filepath),
|
|
"name": name,
|
|
"ghcid": ghcid,
|
|
"verification": verification
|
|
})
|
|
print()
|
|
|
|
# Summary
|
|
print("=" * 70)
|
|
print("VERIFICATION SUMMARY")
|
|
print("=" * 70)
|
|
|
|
verified_count = sum(1 for r in results if r['verification'].get('verified', False))
|
|
avg_confidence = sum(r['verification'].get('overall_confidence', 0) for r in results) / len(results) if results else 0
|
|
|
|
print(f"Files Verified: {verified_count}/{len(results)}")
|
|
print(f"Average Confidence: {avg_confidence:.2f}")
|
|
|
|
# Save results
|
|
output_file = Path("reports/enrichment_verification_glm.json")
|
|
output_file.parent.mkdir(exist_ok=True)
|
|
with open(output_file, 'w') as f:
|
|
json.dump({
|
|
"verification_timestamp": datetime.now().isoformat(),
|
|
"model": "glm-4.6",
|
|
"convention": "ch_annotator-v1_7_0",
|
|
"results": results,
|
|
"summary": {
|
|
"files_verified": verified_count,
|
|
"total_files": len(results),
|
|
"average_confidence": avg_confidence
|
|
}
|
|
}, f, indent=2)
|
|
|
|
print(f"\nResults saved to: {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|