glam/scripts/test_pico_arabic_waqf.py
kempersc 505c12601a Add test script for PiCo extraction from Arabic waqf documents
- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents.
- The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results.
- Added comprehensive logging for API responses, extraction results, and validation errors.
- Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
2025-12-12 17:50:17 +01:00

472 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Test PiCo extraction with Arabic waqf (endowment) document example.
This script tests the GLM annotator's ability to extract person observations
from Arabic historical documents following the PiCo ontology pattern.
Usage:
python scripts/test_pico_arabic_waqf.py
Environment Variables:
ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
"""
import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone
import httpx
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
try:
from dotenv import load_dotenv
load_dotenv(project_root / ".env")
except ImportError:
pass # dotenv not required if env vars set directly
# Z.AI API configuration (per AGENTS.md Rule 11)
# GLM-4.6 uses reasoning mode - essential for complex historical document extraction
# Requires higher max_tokens to accommodate reasoning + output
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_MODEL = "glm-4.6"
# Arabic waqf document example (from pico.yaml)
ARABIC_WAQF_TEXT = """بسم الله الرحمن الرحيم
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
وخمس وعشرين هجرية."""
# PiCo extraction system prompt (abbreviated version for testing)
PICO_SYSTEM_PROMPT = """You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons mentioned in the source text, capturing:
1. Names using PNV (Person Name Vocabulary) structure
2. Roles in the source document
3. Biographical information
4. Family relationships between persons in THIS source
5. For Arabic texts: include both original script AND romanized versions
### Arabic Naming Conventions
- ابن/بن (ibn/bin): son of (patronymic)
- بنت (bint): daughter of
- الحاج (al-Hajj): honorific for pilgrimage completer
- السيد (al-Sayyid): honorific (descendant of Prophet)
- المرحوم (al-marhum): the late (deceased male)
- آل (Al): family of
### Family Relationship Keys
- parent: array of person references (person_index + target_name)
- children: array of person references
- spouse: array of person references
### Output Format
Return ONLY valid JSON:
{
"pico_observation": {
"observation_id": "<source-derived-id>",
"observed_at": "<ISO-timestamp>",
"source_type": "<category>",
"source_reference": "<identifier>"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "Name in original script",
"literalName_romanized": "Romanized name",
"givenName": "Given name",
"givenName_romanized": "Romanized given name",
"patronym": "Father's name",
"patronym_romanized": "Romanized patronym",
"baseSurname": "Family/tribal name",
"baseSurname_romanized": "Romanized surname",
"honorificPrefix": "Title/honorific",
"honorificPrefix_romanized": "Romanized honorific"
},
"roles": [
{
"role_title": "Role as stated",
"role_title_romanized": "Romanized role",
"role_in_source": "founder|witness|beneficiary|null"
}
],
"biographical": {
"deceased": true/false/null,
"address": "Location if mentioned"
},
"family_relationships": {
"parent": [{"person_index": N, "target_name": "Name"}],
"children": [{"person_index": N, "target_name": "Name"}]
},
"context": "Brief description of person's role"
}
],
"temporal_references": [
{
"expression": "Original text",
"expression_romanized": "Romanized",
"normalized": "ISO date or approximate",
"calendar": "Hijri|Gregorian",
"type": "DATE"
}
],
"locations_mentioned": [
{
"name": "Original name",
"name_romanized": "Romanized",
"type": "city|neighborhood"
}
]
}"""
async def call_glm_api(system_prompt: str, user_content: str) -> dict:
"""Call Z.AI GLM-4.6 API and return parsed JSON response."""
api_token = os.environ.get("ZAI_API_TOKEN")
if not api_token:
raise ValueError("ZAI_API_TOKEN not set in environment")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"model": ZAI_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content},
],
"temperature": 0.1, # Low temperature for consistent extraction
"max_tokens": 16000, # High limit for GLM-4.6 reasoning mode + output
}
async with httpx.AsyncClient(timeout=300.0) as client: # 5 min timeout for GLM-4.6 reasoning
response = await client.post(ZAI_API_URL, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
content = result["choices"][0]["message"]["content"]
# Save raw response for debugging
raw_output_path = project_root / "data/entity_annotation/test_outputs"
raw_output_path.mkdir(parents=True, exist_ok=True)
raw_file = raw_output_path / f"raw_response_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
with open(raw_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f" Raw response saved to: {raw_file.name}")
# Parse JSON from response (handle markdown code blocks)
json_content = content
if "```json" in content:
json_content = content.split("```json")[1].split("```")[0]
elif "```" in content:
parts = content.split("```")
if len(parts) >= 2:
json_content = parts[1]
# Try to parse, with fallback for truncated JSON
try:
return json.loads(json_content.strip())
except json.JSONDecodeError as e:
print(f"\n⚠️ JSON parse error at position {e.pos}, attempting repair...")
# Try to repair truncated JSON by closing brackets
repaired = repair_truncated_json(json_content.strip())
return json.loads(repaired)
def repair_truncated_json(json_str: str) -> str:
"""Attempt to repair truncated JSON by closing open brackets."""
import re
# Count open/close brackets
open_braces = json_str.count('{') - json_str.count('}')
open_brackets = json_str.count('[') - json_str.count(']')
# Check if we're in the middle of a string
# Find position of last complete key-value pair
last_comma = json_str.rfind(',')
last_colon = json_str.rfind(':')
if last_colon > last_comma:
# We're in the middle of a value, try to find a safe truncation point
# Look for the last complete object or array element
safe_pos = last_comma
if safe_pos > 0:
json_str = json_str[:safe_pos]
# Recount brackets after truncation
open_braces = json_str.count('{') - json_str.count('}')
open_brackets = json_str.count('[') - json_str.count(']')
# Close open brackets
json_str = json_str.rstrip()
# Remove trailing comma if present
if json_str.endswith(','):
json_str = json_str[:-1]
# Add closing brackets
json_str += ']' * open_brackets
json_str += '}' * open_braces
return json_str
def validate_extraction(result: dict) -> tuple[bool, list[str]]:
"""Validate the extraction result against expected structure."""
errors = []
# Check top-level structure
if "pico_observation" not in result:
errors.append("Missing 'pico_observation' field")
if "persons" not in result:
errors.append("Missing 'persons' field")
if "persons" in result:
persons = result["persons"]
# Check minimum person count (should be at least 4: founder, father, 2 witnesses)
if len(persons) < 4:
errors.append(f"Expected at least 4 persons, got {len(persons)}")
# Check person structure
for i, person in enumerate(persons):
if "person_index" not in person:
errors.append(f"Person {i}: missing 'person_index'")
if "pnv_name" not in person:
errors.append(f"Person {i}: missing 'pnv_name'")
elif "literalName" not in person["pnv_name"]:
errors.append(f"Person {i}: missing 'literalName' in pnv_name")
# Check for specific expected persons
names = [p.get("pnv_name", {}).get("literalName_romanized", "") for p in persons]
names_lower = [n.lower() for n in names]
if not any("ahmad" in n for n in names_lower):
errors.append("Missing founder: Ahmad ibn Muhammad al-'Umari")
if not any("ibrahim" in n for n in names_lower):
errors.append("Missing witness: Ibrahim ibn Yusuf al-Turkmani")
if not any("ali" in n for n in names_lower):
errors.append("Missing witness: Ali ibn Husayn al-Halabi")
# Check temporal reference
if "temporal_references" in result and result["temporal_references"]:
temp = result["temporal_references"][0]
if "calendar" in temp and temp["calendar"] != "Hijri":
errors.append(f"Expected Hijri calendar, got {temp.get('calendar')}")
# Check locations
if "locations_mentioned" in result:
loc_names = [l.get("name_romanized", "").lower() for l in result["locations_mentioned"]]
if not any("aleppo" in n or "halab" in n for n in loc_names):
errors.append("Missing location: Aleppo (حلب)")
return len(errors) == 0, errors
async def test_arabic_waqf_extraction():
"""Test PiCo extraction from Arabic waqf document."""
print("\n" + "=" * 70)
print("TEST: PiCo Arabic Waqf Document Extraction")
print("=" * 70)
# Check API token
if not os.environ.get("ZAI_API_TOKEN"):
print("\n⚠️ SKIPPED: ZAI_API_TOKEN not set")
print("Set it with: export ZAI_API_TOKEN=<your_token>")
return None
print(f"\nModel: {ZAI_MODEL}")
print(f"API: {ZAI_API_URL}")
# Prepare user prompt
user_prompt = f"""Extract all persons, relationships, dates, and locations from this Arabic waqf (endowment) document:
{ARABIC_WAQF_TEXT}
This is a historical Islamic endowment document from Aleppo. Extract all information following the PiCo ontology pattern."""
print("\n" + "-" * 40)
print("SOURCE TEXT (Arabic Waqf Document)")
print("-" * 40)
print(ARABIC_WAQF_TEXT[:200] + "...")
# Call API
print("\n⏳ Calling GLM-4.6 API (this may take 30-60 seconds)...")
try:
start_time = datetime.now(timezone.utc)
result = await call_glm_api(PICO_SYSTEM_PROMPT, user_prompt)
end_time = datetime.now(timezone.utc)
duration = (end_time - start_time).total_seconds()
print(f"✅ API call completed in {duration:.1f}s")
except httpx.HTTPStatusError as e:
print(f"\n❌ API Error: {e.response.status_code}")
print(f"Response: {e.response.text[:500]}")
return False
except json.JSONDecodeError as e:
print(f"\n❌ JSON Parse Error: {e}")
return False
except Exception as e:
print(f"\n❌ Error: {type(e).__name__}: {e}")
return False
# Display results
print("\n" + "-" * 40)
print("EXTRACTION RESULTS")
print("-" * 40)
# PiCo observation metadata
if "pico_observation" in result:
obs = result["pico_observation"]
print(f"\n📋 Observation ID: {obs.get('observation_id', 'N/A')}")
print(f" Source Type: {obs.get('source_type', 'N/A')}")
print(f" Source Ref: {obs.get('source_reference', 'N/A')}")
# Persons extracted
persons = result.get("persons", [])
print(f"\n👥 Persons Extracted: {len(persons)}")
for person in persons:
idx = person.get("person_index", "?")
name = person.get("pnv_name", {})
lit_name = name.get("literalName", "")
rom_name = name.get("literalName_romanized", "")
print(f"\n [{idx}] {lit_name}")
if rom_name:
print(f" Romanized: {rom_name}")
# Honorific
if name.get("honorificPrefix"):
hon = name.get("honorificPrefix", "")
hon_rom = name.get("honorificPrefix_romanized", "")
print(f" Honorific: {hon} ({hon_rom})")
# Patronym
if name.get("patronym"):
pat = name.get("patronym", "")
pat_rom = name.get("patronym_romanized", "")
print(f" Patronym: {pat} ({pat_rom})")
# Roles
roles = person.get("roles", [])
for role in roles:
role_title = role.get("role_title", "")
role_rom = role.get("role_title_romanized", "")
role_in_src = role.get("role_in_source", "")
if role_title or role_in_src:
print(f" Role: {role_title} ({role_rom}) - {role_in_src}")
# Biographical
bio = person.get("biographical", {})
if bio.get("deceased"):
print(f" Status: Deceased (المرحوم)")
if bio.get("address"):
print(f" Address: {bio.get('address')}")
# Family relationships
fam = person.get("family_relationships", {})
if fam.get("parent"):
parents = [p.get("target_name", "") for p in fam["parent"]]
print(f" Parents: {', '.join(parents)}")
if fam.get("children"):
children = [c.get("target_name", "") for c in fam["children"]]
print(f" Children: {', '.join(children)}")
# Context
if person.get("context"):
print(f" Context: {person.get('context')}")
# Temporal references
temps = result.get("temporal_references", [])
if temps:
print(f"\n📅 Temporal References: {len(temps)}")
for temp in temps:
expr = temp.get("expression", "")
expr_rom = temp.get("expression_romanized", "")
norm = temp.get("normalized", "")
cal = temp.get("calendar", "")
print(f" {expr}")
if expr_rom:
print(f"{expr_rom}")
print(f" → Normalized: {norm} ({cal})")
# Locations
locs = result.get("locations_mentioned", [])
if locs:
print(f"\n📍 Locations: {len(locs)}")
for loc in locs:
name = loc.get("name", "")
name_rom = loc.get("name_romanized", "")
loc_type = loc.get("type", "")
print(f" {name} ({name_rom}) - {loc_type}")
# Validate results
print("\n" + "-" * 40)
print("VALIDATION")
print("-" * 40)
is_valid, errors = validate_extraction(result)
if is_valid:
print("\n✅ All validations passed!")
else:
print(f"\n⚠️ Validation issues ({len(errors)}):")
for error in errors:
print(f" - {error}")
# Save result to file for inspection
output_path = project_root / "data/entity_annotation/test_outputs"
output_path.mkdir(parents=True, exist_ok=True)
output_file = output_path / f"arabic_waqf_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n💾 Full result saved to: {output_file.relative_to(project_root)}")
# Final verdict
print("\n" + "=" * 70)
if is_valid:
print("✅ TEST PASSED: Arabic waqf extraction successful")
else:
print("⚠️ TEST COMPLETED WITH WARNINGS: Check validation issues above")
print("=" * 70)
return is_valid
async def main():
"""Run the test."""
print("\n" + "#" * 70)
print("# PiCo ARABIC WAQF EXTRACTION TEST")
print("# Testing GLM-4.6 reasoning mode with historical Arabic document")
print("#" * 70)
result = await test_arabic_waqf_extraction()
if result is None:
return 0 # Skipped (no API key)
return 0 if result else 1
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)