- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents. - The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results. - Added comprehensive logging for API responses, extraction results, and validation errors. - Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
472 lines
17 KiB
Python
472 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test PiCo extraction with Arabic waqf (endowment) document example.
|
|
|
|
This script tests the GLM annotator's ability to extract person observations
|
|
from Arabic historical documents following the PiCo ontology pattern.
|
|
|
|
Usage:
|
|
python scripts/test_pico_arabic_waqf.py
|
|
|
|
Environment Variables:
|
|
ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
|
|
# Load environment variables from .env file
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
try:
|
|
from dotenv import load_dotenv
|
|
load_dotenv(project_root / ".env")
|
|
except ImportError:
|
|
pass # dotenv not required if env vars set directly
|
|
|
|
|
|
# Z.AI API configuration (per AGENTS.md Rule 11)
|
|
# GLM-4.6 uses reasoning mode - essential for complex historical document extraction
|
|
# Requires higher max_tokens to accommodate reasoning + output
|
|
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
|
ZAI_MODEL = "glm-4.6"
|
|
|
|
|
|
# Arabic waqf document example (from pico.yaml)
|
|
ARABIC_WAQF_TEXT = """بسم الله الرحمن الرحيم
|
|
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
|
|
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
|
|
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
|
|
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
|
|
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
|
|
وخمس وعشرين هجرية."""
|
|
|
|
|
|
# PiCo extraction system prompt (abbreviated version for testing)
|
|
PICO_SYSTEM_PROMPT = """You are a historical document annotator following the PiCo (Person in Context) ontology.
|
|
|
|
Extract ALL persons mentioned in the source text, capturing:
|
|
1. Names using PNV (Person Name Vocabulary) structure
|
|
2. Roles in the source document
|
|
3. Biographical information
|
|
4. Family relationships between persons in THIS source
|
|
5. For Arabic texts: include both original script AND romanized versions
|
|
|
|
### Arabic Naming Conventions
|
|
- ابن/بن (ibn/bin): son of (patronymic)
|
|
- بنت (bint): daughter of
|
|
- الحاج (al-Hajj): honorific for pilgrimage completer
|
|
- السيد (al-Sayyid): honorific (descendant of Prophet)
|
|
- المرحوم (al-marhum): the late (deceased male)
|
|
- آل (Al): family of
|
|
|
|
### Family Relationship Keys
|
|
- parent: array of person references (person_index + target_name)
|
|
- children: array of person references
|
|
- spouse: array of person references
|
|
|
|
### Output Format
|
|
Return ONLY valid JSON:
|
|
|
|
{
|
|
"pico_observation": {
|
|
"observation_id": "<source-derived-id>",
|
|
"observed_at": "<ISO-timestamp>",
|
|
"source_type": "<category>",
|
|
"source_reference": "<identifier>"
|
|
},
|
|
"persons": [
|
|
{
|
|
"person_index": 0,
|
|
"pnv_name": {
|
|
"literalName": "Name in original script",
|
|
"literalName_romanized": "Romanized name",
|
|
"givenName": "Given name",
|
|
"givenName_romanized": "Romanized given name",
|
|
"patronym": "Father's name",
|
|
"patronym_romanized": "Romanized patronym",
|
|
"baseSurname": "Family/tribal name",
|
|
"baseSurname_romanized": "Romanized surname",
|
|
"honorificPrefix": "Title/honorific",
|
|
"honorificPrefix_romanized": "Romanized honorific"
|
|
},
|
|
"roles": [
|
|
{
|
|
"role_title": "Role as stated",
|
|
"role_title_romanized": "Romanized role",
|
|
"role_in_source": "founder|witness|beneficiary|null"
|
|
}
|
|
],
|
|
"biographical": {
|
|
"deceased": true/false/null,
|
|
"address": "Location if mentioned"
|
|
},
|
|
"family_relationships": {
|
|
"parent": [{"person_index": N, "target_name": "Name"}],
|
|
"children": [{"person_index": N, "target_name": "Name"}]
|
|
},
|
|
"context": "Brief description of person's role"
|
|
}
|
|
],
|
|
"temporal_references": [
|
|
{
|
|
"expression": "Original text",
|
|
"expression_romanized": "Romanized",
|
|
"normalized": "ISO date or approximate",
|
|
"calendar": "Hijri|Gregorian",
|
|
"type": "DATE"
|
|
}
|
|
],
|
|
"locations_mentioned": [
|
|
{
|
|
"name": "Original name",
|
|
"name_romanized": "Romanized",
|
|
"type": "city|neighborhood"
|
|
}
|
|
]
|
|
}"""
|
|
|
|
|
|
async def call_glm_api(system_prompt: str, user_content: str) -> dict:
|
|
"""Call Z.AI GLM-4.6 API and return parsed JSON response."""
|
|
api_token = os.environ.get("ZAI_API_TOKEN")
|
|
if not api_token:
|
|
raise ValueError("ZAI_API_TOKEN not set in environment")
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_token}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
payload = {
|
|
"model": ZAI_MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_content},
|
|
],
|
|
"temperature": 0.1, # Low temperature for consistent extraction
|
|
"max_tokens": 16000, # High limit for GLM-4.6 reasoning mode + output
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=300.0) as client: # 5 min timeout for GLM-4.6 reasoning
|
|
response = await client.post(ZAI_API_URL, headers=headers, json=payload)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
content = result["choices"][0]["message"]["content"]
|
|
|
|
# Save raw response for debugging
|
|
raw_output_path = project_root / "data/entity_annotation/test_outputs"
|
|
raw_output_path.mkdir(parents=True, exist_ok=True)
|
|
raw_file = raw_output_path / f"raw_response_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
|
with open(raw_file, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
print(f" Raw response saved to: {raw_file.name}")
|
|
|
|
# Parse JSON from response (handle markdown code blocks)
|
|
json_content = content
|
|
if "```json" in content:
|
|
json_content = content.split("```json")[1].split("```")[0]
|
|
elif "```" in content:
|
|
parts = content.split("```")
|
|
if len(parts) >= 2:
|
|
json_content = parts[1]
|
|
|
|
# Try to parse, with fallback for truncated JSON
|
|
try:
|
|
return json.loads(json_content.strip())
|
|
except json.JSONDecodeError as e:
|
|
print(f"\n⚠️ JSON parse error at position {e.pos}, attempting repair...")
|
|
# Try to repair truncated JSON by closing brackets
|
|
repaired = repair_truncated_json(json_content.strip())
|
|
return json.loads(repaired)
|
|
|
|
|
|
def repair_truncated_json(json_str: str) -> str:
|
|
"""Attempt to repair truncated JSON by closing open brackets."""
|
|
import re
|
|
|
|
# Count open/close brackets
|
|
open_braces = json_str.count('{') - json_str.count('}')
|
|
open_brackets = json_str.count('[') - json_str.count(']')
|
|
|
|
# Check if we're in the middle of a string
|
|
# Find position of last complete key-value pair
|
|
last_comma = json_str.rfind(',')
|
|
last_colon = json_str.rfind(':')
|
|
|
|
if last_colon > last_comma:
|
|
# We're in the middle of a value, try to find a safe truncation point
|
|
# Look for the last complete object or array element
|
|
safe_pos = last_comma
|
|
if safe_pos > 0:
|
|
json_str = json_str[:safe_pos]
|
|
# Recount brackets after truncation
|
|
open_braces = json_str.count('{') - json_str.count('}')
|
|
open_brackets = json_str.count('[') - json_str.count(']')
|
|
|
|
# Close open brackets
|
|
json_str = json_str.rstrip()
|
|
|
|
# Remove trailing comma if present
|
|
if json_str.endswith(','):
|
|
json_str = json_str[:-1]
|
|
|
|
# Add closing brackets
|
|
json_str += ']' * open_brackets
|
|
json_str += '}' * open_braces
|
|
|
|
return json_str
|
|
|
|
|
|
def validate_extraction(result: dict) -> tuple[bool, list[str]]:
|
|
"""Validate the extraction result against expected structure."""
|
|
errors = []
|
|
|
|
# Check top-level structure
|
|
if "pico_observation" not in result:
|
|
errors.append("Missing 'pico_observation' field")
|
|
if "persons" not in result:
|
|
errors.append("Missing 'persons' field")
|
|
|
|
if "persons" in result:
|
|
persons = result["persons"]
|
|
|
|
# Check minimum person count (should be at least 4: founder, father, 2 witnesses)
|
|
if len(persons) < 4:
|
|
errors.append(f"Expected at least 4 persons, got {len(persons)}")
|
|
|
|
# Check person structure
|
|
for i, person in enumerate(persons):
|
|
if "person_index" not in person:
|
|
errors.append(f"Person {i}: missing 'person_index'")
|
|
if "pnv_name" not in person:
|
|
errors.append(f"Person {i}: missing 'pnv_name'")
|
|
elif "literalName" not in person["pnv_name"]:
|
|
errors.append(f"Person {i}: missing 'literalName' in pnv_name")
|
|
|
|
# Check for specific expected persons
|
|
names = [p.get("pnv_name", {}).get("literalName_romanized", "") for p in persons]
|
|
names_lower = [n.lower() for n in names]
|
|
|
|
if not any("ahmad" in n for n in names_lower):
|
|
errors.append("Missing founder: Ahmad ibn Muhammad al-'Umari")
|
|
if not any("ibrahim" in n for n in names_lower):
|
|
errors.append("Missing witness: Ibrahim ibn Yusuf al-Turkmani")
|
|
if not any("ali" in n for n in names_lower):
|
|
errors.append("Missing witness: Ali ibn Husayn al-Halabi")
|
|
|
|
# Check temporal reference
|
|
if "temporal_references" in result and result["temporal_references"]:
|
|
temp = result["temporal_references"][0]
|
|
if "calendar" in temp and temp["calendar"] != "Hijri":
|
|
errors.append(f"Expected Hijri calendar, got {temp.get('calendar')}")
|
|
|
|
# Check locations
|
|
if "locations_mentioned" in result:
|
|
loc_names = [l.get("name_romanized", "").lower() for l in result["locations_mentioned"]]
|
|
if not any("aleppo" in n or "halab" in n for n in loc_names):
|
|
errors.append("Missing location: Aleppo (حلب)")
|
|
|
|
return len(errors) == 0, errors
|
|
|
|
|
|
async def test_arabic_waqf_extraction():
|
|
"""Test PiCo extraction from Arabic waqf document."""
|
|
print("\n" + "=" * 70)
|
|
print("TEST: PiCo Arabic Waqf Document Extraction")
|
|
print("=" * 70)
|
|
|
|
# Check API token
|
|
if not os.environ.get("ZAI_API_TOKEN"):
|
|
print("\n⚠️ SKIPPED: ZAI_API_TOKEN not set")
|
|
print("Set it with: export ZAI_API_TOKEN=<your_token>")
|
|
return None
|
|
|
|
print(f"\nModel: {ZAI_MODEL}")
|
|
print(f"API: {ZAI_API_URL}")
|
|
|
|
# Prepare user prompt
|
|
user_prompt = f"""Extract all persons, relationships, dates, and locations from this Arabic waqf (endowment) document:
|
|
|
|
{ARABIC_WAQF_TEXT}
|
|
|
|
This is a historical Islamic endowment document from Aleppo. Extract all information following the PiCo ontology pattern."""
|
|
|
|
print("\n" + "-" * 40)
|
|
print("SOURCE TEXT (Arabic Waqf Document)")
|
|
print("-" * 40)
|
|
print(ARABIC_WAQF_TEXT[:200] + "...")
|
|
|
|
# Call API
|
|
print("\n⏳ Calling GLM-4.6 API (this may take 30-60 seconds)...")
|
|
|
|
try:
|
|
start_time = datetime.now(timezone.utc)
|
|
result = await call_glm_api(PICO_SYSTEM_PROMPT, user_prompt)
|
|
end_time = datetime.now(timezone.utc)
|
|
duration = (end_time - start_time).total_seconds()
|
|
|
|
print(f"✅ API call completed in {duration:.1f}s")
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"\n❌ API Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text[:500]}")
|
|
return False
|
|
except json.JSONDecodeError as e:
|
|
print(f"\n❌ JSON Parse Error: {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"\n❌ Error: {type(e).__name__}: {e}")
|
|
return False
|
|
|
|
# Display results
|
|
print("\n" + "-" * 40)
|
|
print("EXTRACTION RESULTS")
|
|
print("-" * 40)
|
|
|
|
# PiCo observation metadata
|
|
if "pico_observation" in result:
|
|
obs = result["pico_observation"]
|
|
print(f"\n📋 Observation ID: {obs.get('observation_id', 'N/A')}")
|
|
print(f" Source Type: {obs.get('source_type', 'N/A')}")
|
|
print(f" Source Ref: {obs.get('source_reference', 'N/A')}")
|
|
|
|
# Persons extracted
|
|
persons = result.get("persons", [])
|
|
print(f"\n👥 Persons Extracted: {len(persons)}")
|
|
|
|
for person in persons:
|
|
idx = person.get("person_index", "?")
|
|
name = person.get("pnv_name", {})
|
|
lit_name = name.get("literalName", "")
|
|
rom_name = name.get("literalName_romanized", "")
|
|
|
|
print(f"\n [{idx}] {lit_name}")
|
|
if rom_name:
|
|
print(f" Romanized: {rom_name}")
|
|
|
|
# Honorific
|
|
if name.get("honorificPrefix"):
|
|
hon = name.get("honorificPrefix", "")
|
|
hon_rom = name.get("honorificPrefix_romanized", "")
|
|
print(f" Honorific: {hon} ({hon_rom})")
|
|
|
|
# Patronym
|
|
if name.get("patronym"):
|
|
pat = name.get("patronym", "")
|
|
pat_rom = name.get("patronym_romanized", "")
|
|
print(f" Patronym: {pat} ({pat_rom})")
|
|
|
|
# Roles
|
|
roles = person.get("roles", [])
|
|
for role in roles:
|
|
role_title = role.get("role_title", "")
|
|
role_rom = role.get("role_title_romanized", "")
|
|
role_in_src = role.get("role_in_source", "")
|
|
if role_title or role_in_src:
|
|
print(f" Role: {role_title} ({role_rom}) - {role_in_src}")
|
|
|
|
# Biographical
|
|
bio = person.get("biographical", {})
|
|
if bio.get("deceased"):
|
|
print(f" Status: Deceased (المرحوم)")
|
|
if bio.get("address"):
|
|
print(f" Address: {bio.get('address')}")
|
|
|
|
# Family relationships
|
|
fam = person.get("family_relationships", {})
|
|
if fam.get("parent"):
|
|
parents = [p.get("target_name", "") for p in fam["parent"]]
|
|
print(f" Parents: {', '.join(parents)}")
|
|
if fam.get("children"):
|
|
children = [c.get("target_name", "") for c in fam["children"]]
|
|
print(f" Children: {', '.join(children)}")
|
|
|
|
# Context
|
|
if person.get("context"):
|
|
print(f" Context: {person.get('context')}")
|
|
|
|
# Temporal references
|
|
temps = result.get("temporal_references", [])
|
|
if temps:
|
|
print(f"\n📅 Temporal References: {len(temps)}")
|
|
for temp in temps:
|
|
expr = temp.get("expression", "")
|
|
expr_rom = temp.get("expression_romanized", "")
|
|
norm = temp.get("normalized", "")
|
|
cal = temp.get("calendar", "")
|
|
print(f" {expr}")
|
|
if expr_rom:
|
|
print(f" → {expr_rom}")
|
|
print(f" → Normalized: {norm} ({cal})")
|
|
|
|
# Locations
|
|
locs = result.get("locations_mentioned", [])
|
|
if locs:
|
|
print(f"\n📍 Locations: {len(locs)}")
|
|
for loc in locs:
|
|
name = loc.get("name", "")
|
|
name_rom = loc.get("name_romanized", "")
|
|
loc_type = loc.get("type", "")
|
|
print(f" {name} ({name_rom}) - {loc_type}")
|
|
|
|
# Validate results
|
|
print("\n" + "-" * 40)
|
|
print("VALIDATION")
|
|
print("-" * 40)
|
|
|
|
is_valid, errors = validate_extraction(result)
|
|
|
|
if is_valid:
|
|
print("\n✅ All validations passed!")
|
|
else:
|
|
print(f"\n⚠️ Validation issues ({len(errors)}):")
|
|
for error in errors:
|
|
print(f" - {error}")
|
|
|
|
# Save result to file for inspection
|
|
output_path = project_root / "data/entity_annotation/test_outputs"
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_path / f"arabic_waqf_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n💾 Full result saved to: {output_file.relative_to(project_root)}")
|
|
|
|
# Final verdict
|
|
print("\n" + "=" * 70)
|
|
if is_valid:
|
|
print("✅ TEST PASSED: Arabic waqf extraction successful")
|
|
else:
|
|
print("⚠️ TEST COMPLETED WITH WARNINGS: Check validation issues above")
|
|
print("=" * 70)
|
|
|
|
return is_valid
|
|
|
|
|
|
async def main():
|
|
"""Run the test."""
|
|
print("\n" + "#" * 70)
|
|
print("# PiCo ARABIC WAQF EXTRACTION TEST")
|
|
print("# Testing GLM-4.6 reasoning mode with historical Arabic document")
|
|
print("#" * 70)
|
|
|
|
result = await test_arabic_waqf_extraction()
|
|
|
|
if result is None:
|
|
return 0 # Skipped (no API key)
|
|
return 0 if result else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit_code = asyncio.run(main())
|
|
sys.exit(exit_code)
|