#!/usr/bin/env python3 """ Test PiCo extraction with Arabic waqf (endowment) document example. This script tests the GLM annotator's ability to extract person observations from Arabic historical documents following the PiCo ontology pattern. Usage: python scripts/test_pico_arabic_waqf.py Environment Variables: ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API """ import asyncio import json import os import sys from pathlib import Path from datetime import datetime, timezone import httpx # Load environment variables from .env file project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) try: from dotenv import load_dotenv load_dotenv(project_root / ".env") except ImportError: pass # dotenv not required if env vars set directly # Z.AI API configuration (per AGENTS.md Rule 11) # GLM-4.6 uses reasoning mode - essential for complex historical document extraction # Requires higher max_tokens to accommodate reasoning + output ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" ZAI_MODEL = "glm-4.6" # Arabic waqf document example (from pico.yaml) ARABIC_WAQF_TEXT = """بسم الله الرحمن الرحيم هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية.""" # PiCo extraction system prompt (abbreviated version for testing) PICO_SYSTEM_PROMPT = """You are a historical document annotator following the PiCo (Person in Context) ontology. Extract ALL persons mentioned in the source text, capturing: 1. Names using PNV (Person Name Vocabulary) structure 2. Roles in the source document 3. Biographical information 4. Family relationships between persons in THIS source 5. For Arabic texts: include both original script AND romanized versions ### Arabic Naming Conventions - ابن/بن (ibn/bin): son of (patronymic) - بنت (bint): daughter of - الحاج (al-Hajj): honorific for pilgrimage completer - السيد (al-Sayyid): honorific (descendant of Prophet) - المرحوم (al-marhum): the late (deceased male) - آل (Al): family of ### Family Relationship Keys - parent: array of person references (person_index + target_name) - children: array of person references - spouse: array of person references ### Output Format Return ONLY valid JSON: { "pico_observation": { "observation_id": "", "observed_at": "", "source_type": "", "source_reference": "" }, "persons": [ { "person_index": 0, "pnv_name": { "literalName": "Name in original script", "literalName_romanized": "Romanized name", "givenName": "Given name", "givenName_romanized": "Romanized given name", "patronym": "Father's name", "patronym_romanized": "Romanized patronym", "baseSurname": "Family/tribal name", "baseSurname_romanized": "Romanized surname", "honorificPrefix": "Title/honorific", "honorificPrefix_romanized": "Romanized honorific" }, "roles": [ { "role_title": "Role as stated", "role_title_romanized": "Romanized role", "role_in_source": "founder|witness|beneficiary|null" } ], "biographical": { "deceased": true/false/null, "address": "Location if mentioned" }, "family_relationships": { "parent": [{"person_index": N, "target_name": "Name"}], "children": [{"person_index": N, "target_name": "Name"}] }, "context": "Brief description of person's role" } ], "temporal_references": [ { "expression": "Original text", "expression_romanized": "Romanized", "normalized": "ISO date or approximate", "calendar": "Hijri|Gregorian", "type": "DATE" } ], "locations_mentioned": [ { "name": "Original name", "name_romanized": "Romanized", "type": "city|neighborhood" } ] }""" async def call_glm_api(system_prompt: str, user_content: str) -> dict: """Call Z.AI GLM-4.6 API and return parsed JSON response.""" api_token = os.environ.get("ZAI_API_TOKEN") if not api_token: raise ValueError("ZAI_API_TOKEN not set in environment") headers = { "Authorization": f"Bearer {api_token}", "Content-Type": "application/json", } payload = { "model": ZAI_MODEL, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}, ], "temperature": 0.1, # Low temperature for consistent extraction "max_tokens": 16000, # High limit for GLM-4.6 reasoning mode + output } async with httpx.AsyncClient(timeout=300.0) as client: # 5 min timeout for GLM-4.6 reasoning response = await client.post(ZAI_API_URL, headers=headers, json=payload) response.raise_for_status() result = response.json() content = result["choices"][0]["message"]["content"] # Save raw response for debugging raw_output_path = project_root / "data/entity_annotation/test_outputs" raw_output_path.mkdir(parents=True, exist_ok=True) raw_file = raw_output_path / f"raw_response_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt" with open(raw_file, 'w', encoding='utf-8') as f: f.write(content) print(f" Raw response saved to: {raw_file.name}") # Parse JSON from response (handle markdown code blocks) json_content = content if "```json" in content: json_content = content.split("```json")[1].split("```")[0] elif "```" in content: parts = content.split("```") if len(parts) >= 2: json_content = parts[1] # Try to parse, with fallback for truncated JSON try: return json.loads(json_content.strip()) except json.JSONDecodeError as e: print(f"\n⚠️ JSON parse error at position {e.pos}, attempting repair...") # Try to repair truncated JSON by closing brackets repaired = repair_truncated_json(json_content.strip()) return json.loads(repaired) def repair_truncated_json(json_str: str) -> str: """Attempt to repair truncated JSON by closing open brackets.""" import re # Count open/close brackets open_braces = json_str.count('{') - json_str.count('}') open_brackets = json_str.count('[') - json_str.count(']') # Check if we're in the middle of a string # Find position of last complete key-value pair last_comma = json_str.rfind(',') last_colon = json_str.rfind(':') if last_colon > last_comma: # We're in the middle of a value, try to find a safe truncation point # Look for the last complete object or array element safe_pos = last_comma if safe_pos > 0: json_str = json_str[:safe_pos] # Recount brackets after truncation open_braces = json_str.count('{') - json_str.count('}') open_brackets = json_str.count('[') - json_str.count(']') # Close open brackets json_str = json_str.rstrip() # Remove trailing comma if present if json_str.endswith(','): json_str = json_str[:-1] # Add closing brackets json_str += ']' * open_brackets json_str += '}' * open_braces return json_str def validate_extraction(result: dict) -> tuple[bool, list[str]]: """Validate the extraction result against expected structure.""" errors = [] # Check top-level structure if "pico_observation" not in result: errors.append("Missing 'pico_observation' field") if "persons" not in result: errors.append("Missing 'persons' field") if "persons" in result: persons = result["persons"] # Check minimum person count (should be at least 4: founder, father, 2 witnesses) if len(persons) < 4: errors.append(f"Expected at least 4 persons, got {len(persons)}") # Check person structure for i, person in enumerate(persons): if "person_index" not in person: errors.append(f"Person {i}: missing 'person_index'") if "pnv_name" not in person: errors.append(f"Person {i}: missing 'pnv_name'") elif "literalName" not in person["pnv_name"]: errors.append(f"Person {i}: missing 'literalName' in pnv_name") # Check for specific expected persons names = [p.get("pnv_name", {}).get("literalName_romanized", "") for p in persons] names_lower = [n.lower() for n in names] if not any("ahmad" in n for n in names_lower): errors.append("Missing founder: Ahmad ibn Muhammad al-'Umari") if not any("ibrahim" in n for n in names_lower): errors.append("Missing witness: Ibrahim ibn Yusuf al-Turkmani") if not any("ali" in n for n in names_lower): errors.append("Missing witness: Ali ibn Husayn al-Halabi") # Check temporal reference if "temporal_references" in result and result["temporal_references"]: temp = result["temporal_references"][0] if "calendar" in temp and temp["calendar"] != "Hijri": errors.append(f"Expected Hijri calendar, got {temp.get('calendar')}") # Check locations if "locations_mentioned" in result: loc_names = [l.get("name_romanized", "").lower() for l in result["locations_mentioned"]] if not any("aleppo" in n or "halab" in n for n in loc_names): errors.append("Missing location: Aleppo (حلب)") return len(errors) == 0, errors async def test_arabic_waqf_extraction(): """Test PiCo extraction from Arabic waqf document.""" print("\n" + "=" * 70) print("TEST: PiCo Arabic Waqf Document Extraction") print("=" * 70) # Check API token if not os.environ.get("ZAI_API_TOKEN"): print("\n⚠️ SKIPPED: ZAI_API_TOKEN not set") print("Set it with: export ZAI_API_TOKEN=") return None print(f"\nModel: {ZAI_MODEL}") print(f"API: {ZAI_API_URL}") # Prepare user prompt user_prompt = f"""Extract all persons, relationships, dates, and locations from this Arabic waqf (endowment) document: {ARABIC_WAQF_TEXT} This is a historical Islamic endowment document from Aleppo. Extract all information following the PiCo ontology pattern.""" print("\n" + "-" * 40) print("SOURCE TEXT (Arabic Waqf Document)") print("-" * 40) print(ARABIC_WAQF_TEXT[:200] + "...") # Call API print("\n⏳ Calling GLM-4.6 API (this may take 30-60 seconds)...") try: start_time = datetime.now(timezone.utc) result = await call_glm_api(PICO_SYSTEM_PROMPT, user_prompt) end_time = datetime.now(timezone.utc) duration = (end_time - start_time).total_seconds() print(f"✅ API call completed in {duration:.1f}s") except httpx.HTTPStatusError as e: print(f"\n❌ API Error: {e.response.status_code}") print(f"Response: {e.response.text[:500]}") return False except json.JSONDecodeError as e: print(f"\n❌ JSON Parse Error: {e}") return False except Exception as e: print(f"\n❌ Error: {type(e).__name__}: {e}") return False # Display results print("\n" + "-" * 40) print("EXTRACTION RESULTS") print("-" * 40) # PiCo observation metadata if "pico_observation" in result: obs = result["pico_observation"] print(f"\n📋 Observation ID: {obs.get('observation_id', 'N/A')}") print(f" Source Type: {obs.get('source_type', 'N/A')}") print(f" Source Ref: {obs.get('source_reference', 'N/A')}") # Persons extracted persons = result.get("persons", []) print(f"\n👥 Persons Extracted: {len(persons)}") for person in persons: idx = person.get("person_index", "?") name = person.get("pnv_name", {}) lit_name = name.get("literalName", "") rom_name = name.get("literalName_romanized", "") print(f"\n [{idx}] {lit_name}") if rom_name: print(f" Romanized: {rom_name}") # Honorific if name.get("honorificPrefix"): hon = name.get("honorificPrefix", "") hon_rom = name.get("honorificPrefix_romanized", "") print(f" Honorific: {hon} ({hon_rom})") # Patronym if name.get("patronym"): pat = name.get("patronym", "") pat_rom = name.get("patronym_romanized", "") print(f" Patronym: {pat} ({pat_rom})") # Roles roles = person.get("roles", []) for role in roles: role_title = role.get("role_title", "") role_rom = role.get("role_title_romanized", "") role_in_src = role.get("role_in_source", "") if role_title or role_in_src: print(f" Role: {role_title} ({role_rom}) - {role_in_src}") # Biographical bio = person.get("biographical", {}) if bio.get("deceased"): print(f" Status: Deceased (المرحوم)") if bio.get("address"): print(f" Address: {bio.get('address')}") # Family relationships fam = person.get("family_relationships", {}) if fam.get("parent"): parents = [p.get("target_name", "") for p in fam["parent"]] print(f" Parents: {', '.join(parents)}") if fam.get("children"): children = [c.get("target_name", "") for c in fam["children"]] print(f" Children: {', '.join(children)}") # Context if person.get("context"): print(f" Context: {person.get('context')}") # Temporal references temps = result.get("temporal_references", []) if temps: print(f"\n📅 Temporal References: {len(temps)}") for temp in temps: expr = temp.get("expression", "") expr_rom = temp.get("expression_romanized", "") norm = temp.get("normalized", "") cal = temp.get("calendar", "") print(f" {expr}") if expr_rom: print(f" → {expr_rom}") print(f" → Normalized: {norm} ({cal})") # Locations locs = result.get("locations_mentioned", []) if locs: print(f"\n📍 Locations: {len(locs)}") for loc in locs: name = loc.get("name", "") name_rom = loc.get("name_romanized", "") loc_type = loc.get("type", "") print(f" {name} ({name_rom}) - {loc_type}") # Validate results print("\n" + "-" * 40) print("VALIDATION") print("-" * 40) is_valid, errors = validate_extraction(result) if is_valid: print("\n✅ All validations passed!") else: print(f"\n⚠️ Validation issues ({len(errors)}):") for error in errors: print(f" - {error}") # Save result to file for inspection output_path = project_root / "data/entity_annotation/test_outputs" output_path.mkdir(parents=True, exist_ok=True) output_file = output_path / f"arabic_waqf_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n💾 Full result saved to: {output_file.relative_to(project_root)}") # Final verdict print("\n" + "=" * 70) if is_valid: print("✅ TEST PASSED: Arabic waqf extraction successful") else: print("⚠️ TEST COMPLETED WITH WARNINGS: Check validation issues above") print("=" * 70) return is_valid async def main(): """Run the test.""" print("\n" + "#" * 70) print("# PiCo ARABIC WAQF EXTRACTION TEST") print("# Testing GLM-4.6 reasoning mode with historical Arabic document") print("#" * 70) result = await test_arabic_waqf_extraction() if result is None: return 0 # Skipped (no API key) return 0 if result else 1 if __name__ == "__main__": exit_code = asyncio.run(main()) sys.exit(exit_code)