glam/scripts/test_pico_arabic_waqf.py

#!/usr/bin/env python3
"""
Test PiCo extraction with Arabic waqf (endowment) document example.

This script tests the GLM annotator's ability to extract person observations
from Arabic historical documents following the PiCo ontology pattern.

Usage:
    python scripts/test_pico_arabic_waqf.py

Environment Variables:
    ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
"""

import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone

import httpx

# Load environment variables from .env file
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

try:
    from dotenv import load_dotenv
    load_dotenv(project_root / ".env")
except ImportError:
    pass  # dotenv not required if env vars set directly


# Z.AI API configuration (per AGENTS.md Rule 11)
# GLM-4.6 uses reasoning mode - essential for complex historical document extraction
# Requires higher max_tokens to accommodate reasoning + output
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_MODEL = "glm-4.6"


# Arabic waqf document example (from pico.yaml)
ARABIC_WAQF_TEXT = """بسم الله الرحمن الرحيم
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
وخمس وعشرين هجرية."""


# PiCo extraction system prompt (abbreviated version for testing)
PICO_SYSTEM_PROMPT = """You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons mentioned in the source text, capturing:
1. Names using PNV (Person Name Vocabulary) structure
2. Roles in the source document
3. Biographical information
4. Family relationships between persons in THIS source
5. For Arabic texts: include both original script AND romanized versions

### Arabic Naming Conventions
- ابن/بن (ibn/bin): son of (patronymic)
- بنت (bint): daughter of
- الحاج (al-Hajj): honorific for pilgrimage completer
- السيد (al-Sayyid): honorific (descendant of Prophet)
- المرحوم (al-marhum): the late (deceased male)
- آل (Al): family of

### Family Relationship Keys
- parent: array of person references (person_index + target_name)
- children: array of person references
- spouse: array of person references

### Output Format
Return ONLY valid JSON:

{
  "pico_observation": {
    "observation_id": "<source-derived-id>",
    "observed_at": "<ISO-timestamp>",
    "source_type": "<category>",
    "source_reference": "<identifier>"
  },
  "persons": [
    {
      "person_index": 0,
      "pnv_name": {
        "literalName": "Name in original script",
        "literalName_romanized": "Romanized name",
        "givenName": "Given name",
        "givenName_romanized": "Romanized given name",
        "patronym": "Father's name",
        "patronym_romanized": "Romanized patronym",
        "baseSurname": "Family/tribal name",
        "baseSurname_romanized": "Romanized surname",
        "honorificPrefix": "Title/honorific",
        "honorificPrefix_romanized": "Romanized honorific"
      },
      "roles": [
        {
          "role_title": "Role as stated",
          "role_title_romanized": "Romanized role",
          "role_in_source": "founder|witness|beneficiary|null"
        }
      ],
      "biographical": {
        "deceased": true/false/null,
        "address": "Location if mentioned"
      },
      "family_relationships": {
        "parent": [{"person_index": N, "target_name": "Name"}],
        "children": [{"person_index": N, "target_name": "Name"}]
      },
      "context": "Brief description of person's role"
    }
  ],
  "temporal_references": [
    {
      "expression": "Original text",
      "expression_romanized": "Romanized",
      "normalized": "ISO date or approximate",
      "calendar": "Hijri|Gregorian",
      "type": "DATE"
    }
  ],
  "locations_mentioned": [
    {
      "name": "Original name",
      "name_romanized": "Romanized",
      "type": "city|neighborhood"
    }
  ]
}"""


async def call_glm_api(system_prompt: str, user_content: str) -> dict:
    """Call Z.AI GLM-4.6 API and return parsed JSON response."""
    api_token = os.environ.get("ZAI_API_TOKEN")
    if not api_token:
        raise ValueError("ZAI_API_TOKEN not set in environment")

    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json",
    }

    payload = {
        "model": ZAI_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content},
        ],
        "temperature": 0.1,  # Low temperature for consistent extraction
        "max_tokens": 16000,  # High limit for GLM-4.6 reasoning mode + output
    }

    async with httpx.AsyncClient(timeout=300.0) as client:  # 5 min timeout for GLM-4.6 reasoning
        response = await client.post(ZAI_API_URL, headers=headers, json=payload)
        response.raise_for_status()

        result = response.json()
        content = result["choices"][0]["message"]["content"]

        # Save raw response for debugging
        raw_output_path = project_root / "data/entity_annotation/test_outputs"
        raw_output_path.mkdir(parents=True, exist_ok=True)
        raw_file = raw_output_path / f"raw_response_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        with open(raw_file, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"   Raw response saved to: {raw_file.name}")

        # Parse JSON from response (handle markdown code blocks)
        json_content = content
        if "```json" in content:
            json_content = content.split("```json")[1].split("```")[0]
        elif "```" in content:
            parts = content.split("```")
            if len(parts) >= 2:
                json_content = parts[1]

        # Try to parse, with fallback for truncated JSON
        try:
            return json.loads(json_content.strip())
        except json.JSONDecodeError as e:
            print(f"\n⚠️  JSON parse error at position {e.pos}, attempting repair...")
            # Try to repair truncated JSON by closing brackets
            repaired = repair_truncated_json(json_content.strip())
            return json.loads(repaired)


def repair_truncated_json(json_str: str) -> str:
    """Attempt to repair truncated JSON by closing open brackets."""
    import re

    # Count open/close brackets
    open_braces = json_str.count('{') - json_str.count('}')
    open_brackets = json_str.count('[') - json_str.count(']')

    # Check if we're in the middle of a string
    # Find position of last complete key-value pair
    last_comma = json_str.rfind(',')
    last_colon = json_str.rfind(':')

    if last_colon > last_comma:
        # We're in the middle of a value, try to find a safe truncation point
        # Look for the last complete object or array element
        safe_pos = last_comma
        if safe_pos > 0:
            json_str = json_str[:safe_pos]
            # Recount brackets after truncation
            open_braces = json_str.count('{') - json_str.count('}')
            open_brackets = json_str.count('[') - json_str.count(']')

    # Close open brackets
    json_str = json_str.rstrip()

    # Remove trailing comma if present
    if json_str.endswith(','):
        json_str = json_str[:-1]

    # Add closing brackets
    json_str += ']' * open_brackets
    json_str += '}' * open_braces

    return json_str


def validate_extraction(result: dict) -> tuple[bool, list[str]]:
    """Validate the extraction result against expected structure."""
    errors = []

    # Check top-level structure
    if "pico_observation" not in result:
        errors.append("Missing 'pico_observation' field")
    if "persons" not in result:
        errors.append("Missing 'persons' field")

    if "persons" in result:
        persons = result["persons"]

        # Check minimum person count (should be at least 4: founder, father, 2 witnesses)
        if len(persons) < 4:
            errors.append(f"Expected at least 4 persons, got {len(persons)}")

        # Check person structure
        for i, person in enumerate(persons):
            if "person_index" not in person:
                errors.append(f"Person {i}: missing 'person_index'")
            if "pnv_name" not in person:
                errors.append(f"Person {i}: missing 'pnv_name'")
            elif "literalName" not in person["pnv_name"]:
                errors.append(f"Person {i}: missing 'literalName' in pnv_name")

        # Check for specific expected persons
        names = [p.get("pnv_name", {}).get("literalName_romanized", "") for p in persons]
        names_lower = [n.lower() for n in names]

        if not any("ahmad" in n for n in names_lower):
            errors.append("Missing founder: Ahmad ibn Muhammad al-'Umari")
        if not any("ibrahim" in n for n in names_lower):
            errors.append("Missing witness: Ibrahim ibn Yusuf al-Turkmani")
        if not any("ali" in n for n in names_lower):
            errors.append("Missing witness: Ali ibn Husayn al-Halabi")

    # Check temporal reference
    if "temporal_references" in result and result["temporal_references"]:
        temp = result["temporal_references"][0]
        if "calendar" in temp and temp["calendar"] != "Hijri":
            errors.append(f"Expected Hijri calendar, got {temp.get('calendar')}")

    # Check locations
    if "locations_mentioned" in result:
        loc_names = [l.get("name_romanized", "").lower() for l in result["locations_mentioned"]]
        if not any("aleppo" in n or "halab" in n for n in loc_names):
            errors.append("Missing location: Aleppo (حلب)")

    return len(errors) == 0, errors


async def test_arabic_waqf_extraction():
    """Test PiCo extraction from Arabic waqf document."""
    print("\n" + "=" * 70)
    print("TEST: PiCo Arabic Waqf Document Extraction")
    print("=" * 70)

    # Check API token
    if not os.environ.get("ZAI_API_TOKEN"):
        print("\n⚠️  SKIPPED: ZAI_API_TOKEN not set")
        print("Set it with: export ZAI_API_TOKEN=<your_token>")
        return None

    print(f"\nModel: {ZAI_MODEL}")
    print(f"API: {ZAI_API_URL}")

    # Prepare user prompt
    user_prompt = f"""Extract all persons, relationships, dates, and locations from this Arabic waqf (endowment) document:

{ARABIC_WAQF_TEXT}

This is a historical Islamic endowment document from Aleppo. Extract all information following the PiCo ontology pattern."""

    print("\n" + "-" * 40)
    print("SOURCE TEXT (Arabic Waqf Document)")
    print("-" * 40)
    print(ARABIC_WAQF_TEXT[:200] + "...")

    # Call API
    print("\n⏳ Calling GLM-4.6 API (this may take 30-60 seconds)...")

    try:
        start_time = datetime.now(timezone.utc)
        result = await call_glm_api(PICO_SYSTEM_PROMPT, user_prompt)
        end_time = datetime.now(timezone.utc)
        duration = (end_time - start_time).total_seconds()

        print(f"✅ API call completed in {duration:.1f}s")

    except httpx.HTTPStatusError as e:
        print(f"\n❌ API Error: {e.response.status_code}")
        print(f"Response: {e.response.text[:500]}")
        return False
    except json.JSONDecodeError as e:
        print(f"\n❌ JSON Parse Error: {e}")
        return False
    except Exception as e:
        print(f"\n❌ Error: {type(e).__name__}: {e}")
        return False

    # Display results
    print("\n" + "-" * 40)
    print("EXTRACTION RESULTS")
    print("-" * 40)

    # PiCo observation metadata
    if "pico_observation" in result:
        obs = result["pico_observation"]
        print(f"\n📋 Observation ID: {obs.get('observation_id', 'N/A')}")
        print(f"   Source Type: {obs.get('source_type', 'N/A')}")
        print(f"   Source Ref: {obs.get('source_reference', 'N/A')}")

    # Persons extracted
    persons = result.get("persons", [])
    print(f"\n👥 Persons Extracted: {len(persons)}")

    for person in persons:
        idx = person.get("person_index", "?")
        name = person.get("pnv_name", {})
        lit_name = name.get("literalName", "")
        rom_name = name.get("literalName_romanized", "")

        print(f"\n   [{idx}] {lit_name}")
        if rom_name:
            print(f"       Romanized: {rom_name}")

        # Honorific
        if name.get("honorificPrefix"):
            hon = name.get("honorificPrefix", "")
            hon_rom = name.get("honorificPrefix_romanized", "")
            print(f"       Honorific: {hon} ({hon_rom})")

        # Patronym
        if name.get("patronym"):
            pat = name.get("patronym", "")
            pat_rom = name.get("patronym_romanized", "")
            print(f"       Patronym: {pat} ({pat_rom})")

        # Roles
        roles = person.get("roles", [])
        for role in roles:
            role_title = role.get("role_title", "")
            role_rom = role.get("role_title_romanized", "")
            role_in_src = role.get("role_in_source", "")
            if role_title or role_in_src:
                print(f"       Role: {role_title} ({role_rom}) - {role_in_src}")

        # Biographical
        bio = person.get("biographical", {})
        if bio.get("deceased"):
            print(f"       Status: Deceased (المرحوم)")
        if bio.get("address"):
            print(f"       Address: {bio.get('address')}")

        # Family relationships
        fam = person.get("family_relationships", {})
        if fam.get("parent"):
            parents = [p.get("target_name", "") for p in fam["parent"]]
            print(f"       Parents: {', '.join(parents)}")
        if fam.get("children"):
            children = [c.get("target_name", "") for c in fam["children"]]
            print(f"       Children: {', '.join(children)}")

        # Context
        if person.get("context"):
            print(f"       Context: {person.get('context')}")

    # Temporal references
    temps = result.get("temporal_references", [])
    if temps:
        print(f"\n📅 Temporal References: {len(temps)}")
        for temp in temps:
            expr = temp.get("expression", "")
            expr_rom = temp.get("expression_romanized", "")
            norm = temp.get("normalized", "")
            cal = temp.get("calendar", "")
            print(f"   {expr}")
            if expr_rom:
                print(f"   → {expr_rom}")
            print(f"   → Normalized: {norm} ({cal})")

    # Locations
    locs = result.get("locations_mentioned", [])
    if locs:
        print(f"\n📍 Locations: {len(locs)}")
        for loc in locs:
            name = loc.get("name", "")
            name_rom = loc.get("name_romanized", "")
            loc_type = loc.get("type", "")
            print(f"   {name} ({name_rom}) - {loc_type}")

    # Validate results
    print("\n" + "-" * 40)
    print("VALIDATION")
    print("-" * 40)

    is_valid, errors = validate_extraction(result)

    if is_valid:
        print("\n✅ All validations passed!")
    else:
        print(f"\n⚠️  Validation issues ({len(errors)}):")
        for error in errors:
            print(f"   - {error}")

    # Save result to file for inspection
    output_path = project_root / "data/entity_annotation/test_outputs"
    output_path.mkdir(parents=True, exist_ok=True)
    output_file = output_path / f"arabic_waqf_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"\n💾 Full result saved to: {output_file.relative_to(project_root)}")

    # Final verdict
    print("\n" + "=" * 70)
    if is_valid:
        print("✅ TEST PASSED: Arabic waqf extraction successful")
    else:
        print("⚠️  TEST COMPLETED WITH WARNINGS: Check validation issues above")
    print("=" * 70)

    return is_valid


async def main():
    """Run the test."""
    print("\n" + "#" * 70)
    print("# PiCo ARABIC WAQF EXTRACTION TEST")
    print("# Testing GLM-4.6 reasoning mode with historical Arabic document")
    print("#" * 70)

    result = await test_arabic_waqf_extraction()

    if result is None:
        return 0  # Skipped (no API key)
    return 0 if result else 1


if __name__ == "__main__":
    exit_code = asyncio.run(main())
    sys.exit(exit_code)