glam/scripts/test_pico_batch.py

#!/usr/bin/env python3
"""
Batch test runner for PiCo (Person in Context) extraction across multiple document types.

This script tests GLM-4.6 reasoning mode extraction from various historical document types:
1. Arabic Waqf (Islamic endowment)
2. Hebrew Ketubah (Jewish marriage contract)
3. Spanish Colonial Baptism
4. Dutch Marriage Certificate
5. Latin Notarial Protocol

Usage:
    python scripts/test_pico_batch.py [--test-name NAME] [--all] [--list]

Examples:
    python scripts/test_pico_batch.py --all              # Run all tests
    python scripts/test_pico_batch.py --test-name arabic # Run only Arabic waqf test
    python scripts/test_pico_batch.py --list             # List available tests

Environment Variables:
    ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
"""

import asyncio
import argparse
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Optional

import httpx

# Load environment variables from .env file
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

try:
    from dotenv import load_dotenv
    load_dotenv(project_root / ".env")
except ImportError:
    pass


# =============================================================================
# API Configuration
# =============================================================================

ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_MODEL = "glm-4.6"
MAX_TOKENS = 16000  # High limit for GLM-4.6 reasoning mode
TIMEOUT = 300  # 5 minutes for complex reasoning


# =============================================================================
# Test Document Definitions
# =============================================================================

@dataclass
class TestDocument:
    """A historical document for PiCo extraction testing."""
    name: str
    language: str
    script: str
    date_period: str
    source_type: str
    source_text: str
    system_prompt: str
    expected_persons: int
    expected_locations: int
    validation_names: list[str]  # Names that should appear in extraction


# Arabic Waqf Document
ARABIC_WAQF = TestDocument(
    name="arabic_waqf",
    language="Arabic",
    script="Arabic",
    date_period="1225 AH (1810 CE)",
    source_type="waqf_document",
    source_text="""بسم الله الرحمن الرحيم
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
وخمس وعشرين هجرية.""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Arabic waqf (endowment) document:
1. Names using PNV structure with both Arabic script AND romanized versions
2. Patronymics (ابن/بن = son of)
3. Honorifics (الحاج = pilgrim, السيد = sayyid, المرحوم = the late)
4. Family relationships between persons
5. Roles in the document (founder, witness)
6. Biographical info (deceased status, occupation, address)

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "...", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "..."}],
  "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
    expected_persons=4,
    expected_locations=2,
    validation_names=["ahmad", "ibrahim", "ali"]
)


# Hebrew Ketubah
HEBREW_KETUBAH = TestDocument(
    name="hebrew_ketubah",
    language="Hebrew/Aramaic",
    script="Hebrew",
    date_period="5645 AM (1885 CE)",
    source_type="ketubah",
    source_text="""בס״ד

ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות
וארבעים וחמש לבריאת עולם למנין שאנו מונין בו פה ווילנא

איך החתן הבחור יצחק בן הר״ר אברהם הכהן ז״ל אמר לה להדא בתולתא
מרים בת הר״ר משה הלוי: הוי לי לאנתו כדת משה וישראל ואנא אפלח
ואוקיר ואיזון ואפרנס יתיכי כהלכות גוברין יהודאין

ונתרצית מרת מרים בתולתא דא והות ליה לאנתו

עדים:
שמעון בן יעקב הכהן
דוד בן אליהו""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Hebrew ketubah (Jewish marriage contract):
1. Names using PNV structure with both Hebrew script AND romanized versions
2. Patronymics (בן/בת = son/daughter of)
3. Tribal affiliations (הכהן = the priest/Kohen, הלוי = the Levite)
4. Honorifics (הר״ר = Rabbi, מרת = Mrs., ז״ל = of blessed memory)
5. Family relationships between persons
6. Roles in document (groom/חתן, bride/כלה, witness/עד)
7. Deceased markers (ז״ל)

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "ketubah", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hebrew"}],
  "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
    expected_persons=6,  # groom, bride, 2 fathers, 2 witnesses (fathers implicit)
    expected_locations=1,
    validation_names=["yitzchak", "miriam", "shimon", "david"]
)


# Spanish Colonial Baptism
SPANISH_BAPTISM = TestDocument(
    name="spanish_colonial_baptism",
    language="Spanish",
    script="Latin",
    date_period="1742 CE",
    source_type="baptismal_register",
    source_text="""En la ciudad de México, a veinte y tres días del mes de febrero de mil
setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza,
teniente de cura de esta santa iglesia catedral, bauticé solemnemente,
puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro
García de la Cruz, español, natural de la villa de Puebla de los Ángeles,
y de Doña María Josefa de los Reyes, española, natural de esta ciudad.

Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino
de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa,
a quienes advertí el parentesco espiritual y obligaciones que contrajeron.

Y lo firmé.
Br. Don Antonio de Mendoza""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Spanish colonial baptismal record:
1. Names using PNV structure (given name, surname with particles like "de")
2. Casta (racial/social) designations (español, mestizo, mulato, indio, etc.)
3. Legitimacy markers (hijo legítimo, hijo natural)
4. Place of origin (natural de, vecino de)
5. Family relationships (parents, godparents/padrinos)
6. Compadrazgo relationships (spiritual kinship between parents and godparents)
7. Ecclesiastical roles (priest, teniente de cura)
8. Honorifics (Don, Doña, Br./Bachiller)

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
  "locations_mentioned": [{"name": "...", "type": "..."}]
}""",
    expected_persons=6,  # infant, father, mother, godfather, godmother, priest
    expected_locations=3,
    validation_names=["juan", "pedro", "maria", "francisco", "antonio"]
)


# Dutch Marriage Certificate
DUTCH_MARRIAGE = TestDocument(
    name="dutch_marriage",
    language="Dutch",
    script="Latin",
    date_period="1885 CE",
    source_type="marriage_certificate",
    source_text="""Heden den vierden Maart achttien honderd vijf en tachtig, compareerden
voor mij, Ambtenaar van den Burgerlijken Stand der Gemeente Haarlem:

Johannes Petrus van der Berg, oud dertig jaren, koopman, geboren te
Amsterdam, wonende alhier, meerderjarige zoon van wijlen Pieter van der
Berg, in leven koopman, en van Maria Johanna Bakker, zonder beroep,
wonende te Amsterdam;

en

Cornelia Wilhelmina de Groot, oud vijf en twintig jaren, zonder beroep,
geboren te Haarlem, wonende alhier, meerderjarige dochter van Hendrik
de Groot, timmerman, en van wijlen Elisabeth van Dijk.

De getuigen waren:
Willem Frederik Smit, oud veertig jaren, notaris
Jacobus Hendrikus Jansen, oud vijf en dertig jaren, klerk""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Dutch marriage certificate (huwelijksakte):
1. Names using PNV structure with Dutch naming conventions
2. Patronymics and tussenvoegsels (van der, de, etc.)
3. Ages, occupations, birthplaces, residences
4. Family relationships (parents identified with "zoon van" / "dochter van")
5. Deceased markers ("wijlen" = the late)
6. Roles in document (groom, bride, witnesses/getuigen)
7. Civil status terminology

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "marriage_certificate", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
  "locations_mentioned": [{"name": "...", "type": "..."}]
}""",
    expected_persons=8,  # groom, bride, 4 parents (2 deceased), 2 witnesses
    expected_locations=2,
    validation_names=["johannes", "cornelia", "willem", "jacobus"]
)


# Russian Metrical Book Entry
RUSSIAN_METRICAL = TestDocument(
    name="russian_metrical",
    language="Russian",
    script="Cyrillic",
    date_period="1892 CE",
    source_type="metrical_book",
    source_text="""Метрическая книга Троицкой церкви села Покровского за 1892 год

О родившихся

Марта 15 дня родился, 17 дня крещён Иван.

Родители: крестьянин деревни Ивановки Пётр Иванович Сидоров и законная
жена его Анна Фёдоровна, оба православного вероисповедания.

Восприемники: крестьянин той же деревни Николай Петрович Кузнецов
и крестьянская дочь девица Мария Ивановна Сидорова.""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Russian metrical book (метрическая книга) entry:
1. Names using Russian naming conventions: given name + patronymic (отчество) + surname
2. Patronymic patterns (-ович/-евич for males, -овна/-евна for females)
3. Estate/class designations (крестьянин = peasant, мещанин = townsman, дворянин = noble)
4. Family relationships
5. Roles (родители = parents, восприемники = godparents)
6. Religious denomination (православный = Orthodox)
7. Include both Cyrillic AND romanized versions

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "metrical_book", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Gregorian/Julian"}],
  "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
    expected_persons=5,  # infant, father, mother, godfather, godmother
    expected_locations=2,
    validation_names=["ivan", "petr", "anna", "nikolai", "maria"]
)


# Italian Notarial Act
ITALIAN_NOTARIAL = TestDocument(
    name="italian_notarial",
    language="Italian",
    script="Latin",
    date_period="1654 CE",
    source_type="notarial_act",
    source_text="""Adì 15 Marzo 1654, in Venetia.

Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu
quondam Magnifico Messer Andrea, della contrada di San Marco,
et sua moglie la Nobil Donna Madonna Caterina Contarini fu
quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo
Fabbro, habitante nella contrada di San Polo, et Messer Marco
Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io
Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico
di Venetia.""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Italian notarial act:
1. Names using PNV structure (given name, surname)
2. Venetian nobility titles (Nobil Homo, Magnifico Messer, Nobil Donna Madonna)
3. Deceased father markers ("fu", "quondam" = the late)
4. Family relationships (spouses, children of)
5. Occupations (bottegaio = shopkeeper, notaro = notary)
6. Roles in document (party, witness/testimone, notary)
7. Residence/contrada information

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "notarial_act", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
  "locations_mentioned": [{"name": "...", "type": "..."}]
}""",
    expected_persons=6,  # Giovanni, Caterina, 2 witnesses, notary, plus fathers
    expected_locations=4,
    validation_names=["giovanni", "caterina", "pietro", "antonio"]
)


# Greek Orthodox Baptismal Register
GREEK_BAPTISMAL = TestDocument(
    name="greek_baptismal",
    language="Greek",
    script="Greek",
    date_period="1875 CE",
    source_type="baptismal_register",
    source_text="""Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875.

Ἐβαπτίσθη ὁ Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου,
ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ
μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς ὁ Κωνσταντῖνος
Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: ὁ Πρωτοπρεσβύτερος
Ἀθανάσιος Χρυσοστόμου.""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Greek Orthodox baptismal register:
1. Names with BOTH Greek script AND romanized versions
2. Greek patronymics ("τοῦ" + genitive = son/daughter of)
3. Deceased markers (μακαρίτης/μακαρίτισσα = the late)
4. Family relationships (υἱός = son, σύζυγος = wife)
5. Godparent (νονός/νονά)
6. Occupations (ἔμπορος = merchant, ἰατρός = physician)
7. Ecclesiastical titles (Πρωτοπρεσβύτερος = Archpriest)
8. Roles in document (baptized, parents, godparent, priest)

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Julian"}],
  "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
    expected_persons=6,  # infant, father, mother, maternal grandfather, godfather, priest
    expected_locations=1,
    validation_names=["dimitrios", "nikolaos", "eleni", "konstantinos"]
)


# Ottoman Turkish Court Record (Sijill)
OTTOMAN_SIJILL = TestDocument(
    name="ottoman_sijill",
    language="Ottoman Turkish",
    script="Arabic",
    date_period="1258 AH (1842 CE)",
    source_type="sijill",
    source_text="""بسم الله الرحمن الرحيم

مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجی‌کوی
ساکنلرندن محمد بن احمد افندی و زوجه‌سی فاطمه خاتون بنت علی‌اوغلو
حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی

شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی

فی اوائل شهر رجب سنة ١٢٥٨""",
    system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.

Extract ALL persons from this Ottoman Turkish sijill (court record):
1. Names with both Arabic script AND romanized versions
2. Ottoman honorifics (آغا/Ağa, افندی/Efendi, چلبی/Çelebi, خاتون/Hatun)
3. Patronymics (بن/bin = son of, بنت/bint = daughter of)
4. Deceased markers (مرحوم/merhum)
5. Family relationships (زوجه/zevce = wife)
6. Roles in document (buyer, seller, witnesses)
7. Residence information

Note: Ottoman Turkish uses Arabic script with Turkish vocabulary and grammatical structures.

Return ONLY valid JSON with this structure:
{
  "pico_observation": {"observation_id": "...", "source_type": "sijill", "source_reference": "..."},
  "persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
  "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hijri"}],
  "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
    expected_persons=6,  # Mehmed Ağa, Mehmed bin Ahmed, Fatma Hatun, 2 witnesses + fathers
    expected_locations=1,
    validation_names=["mehmed", "fatma", "hasan", "ibrahim"]
)


# All available tests
ALL_TESTS = {
    "arabic": ARABIC_WAQF,
    "hebrew": HEBREW_KETUBAH,
    "spanish": SPANISH_BAPTISM,
    "dutch": DUTCH_MARRIAGE,
    "russian": RUSSIAN_METRICAL,
    "italian": ITALIAN_NOTARIAL,
    "greek": GREEK_BAPTISMAL,
    "ottoman": OTTOMAN_SIJILL,
}


# =============================================================================
# API Functions
# =============================================================================

async def call_glm_api(system_prompt: str, user_content: str) -> tuple[dict, float]:
    """Call Z.AI GLM-4.6 API and return parsed JSON response with timing."""
    api_token = os.environ.get("ZAI_API_TOKEN")
    if not api_token:
        raise ValueError("ZAI_API_TOKEN not set in environment")

    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json",
    }

    payload = {
        "model": ZAI_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content},
        ],
        "temperature": 0.1,
        "max_tokens": MAX_TOKENS,
    }

    start_time = datetime.now(timezone.utc)

    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
        response = await client.post(ZAI_API_URL, headers=headers, json=payload)
        response.raise_for_status()

        result = response.json()
        content = result["choices"][0]["message"]["content"]

        end_time = datetime.now(timezone.utc)
        duration = (end_time - start_time).total_seconds()

        # Parse JSON from response
        json_content = content
        if "```json" in content:
            json_content = content.split("```json")[1].split("```")[0]
        elif "```" in content:
            parts = content.split("```")
            if len(parts) >= 2:
                json_content = parts[1]

        return json.loads(json_content.strip()), duration


def extract_all_strings_recursive(obj, strings: list[str]) -> None:
    """Recursively extract all string values from nested dicts/lists."""
    if isinstance(obj, str):
        strings.append(obj.lower())
    elif isinstance(obj, dict):
        for value in obj.values():
            extract_all_strings_recursive(value, strings)
    elif isinstance(obj, list):
        for item in obj:
            extract_all_strings_recursive(item, strings)


def normalize_name_variant(name: str) -> list[str]:
    """Generate common spelling variants for a name.

    Handles cross-script romanization differences like:
    - mehmed/muhammad/mohammed
    - fatma/fatima
    - dimitrios/demetrios
    - yitzchak/isaac
    """
    variants = [name.lower()]

    # Arabic/Turkish name variants
    variant_map = {
        'mehmed': ['muhammad', 'mohammed', 'mehmet'],
        'fatma': ['fatima', 'fatmah'],
        'ahmed': ['ahmad'],
        'ibrahim': ['abraham', 'ibrahim'],
        'hasan': ['hassan'],
        'hussein': ['husayn', 'huseyin'],
        # Greek variants
        'dimitrios': ['demetrios', 'dimitris', 'dēmētrios'],
        'nikolaos': ['nicholas', 'nikolas'],
        'konstantinos': ['constantine', 'constantinos'],
        'georgios': ['george', 'geōrgios'],
        'eleni': ['helen', 'elena', 'elenē'],
        'athanasios': ['athanasius'],
        # Hebrew variants
        'yitzchak': ['isaac', 'itzhak', 'yitzhak'],
        'miriam': ['mirjam', 'myriam'],
        'shimon': ['simon', 'shimeon'],
        'avraham': ['abraham'],
        'moshe': ['moses'],
        'david': ['dovid'],
        'yaakov': ['jacob', 'jakob'],
        # Russian variants
        'petr': ['peter', 'pyotr', 'piotr'],
        'ivan': ['john', 'ioann'],
        'nikolai': ['nicholas', 'nikolay'],
        'maria': ['mary', 'mariya'],
    }

    for key, values in variant_map.items():
        if name.lower() == key:
            variants.extend(values)
        elif name.lower() in values:
            variants.append(key)
            variants.extend(v for v in values if v != name.lower())

    return variants


def validate_extraction(result: dict, test: TestDocument) -> tuple[bool, list[str]]:
    """Validate extraction result against expected values."""
    errors = []
    warnings = []

    # Check structure
    if "persons" not in result:
        errors.append("Missing 'persons' field")
        return False, errors

    persons = result.get("persons", [])

    # Check person count
    if len(persons) < test.expected_persons:
        warnings.append(f"Expected at least {test.expected_persons} persons, got {len(persons)}")

    # Extract ALL string values from persons recursively for comprehensive name matching
    all_name_strings = []
    for person in persons:
        # Get pnv_name - could be nested structure
        pnv = person.get("pnv_name", {})
        extract_all_strings_recursive(pnv, all_name_strings)
        # Also check context field which often contains the original text
        if person.get("context"):
            all_name_strings.append(str(person["context"]).lower())

    # Check for expected names with variant support
    for expected_name in test.validation_names:
        variants = normalize_name_variant(expected_name)
        found = False
        for variant in variants:
            if any(variant in name_str for name_str in all_name_strings):
                found = True
                break
        if not found:
            warnings.append(f"Expected name '{expected_name}' (variants: {variants[:3]}) not found")

    # Check locations
    locations = result.get("locations_mentioned", [])
    if len(locations) < test.expected_locations:
        warnings.append(f"Expected at least {test.expected_locations} locations, got {len(locations)}")

    # Combine errors and warnings
    is_valid = len(errors) == 0
    all_issues = errors + warnings

    return is_valid, all_issues


# =============================================================================
# Test Runner
# =============================================================================

async def run_single_test(test: TestDocument) -> dict:
    """Run extraction test for a single document type."""
    print(f"\n{'='*70}")
    print(f"TEST: {test.name.upper()}")
    print(f"Language: {test.language} | Script: {test.script} | Period: {test.date_period}")
    print(f"{'='*70}")

    # Prepare user prompt
    user_prompt = f"""Extract all persons, relationships, dates, and locations from this {test.source_type}:

{test.source_text}

Follow the PiCo ontology pattern for person observations."""

    print(f"\n📄 Source: {test.source_type}")
    print(f"   Text length: {len(test.source_text)} chars")

    # Call API
    print(f"\n⏳ Calling GLM-4.6 API...")

    try:
        result, duration = await call_glm_api(test.system_prompt, user_prompt)
        print(f"✅ API call completed in {duration:.1f}s")

    except httpx.HTTPStatusError as e:
        print(f"❌ API Error: {e.response.status_code}")
        return {"test": test.name, "status": "error", "error": str(e)}
    except json.JSONDecodeError as e:
        print(f"❌ JSON Parse Error: {e}")
        return {"test": test.name, "status": "error", "error": str(e)}
    except Exception as e:
        print(f"❌ Error: {type(e).__name__}: {e}")
        return {"test": test.name, "status": "error", "error": str(e)}

    # Display summary
    persons = result.get("persons", [])
    locations = result.get("locations_mentioned", [])
    temporal = result.get("temporal_references", [])

    print(f"\n📊 Extraction Summary:")
    print(f"   Persons: {len(persons)}")
    print(f"   Locations: {len(locations)}")
    print(f"   Temporal refs: {len(temporal)}")

    # Show persons
    print(f"\n👥 Persons:")
    for person in persons[:5]:  # Show first 5
        idx = person.get("person_index", "?")
        name = person.get("pnv_name", {})
        if isinstance(name, str):
            lit_name = name
        else:
            lit_name = name.get("literalName_romanized") or name.get("literalName", "?")

        # Handle roles - could be list of dicts, list of strings, or string
        roles_raw = person.get("roles", [])
        if isinstance(roles_raw, str):
            role = roles_raw
        elif isinstance(roles_raw, list) and len(roles_raw) > 0:
            first_role = roles_raw[0]
            if isinstance(first_role, dict):
                role = first_role.get("role_in_source", "-")
            else:
                role = str(first_role)
        else:
            role = "-"

        print(f"   [{idx}] {str(lit_name)[:50]} ({role})")

    if len(persons) > 5:
        print(f"   ... and {len(persons) - 5} more")

    # Validate
    is_valid, issues = validate_extraction(result, test)

    print(f"\n🔍 Validation: {'✅ PASSED' if is_valid else '⚠️ ISSUES'}")
    if issues:
        for issue in issues:
            print(f"   - {issue}")

    # Save result
    output_dir = project_root / "data/entity_annotation/test_outputs"
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"{test.name}_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"\n💾 Saved: {output_file.name}")

    return {
        "test": test.name,
        "status": "passed" if is_valid else "warning",
        "persons_extracted": len(persons),
        "locations_extracted": len(locations),
        "duration_seconds": duration,
        "issues": issues,
        "output_file": str(output_file)
    }


async def run_all_tests() -> list[dict]:
    """Run all extraction tests sequentially."""
    results = []

    for name, test in ALL_TESTS.items():
        result = await run_single_test(test)
        results.append(result)

    return results


def print_summary(results: list[dict]):
    """Print summary of all test results."""
    print("\n" + "=" * 70)
    print("BATCH TEST SUMMARY")
    print("=" * 70)

    passed = sum(1 for r in results if r["status"] == "passed")
    warnings = sum(1 for r in results if r["status"] == "warning")
    errors = sum(1 for r in results if r["status"] == "error")

    print(f"\n📊 Results: {passed} passed, {warnings} warnings, {errors} errors")
    print(f"   Total tests: {len(results)}")

    print(f"\n📋 Test Details:")
    for r in results:
        status_icon = {"passed": "✅", "warning": "⚠️", "error": "❌"}.get(r["status"], "?")
        print(f"   {status_icon} {r['test']}: {r.get('persons_extracted', 0)} persons, {r.get('duration_seconds', 0):.1f}s")
        if r.get("issues"):
            for issue in r["issues"][:2]:
                print(f"      - {issue}")

    print("\n" + "=" * 70)
    if errors == 0:
        print("✅ ALL TESTS COMPLETED SUCCESSFULLY")
    else:
        print(f"⚠️ {errors} TESTS FAILED - Check details above")
    print("=" * 70)


# =============================================================================
# Main
# =============================================================================

async def main():
    parser = argparse.ArgumentParser(description="Batch test PiCo extraction")
    parser.add_argument("--test-name", "-t", choices=list(ALL_TESTS.keys()),
                        help="Run specific test by name")
    parser.add_argument("--all", "-a", action="store_true",
                        help="Run all tests")
    parser.add_argument("--list", "-l", action="store_true",
                        help="List available tests")

    args = parser.parse_args()

    # Check API token
    if not os.environ.get("ZAI_API_TOKEN"):
        print("❌ Error: ZAI_API_TOKEN not set")
        print("Set it with: export ZAI_API_TOKEN=<your_token>")
        print("Or add to .env file in project root")
        return 1

    print("\n" + "#" * 70)
    print("# PiCo BATCH EXTRACTION TEST")
    print(f"# Model: {ZAI_MODEL} (reasoning mode)")
    print(f"# Max tokens: {MAX_TOKENS}")
    print("#" * 70)

    if args.list:
        print("\n📋 Available tests:")
        for name, test in ALL_TESTS.items():
            print(f"   {name}: {test.language} {test.source_type} ({test.date_period})")
        return 0

    if args.test_name:
        test = ALL_TESTS[args.test_name]
        result = await run_single_test(test)
        return 0 if result["status"] != "error" else 1

    if args.all:
        results = await run_all_tests()
        print_summary(results)
        errors = sum(1 for r in results if r["status"] == "error")
        return 0 if errors == 0 else 1

    # Default: show help
    parser.print_help()
    return 0


if __name__ == "__main__":
    exit_code = asyncio.run(main())
    sys.exit(exit_code)