- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents. - The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results. - Added comprehensive logging for API responses, extraction results, and validation errors. - Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
786 lines
32 KiB
Python
786 lines
32 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Batch test runner for PiCo (Person in Context) extraction across multiple document types.
|
||
|
||
This script tests GLM-4.6 reasoning mode extraction from various historical document types:
|
||
1. Arabic Waqf (Islamic endowment)
|
||
2. Hebrew Ketubah (Jewish marriage contract)
|
||
3. Spanish Colonial Baptism
|
||
4. Dutch Marriage Certificate
|
||
5. Latin Notarial Protocol
|
||
|
||
Usage:
|
||
python scripts/test_pico_batch.py [--test-name NAME] [--all] [--list]
|
||
|
||
Examples:
|
||
python scripts/test_pico_batch.py --all # Run all tests
|
||
python scripts/test_pico_batch.py --test-name arabic # Run only Arabic waqf test
|
||
python scripts/test_pico_batch.py --list # List available tests
|
||
|
||
Environment Variables:
|
||
ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
|
||
"""
|
||
|
||
import asyncio
|
||
import argparse
|
||
import json
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from dataclasses import dataclass
|
||
from typing import Optional
|
||
|
||
import httpx
|
||
|
||
# Load environment variables from .env file
|
||
project_root = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(project_root))
|
||
|
||
try:
|
||
from dotenv import load_dotenv
|
||
load_dotenv(project_root / ".env")
|
||
except ImportError:
|
||
pass
|
||
|
||
|
||
# =============================================================================
|
||
# API Configuration
|
||
# =============================================================================
|
||
|
||
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
||
ZAI_MODEL = "glm-4.6"
|
||
MAX_TOKENS = 16000 # High limit for GLM-4.6 reasoning mode
|
||
TIMEOUT = 300 # 5 minutes for complex reasoning
|
||
|
||
|
||
# =============================================================================
|
||
# Test Document Definitions
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class TestDocument:
|
||
"""A historical document for PiCo extraction testing."""
|
||
name: str
|
||
language: str
|
||
script: str
|
||
date_period: str
|
||
source_type: str
|
||
source_text: str
|
||
system_prompt: str
|
||
expected_persons: int
|
||
expected_locations: int
|
||
validation_names: list[str] # Names that should appear in extraction
|
||
|
||
|
||
# Arabic Waqf Document
|
||
ARABIC_WAQF = TestDocument(
|
||
name="arabic_waqf",
|
||
language="Arabic",
|
||
script="Arabic",
|
||
date_period="1225 AH (1810 CE)",
|
||
source_type="waqf_document",
|
||
source_text="""بسم الله الرحمن الرحيم
|
||
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
|
||
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
|
||
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
|
||
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
|
||
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
|
||
وخمس وعشرين هجرية.""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Arabic waqf (endowment) document:
|
||
1. Names using PNV structure with both Arabic script AND romanized versions
|
||
2. Patronymics (ابن/بن = son of)
|
||
3. Honorifics (الحاج = pilgrim, السيد = sayyid, المرحوم = the late)
|
||
4. Family relationships between persons
|
||
5. Roles in the document (founder, witness)
|
||
6. Biographical info (deceased status, occupation, address)
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "...", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "..."}],
|
||
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=4,
|
||
expected_locations=2,
|
||
validation_names=["ahmad", "ibrahim", "ali"]
|
||
)
|
||
|
||
|
||
# Hebrew Ketubah
|
||
HEBREW_KETUBAH = TestDocument(
|
||
name="hebrew_ketubah",
|
||
language="Hebrew/Aramaic",
|
||
script="Hebrew",
|
||
date_period="5645 AM (1885 CE)",
|
||
source_type="ketubah",
|
||
source_text="""בס״ד
|
||
|
||
ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות
|
||
וארבעים וחמש לבריאת עולם למנין שאנו מונין בו פה ווילנא
|
||
|
||
איך החתן הבחור יצחק בן הר״ר אברהם הכהן ז״ל אמר לה להדא בתולתא
|
||
מרים בת הר״ר משה הלוי: הוי לי לאנתו כדת משה וישראל ואנא אפלח
|
||
ואוקיר ואיזון ואפרנס יתיכי כהלכות גוברין יהודאין
|
||
|
||
ונתרצית מרת מרים בתולתא דא והות ליה לאנתו
|
||
|
||
עדים:
|
||
שמעון בן יעקב הכהן
|
||
דוד בן אליהו""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Hebrew ketubah (Jewish marriage contract):
|
||
1. Names using PNV structure with both Hebrew script AND romanized versions
|
||
2. Patronymics (בן/בת = son/daughter of)
|
||
3. Tribal affiliations (הכהן = the priest/Kohen, הלוי = the Levite)
|
||
4. Honorifics (הר״ר = Rabbi, מרת = Mrs., ז״ל = of blessed memory)
|
||
5. Family relationships between persons
|
||
6. Roles in document (groom/חתן, bride/כלה, witness/עד)
|
||
7. Deceased markers (ז״ל)
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "ketubah", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hebrew"}],
|
||
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=6, # groom, bride, 2 fathers, 2 witnesses (fathers implicit)
|
||
expected_locations=1,
|
||
validation_names=["yitzchak", "miriam", "shimon", "david"]
|
||
)
|
||
|
||
|
||
# Spanish Colonial Baptism
|
||
SPANISH_BAPTISM = TestDocument(
|
||
name="spanish_colonial_baptism",
|
||
language="Spanish",
|
||
script="Latin",
|
||
date_period="1742 CE",
|
||
source_type="baptismal_register",
|
||
source_text="""En la ciudad de México, a veinte y tres días del mes de febrero de mil
|
||
setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza,
|
||
teniente de cura de esta santa iglesia catedral, bauticé solemnemente,
|
||
puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro
|
||
García de la Cruz, español, natural de la villa de Puebla de los Ángeles,
|
||
y de Doña María Josefa de los Reyes, española, natural de esta ciudad.
|
||
|
||
Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino
|
||
de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa,
|
||
a quienes advertí el parentesco espiritual y obligaciones que contrajeron.
|
||
|
||
Y lo firmé.
|
||
Br. Don Antonio de Mendoza""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Spanish colonial baptismal record:
|
||
1. Names using PNV structure (given name, surname with particles like "de")
|
||
2. Casta (racial/social) designations (español, mestizo, mulato, indio, etc.)
|
||
3. Legitimacy markers (hijo legítimo, hijo natural)
|
||
4. Place of origin (natural de, vecino de)
|
||
5. Family relationships (parents, godparents/padrinos)
|
||
6. Compadrazgo relationships (spiritual kinship between parents and godparents)
|
||
7. Ecclesiastical roles (priest, teniente de cura)
|
||
8. Honorifics (Don, Doña, Br./Bachiller)
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
|
||
"locations_mentioned": [{"name": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=6, # infant, father, mother, godfather, godmother, priest
|
||
expected_locations=3,
|
||
validation_names=["juan", "pedro", "maria", "francisco", "antonio"]
|
||
)
|
||
|
||
|
||
# Dutch Marriage Certificate
|
||
DUTCH_MARRIAGE = TestDocument(
|
||
name="dutch_marriage",
|
||
language="Dutch",
|
||
script="Latin",
|
||
date_period="1885 CE",
|
||
source_type="marriage_certificate",
|
||
source_text="""Heden den vierden Maart achttien honderd vijf en tachtig, compareerden
|
||
voor mij, Ambtenaar van den Burgerlijken Stand der Gemeente Haarlem:
|
||
|
||
Johannes Petrus van der Berg, oud dertig jaren, koopman, geboren te
|
||
Amsterdam, wonende alhier, meerderjarige zoon van wijlen Pieter van der
|
||
Berg, in leven koopman, en van Maria Johanna Bakker, zonder beroep,
|
||
wonende te Amsterdam;
|
||
|
||
en
|
||
|
||
Cornelia Wilhelmina de Groot, oud vijf en twintig jaren, zonder beroep,
|
||
geboren te Haarlem, wonende alhier, meerderjarige dochter van Hendrik
|
||
de Groot, timmerman, en van wijlen Elisabeth van Dijk.
|
||
|
||
De getuigen waren:
|
||
Willem Frederik Smit, oud veertig jaren, notaris
|
||
Jacobus Hendrikus Jansen, oud vijf en dertig jaren, klerk""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Dutch marriage certificate (huwelijksakte):
|
||
1. Names using PNV structure with Dutch naming conventions
|
||
2. Patronymics and tussenvoegsels (van der, de, etc.)
|
||
3. Ages, occupations, birthplaces, residences
|
||
4. Family relationships (parents identified with "zoon van" / "dochter van")
|
||
5. Deceased markers ("wijlen" = the late)
|
||
6. Roles in document (groom, bride, witnesses/getuigen)
|
||
7. Civil status terminology
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "marriage_certificate", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
|
||
"locations_mentioned": [{"name": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=8, # groom, bride, 4 parents (2 deceased), 2 witnesses
|
||
expected_locations=2,
|
||
validation_names=["johannes", "cornelia", "willem", "jacobus"]
|
||
)
|
||
|
||
|
||
# Russian Metrical Book Entry
|
||
RUSSIAN_METRICAL = TestDocument(
|
||
name="russian_metrical",
|
||
language="Russian",
|
||
script="Cyrillic",
|
||
date_period="1892 CE",
|
||
source_type="metrical_book",
|
||
source_text="""Метрическая книга Троицкой церкви села Покровского за 1892 год
|
||
|
||
О родившихся
|
||
|
||
Марта 15 дня родился, 17 дня крещён Иван.
|
||
|
||
Родители: крестьянин деревни Ивановки Пётр Иванович Сидоров и законная
|
||
жена его Анна Фёдоровна, оба православного вероисповедания.
|
||
|
||
Восприемники: крестьянин той же деревни Николай Петрович Кузнецов
|
||
и крестьянская дочь девица Мария Ивановна Сидорова.""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Russian metrical book (метрическая книга) entry:
|
||
1. Names using Russian naming conventions: given name + patronymic (отчество) + surname
|
||
2. Patronymic patterns (-ович/-евич for males, -овна/-евна for females)
|
||
3. Estate/class designations (крестьянин = peasant, мещанин = townsman, дворянин = noble)
|
||
4. Family relationships
|
||
5. Roles (родители = parents, восприемники = godparents)
|
||
6. Religious denomination (православный = Orthodox)
|
||
7. Include both Cyrillic AND romanized versions
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "metrical_book", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Gregorian/Julian"}],
|
||
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=5, # infant, father, mother, godfather, godmother
|
||
expected_locations=2,
|
||
validation_names=["ivan", "petr", "anna", "nikolai", "maria"]
|
||
)
|
||
|
||
|
||
# Italian Notarial Act
|
||
ITALIAN_NOTARIAL = TestDocument(
|
||
name="italian_notarial",
|
||
language="Italian",
|
||
script="Latin",
|
||
date_period="1654 CE",
|
||
source_type="notarial_act",
|
||
source_text="""Adì 15 Marzo 1654, in Venetia.
|
||
|
||
Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu
|
||
quondam Magnifico Messer Andrea, della contrada di San Marco,
|
||
et sua moglie la Nobil Donna Madonna Caterina Contarini fu
|
||
quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo
|
||
Fabbro, habitante nella contrada di San Polo, et Messer Marco
|
||
Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io
|
||
Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico
|
||
di Venetia.""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Italian notarial act:
|
||
1. Names using PNV structure (given name, surname)
|
||
2. Venetian nobility titles (Nobil Homo, Magnifico Messer, Nobil Donna Madonna)
|
||
3. Deceased father markers ("fu", "quondam" = the late)
|
||
4. Family relationships (spouses, children of)
|
||
5. Occupations (bottegaio = shopkeeper, notaro = notary)
|
||
6. Roles in document (party, witness/testimone, notary)
|
||
7. Residence/contrada information
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "notarial_act", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
|
||
"locations_mentioned": [{"name": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=6, # Giovanni, Caterina, 2 witnesses, notary, plus fathers
|
||
expected_locations=4,
|
||
validation_names=["giovanni", "caterina", "pietro", "antonio"]
|
||
)
|
||
|
||
|
||
# Greek Orthodox Baptismal Register
|
||
GREEK_BAPTISMAL = TestDocument(
|
||
name="greek_baptismal",
|
||
language="Greek",
|
||
script="Greek",
|
||
date_period="1875 CE",
|
||
source_type="baptismal_register",
|
||
source_text="""Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875.
|
||
|
||
Ἐβαπτίσθη ὁ Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου,
|
||
ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ
|
||
μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς ὁ Κωνσταντῖνος
|
||
Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: ὁ Πρωτοπρεσβύτερος
|
||
Ἀθανάσιος Χρυσοστόμου.""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Greek Orthodox baptismal register:
|
||
1. Names with BOTH Greek script AND romanized versions
|
||
2. Greek patronymics ("τοῦ" + genitive = son/daughter of)
|
||
3. Deceased markers (μακαρίτης/μακαρίτισσα = the late)
|
||
4. Family relationships (υἱός = son, σύζυγος = wife)
|
||
5. Godparent (νονός/νονά)
|
||
6. Occupations (ἔμπορος = merchant, ἰατρός = physician)
|
||
7. Ecclesiastical titles (Πρωτοπρεσβύτερος = Archpriest)
|
||
8. Roles in document (baptized, parents, godparent, priest)
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Julian"}],
|
||
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=6, # infant, father, mother, maternal grandfather, godfather, priest
|
||
expected_locations=1,
|
||
validation_names=["dimitrios", "nikolaos", "eleni", "konstantinos"]
|
||
)
|
||
|
||
|
||
# Ottoman Turkish Court Record (Sijill)
|
||
OTTOMAN_SIJILL = TestDocument(
|
||
name="ottoman_sijill",
|
||
language="Ottoman Turkish",
|
||
script="Arabic",
|
||
date_period="1258 AH (1842 CE)",
|
||
source_type="sijill",
|
||
source_text="""بسم الله الرحمن الرحيم
|
||
|
||
مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجیکوی
|
||
ساکنلرندن محمد بن احمد افندی و زوجهسی فاطمه خاتون بنت علیاوغلو
|
||
حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی
|
||
|
||
شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی
|
||
|
||
فی اوائل شهر رجب سنة ١٢٥٨""",
|
||
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
|
||
|
||
Extract ALL persons from this Ottoman Turkish sijill (court record):
|
||
1. Names with both Arabic script AND romanized versions
|
||
2. Ottoman honorifics (آغا/Ağa, افندی/Efendi, چلبی/Çelebi, خاتون/Hatun)
|
||
3. Patronymics (بن/bin = son of, بنت/bint = daughter of)
|
||
4. Deceased markers (مرحوم/merhum)
|
||
5. Family relationships (زوجه/zevce = wife)
|
||
6. Roles in document (buyer, seller, witnesses)
|
||
7. Residence information
|
||
|
||
Note: Ottoman Turkish uses Arabic script with Turkish vocabulary and grammatical structures.
|
||
|
||
Return ONLY valid JSON with this structure:
|
||
{
|
||
"pico_observation": {"observation_id": "...", "source_type": "sijill", "source_reference": "..."},
|
||
"persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
|
||
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hijri"}],
|
||
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
|
||
}""",
|
||
expected_persons=6, # Mehmed Ağa, Mehmed bin Ahmed, Fatma Hatun, 2 witnesses + fathers
|
||
expected_locations=1,
|
||
validation_names=["mehmed", "fatma", "hasan", "ibrahim"]
|
||
)
|
||
|
||
|
||
# All available tests
|
||
ALL_TESTS = {
|
||
"arabic": ARABIC_WAQF,
|
||
"hebrew": HEBREW_KETUBAH,
|
||
"spanish": SPANISH_BAPTISM,
|
||
"dutch": DUTCH_MARRIAGE,
|
||
"russian": RUSSIAN_METRICAL,
|
||
"italian": ITALIAN_NOTARIAL,
|
||
"greek": GREEK_BAPTISMAL,
|
||
"ottoman": OTTOMAN_SIJILL,
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# API Functions
|
||
# =============================================================================
|
||
|
||
async def call_glm_api(system_prompt: str, user_content: str) -> tuple[dict, float]:
|
||
"""Call Z.AI GLM-4.6 API and return parsed JSON response with timing."""
|
||
api_token = os.environ.get("ZAI_API_TOKEN")
|
||
if not api_token:
|
||
raise ValueError("ZAI_API_TOKEN not set in environment")
|
||
|
||
headers = {
|
||
"Authorization": f"Bearer {api_token}",
|
||
"Content-Type": "application/json",
|
||
}
|
||
|
||
payload = {
|
||
"model": ZAI_MODEL,
|
||
"messages": [
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": user_content},
|
||
],
|
||
"temperature": 0.1,
|
||
"max_tokens": MAX_TOKENS,
|
||
}
|
||
|
||
start_time = datetime.now(timezone.utc)
|
||
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
response = await client.post(ZAI_API_URL, headers=headers, json=payload)
|
||
response.raise_for_status()
|
||
|
||
result = response.json()
|
||
content = result["choices"][0]["message"]["content"]
|
||
|
||
end_time = datetime.now(timezone.utc)
|
||
duration = (end_time - start_time).total_seconds()
|
||
|
||
# Parse JSON from response
|
||
json_content = content
|
||
if "```json" in content:
|
||
json_content = content.split("```json")[1].split("```")[0]
|
||
elif "```" in content:
|
||
parts = content.split("```")
|
||
if len(parts) >= 2:
|
||
json_content = parts[1]
|
||
|
||
return json.loads(json_content.strip()), duration
|
||
|
||
|
||
def extract_all_strings_recursive(obj, strings: list[str]) -> None:
|
||
"""Recursively extract all string values from nested dicts/lists."""
|
||
if isinstance(obj, str):
|
||
strings.append(obj.lower())
|
||
elif isinstance(obj, dict):
|
||
for value in obj.values():
|
||
extract_all_strings_recursive(value, strings)
|
||
elif isinstance(obj, list):
|
||
for item in obj:
|
||
extract_all_strings_recursive(item, strings)
|
||
|
||
|
||
def normalize_name_variant(name: str) -> list[str]:
|
||
"""Generate common spelling variants for a name.
|
||
|
||
Handles cross-script romanization differences like:
|
||
- mehmed/muhammad/mohammed
|
||
- fatma/fatima
|
||
- dimitrios/demetrios
|
||
- yitzchak/isaac
|
||
"""
|
||
variants = [name.lower()]
|
||
|
||
# Arabic/Turkish name variants
|
||
variant_map = {
|
||
'mehmed': ['muhammad', 'mohammed', 'mehmet'],
|
||
'fatma': ['fatima', 'fatmah'],
|
||
'ahmed': ['ahmad'],
|
||
'ibrahim': ['abraham', 'ibrahim'],
|
||
'hasan': ['hassan'],
|
||
'hussein': ['husayn', 'huseyin'],
|
||
# Greek variants
|
||
'dimitrios': ['demetrios', 'dimitris', 'dēmētrios'],
|
||
'nikolaos': ['nicholas', 'nikolas'],
|
||
'konstantinos': ['constantine', 'constantinos'],
|
||
'georgios': ['george', 'geōrgios'],
|
||
'eleni': ['helen', 'elena', 'elenē'],
|
||
'athanasios': ['athanasius'],
|
||
# Hebrew variants
|
||
'yitzchak': ['isaac', 'itzhak', 'yitzhak'],
|
||
'miriam': ['mirjam', 'myriam'],
|
||
'shimon': ['simon', 'shimeon'],
|
||
'avraham': ['abraham'],
|
||
'moshe': ['moses'],
|
||
'david': ['dovid'],
|
||
'yaakov': ['jacob', 'jakob'],
|
||
# Russian variants
|
||
'petr': ['peter', 'pyotr', 'piotr'],
|
||
'ivan': ['john', 'ioann'],
|
||
'nikolai': ['nicholas', 'nikolay'],
|
||
'maria': ['mary', 'mariya'],
|
||
}
|
||
|
||
for key, values in variant_map.items():
|
||
if name.lower() == key:
|
||
variants.extend(values)
|
||
elif name.lower() in values:
|
||
variants.append(key)
|
||
variants.extend(v for v in values if v != name.lower())
|
||
|
||
return variants
|
||
|
||
|
||
def validate_extraction(result: dict, test: TestDocument) -> tuple[bool, list[str]]:
|
||
"""Validate extraction result against expected values."""
|
||
errors = []
|
||
warnings = []
|
||
|
||
# Check structure
|
||
if "persons" not in result:
|
||
errors.append("Missing 'persons' field")
|
||
return False, errors
|
||
|
||
persons = result.get("persons", [])
|
||
|
||
# Check person count
|
||
if len(persons) < test.expected_persons:
|
||
warnings.append(f"Expected at least {test.expected_persons} persons, got {len(persons)}")
|
||
|
||
# Extract ALL string values from persons recursively for comprehensive name matching
|
||
all_name_strings = []
|
||
for person in persons:
|
||
# Get pnv_name - could be nested structure
|
||
pnv = person.get("pnv_name", {})
|
||
extract_all_strings_recursive(pnv, all_name_strings)
|
||
# Also check context field which often contains the original text
|
||
if person.get("context"):
|
||
all_name_strings.append(str(person["context"]).lower())
|
||
|
||
# Check for expected names with variant support
|
||
for expected_name in test.validation_names:
|
||
variants = normalize_name_variant(expected_name)
|
||
found = False
|
||
for variant in variants:
|
||
if any(variant in name_str for name_str in all_name_strings):
|
||
found = True
|
||
break
|
||
if not found:
|
||
warnings.append(f"Expected name '{expected_name}' (variants: {variants[:3]}) not found")
|
||
|
||
# Check locations
|
||
locations = result.get("locations_mentioned", [])
|
||
if len(locations) < test.expected_locations:
|
||
warnings.append(f"Expected at least {test.expected_locations} locations, got {len(locations)}")
|
||
|
||
# Combine errors and warnings
|
||
is_valid = len(errors) == 0
|
||
all_issues = errors + warnings
|
||
|
||
return is_valid, all_issues
|
||
|
||
|
||
# =============================================================================
|
||
# Test Runner
|
||
# =============================================================================
|
||
|
||
async def run_single_test(test: TestDocument) -> dict:
|
||
"""Run extraction test for a single document type."""
|
||
print(f"\n{'='*70}")
|
||
print(f"TEST: {test.name.upper()}")
|
||
print(f"Language: {test.language} | Script: {test.script} | Period: {test.date_period}")
|
||
print(f"{'='*70}")
|
||
|
||
# Prepare user prompt
|
||
user_prompt = f"""Extract all persons, relationships, dates, and locations from this {test.source_type}:
|
||
|
||
{test.source_text}
|
||
|
||
Follow the PiCo ontology pattern for person observations."""
|
||
|
||
print(f"\n📄 Source: {test.source_type}")
|
||
print(f" Text length: {len(test.source_text)} chars")
|
||
|
||
# Call API
|
||
print(f"\n⏳ Calling GLM-4.6 API...")
|
||
|
||
try:
|
||
result, duration = await call_glm_api(test.system_prompt, user_prompt)
|
||
print(f"✅ API call completed in {duration:.1f}s")
|
||
|
||
except httpx.HTTPStatusError as e:
|
||
print(f"❌ API Error: {e.response.status_code}")
|
||
return {"test": test.name, "status": "error", "error": str(e)}
|
||
except json.JSONDecodeError as e:
|
||
print(f"❌ JSON Parse Error: {e}")
|
||
return {"test": test.name, "status": "error", "error": str(e)}
|
||
except Exception as e:
|
||
print(f"❌ Error: {type(e).__name__}: {e}")
|
||
return {"test": test.name, "status": "error", "error": str(e)}
|
||
|
||
# Display summary
|
||
persons = result.get("persons", [])
|
||
locations = result.get("locations_mentioned", [])
|
||
temporal = result.get("temporal_references", [])
|
||
|
||
print(f"\n📊 Extraction Summary:")
|
||
print(f" Persons: {len(persons)}")
|
||
print(f" Locations: {len(locations)}")
|
||
print(f" Temporal refs: {len(temporal)}")
|
||
|
||
# Show persons
|
||
print(f"\n👥 Persons:")
|
||
for person in persons[:5]: # Show first 5
|
||
idx = person.get("person_index", "?")
|
||
name = person.get("pnv_name", {})
|
||
if isinstance(name, str):
|
||
lit_name = name
|
||
else:
|
||
lit_name = name.get("literalName_romanized") or name.get("literalName", "?")
|
||
|
||
# Handle roles - could be list of dicts, list of strings, or string
|
||
roles_raw = person.get("roles", [])
|
||
if isinstance(roles_raw, str):
|
||
role = roles_raw
|
||
elif isinstance(roles_raw, list) and len(roles_raw) > 0:
|
||
first_role = roles_raw[0]
|
||
if isinstance(first_role, dict):
|
||
role = first_role.get("role_in_source", "-")
|
||
else:
|
||
role = str(first_role)
|
||
else:
|
||
role = "-"
|
||
|
||
print(f" [{idx}] {str(lit_name)[:50]} ({role})")
|
||
|
||
if len(persons) > 5:
|
||
print(f" ... and {len(persons) - 5} more")
|
||
|
||
# Validate
|
||
is_valid, issues = validate_extraction(result, test)
|
||
|
||
print(f"\n🔍 Validation: {'✅ PASSED' if is_valid else '⚠️ ISSUES'}")
|
||
if issues:
|
||
for issue in issues:
|
||
print(f" - {issue}")
|
||
|
||
# Save result
|
||
output_dir = project_root / "data/entity_annotation/test_outputs"
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
output_file = output_dir / f"{test.name}_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\n💾 Saved: {output_file.name}")
|
||
|
||
return {
|
||
"test": test.name,
|
||
"status": "passed" if is_valid else "warning",
|
||
"persons_extracted": len(persons),
|
||
"locations_extracted": len(locations),
|
||
"duration_seconds": duration,
|
||
"issues": issues,
|
||
"output_file": str(output_file)
|
||
}
|
||
|
||
|
||
async def run_all_tests() -> list[dict]:
|
||
"""Run all extraction tests sequentially."""
|
||
results = []
|
||
|
||
for name, test in ALL_TESTS.items():
|
||
result = await run_single_test(test)
|
||
results.append(result)
|
||
|
||
return results
|
||
|
||
|
||
def print_summary(results: list[dict]):
|
||
"""Print summary of all test results."""
|
||
print("\n" + "=" * 70)
|
||
print("BATCH TEST SUMMARY")
|
||
print("=" * 70)
|
||
|
||
passed = sum(1 for r in results if r["status"] == "passed")
|
||
warnings = sum(1 for r in results if r["status"] == "warning")
|
||
errors = sum(1 for r in results if r["status"] == "error")
|
||
|
||
print(f"\n📊 Results: {passed} passed, {warnings} warnings, {errors} errors")
|
||
print(f" Total tests: {len(results)}")
|
||
|
||
print(f"\n📋 Test Details:")
|
||
for r in results:
|
||
status_icon = {"passed": "✅", "warning": "⚠️", "error": "❌"}.get(r["status"], "?")
|
||
print(f" {status_icon} {r['test']}: {r.get('persons_extracted', 0)} persons, {r.get('duration_seconds', 0):.1f}s")
|
||
if r.get("issues"):
|
||
for issue in r["issues"][:2]:
|
||
print(f" - {issue}")
|
||
|
||
print("\n" + "=" * 70)
|
||
if errors == 0:
|
||
print("✅ ALL TESTS COMPLETED SUCCESSFULLY")
|
||
else:
|
||
print(f"⚠️ {errors} TESTS FAILED - Check details above")
|
||
print("=" * 70)
|
||
|
||
|
||
# =============================================================================
|
||
# Main
|
||
# =============================================================================
|
||
|
||
async def main():
|
||
parser = argparse.ArgumentParser(description="Batch test PiCo extraction")
|
||
parser.add_argument("--test-name", "-t", choices=list(ALL_TESTS.keys()),
|
||
help="Run specific test by name")
|
||
parser.add_argument("--all", "-a", action="store_true",
|
||
help="Run all tests")
|
||
parser.add_argument("--list", "-l", action="store_true",
|
||
help="List available tests")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Check API token
|
||
if not os.environ.get("ZAI_API_TOKEN"):
|
||
print("❌ Error: ZAI_API_TOKEN not set")
|
||
print("Set it with: export ZAI_API_TOKEN=<your_token>")
|
||
print("Or add to .env file in project root")
|
||
return 1
|
||
|
||
print("\n" + "#" * 70)
|
||
print("# PiCo BATCH EXTRACTION TEST")
|
||
print(f"# Model: {ZAI_MODEL} (reasoning mode)")
|
||
print(f"# Max tokens: {MAX_TOKENS}")
|
||
print("#" * 70)
|
||
|
||
if args.list:
|
||
print("\n📋 Available tests:")
|
||
for name, test in ALL_TESTS.items():
|
||
print(f" {name}: {test.language} {test.source_type} ({test.date_period})")
|
||
return 0
|
||
|
||
if args.test_name:
|
||
test = ALL_TESTS[args.test_name]
|
||
result = await run_single_test(test)
|
||
return 0 if result["status"] != "error" else 1
|
||
|
||
if args.all:
|
||
results = await run_all_tests()
|
||
print_summary(results)
|
||
errors = sum(1 for r in results if r["status"] == "error")
|
||
return 0 if errors == 0 else 1
|
||
|
||
# Default: show help
|
||
parser.print_help()
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
exit_code = asyncio.run(main())
|
||
sys.exit(exit_code)
|