- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents. - The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results. - Added comprehensive logging for API responses, extraction results, and validation errors. - Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
439 lines
13 KiB
YAML
439 lines
13 KiB
YAML
# =============================================================================
|
||
# PiCo Integration Module: Person Name Vocabulary (PNV)
|
||
# =============================================================================
|
||
# Part of: data/entity_annotation/modules/integrations/pico/
|
||
# Parent: _index.yaml
|
||
#
|
||
# Description: Person Name Vocabulary (PNV) provides structured name components.
|
||
# This enables proper parsing of complex name structures across cultures.
|
||
#
|
||
# References:
|
||
# - PNV: https://w3id.org/pnv
|
||
# - PNV Specification: https://w3id.org/pnv/doc/v2
|
||
#
|
||
# Last Updated: 2025-01-13
|
||
# =============================================================================
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Person Name Vocabulary (PNV)
|
||
# -----------------------------------------------------------------------------
|
||
|
||
pnv_name_structure:
|
||
description: |
|
||
Person Name Vocabulary (PNV) provides structured name components.
|
||
This enables proper parsing of complex name structures across cultures.
|
||
|
||
class: "pnv:PersonName"
|
||
class_uri: "https://w3id.org/pnv/PersonName"
|
||
|
||
components:
|
||
- property: "pnv:literalName"
|
||
description: "Full name as single string"
|
||
examples:
|
||
- "Dr. Maria van den Berg"
|
||
- "Rembrandt Harmenszoon van Rijn"
|
||
- "Queen Elizabeth II"
|
||
note: "Original string before parsing"
|
||
|
||
- property: "pnv:givenName"
|
||
description: "First/given name"
|
||
examples:
|
||
- "Rembrandt"
|
||
- "Maria"
|
||
- "Jan"
|
||
- "Elizabeth"
|
||
note: "Personal name, not surname"
|
||
|
||
- property: "pnv:patronym"
|
||
description: "Patronymic name component"
|
||
examples:
|
||
- "Harmenszoon"
|
||
- "Janszoon"
|
||
- "Pietersdochter"
|
||
note: "Common in Dutch, Scandinavian, Slavic names"
|
||
|
||
- property: "pnv:surnamePrefix"
|
||
description: "Prefix to surname (tussenvoegsel)"
|
||
examples:
|
||
- "van"
|
||
- "de"
|
||
- "van den"
|
||
- "van der"
|
||
- "op de"
|
||
- "'t"
|
||
- "von"
|
||
- "di"
|
||
note: "Language-specific, affects sorting"
|
||
|
||
- property: "pnv:baseSurname"
|
||
description: "Core surname without prefix"
|
||
examples:
|
||
- "Rijn"
|
||
- "Berg"
|
||
- "Velde"
|
||
- "Gogh"
|
||
note: "Primary sorting component in Dutch"
|
||
|
||
- property: "pnv:honorificPrefix"
|
||
description: "Title or honorific before name"
|
||
examples:
|
||
- "Dr."
|
||
- "Prof."
|
||
- "Prof. dr."
|
||
- "Sir"
|
||
- "Queen"
|
||
- "Mr."
|
||
- "Drs."
|
||
- "Ir."
|
||
note: "May indicate role - link to ROL"
|
||
|
||
- property: "pnv:honorificSuffix"
|
||
description: "Title or honorific after name"
|
||
examples:
|
||
- "PhD"
|
||
- "Jr."
|
||
- "III"
|
||
- "MD"
|
||
- "RA"
|
||
- "MSc"
|
||
note: "Credentials and generational markers"
|
||
|
||
- property: "pnv:infixTitle"
|
||
description: "Title within name structure"
|
||
examples:
|
||
- "graaf van"
|
||
- "baron de"
|
||
- "duke of"
|
||
note: "Nobility titles embedded in name"
|
||
|
||
- property: "pnv:initials"
|
||
description: "Initials of given name(s)"
|
||
examples:
|
||
- "P.R."
|
||
- "C.Joh."
|
||
- "H.A.F.M.O."
|
||
note: "Each initial followed by period"
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Dutch Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
dutch_name_patterns:
|
||
description: |
|
||
Special handling for Dutch names with tussenvoegsels (surname prefixes).
|
||
Dutch sorting rules differ from other languages.
|
||
|
||
tussenvoegsel_list:
|
||
- "van"
|
||
- "van de"
|
||
- "van den"
|
||
- "van der"
|
||
- "de"
|
||
- "den"
|
||
- "het"
|
||
- "'t"
|
||
- "ter"
|
||
- "ten"
|
||
- "op de"
|
||
- "op den"
|
||
- "in 't"
|
||
- "in de"
|
||
|
||
sorting_rule: |
|
||
In Dutch, surnames sort by baseSurname, ignoring tussenvoegsel.
|
||
"Vincent van Gogh" sorts under "G" not "V".
|
||
"Maria van den Berg" sorts under "B" not "V".
|
||
|
||
capitalization_rule: |
|
||
Tussenvoegsel lowercase when preceded by given name:
|
||
- "Vincent van Gogh" (not "Vincent Van Gogh")
|
||
- "Van Gogh" (surname alone, capitalized)
|
||
- "de heer Van Gogh" (formal, capitalized)
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Arabic Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
arabic_name_patterns:
|
||
description: |
|
||
Arabic names follow complex conventions with multiple components:
|
||
nasab (patronymic), nisba (geographic/tribal), kunya (teknonym), laqab (title/epithet).
|
||
|
||
components:
|
||
nasab:
|
||
description: "Patronymic chain using ibn/bin (son) or bint (daughter)"
|
||
examples:
|
||
- "محمد بن علي بن حسن"
|
||
- "Muhammad ibn Ali ibn Hasan"
|
||
note: "Can extend multiple generations"
|
||
|
||
nisba:
|
||
description: "Geographic or tribal affiliation (adjective form, ends in -i)"
|
||
examples:
|
||
- "البغدادي (al-Baghdadi)"
|
||
- "المصري (al-Misri)"
|
||
- "الهاشمي (al-Hashimi)"
|
||
|
||
kunya:
|
||
description: "Teknonym (Abu/Umm + child's name)"
|
||
examples:
|
||
- "أبو عبد الله (Abu Abdullah)"
|
||
- "أم كلثوم (Umm Kulthum)"
|
||
note: "Often used as primary form of address"
|
||
|
||
laqab:
|
||
description: "Title, epithet, or nickname"
|
||
examples:
|
||
- "الرشيد (al-Rashid - the rightly guided)"
|
||
- "المأمون (al-Ma'mun - the trustworthy)"
|
||
|
||
parsing_order: |
|
||
Traditional order: kunya - ism - nasab - laqab - nisba
|
||
Example: Abu Bakr Muhammad ibn Zakariyya al-Razi
|
||
- Kunya: Abu Bakr (father of Bakr)
|
||
- Ism: Muhammad (given name)
|
||
- Nasab: ibn Zakariyya (son of Zakariyya)
|
||
- Nisba: al-Razi (from Ray, city in Persia)
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Hebrew Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
hebrew_name_patterns:
|
||
description: |
|
||
Hebrew names, especially in religious and historical documents, follow
|
||
specific conventions with patronymics and honorifics.
|
||
|
||
components:
|
||
given_name:
|
||
description: "First name (shem)"
|
||
examples:
|
||
- "משה (Moshe/Moses)"
|
||
- "רבקה (Rivkah/Rebecca)"
|
||
|
||
patronymic:
|
||
description: "Son/daughter of (ben/bat)"
|
||
examples:
|
||
- "משה בן אברהם (Moshe ben Avraham)"
|
||
- "רבקה בת יעקב (Rivkah bat Ya'akov)"
|
||
note: "ben for males, bat for females"
|
||
|
||
honorifics:
|
||
examples:
|
||
- "ר' (Rabbi)"
|
||
- "הרב (HaRav - the Rabbi)"
|
||
- "מו\"ר (Morenu - our teacher)"
|
||
- "ז\"ל (zikhrono livrakha - of blessed memory)"
|
||
- "ע\"ה (alav hashalom - peace be upon him)"
|
||
|
||
ketubah_conventions:
|
||
description: "Special naming in marriage contracts"
|
||
notes:
|
||
- "Full patronymics required for both parties"
|
||
- "Honorifics for fathers (החתן = the groom, הכלה = the bride)"
|
||
- "Geographic origin often included"
|
||
- "Hebrew date format (day of month, month, year from creation)"
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Spanish Colonial Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
spanish_name_patterns:
|
||
description: |
|
||
Spanish naming conventions, including colonial-era patterns with
|
||
double surnames and titles.
|
||
|
||
components:
|
||
given_names:
|
||
description: "First and middle names (often religious)"
|
||
examples:
|
||
- "María Guadalupe"
|
||
- "José Antonio"
|
||
- "Juan Pablo"
|
||
|
||
paternal_surname:
|
||
description: "Father's family name (apellido paterno)"
|
||
note: "Listed first in double surname"
|
||
|
||
maternal_surname:
|
||
description: "Mother's maiden family name (apellido materno)"
|
||
note: "Listed second in double surname"
|
||
|
||
particles:
|
||
examples:
|
||
- "de"
|
||
- "de la"
|
||
- "del"
|
||
note: "May indicate nobility or geographic origin"
|
||
|
||
titles:
|
||
examples:
|
||
- "Don/Doña"
|
||
- "Señor/Señora"
|
||
- "Fray (friar)"
|
||
- "Sor (sister)"
|
||
|
||
colonial_patterns:
|
||
notes:
|
||
- "Racial designations (español, mestizo, indio, mulato) often recorded"
|
||
- "Parish affiliation important"
|
||
- "Godparents (padrinos) always named"
|
||
- "Legitimacy (hijo legítimo/natural) specified"
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Italian Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
italian_name_patterns:
|
||
description: |
|
||
Italian naming conventions with notarial and nobility elements.
|
||
|
||
components:
|
||
given_name:
|
||
description: "Nome proprio"
|
||
note: "Often saints' names"
|
||
|
||
surname:
|
||
description: "Cognome"
|
||
note: "May derive from patronymics, locations, or professions"
|
||
|
||
particles:
|
||
examples:
|
||
- "di"
|
||
- "del"
|
||
- "della"
|
||
- "dei"
|
||
- "da"
|
||
note: "May indicate origin or noble lineage"
|
||
|
||
honorifics:
|
||
examples:
|
||
- "Signore/Signora"
|
||
- "Messer (medieval)"
|
||
- "Ser (notarial)"
|
||
- "Conte/Contessa"
|
||
- "Marchese/Marchesa"
|
||
|
||
notarial_conventions:
|
||
notes:
|
||
- "Father's name in genitive: 'figlio di Giovanni'"
|
||
- "Profession often stated: 'mercante', 'notaio'"
|
||
- "Parish or neighborhood: 'della parrocchia di San Marco'"
|
||
- "Legal capacity: 'maggiore d'età' (of legal age)"
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Greek Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
greek_name_patterns:
|
||
description: |
|
||
Greek Orthodox naming conventions with genitive patronymics.
|
||
|
||
components:
|
||
given_name:
|
||
description: "First name (often saint's name)"
|
||
examples:
|
||
- "Κωνσταντίνος (Konstantinos)"
|
||
- "Μαρία (Maria)"
|
||
|
||
patronymic:
|
||
description: "Father's name in genitive case"
|
||
examples:
|
||
- "του Νικολάου (tou Nikolaou - son of Nikolaos)"
|
||
- "του Δημητρίου (tou Dimitriou)"
|
||
note: "Genitive case indicates 'of' or 'belonging to'"
|
||
|
||
surname:
|
||
description: "Family name"
|
||
examples:
|
||
- "Παπαδόπουλος (Papadopoulos)"
|
||
- "Αντωνίου (Antoniou)"
|
||
note: "May be patronymic origin (-opoulos, -ou, -ides)"
|
||
|
||
honorifics:
|
||
examples:
|
||
- "Κύριος/Κυρία (Kyrios/Kyria - Mr./Mrs.)"
|
||
- "Πατήρ (Patir - Father, for clergy)"
|
||
- "Παπα- (Papa- - prefix for priests)"
|
||
|
||
orthodox_conventions:
|
||
notes:
|
||
- "Name day (onomastics) important in Greek culture"
|
||
- "Multiple given names common"
|
||
- "Grandparents' names often passed down"
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Russian/Cyrillic Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
russian_name_patterns:
|
||
description: |
|
||
Russian naming conventions with formal patronymics.
|
||
|
||
components:
|
||
given_name:
|
||
description: "First name (имя)"
|
||
examples:
|
||
- "Иван (Ivan)"
|
||
- "Мария (Maria)"
|
||
|
||
patronymic:
|
||
description: "Father's name + suffix (отчество)"
|
||
examples:
|
||
- "Петрович (Petrovich - son of Pyotr)"
|
||
- "Петровна (Petrovna - daughter of Pyotr)"
|
||
note: "-ovich/-evich for males, -ovna/-evna for females"
|
||
|
||
surname:
|
||
description: "Family name (фамилия)"
|
||
note: "Gendered: -ov/-ova, -in/-ina, -sky/-skaya"
|
||
|
||
formal_usage:
|
||
notes:
|
||
- "Formal address: given name + patronymic"
|
||
- "Informal: given name or diminutive"
|
||
- "Full official: surname, given name, patronymic"
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Ottoman Turkish Name Conventions
|
||
# -----------------------------------------------------------------------------
|
||
|
||
ottoman_name_patterns:
|
||
description: |
|
||
Ottoman Turkish naming conventions blending Arabic and Turkish elements.
|
||
|
||
components:
|
||
given_name:
|
||
description: "Primary name (often Arabic origin)"
|
||
examples:
|
||
- "Mehmed"
|
||
- "Ahmed"
|
||
- "Fatma"
|
||
|
||
patronymic:
|
||
description: "Father's name with 'oğlu' (son of) or 'kızı' (daughter of)"
|
||
examples:
|
||
- "Ali oğlu Mehmed"
|
||
- "Hasan oğlu Ahmed"
|
||
|
||
epithet:
|
||
description: "Title or descriptor (laqab)"
|
||
examples:
|
||
- "Paşa (Pasha)"
|
||
- "Efendi"
|
||
- "Ağa"
|
||
- "Bey"
|
||
- "Hatun/Hanım (for women)"
|
||
|
||
nisba:
|
||
description: "Geographic origin or profession"
|
||
examples:
|
||
- "Kayserili (from Kayseri)"
|
||
- "Bakkal (grocer)"
|
||
|
||
sijill_conventions:
|
||
notes:
|
||
- "Court records (sicil) use formal full names"
|
||
- "Witnesses identified by profession and address"
|
||
- "Deceased marked as 'merhum/merhume'"
|
||
- "Non-Muslims identified by religious community (millet)"
|