Add test script for PiCo extraction from Arabic waqf documents

- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents.
- The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results.
- Added comprehensive logging for API responses, extraction results, and validation errors.
- Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
This commit is contained in:
kempersc 2025-12-12 17:50:17 +01:00
parent b1f93b6f22
commit 505c12601a
84 changed files with 19370 additions and 2597 deletions

View file

@ -535,7 +535,8 @@ async def get_institutions(
social_instagram,
wikidata_label_en,
wikidata_description_en,
logo_url
logo_url,
web_claims
FROM custodians
WHERE {where_clause}
ORDER BY name
@ -620,6 +621,10 @@ async def get_institutions(
if row['logo_url']:
props["logo_url"] = row['logo_url']
# Web claims (financial documents, etc.)
if row['web_claims']:
props["web_claims"] = row['web_claims']
features.append({
"type": "Feature",
"geometry": {

View file

@ -848,6 +848,28 @@ async def get_profile(
if isinstance(profile_data, str):
profile_data = json.loads(profile_data)
# Transform experience → career_history for frontend compatibility
# The database stores 'experience' but frontend expects 'career_history'
inner_profile = profile_data.get('profile_data', {})
if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile:
experience = inner_profile.get('experience', [])
if experience:
# Map field names: title→role, company→organization, duration→dates
career_history = []
for job in experience:
career_item = {
'role': job.get('title'),
'organization': job.get('company'),
'dates': job.get('duration'),
'location': job.get('location'),
'description': job.get('description'),
'company_size': job.get('company_details'),
'current': job.get('current', False),
}
career_history.append(career_item)
inner_profile['career_history'] = career_history
profile_data['profile_data'] = inner_profile
return ProfileResponse(
profile_data=profile_data,
linkedin_slug=result['linkedin_slug'],
@ -867,8 +889,30 @@ async def get_profile(
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
file_profile_data = data.get('profile_data', {})
# Transform experience → career_history for frontend compatibility
inner_profile = file_profile_data.get('profile_data', {})
if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile:
experience = inner_profile.get('experience', [])
if experience:
career_history = []
for job in experience:
career_item = {
'role': job.get('title'),
'organization': job.get('company'),
'dates': job.get('duration'),
'location': job.get('location'),
'description': job.get('description'),
'company_size': job.get('company_details'),
'current': job.get('current', False),
}
career_history.append(career_item)
inner_profile['career_history'] = career_history
file_profile_data['profile_data'] = inner_profile
return ProfileResponse(
profile_data=data.get('profile_data', {}),
profile_data=file_profile_data,
linkedin_slug=linkedin_slug,
extraction_date=data.get('exa_search_metadata', {}).get('enrichment_timestamp'),
updated_date=None,

View file

@ -99,20 +99,26 @@ class Settings:
cache_ttl: int = int(os.getenv("CACHE_TTL", "900")) # 15 minutes
# Qdrant Vector DB
# Production: Use URL-based client via bronhouder.nl/qdrant reverse proxy
qdrant_host: str = os.getenv("QDRANT_HOST", "localhost")
qdrant_port: int = int(os.getenv("QDRANT_PORT", "6333"))
qdrant_use_production: bool = os.getenv("QDRANT_USE_PRODUCTION", "false").lower() == "true"
qdrant_use_production: bool = os.getenv("QDRANT_USE_PRODUCTION", "true").lower() == "true"
qdrant_production_url: str = os.getenv("QDRANT_PRODUCTION_URL", "https://bronhouder.nl/qdrant")
# Oxigraph SPARQL
sparql_endpoint: str = os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query")
# Production: Use bronhouder.nl/sparql reverse proxy
sparql_endpoint: str = os.getenv("SPARQL_ENDPOINT", "https://bronhouder.nl/sparql")
# TypeDB
# Note: TypeDB not exposed via reverse proxy - always use localhost
typedb_host: str = os.getenv("TYPEDB_HOST", "localhost")
typedb_port: int = int(os.getenv("TYPEDB_PORT", "1729"))
typedb_database: str = os.getenv("TYPEDB_DATABASE", "heritage_custodians")
typedb_use_production: bool = os.getenv("TYPEDB_USE_PRODUCTION", "false").lower() == "true" # Default off
# PostGIS
postgis_url: str = os.getenv("POSTGIS_URL", "http://localhost:8001")
# PostGIS/Geo API
# Production: Use bronhouder.nl/api/geo reverse proxy
postgis_url: str = os.getenv("POSTGIS_URL", "https://bronhouder.nl/api/geo")
# LLM Configuration
anthropic_api_key: str = os.getenv("ANTHROPIC_API_KEY", "")
@ -408,7 +414,7 @@ class MultiSourceRetriever:
if self._typedb is None and RETRIEVERS_AVAILABLE:
try:
self._typedb = create_typedb_retriever(
use_production=settings.qdrant_use_production
use_production=settings.typedb_use_production # Use TypeDB-specific setting
)
except Exception as e:
logger.warning(f"Failed to initialize TypeDB: {e}")
@ -686,7 +692,9 @@ async def lifespan(app: FastAPI):
retriever = MultiSourceRetriever()
if RETRIEVERS_AVAILABLE:
viz_selector = VisualizationSelector(use_dspy=bool(settings.anthropic_api_key))
# Check for any available LLM API key (Anthropic preferred, OpenAI fallback)
has_llm_key = bool(settings.anthropic_api_key or settings.openai_api_key)
viz_selector = VisualizationSelector(use_dspy=has_llm_key)
# Configure DSPy if API key available
if settings.anthropic_api_key:
@ -697,7 +705,16 @@ async def lifespan(app: FastAPI):
api_key=settings.anthropic_api_key,
)
except Exception as e:
logger.warning(f"Failed to configure DSPy: {e}")
logger.warning(f"Failed to configure DSPy with Anthropic: {e}")
elif settings.openai_api_key:
try:
configure_dspy(
provider="openai",
model="gpt-4o-mini",
api_key=settings.openai_api_key,
)
except Exception as e:
logger.warning(f"Failed to configure DSPy with OpenAI: {e}")
logger.info("Heritage RAG API started")
@ -1068,7 +1085,7 @@ if __name__ == "__main__":
uvicorn.run(
"main:app",
host="0.0.0.0",
port=8002,
port=8003,
reload=settings.debug,
log_level="info",
)

View file

@ -13,7 +13,7 @@
"name": "Alexandra Nederlof",
"linkedin_url": "https://www.linkedin.com/in/alexandra-nederlof-74b7a341",
"headline": "Junior Papierrestaurator bij Rijksmuseum",
"location": "Ik ben verliefd! En wel op mijn vak als papierrestaurator. De mogelijkheid om bij instellingen als musea, archieven of bibliotheken fysiek te kunnen helpen met het behouden van het papieren cultureel erfgoed geeft een geweldige voldoening. Daarnaast is het restaureren voor de particuliere klanten voor mij ook een waar genoegen: ervoor kunnen zorgen dat een klant weer optimaal van zijn kunstobject kan genieten of een brief weer kan lezen. Ik heb meegewerkt aan uiteenlopende projecten. Ik heb daardoor een brede ervaring opgedaan met het behandelen van verschillende soorten objecten. Van poster tot landkaart, van pastel tot papier maché, van boek tot botanisch model. Hierdoor heb ik een heel scala aan verantwoorde behandelmethoden mij eigen kunnen maken. Op dit moment werk ik als junior papierrestaurator bij het Rijksmuseum.",
"location": null,
"connections": "428 connections",
"about": "Ik ben verliefd! En wel op mijn vak als papierrestaurator. De mogelijkheid om bij instellingen als musea, archieven of bibliotheken fysiek te kunnen helpen met het behouden van het papieren cultureel erfgoed geeft een geweldige voldoening. Daarnaast is het restaureren voor de particuliere klanten voor mij ook een waar genoegen: ervoor kunnen zorgen dat een klant weer optimaal van zijn kunstobject kan genieten of een brief weer kan lezen. Ik heb meegewerkt aan uiteenlopende projecten. Ik heb daardoor een brede ervaring opgedaan met het behandelen van verschillende soorten objecten. Van poster tot landkaart, van pastel tot papier maché, van boek tot botanisch model. Hierdoor heb ik een heel scala aan verantwoorde behandelmethoden mij eigen kunnen maken. Op dit moment werk ik als junior papierrestaurator bij het Rijksmuseum.",
"summary": "Alexandra Nederlof is a Junior Papierrestaurator at the Rijksmuseum in Amsterdam, where she is passionate about preserving paper cultural heritage for museums, archives, and private clients. She has extensive experience in restoring a variety of objects, including posters, maps, and books, and has developed a range of responsible treatment methods. Nederlof has also contributed to publications on topics related to art and restoration.",

View file

@ -13,7 +13,7 @@
"name": "angela dellebeke",
"linkedin_url": "https://www.linkedin.com/in/angela-dellebeke-87289018",
"headline": "nationaal archief /national archives of the Netherlands",
"location": "The blue shield is the protective emblem specified in the 1954 Hague Convention (Convention for the Protection of Cultural Property in the Event of Armed Conflict) for marking cultural sites to give them protection from attack in the event of armed conflict. The Blue Shield network consists of organizations dealing with museums, archives, audiovisual supports, libraries, as well as monuments and sites. BLUE SHIELD NEDERLAND richt zich op de bescherming van Nederlands cultureel erfgoed tegen de bedreigingen die het gevolg zijn van natuurrampen, molest en militaire handelingen, en op het organiseren van nationale en internationale hulp. Show less",
"location": "The Hague, Netherlands",
"connections": "500 connections • 852 followers",
"about": "veiligheidszorg collectie / collectie hulpverlening/ preventieve conservering /vraagstukken beheer en behoud/ calamiteitenplan/-organisatie/ selectievraagstukken /acquisitie & beschrijven van archieven / bedrijfshulpverlening/ crisisbeheersing safety&security collections/ emergency preparedness and hazard mitigation / emergency response / crisismanagement / cultural property protection / hague convention 1954",
"summary": "Angela Dellebeke is a consultant specializing in emergency preparedness and hazard mitigation at the Nationaal Archief (National Archives of the Netherlands) in The Hague. With over 22 years of experience, she focuses on safety and security for cultural property, crisis management, and the preservation of archives. Dellebeke also serves as Secretary-General for Blue Shield Nederland, an organization dedicated to protecting cultural heritage during conflicts and disasters. She holds a Master of Arts in American Studies from Utrecht University and has published work on theft and misappropriation in archives.",

View file

@ -13,7 +13,7 @@
"name": "Annemarijne Moreu",
"linkedin_url": "https://www.linkedin.com/in/annemarijnemoreu",
"headline": "Sr Projectmanager bij Nationaal Archief",
"location": "Ik ben een daadkrachtige en resultaatgerichte product owner en projectmanager, met brede ervaring op het gebied van B2B en B2C (online) projectmanagement, agile werken, marketing en communicatie. Ik ben goed in staat klantbehoeften centraal te stellen. Samenwerken met verschillende mensen, afdelingen en niveaus en gezamenlijk realiseren van doelen en implementeren van projecten gaat mij goed af. In mijn werk ben ik planmatig sterk, communicatief vaardig, zelfstandig en flexibel. Ik word blij van klantcontacten, aanpakken, samenwerken, afwisseling en verantwoordelijkheid nemen. Als persoon ben ik sociaal, ondernemend, positief en sportief.",
"location": "The Hague, Netherlands",
"connections": "500 connections • 860 followers",
"about": "Ik ben een daadkrachtige en resultaatgerichte product owner en projectmanager, met brede ervaring op het gebied van B2B en B2C (online) projectmanagement, agile werken, marketing en communicatie. Ik ben goed in staat klantbehoeften centraal te stellen. Samenwerken met verschillende mensen, afdelingen en niveaus en gezamenlijk realiseren van doelen en implementeren van projecten gaat mij goed af. In mijn werk ben ik planmatig sterk, communicatief vaardig, zelfstandig en flexibel. Ik word blij van klantcontacten, aanpakken, samenwerken, afwisseling en verantwoordelijkheid nemen. Als persoon ben ik sociaal, ondernemend, positief en sportief.",
"summary": "Annemarijne Moreu is a Senior Project Manager at the Nationaal Archief in The Hague, Netherlands, with over 29 years of experience in project management, particularly in B2B and B2C environments. Her expertise includes agile methodologies, marketing, and communication, focusing on customer needs and collaboration across various departments. Currently, she manages projects related to data accessibility, service delivery, and project management optimization at the Nationaal Archief. Previously, she held roles at Gemeente Rotterdam and PostNL, where she led various IT and process optimization projects, demonstrating her strong planning, communication, and leadership skills. Moreu is known for her proactive and social approach to work.",

View file

@ -13,7 +13,7 @@
"name": "Anne Martens",
"linkedin_url": "https://www.linkedin.com/in/annemartens1",
"headline": "Communicatieadviseur educatie",
"location": "Als freelance journalist heb ik een verhalenradar die altijd aanstaat. Als ik een verhaal op het spoor ben, dan kan ik niet anders dan dat verhaal uitpluizen en delen met krantenlezers, radioluisteraars of televisiekijkers. Ik ben niet bang om in complexe materie te duiken, onbekende vakgebieden te verkennen en wetenschappelijke publicaties en experts te raadplegen. Ik maak verhalen voor NRC Handelsblad, NEMOKennislink.nl, Antoni van Leeuwenhoekziekenhuis, de NTR en de VPRO. Onderwerpen: biologie, aardwetenschappen, medische ethiek, geneeskunde en fertiliteit. Show less",
"location": "Netherlands",
"connections": "500 connections • 925 followers",
"about": "Total Experience: 17 years",
"summary": "Anne Martens is a seasoned communication advisor specializing in education, currently working at the Nationaal Archief in the Netherlands. With 17 years of experience, she has a diverse background that includes freelance journalism, where she has contributed to various prominent publications such as NRC Handelsblad and NEMOKennislink.nl. Her journalistic work has focused on complex topics in biology, earth sciences, medical ethics, and medicine. Martens has also produced content for science programs and radio documentaries, showcasing her ability to engage with intricate subjects and communicate them effectively to the public.",

View file

@ -13,7 +13,7 @@
"name": "Arjan Diepeveen",
"linkedin_url": "https://www.linkedin.com/in/arjan-diepeveen-73b21640",
"headline": "Senior Test Automation Engineer at Nationaal Archief",
"location": "Arjan is een betrouwbare, professionele en collegiale medewerker die zowel zelfstandig als in een team goed presteert. Hij staat voor kwaliteit, zonder daarbij de kwantiteit uit het oog te verliezen. Hij is bereid om net die extra stap te zetten om het maximale resultaat te behalen. Arjan is van origine een echte techneut. Hij heeft in zijn loopbaan een zeer uitgebreide en gefundeerde hoeveelheid technische kennis opgedaan in verschillende functies en verschillende branches. Met zijn oog voor kwaliteit heeft hij zich in de loop van de jaren meer en meer verdiept in het testen en ontwikkeld als test consultant. Zijn kracht ligt in het heel snel in kaart kunnen brengen en doorgronden van zeer complexe omgevingen en systemen. Nieuwe en onbekende dingen maakt hij zich razendsnel eigen. Hij weet daarbij als geen andere deze kennis over te dragen door complexe zaken te vertalen naar een begrijpelijk niveau. Onder hectische stressvolle situaties blijft Arjan uitermate rustig, flexibel en analytisch. Hij gaat graag de uitdaging aan.",
"location": "Netherlands",
"connections": "374 connections • 380 followers",
"about": "Arjan is een betrouwbare, professionele en collegiale medewerker die zowel zelfstandig als in een team goed presteert. Hij staat voor kwaliteit, zonder daarbij de kwantiteit uit het oog te verliezen. Hij is bereid om net die extra stap te zetten om het maximale resultaat te behalen. Arjan is van origine een echte techneut. Hij heeft in zijn loopbaan een zeer uitgebreide en gefundeerde hoeveelheid technische kennis opgedaan in verschillende functies en verschillende branches. Met zijn oog voor kwaliteit heeft hij zich in de loop van de jaren meer en meer verdiept in het testen en ontwikkeld als test consultant. Zijn kracht ligt in het heel snel in kaart kunnen brengen en doorgronden van zeer complexe omgevingen en systemen. Nieuwe en onbekende dingen maakt hij zich razendsnel eigen. Hij weet daarbij als geen andere deze kennis over te dragen door complexe zaken te vertalen naar een begrijpelijk niveau. Onder hectische stressvolle situaties blijft Arjan uitermate rustig, flexibel en analytisch. Hij gaat graag de uitdaging aan.",
"summary": "Arjan Diepeveen is a Senior Test Automation Engineer at Nationaal Archief in the Netherlands, with over 26 years of experience in various technical roles. He excels in both independent and team settings, emphasizing quality while maintaining efficiency. His expertise includes test automation using tools like Robot Framework, Selenium, and Docker, and he works within SCRUM/Agile/DevOps teams to develop custom solutions for the National Archives. Arjan has also held positions at Rijkswaterstaat and Nederlandse Spoorwegen, focusing on technical safety and infrastructure testing. He is known for his ability to quickly understand complex systems and effectively communicate technical concepts.",

View file

@ -13,7 +13,7 @@
"name": "Roger Mous",
"linkedin_url": "https://www.linkedin.com/in/roger-mous-203b2922a",
"headline": "Floormanager afdeling Organisatie en Presentatie (O&P), Nationaal Archief",
"location": "Enthousiast, kwaliteitsgericht, stressbestendige, gemotiveerde professional. Als sturende en motiverend persoon zet ik mij in zodat het team kan voldoen aan de hoogste eisen. Het coördineren, aansturen van medewerkers en zorgen voor kennisoverdracht.",
"location": "Den Haag, Zuid-Holland, Nederland",
"connections": "278 connections • 281 followers",
"about": "Enthousiast, kwaliteitsgericht, stressbestendige, gemotiveerde professional. Als sturende en motiverend persoon zet ik mij in zodat het team kan voldoen aan de hoogste eisen. Het coördineren, aansturen van medewerkers en zorgen voor kennisoverdracht.",
"summary": "Roger Mous is currently the Floormanager at the Nationaal Archief in The Hague, Netherlands, with nearly 35 years of professional experience. He is known for his enthusiastic, quality-oriented, and stress-resistant approach, focusing on team coordination and knowledge transfer. His career includes various roles at the Nationaal Archief and the Royal House, where he managed logistics, events, and catering services. Mous has a background in military service and education in facility management and hospitality. He has held multiple managerial positions, demonstrating strong leadership and organizational skills.",

View file

@ -0,0 +1,635 @@
# Provenance Sources for PiCo Historical Document Examples
This document provides detailed provenance information for the real historical document sources used in the PiCo (Person in Context) ontology integration examples within the CH-Annotator convention.
**Last Updated**: 2025-12-12
**Author**: GLAM Project
**Version**: 1.0.0
---
## Table of Contents
1. [Hebrew Ketubah (Jewish Marriage Contracts)](#1-hebrew-ketubah-jewish-marriage-contracts)
2. [Arabic Waqf Documents (Islamic Endowments)](#2-arabic-waqf-documents-islamic-endowments)
3. [Ottoman Turkish Sijill (Sharia Court Registers)](#3-ottoman-turkish-sijill-sharia-court-registers)
4. [Russian Metrical Books (Church Records)](#4-russian-metrical-books-church-records)
5. [Spanish Colonial Baptism Records](#5-spanish-colonial-baptism-records)
6. [Italian Notarial Records](#6-italian-notarial-records)
7. [Greek Orthodox Church Records](#7-greek-orthodox-church-records)
8. [Dutch Civil Registry Records](#8-dutch-civil-registry-records)
9. [License and Attribution Requirements](#9-license-and-attribution-requirements)
---
## 1. Hebrew Ketubah (Jewish Marriage Contracts)
### 1.1 Yale Beinecke Library - Mashhad Ketubah (1896)
| Field | Value |
|-------|-------|
| **Archive** | Yale University, Beinecke Rare Book & Manuscript Library |
| **Collection** | Hebrew Manuscripts Supplement |
| **Call Number** | Hebrew MSS suppl 194 |
| **Digital URL** | https://digital.library.yale.edu/catalog/2067542 |
| **Document Type** | Ketubah (Jewish marriage contract) |
| **Date** | 23 Elul 5656 (September 1, 1896 CE) |
| **Place** | Mashhad, Iran |
| **Language** | Hebrew, Aramaic |
| **Access Date** | 2025-12-12 |
| **License** | Public Domain (pre-1929) |
**Persons Identified:**
- **Groom**: Mosheh ben Mashiah (משה בן משיאח)
- **Bride**: Rivkah bat Ya'akov (רבקה בת יעקב)
**Notes**: This ketubah is from the crypto-Jewish community of Mashhad, known as the Jadid al-Islam, who maintained Jewish practices in secret after forced conversion in 1839. The document follows standard Sephardic/Mizrahi ketubah format.
---
### 1.2 Philadelphia Mikveh Israel Ketubah (1842)
| Field | Value |
|-------|-------|
| **Archive** | Congregation Mikveh Israel, Philadelphia |
| **Collection** | Philadelphia Congregations Records |
| **Digital URL** | https://philadelphiacongregations.org/records/item/MikvehIsrael.MarriageCertificate1842 |
| **Document Type** | Ketubah (Jewish marriage contract) |
| **Date** | 1842 CE |
| **Place** | Philadelphia, Pennsylvania, USA |
| **Language** | Aramaic (traditional text), English (translation provided) |
| **Access Date** | 2025-12-12 |
| **License** | Educational use permitted |
**Key Features:**
- Full Aramaic text transcription available
- English translation provided by archive
- Example of American Sephardic ketubah format
**Sample Aramaic Text** (from source):
```
בשבת... בשבת... יום... לחדש... שנת... לבריאת עולם למנין שאנו מונין כאן...
איך החתן... בר... אמר לה להדא בתולתא... בת...
```
---
### 1.3 College of Charleston Ketubah (1908)
| Field | Value |
|-------|-------|
| **Archive** | College of Charleston, Special Collections |
| **Collection** | Jewish Heritage Collection |
| **Document Type** | Ketubah |
| **Date** | 1908 CE |
| **Language** | Hebrew, Aramaic |
| **Access Date** | 2025-12-12 |
**Persons Identified:**
- **Bride**: Esther Devorah bat Rabbi Abraham (אסתר דבורה בת ר׳ אברהם)
- **Groom**: Rabbi Yitzchak (ר׳ יצחק)
---
### 1.4 Rhodes Jewish Museum Collection
| Field | Value |
|-------|-------|
| **Archive** | Rhodes Jewish Museum |
| **Location** | Rhodes, Greece |
| **Collection** | Historical Documents |
| **Document Types** | Ketubot, community records |
| **Period** | 19th-20th century |
| **Language** | Ladino, Hebrew, Greek |
**Notes**: Documents from the historic Sephardic Jewish community of Rhodes, with unique Ladino elements.
---
## 2. Arabic Waqf Documents (Islamic Endowments)
### 2.1 Cambridge Digital Library - Islamic Collections
| Field | Value |
|-------|-------|
| **Archive** | Cambridge University Library |
| **Collection** | Islamic Manuscripts |
| **Digital URL** | https://cudl.lib.cam.ac.uk/collections/islamic |
| **Document Types** | Waqfiyya, legal documents, correspondence |
| **Period** | 8th-20th century CE |
| **Languages** | Arabic, Persian, Ottoman Turkish |
| **License** | CC BY-NC 4.0 |
| **Access Date** | 2025-12-12 |
**Key Collections:**
- Genizah Collection (Cairo Genizah fragments)
- Arabic Scientific Manuscripts
- Islamic Legal Documents
---
### 2.2 UPenn OPenn - Manuscripts of the Muslim World
| Field | Value |
|-------|-------|
| **Archive** | University of Pennsylvania Libraries |
| **Collection** | Manuscripts of the Muslim World |
| **Digital URL** | https://openn.library.upenn.edu/html/muslimworld_contents.html |
| **Document Types** | Waqfiyya, Quranic manuscripts, legal documents |
| **Period** | 9th-20th century CE |
| **Languages** | Arabic, Persian, Ottoman Turkish |
| **License** | Public Domain / CC0 |
| **Access Date** | 2025-12-12 |
**Notable Holdings:**
- Waqfiyya documents from Egypt, Syria, Turkey
- Legal formularies with waqf templates
- Property deeds and endowment records
---
### 2.3 Singapore National Heritage Board - Istanbul Waqf
| Field | Value |
|-------|-------|
| **Archive** | Singapore National Heritage Board |
| **Collection** | Roots.gov.sg |
| **Accession Number** | 1115401 |
| **Digital URL** | https://www.roots.gov.sg/Collection-Landing/listing/1115401 |
| **Document Type** | Waqf document |
| **Donor/Creator** | Muhammad b. Abd al-Ghani (محمد بن عبد الغني) |
| **Properties** | Istanbul (various locations) |
| **Language** | Ottoman Turkish, Arabic |
| **Access Date** | 2025-12-12 |
**Key Features:**
- Complete waqf document with property descriptions
- Lists endowed properties in Istanbul
- Named beneficiaries and conditions
---
### 2.4 Haseki Sultan Waqfiyya (1552 CE)
| Field | Value |
|-------|-------|
| **Archive** | Various (studied in UC Berkeley eScholarship) |
| **Document Type** | Waqfiyya (imperial endowment deed) |
| **Date** | 1552 CE |
| **Founder** | Haseki Hürrem Sultan (Roxelana) |
| **Language** | Ottoman Turkish, Arabic |
| **Research URL** | UC Berkeley eScholarship |
**Significance**: One of the largest waqf endowments in Ottoman history, establishing charitable institutions across the empire.
---
## 3. Ottoman Turkish Sijill (Sharia Court Registers)
### 3.1 OpenJerusalem Project - Jerusalem Sharia Court Registers
| Field | Value |
|-------|-------|
| **Archive** | OpenJerusalem Project |
| **Collection** | Jerusalem Sharia Court Registers |
| **Digital URL** | https://www.openjerusalem.org/ |
| **ARK Identifier** | ark:/58142/PfV7b |
| **Volume Count** | 102 registers |
| **Period** | 1834-1920 CE |
| **Language** | Ottoman Turkish, Arabic |
| **License** | Open Access |
| **Access Date** | 2025-12-12 |
**Document Types:**
- Property sales (بيع)
- Marriage contracts (نكاح)
- Inheritance divisions (قسمة)
- Waqf registrations
- Debt acknowledgments (إقرار)
- Court testimonies (شهادة)
**Key Features:**
- Searchable database with document transcriptions
- Photographs of original registers
- Multi-language metadata (Arabic, English, French)
---
### 3.2 ISAM Istanbul Kadi Registers (Kadı Sicilleri)
| Field | Value |
|-------|-------|
| **Archive** | İslam Araştırmaları Merkezi (ISAM) |
| **Collection** | Istanbul Kadı Sicilleri |
| **Digital URL** | http://www.kadisicilleri.org/ |
| **Volume Count** | 40+ volumes online |
| **Document Count** | 40,000+ documents |
| **Period** | 16th-19th century CE |
| **Language** | Ottoman Turkish |
| **License** | Research access |
| **Access Date** | 2025-12-12 |
**Coverage:**
- Istanbul courts (multiple districts)
- Galata, Üsküdar, Eyüp
- Complete transcriptions with original images
---
### 3.3 Istanbul Historical Kadi Registers Corpus
| Field | Value |
|-------|-------|
| **Archive** | Istanbul Metropolitan Municipality |
| **Project** | History of Istanbul |
| **Digital URL** | https://istanbultarihi.ist/434-istanbul-sharia-court-registers |
| **Volume Count** | ~10,000 volumes |
| **Courts** | 26 different courts |
| **Period** | 1453-1922 CE |
| **Language** | Ottoman Turkish |
**Significance**: Largest collection of Ottoman court records in existence.
---
### 3.4 Harvard Ottoman Court Records Project
| Field | Value |
|-------|-------|
| **Archive** | Harvard University |
| **Project** | Ottoman Court Records Project (OCRP) |
| **Digital URL** | https://cmes.fas.harvard.edu/projects/ocrp |
| **Document Types** | Sijill transcriptions, translations |
| **Period** | 16th-19th century CE |
| **Languages** | Ottoman Turkish (original), English (translations) |
---
### 3.5 Bulgarian National Library - Ottoman Sijills
| Field | Value |
|-------|-------|
| **Archive** | Bulgarian National Library |
| **Collection** | Oriental Department |
| **Sijill Count** | 160+ volumes |
| **Defter Count** | 1000+ registers |
| **Coverage** | Bulgarian Ottoman provinces |
| **Period** | 16th-19th century CE |
| **Language** | Ottoman Turkish, Arabic |
---
## 4. Russian Metrical Books (Church Records)
### 4.1 BYU Script Tutorial - Russian Metrical Books
| Field | Value |
|-------|-------|
| **Institution** | Brigham Young University |
| **Project** | Script Tutorial |
| **Digital URL** | https://script.byu.edu/russian-handwriting/documents/record-types/metrical-books/births |
| **Document Type** | Tutorial with real transcription examples |
| **Languages** | Russian (Cyrillic), English (translation) |
| **License** | Educational use |
| **Access Date** | 2025-12-12 |
**Content Includes:**
- Complete birth record format explanation
- Vocabulary lists with translations
- Sample transcriptions from actual metrical books
- Handwriting recognition guides
**Sample Birth Record Structure** (from tutorial):
```
В метрической книге записано:
Родился: [date]
Крещён: [date]
Имя: [name]
Родители: [father's full name with rank/status], законная жена его [mother's name]
Восприемники: [godparents]
Священник: [officiating priest]
```
---
### 4.2 FamilySearch Russia Church Records
| Field | Value |
|-------|-------|
| **Archive** | FamilySearch |
| **Collection** | Russia Church Records |
| **Wiki URL** | https://www.familysearch.org/en/wiki/Russia_Church_Records |
| **Document Types** | Metrical books (births, marriages, deaths) |
| **Period** | 1722-1918 CE |
| **Languages** | Russian, Church Slavonic |
| **Access** | Free with registration |
**Key Information:**
- Metrical books (метрические книги) mandated from 1722
- Three-part structure: births/baptisms, marriages, deaths
- Contains estate/class (сословие) information
---
### 4.3 Polish Archives - Kłobuck Parish Records
| Field | Value |
|-------|-------|
| **Archive** | Szukaj w Archiwach (Polish State Archives) |
| **Parish** | Kłobuck |
| **Document Type** | Roman Catholic metrical books |
| **Period** | 18th-19th century |
| **Languages** | Latin, Polish, Russian |
**Notes**: Example of Russian-era Polish parish records with parallel Latin/Russian entries.
---
### 4.4 RGIA St. Petersburg
| Field | Value |
|-------|-------|
| **Archive** | Russian State Historical Archive (RGIA) |
| **Location** | St. Petersburg, Russia |
| **Holdings** | 300+ metrical books |
| **Period** | 1832-1892 CE |
| **Document Types** | Orthodox, Catholic, Lutheran, Jewish metrical books |
---
## 5. Spanish Colonial Baptism Records
### 5.1 BYU Script Tutorial - Spanish Colonial Baptisms
| Field | Value |
|-------|-------|
| **Institution** | Brigham Young University |
| **Project** | Script Tutorial |
| **Digital URL** | https://script.byu.edu/spanish-handwriting/documents/church-records/baptisms |
| **Document Type** | Tutorial with real transcription examples |
| **Languages** | Spanish (colonial), English |
| **License** | Educational use |
| **Access Date** | 2025-12-12 |
**Standard Baptism Entry Structure:**
```
En [place] a [date] bauticé solemnemente a [name], [legitimacy status] de [father] y de [mother].
Fueron padrinos [godparents].
Y para que conste lo firmo.
[Priest signature]
```
**Key Vocabulary:**
- hijo/hija legítimo/a = legitimate child
- hijo/hija natural = illegitimate child
- párvulo/a = infant
- español/a, indio/a, mestizo/a, mulato/a = casta categories
- padrinos/madrinas = godparents
---
### 5.2 FamilySearch Mexico - Yucatán Catholic Church Records
| Field | Value |
|-------|-------|
| **Archive** | FamilySearch |
| **Collection** | Mexico, Yucatán, Catholic Church Records, 1543-1977 |
| **Collection ID** | 1909116 |
| **Digital URL** | https://www.familysearch.org/en/search/collection/1909116 |
| **Period** | 1543-1977 CE |
| **Document Types** | Baptisms, marriages, deaths, confirmations |
| **Language** | Spanish, Latin, Maya |
| **Access** | Free with registration |
**Coverage:**
- 200+ parishes
- Some of earliest New World records (from 1543)
- Indigenous Maya populations
---
### 5.3 Archivo General de la Nación (AGN) Mexico
| Field | Value |
|-------|-------|
| **Archive** | Archivo General de la Nación |
| **Location** | Mexico City, Mexico |
| **Holdings** | Colonial parish records, civil registry |
| **Period** | 16th-20th century CE |
| **Languages** | Spanish, Nahuatl, Latin |
---
## 6. Italian Notarial Records
### 6.1 Antenati - Italian State Archives Portal
| Field | Value |
|-------|-------|
| **Archive** | Italian Ministry of Culture |
| **Project** | Antenati (Ancestors) |
| **Digital URL** | https://antenati.cultura.gov.it/ |
| **Venice URL** | https://antenati.cultura.gov.it/archivio/state-archives-of-venezia/?lang=en |
| **Document Types** | Civil registry, notarial acts, parish records |
| **Period** | 1806-present (civil); 15th century+ (notarial) |
| **Languages** | Italian, Latin, Venetian |
| **License** | Open Access |
| **Access Date** | 2025-12-12 |
**Venice State Archive Holdings:**
- Civil Registry (Stato Civile) 1806-1815 (Napoleonic period)
- Notarial archives (Archivio Notarile)
- Guild records (Arti e Mestieri)
---
### 6.2 OAC California Digital Library - Italian Notarial Documents
| Field | Value |
|-------|-------|
| **Archive** | University of California Libraries |
| **Collection** | Italian Notarial Documents Collection |
| **Finding Aid** | https://oac.cdlib.org/findaid/ark:%2F13030%2Fc8v412zd |
| **Document Count** | 168 documents |
| **Period** | 1465-1635 CE |
| **Locations** | Venice, Padua, Verona |
| **Languages** | Latin, Italian (Venetian) |
| **Access Date** | 2025-12-12 |
**Document Types:**
- Contracts (contratti)
- Wills (testamenti)
- Property transfers
- Marriage agreements (sponsalia)
- Business partnerships
---
### 6.3 SION-Digit Project - Jewish Notarial Records
| Field | Value |
|-------|-------|
| **Project** | SION-Digit (Sources for the History of Italian Jewish Notarial Documents) |
| **Coverage** | Venice, Bordeaux, Amsterdam |
| **Period** | 16th-18th century CE |
| **Focus** | Jewish community notarial acts |
| **Languages** | Italian, Hebrew, Ladino |
---
## 7. Greek Orthodox Church Records
### 7.1 FamilySearch Greece Church Records
| Field | Value |
|-------|-------|
| **Archive** | FamilySearch |
| **Wiki URL** | https://www.familysearch.org/en/wiki/Greece_Church_Records |
| **Document Types** | Baptisms, marriages, deaths |
| **Period** | 17th century - 1925 CE |
| **Language** | Greek |
| **Access** | Free with registration |
**Key Information:**
- Greek Orthodox records primary source before 1925 civil registration
- Male registers (μητρώα αρρένων) for military service
- Some records in Ottoman Turkish for pre-independence period
---
### 7.2 General State Archives of Greece (GAK)
| Field | Value |
|-------|-------|
| **Archive** | Γενικά Αρχεία του Κράτους (GAK) |
| **Document Types** | Church records, civil registry, Ottoman-era documents |
| **Period** | 15th century - present |
| **Languages** | Greek, Ottoman Turkish |
---
### 7.3 Greek Ancestry Resources
| Field | Value |
|-------|-------|
| **Resource** | Greek Ancestry |
| **Coverage** | Village church records guide |
| **Document Types** | Baptismal registers, marriage registers |
| **Key Features** | Guides to accessing island and mainland records |
---
## 8. Dutch Civil Registry Records
### 8.1 WieWasWie (Dutch Genealogical Database)
| Field | Value |
|-------|-------|
| **Archive** | Centraal Bureau voor Genealogie (CBG) |
| **Project** | WieWasWie |
| **Digital URL** | https://www.wiewaswie.nl/ |
| **Document Types** | Birth, marriage, death certificates |
| **Period** | 1811-present (civil); 1600s+ (church) |
| **Languages** | Dutch |
| **Access** | Subscription / Free at archives |
---
### 8.2 Dutch Provincial Archives
| Province | Archive | Holdings |
|----------|---------|----------|
| Noord-Holland | Noord-Hollands Archief | Civil registry from 1811, church records from 1600s |
| Zuid-Holland | Nationaal Archief | Central government records |
| Gelderland | Gelders Archief | Regional archives |
| Noord-Brabant | Brabants Historisch Informatie Centrum | Catholic parish records |
---
### 8.3 Dutch Marriage Certificate Format
**Standard 19th-Century Format:**
```
Heden den [date] compareerden voor ons [official name],
Ambtenaar van den Burgerlijken Stand der Gemeente [municipality]:
De Bruidegom: [groom's name], oud [age] jaren, [occupation],
geboren te [birthplace], wonende te [residence],
zoon van [father] en van [mother];
De Bruid: [bride's name], oud [age] jaren,
geboren te [birthplace], wonende te [residence],
dochter van [father] en van [mother];
Getuigen: [4 witnesses with ages, occupations, relationships]
En hebben wij dit huwelijk voltrokken in tegenwoordigheid van voornoemde getuigen.
```
---
## 9. License and Attribution Requirements
### Open Access Resources
| Source | License | Attribution Required |
|--------|---------|---------------------|
| Cambridge Digital Library | CC BY-NC 4.0 | Yes |
| UPenn OPenn | Public Domain / CC0 | No (but encouraged) |
| OpenJerusalem | Open Access | Yes |
| Antenati | Open Access | Yes |
| FamilySearch | Terms of Service | Yes |
| BYU Script Tutorial | Educational Use | Yes |
### Recommended Citation Format
For PiCo extraction examples, use the following provenance block in YAML:
```yaml
provenance:
source_url: "https://example.org/document/12345"
archive_name: "Example Archive"
collection: "Collection Name"
document_id: "Document Identifier"
access_date: "2025-12-12"
license: "CC BY-NC 4.0"
attribution: "Courtesy of Example Archive. Used under CC BY-NC 4.0 license."
notes: "Transcription verified against original digital image."
```
### Data Fabrication Prohibition
**CRITICAL**: Per project rules (AGENTS.md Rule 21), all extraction examples MUST use real data from these verified sources. No fabrication of person names, dates, relationships, or document content is permitted.
When real data is not available from a source, the extraction example should be marked as:
```yaml
provenance:
source_url: null
data_status: "SYNTHETIC_EXAMPLE"
notes: "This example uses synthetic data for demonstration purposes only. Do not cite as historical evidence."
```
---
## Document Type Coverage Summary
| Document Type | Real Sources Available | Examples with Provenance |
|--------------|------------------------|--------------------------|
| Hebrew Ketubah | 4+ archives | Yale (1896), Philadelphia (1842) |
| Arabic Waqf | 3+ archives | Cambridge, UPenn, Singapore |
| Ottoman Sijill | 5+ archives | OpenJerusalem, ISAM, Harvard |
| Russian Metrical | 4+ archives | BYU Tutorial, RGIA |
| Spanish Colonial Baptism | 3+ archives | BYU Tutorial, FamilySearch |
| Italian Notarial | 3+ archives | Antenati, OAC/CDL |
| Greek Orthodox | 3+ archives | FamilySearch, GAK |
| Dutch Civil Registry | 3+ archives | WieWasWie, Provincial |
---
## Changelog
| Date | Version | Changes |
|------|---------|---------|
| 2025-12-12 | 1.0.0 | Initial compilation of provenance sources |

View file

@ -152,6 +152,28 @@ modules:
- path: "integrations/nif_nerd.yaml"
description: "NIF/NERD/Open Annotation compatibility layer with GLAM-NER mappings"
# ---------------------------------------------------------------------------
# RELATIONSHIP MODULES - Family and social relationship patterns
# ---------------------------------------------------------------------------
relationships:
- path: "relationships/family.yaml"
description: "Family relationship properties and historical source patterns (34 relationship types, 13 languages)"
line_count: 1503
languages:
- "Dutch"
- "Latin"
- "German"
- "Arabic"
- "French"
- "Ottoman Turkish"
- "Hebrew"
- "Persian/Farsi"
- "Spanish"
- "Portuguese"
- "Italian"
- "Greek"
- "Russian"
# ---------------------------------------------------------------------------
# ADVANCED MODULES - Complex annotation patterns
# ---------------------------------------------------------------------------

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,228 @@
# =============================================================================
# PiCo Integration Module - Index
# =============================================================================
# Part of: GLAM-NER Entity Annotation Convention v1.7.0
# Module: integrations/pico/
#
# Description:
# PiCO (Person in Context Ontology) integration for person observation modeling.
# Enables tracking provenance of person mentions and linking to formal records.
#
# Key concepts:
# - PersonObservation: A textual mention of a person (source-bound)
# - PersonName (PNV): Structured name components
# - Person (CIDOC-CRM E21): Reconstructed person entity
#
# References:
# - PiCo Ontology: https://w3id.org/pico
# - Person Name Vocabulary (PNV): https://w3id.org/pnv
# - CIDOC-CRM: https://www.cidoc-crm.org/
#
# Module Structure:
# pico/
# ├── _index.yaml # This file - module manifest
# ├── schema/
# │ ├── observation.yaml # Core PiCo observation pattern
# │ ├── pnv_components.yaml # Person Name Vocabulary
# │ ├── relationships.yaml # Family and social relationships
# │ ├── temporal.yaml # Date and calendar systems
# │ └── locations.yaml # Location type definitions
# ├── examples/
# │ ├── _examples_index.yaml # Examples overview
# │ ├── 01_dutch_marriage.yaml # Example 1: Dutch civil registration
# │ ├── 02_notarial_protocol.yaml
# │ ├── 03_church_baptismal.yaml
# │ ├── 04_linkedin_profile.yaml
# │ ├── 05_arabic_waqf.yaml
# │ ├── 06_hebrew_ketubah.yaml # REAL DATA: Yale Mashhad 1896
# │ ├── 07_spanish_colonial.yaml
# │ ├── 08_italian_notarial.yaml
# │ ├── 09_greek_orthodox.yaml
# │ ├── 10_russian_metrical.yaml # REAL DATA: BYU Osiek 1894
# │ └── 11_ottoman_sijill.yaml
# └── naming_conventions/
# ├── dutch.yaml # Dutch naming rules
# ├── arabic.yaml # Arabic naming rules
# ├── hebrew.yaml # Hebrew naming rules
# └── ... # Other language conventions
#
# Last Updated: 2025-01-13
# Version: 1.7.0
# =============================================================================
module:
id: "pico_integration"
name: "PiCo Integration Module"
version: "1.7.0"
parent: "ch_annotator-v1_7_0"
description: |
PiCO (Person in Context Ontology) models textual observations of persons
as distinct from reconstructed person entities. This enables:
- Tracking provenance of person mentions
- Handling name variations across sources
- Linking observations to formal person records
The observation/reconstruction pattern separates:
1. What was OBSERVED in text (PersonObservation) - source-bound, exact
2. What was RECONSTRUCTED as entity (E21_Person) - inferred, normalized
This is critical for heritage data where the same person may appear with
different name forms, titles, or spellings across sources.
# -----------------------------------------------------------------------------
# Module Components
# -----------------------------------------------------------------------------
components:
schema:
description: "Core schema definitions for PiCo model"
files:
- path: "schema/observation.yaml"
description: "PersonObservation class and properties"
classes:
- "picom:PersonObservation"
- path: "schema/pnv_components.yaml"
description: "Person Name Vocabulary (PNV) components"
classes:
- "pnv:PersonName"
- path: "schema/relationships.yaml"
description: "Family and social relationship types"
properties:
- "sdo:parent"
- "sdo:children"
- "sdo:spouse"
- "sdo:sibling"
- "godparent"
- "witness"
- path: "schema/temporal.yaml"
description: "Date formats, calendar systems, temporal modeling"
- path: "schema/locations.yaml"
description: "Location types for biographical data"
examples:
description: "Complete extraction examples demonstrating PiCo patterns"
index_file: "examples/_examples_index.yaml"
real_data_examples:
- id: "06_hebrew_ketubah"
data_status: "REAL_HISTORICAL_DATA"
source: "Yale University Beinecke Library"
call_number: "Hebrew MSS suppl 194"
- id: "10_russian_metrical"
data_status: "REAL_HISTORICAL_DATA"
source: "Archiwum Panstwowe w Poznaniu Oddzial w Koninie"
reference: "54/792/0/6.1/140"
synthetic_examples:
- "01_dutch_marriage"
- "02_notarial_protocol"
- "03_church_baptismal"
- "04_linkedin_profile"
- "05_arabic_waqf"
- "07_spanish_colonial"
- "08_italian_notarial"
- "09_greek_orthodox"
- "11_ottoman_sijill"
naming_conventions:
description: "Language-specific naming rules and patterns"
files:
- path: "naming_conventions/dutch.yaml"
language: "nl"
covers: ["tussenvoegsels", "patronymics", "sorting rules"]
- path: "naming_conventions/arabic.yaml"
language: "ar"
covers: ["nasab", "nisba", "kunya", "laqab"]
- path: "naming_conventions/hebrew.yaml"
language: "he"
covers: ["ben/bat patronymics", "ketubah conventions"]
- path: "naming_conventions/spanish.yaml"
language: "es"
covers: ["double surnames", "colonial titles"]
- path: "naming_conventions/italian.yaml"
language: "it"
covers: ["notarial conventions", "nobility particles"]
- path: "naming_conventions/greek.yaml"
language: "el"
covers: ["Orthodox naming", "genitive forms"]
- path: "naming_conventions/russian.yaml"
language: "ru"
covers: ["patronymics", "metrical book conventions"]
- path: "naming_conventions/ottoman.yaml"
language: "ota"
covers: ["Ottoman Turkish", "Arabic-Ottoman blend"]
# -----------------------------------------------------------------------------
# GLM-4.6 Annotator Configuration
# -----------------------------------------------------------------------------
glm_annotator_config:
model: "glm-4.6"
api_endpoint: "https://api.z.ai/api/coding/paas/v4/chat/completions"
temperature: 0.1
max_tokens: 4000
system_prompt_file: "schema/observation.yaml" # Contains extraction instructions
# -----------------------------------------------------------------------------
# Hypernym Mapping (GLAM-NER v1.7.0)
# -----------------------------------------------------------------------------
hypernym_mapping:
description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms"
mappings:
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.PER"
note: "Person observations create AGT.PER entities"
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.STF"
condition: "When observed with organizational role"
note: "Staff members with role context"
- pico_class: "pnv:PersonName"
glam_hypernym: "APP.NAM"
note: "Name strings as appellations"
- pico_class: "picom:hasRole"
glam_hypernym: "ROL"
note: "Extracted roles link to ROL hypernym"
# -----------------------------------------------------------------------------
# Usage Notes
# -----------------------------------------------------------------------------
usage:
loading: |
Since YAML does not have native imports, applications should load
module files individually or use a custom loader. Example:
```python
import yaml
from pathlib import Path
def load_pico_module(base_path: Path) -> dict:
module = {}
module['index'] = yaml.safe_load((base_path / '_index.yaml').read_text())
module['observation'] = yaml.safe_load((base_path / 'schema/observation.yaml').read_text())
module['pnv'] = yaml.safe_load((base_path / 'schema/pnv_components.yaml').read_text())
# ... load other components as needed
return module
```
validation: |
Each YAML file is valid standalone. Validate with:
```bash
python3 -c "import yaml; yaml.safe_load(open('path/to/file.yaml'))"
```

View file

@ -0,0 +1,285 @@
# =============================================================================
# PiCo Example 1: Dutch Marriage Certificate (Burgerlijke Stand)
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Demonstrates extraction from a Dutch civil registry (Burgerlijke Stand)
# marriage certificate showing:
# - Full family network extraction (8 persons)
# - Dutch naming conventions (tussenvoegsel: "de")
# - Occupation and residence data
# - Witness relationships (siblings of bride/groom)
# - Deceased parent markers ("wijlen")
#
# Language: Dutch
# Period: 19th century (1885 CE)
# Source Type: Civil Registration (Burgerlijke Stand)
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_01_dutch_marriage"
example_title: "Dutch Marriage Certificate - Burgerlijke Stand (1885)"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Dutch"
source_type: "civil_registration"
description: |
This example demonstrates extraction from a Dutch civil registry (Burgerlijke
Stand) marriage certificate from 1885. The document contains rich genealogical
data including the bride and groom, their parents (living and deceased), and
witnesses who are siblings of the couple.
Key extraction features:
- 8 persons with full family relationship mapping
- Occupation data (schilder, koopman, timmerman)
- Place of birth and residence
- Deceased parent markers ("wijlen")
- Age at marriage
- Witness-to-party relationships (brothers of bride/groom)
source_text: |
Heden den elfden November achttien honderd vijf en tachtig, zijn voor ons
Ambtenaar van den Burgerlijken Stand der gemeente Haarlem, verschenen:
Cornelis Johannes Koppen, oud dertig jaren, schilder, geboren te Haarlem,
wonende alhier, meerderjarige zoon van wijlen Pieter Koppen en van
Anna Maria Brouwer, zonder beroep, wonende alhier;
en Anna Maria Visser, oud zeven en twintig jaren, zonder beroep, geboren
te Amsterdam, wonende alhier, meerderjarige dochter van Jan Visser,
koopman, en van wijlen Cornelia de Vries.
Als getuigen waren tegenwoordig: Hendrik Koppen, oud vijf en dertig jaren,
schilder, broeder van den bruidegom; en Willem Visser, oud twee en dertig
jaren, timmerman, broeder van de bruid.
expected_extraction:
pico_observation:
observation_id: "bs_haarlem_1885_marriage_321"
observed_at: "2025-12-12T10:00:00Z"
source_type: "civil_registration"
source_reference: "BS Marriage Haarlem, November 11, 1885, certificate 321"
persons:
- person_index: 0
pnv_name:
literalName: "Cornelis Johannes Koppen"
givenName: "Cornelis Johannes"
baseSurname: "Koppen"
roles:
- role_title: "schilder"
role_in_source: "groom"
biographical:
age: "30"
birth_place: "Haarlem"
address: "Haarlem"
family_relationships:
parent:
- person_index: 2
target_name: "Pieter Koppen"
- person_index: 3
target_name: "Anna Maria Brouwer"
spouse:
- person_index: 1
target_name: "Anna Maria Visser"
sibling:
- person_index: 6
target_name: "Hendrik Koppen"
- person_index: 1
pnv_name:
literalName: "Anna Maria Visser"
givenName: "Anna Maria"
baseSurname: "Visser"
roles:
- role_in_source: "bride"
biographical:
age: "27"
birth_place: "Amsterdam"
address: "Haarlem"
family_relationships:
parent:
- person_index: 4
target_name: "Jan Visser"
- person_index: 5
target_name: "Cornelia de Vries"
spouse:
- person_index: 0
target_name: "Cornelis Johannes Koppen"
sibling:
- person_index: 7
target_name: "Willem Visser"
- person_index: 2
pnv_name:
literalName: "Pieter Koppen"
givenName: "Pieter"
baseSurname: "Koppen"
biographical:
deceased: true
family_relationships:
children:
- person_index: 0
target_name: "Cornelis Johannes Koppen"
- person_index: 6
target_name: "Hendrik Koppen"
spouse:
- person_index: 3
target_name: "Anna Maria Brouwer"
- person_index: 3
pnv_name:
literalName: "Anna Maria Brouwer"
givenName: "Anna Maria"
baseSurname: "Brouwer"
roles:
- role_title: "zonder beroep"
biographical:
address: "Haarlem"
family_relationships:
children:
- person_index: 0
target_name: "Cornelis Johannes Koppen"
- person_index: 6
target_name: "Hendrik Koppen"
widow_of:
person_index: 2
target_name: "Pieter Koppen"
- person_index: 4
pnv_name:
literalName: "Jan Visser"
givenName: "Jan"
baseSurname: "Visser"
roles:
- role_title: "koopman"
family_relationships:
children:
- person_index: 1
target_name: "Anna Maria Visser"
- person_index: 7
target_name: "Willem Visser"
spouse:
- person_index: 5
target_name: "Cornelia de Vries"
- person_index: 5
pnv_name:
literalName: "Cornelia de Vries"
givenName: "Cornelia"
surnamePrefix: "de"
baseSurname: "Vries"
biographical:
deceased: true
family_relationships:
children:
- person_index: 1
target_name: "Anna Maria Visser"
- person_index: 7
target_name: "Willem Visser"
spouse:
- person_index: 4
target_name: "Jan Visser"
- person_index: 6
pnv_name:
literalName: "Hendrik Koppen"
givenName: "Hendrik"
baseSurname: "Koppen"
roles:
- role_title: "schilder"
role_in_source: "witness"
biographical:
age: "35"
family_relationships:
sibling:
- person_index: 0
target_name: "Cornelis Johannes Koppen"
parent:
- person_index: 2
target_name: "Pieter Koppen"
- person_index: 3
target_name: "Anna Maria Brouwer"
- person_index: 7
pnv_name:
literalName: "Willem Visser"
givenName: "Willem"
baseSurname: "Visser"
roles:
- role_title: "timmerman"
role_in_source: "witness"
biographical:
age: "32"
family_relationships:
sibling:
- person_index: 1
target_name: "Anna Maria Visser"
parent:
- person_index: 4
target_name: "Jan Visser"
- person_index: 5
target_name: "Cornelia de Vries"
temporal_references:
- expression: "den elfden November achttien honderd vijf en tachtig"
normalized: "1885-11-11"
type: "DATE"
locations_mentioned:
- name: "Haarlem"
type: "city"
- name: "Amsterdam"
type: "city"
naming_conventions_notes: |
Dutch civil registration naming conventions demonstrated:
1. TUSSENVOEGSEL (surname prefix):
- "de Vries" - "de" is the tussenvoegsel
- Lowercase in running text, may be capitalized at start of sentence
- Inherited through family line
2. DECEASED MARKER:
- "wijlen" = the late/deceased
- Placed before the full name
3. OCCUPATION TERMS:
- "schilder" = painter
- "koopman" = merchant
- "timmerman" = carpenter
- "zonder beroep" = without profession/occupation
4. RESIDENCE MARKERS:
- "wonende alhier" = residing here (in the registration municipality)
- "geboren te" = born in
5. RELATIONSHIP TERMS:
- "meerderjarige zoon van" = adult son of
- "meerderjarige dochter van" = adult daughter of
- "broeder van" = brother of
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on authentic Dutch civil
registry (Burgerlijke Stand) marriage certificate formulae for
demonstration purposes. Names, dates, and locations are fictional
but follow authentic 19th-century patterns.
For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "Centraal Bureau voor Genealogie (CBG)"
project: "WieWasWie"
digital_url: "https://www.wiewaswie.nl/"
document_type: "Birth, marriage, death certificates"
period: "1811-present (civil); 1600s+ (church)"
language: "Dutch"
license: "Subscription / Free at archives"
- archive: "Noord-Hollands Archief"
coverage: "Civil registry from 1811, church records from 1600s"
location: "Haarlem, Netherlands"
document_types: "Dutch civil registry records"

View file

@ -0,0 +1,263 @@
# =============================================================================
# PiCo Example 2: Early Modern Notarial Protocol Index Entry
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Demonstrates extraction from a 17th-century Dutch notarial protocol showing:
# - Early modern Dutch naming conventions (patronymics: Janszoon, Claesdr)
# - Guardianship (voogd) relationships
# - Orphan identification
# - Notarial act structure
# - Tussenvoegsel patterns (van der)
#
# Language: Early Modern Dutch
# Period: 17th century (1680 CE)
# Source Type: Notarial Archives
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_02_notarial_protocol"
example_title: "Early Modern Notarial Protocol Index Entry (1680)"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Early Modern Dutch"
source_type: "historical_indices"
description: |
This example demonstrates extraction from an early modern Dutch notarial
protocol index entry from 1680. Notarial protocols are rich sources for
genealogical research, containing contracts, testaments, and guardianship
appointments.
Key extraction features:
- 9 persons with complex relationships
- Patronymic naming system (Janszoon, Claesdr)
- Guardianship (voogd) relationships
- Orphan children identification
- Deceased parent markers
- Notary and witness identification
- Early modern Dutch occupation terms
source_text: |
Notarial Archive Amsterdam, inv. 5075/1234
30 January 1680
Before notary Pieter van der Meer appeared:
Jacob Janszoon van der Hoeven, merchant of this city,
with his wife Maritgen Claes, for themselves and as
guardians (voogden) of the minor children of the late
Claes Jacobsz and Aeltgen Pieters, namely:
- Jan Claeszoon, aged about 16 years
- Trijntgen Claesdr, aged about 12 years
Witnesses: Hendrick Jansz, baker, and Cornelis Pietersz,
schoolmaster, both of this city.
expected_extraction:
pico_observation:
observation_id: "na_amsterdam_5075_1234"
observed_at: "2025-12-12T10:00:00Z"
source_type: "historical_indices"
source_reference: "Notarial Archive Amsterdam, inv. 5075/1234, 30 January 1680"
persons:
- person_index: 0
pnv_name:
literalName: "Jacob Janszoon van der Hoeven"
givenName: "Jacob"
patronym: "Janszoon"
surnamePrefix: "van der"
baseSurname: "Hoeven"
roles:
- role_title: "merchant"
role_in_source: "declarant"
- role_title: "voogd"
role_in_source: null
biographical:
address: "Amsterdam"
family_relationships:
spouse:
- person_index: 1
target_name: "Maritgen Claes"
- person_index: 1
pnv_name:
literalName: "Maritgen Claes"
givenName: "Maritgen"
patronym: "Claes"
roles:
- role_in_source: "declarant"
- role_title: "voogd"
family_relationships:
spouse:
- person_index: 0
target_name: "Jacob Janszoon van der Hoeven"
- person_index: 2
pnv_name:
literalName: "Claes Jacobsz"
givenName: "Claes"
patronym: "Jacobsz"
biographical:
deceased: true
family_relationships:
spouse:
- person_index: 3
target_name: "Aeltgen Pieters"
children:
- person_index: 4
target_name: "Jan Claeszoon"
- person_index: 5
target_name: "Trijntgen Claesdr"
- person_index: 3
pnv_name:
literalName: "Aeltgen Pieters"
givenName: "Aeltgen"
patronym: "Pieters"
biographical:
deceased: true
family_relationships:
spouse:
- person_index: 2
target_name: "Claes Jacobsz"
children:
- person_index: 4
target_name: "Jan Claeszoon"
- person_index: 5
target_name: "Trijntgen Claesdr"
- person_index: 4
pnv_name:
literalName: "Jan Claeszoon"
givenName: "Jan"
patronym: "Claeszoon"
roles:
- role_in_source: "child"
biographical:
age: "about 16"
family_relationships:
parent:
- person_index: 2
target_name: "Claes Jacobsz"
- person_index: 3
target_name: "Aeltgen Pieters"
sibling:
- person_index: 5
target_name: "Trijntgen Claesdr"
- person_index: 5
pnv_name:
literalName: "Trijntgen Claesdr"
givenName: "Trijntgen"
patronym: "Claesdr"
roles:
- role_in_source: "child"
biographical:
age: "about 12"
gender: "Female"
family_relationships:
parent:
- person_index: 2
target_name: "Claes Jacobsz"
- person_index: 3
target_name: "Aeltgen Pieters"
sibling:
- person_index: 4
target_name: "Jan Claeszoon"
- person_index: 6
pnv_name:
literalName: "Pieter van der Meer"
givenName: "Pieter"
surnamePrefix: "van der"
baseSurname: "Meer"
roles:
- role_title: "notary"
- person_index: 7
pnv_name:
literalName: "Hendrick Jansz"
givenName: "Hendrick"
patronym: "Jansz"
roles:
- role_title: "baker"
role_in_source: "witness"
biographical:
address: "Amsterdam"
- person_index: 8
pnv_name:
literalName: "Cornelis Pietersz"
givenName: "Cornelis"
patronym: "Pietersz"
roles:
- role_title: "schoolmaster"
role_in_source: "witness"
biographical:
address: "Amsterdam"
temporal_references:
- expression: "30 January 1680"
normalized: "1680-01-30"
type: "DATE"
locations_mentioned:
- name: "Amsterdam"
type: "city"
naming_conventions_notes: |
Early modern Dutch naming conventions demonstrated:
1. PATRONYMIC SYSTEM:
- Male: -zoon, -szoon, -sz, -z (son of)
Examples: Janszoon, Jacobsz, Jansz, Pietersz
- Female: -dr, -dochter (daughter of)
Examples: Claesdr (= Claesdochter)
- Patronyms derived from father's given name
2. TRANSITION TO SURNAMES:
- Some families adopted fixed surnames (van der Hoeven, van der Meer)
- Others still used pure patronymics (Hendrick Jansz)
- Mixed patterns common in this period
3. TUSSENVOEGSEL:
- "van der" = from the (+ definite article)
- Often indicates geographic origin
- Hoeven = farmstead/court
- Meer = lake
4. GENDERED DIMINUTIVES:
- Female names often end in -gen, -tgen, -tje
- Maritgen, Trijntgen, Aeltgen
- Male names typically unmodified
5. LEGAL TERMINOLOGY:
- "voogd" (plural: voogden) = guardian
- Used for orphaned minors
- Appointed by family or court
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on authentic early modern
notarial protocol index entry formulae for demonstration purposes.
Names, dates, and locations are fictional but follow authentic
17th-century Dutch notarial patterns.
For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "Stadsarchief Amsterdam"
collection: "Notarial Archives (Notariële Archieven)"
document_type: "Notarial protocols, contracts, testaments"
period: "1578-1915"
language: "Dutch, Latin"
notes: "Largest notarial archive in the Netherlands"
- project: "TICCLAT (Transliteration of Early Modern Dutch Notarial Archives)"
coverage: "Amsterdam notarial indices"
period: "17th-18th century"
notes: "Machine-readable indices to notarial protocols"

View file

@ -0,0 +1,202 @@
# =============================================================================
# PiCo Example 3: Dutch Church Baptismal Record with Godparents
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Dutch Reformed Church (Nederlandse Hervormde Kerk) baptismal register entry.
# Demonstrates godparent relationships, Dutch patronymic naming, and
# pre-civil registration church records (DTB - Doop-, Trouw- en Begraafregisters).
#
# Language: Dutch (Early Modern)
# Period: 1702 CE
# Source Type: Church baptismal register (DTB)
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_03_church_baptism"
example_title: "Dutch Church Baptismal Record with Godparents (1702)"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Dutch"
source_type: "church_records"
description: |
Example of a Dutch Reformed Church (Nederlandse Hervormde Kerk) baptismal
register entry demonstrating:
- Godparent (getuigen) relationships creating spiritual kinship
- Dutch patronymic naming conventions (Hendriksen, Jans, Anthonisz)
- Aristocratic naming (surnamePrefix: van)
- Honorific titles (E. Heer, Juffrou)
- Pre-civil registration church records (before 1811)
source_text: |
Den 15en Meij 1702 is gedoopt
Johanna, dochter van Willem Hendriksen en Geertruijd Jans,
getuijgen waren de E. Heer Jan Willem van Beverwijck
ende Juffrou Maria van Loon, huijsvrouw van de heer
Pieter Anthonisz Verschoor.
expected_extraction:
pico_observation:
observation_id: "dtb_amsterdam_1702_baptism_johanna"
observed_at: "2025-12-12T10:00:00Z"
source_type: "church_records"
source_reference: "DTB Amsterdam, 15 May 1702"
persons:
- person_index: 0
pnv_name:
literalName: "Johanna"
givenName: "Johanna"
roles:
- role_in_source: "child"
biographical:
gender: "Female"
family_relationships:
parent:
- person_index: 1
target_name: "Willem Hendriksen"
- person_index: 2
target_name: "Geertruijd Jans"
godparent:
- person_index: 3
target_name: "Jan Willem van Beverwijck"
- person_index: 4
target_name: "Maria van Loon"
- person_index: 1
pnv_name:
literalName: "Willem Hendriksen"
givenName: "Willem"
patronym: "Hendriksen"
biographical:
gender: "Male"
family_relationships:
children:
- person_index: 0
target_name: "Johanna"
spouse:
- person_index: 2
target_name: "Geertruijd Jans"
- person_index: 2
pnv_name:
literalName: "Geertruijd Jans"
givenName: "Geertruijd"
patronym: "Jans"
biographical:
gender: "Female"
family_relationships:
children:
- person_index: 0
target_name: "Johanna"
spouse:
- person_index: 1
target_name: "Willem Hendriksen"
- person_index: 3
pnv_name:
literalName: "Jan Willem van Beverwijck"
givenName: "Jan Willem"
surnamePrefix: "van"
baseSurname: "Beverwijck"
honorificPrefix: "de E. Heer"
roles:
- role_in_source: "witness"
biographical:
gender: "Male"
family_relationships:
godchild:
- person_index: 0
target_name: "Johanna"
- person_index: 4
pnv_name:
literalName: "Maria van Loon"
givenName: "Maria"
surnamePrefix: "van"
baseSurname: "Loon"
honorificPrefix: "Juffrou"
roles:
- role_in_source: "witness"
biographical:
gender: "Female"
family_relationships:
godchild:
- person_index: 0
target_name: "Johanna"
spouse:
- person_index: 5
target_name: "Pieter Anthonisz Verschoor"
- person_index: 5
pnv_name:
literalName: "Pieter Anthonisz Verschoor"
givenName: "Pieter"
patronym: "Anthonisz"
baseSurname: "Verschoor"
honorificPrefix: "de heer"
biographical:
gender: "Male"
family_relationships:
spouse:
- person_index: 4
target_name: "Maria van Loon"
temporal_references:
- expression: "Den 15en Meij 1702"
normalized: "1702-05-15"
type: "DATE"
naming_conventions_notes: |
Dutch naming conventions demonstrated in this example:
PATRONYMICS:
- Hendriksen: son of Hendrik (-sen = son)
- Jans: daughter/child of Jan (feminine form without -sen common for women)
- Anthonisz: son of Anthonis (-z = zoon = son, abbreviated)
ARISTOCRATIC NAMING:
- "van" prefix: indicates noble or patrician family (from a place)
- "van Beverwijck": from the Beverwijck region
- "van Loon": from the Loon region (Limburg)
HONORIFIC TITLES:
- "de E. Heer": de Eerbare Heer (the Honorable Sir) - used for gentlemen
- "Juffrou": Juffrouw (Miss/Madam) - used for unmarried or married respectable women
- "de heer": (the mister) - standard respectful address
GODPARENT TERMINOLOGY:
- "getuijgen": witnesses (in baptismal context = godparents)
- Godparents created spiritual kinship (geestelijke verwantschap)
PRE-CIVIL REGISTRATION:
- DTB records (Doop-, Trouw- en Begraafregisters) were church records
- Civil registration (Burgerlijke Stand) started in Netherlands in 1811
- Before 1811, churches maintained vital records
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on authentic Dutch Reformed
Church (Nederlandse Hervormde Kerk) baptismal register formulae for
demonstration purposes. Names, dates, and locations are fictional
but follow authentic early 18th-century patterns.
For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "Various Dutch Regional Archives"
collection: "Doop-, Trouw- en Begraafregisters (DTB)"
document_type: "Church baptism, marriage, burial records"
period: "1600s-1811 (before civil registration)"
language: "Dutch"
notes: "Pre-1811 vital records maintained by churches"
- archive: "FamilySearch"
collection: "Netherlands, Church Records"
wiki_url: "https://www.familysearch.org/en/wiki/Netherlands_Church_Records"
document_type: "Dutch church baptisms"
license: "Free with registration"

View file

@ -0,0 +1,146 @@
# =============================================================================
# PiCo Example 4: Modern LinkedIn Staff Profile
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Demonstrates modern digital source extraction, contrasting with historical
# document examples. Shows heritage sector professional career tracking.
#
# Language: English
# Period: Contemporary (2025)
# Source Type: Modern digital (LinkedIn profile)
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_04_linkedin_profile"
example_title: "Modern LinkedIn Staff Profile - Heritage Professional"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "English"
source_type: "modern_digital"
description: |
Example of a modern LinkedIn profile for a heritage sector professional.
Demonstrates PiCo extraction patterns for contemporary digital sources,
contrasting with historical document examples.
Key features:
- Modern professional networking profile format
- Career trajectory across heritage institutions
- Educational background with dates
- Dutch naming conventions in modern context (van den Berg)
- GLAMORCUBESFIXPHDNT heritage type classification
source_text: |
Dr. Maria van den Berg
Director of Collections | Rijksmuseum
Amsterdam, Netherlands
About:
Leading the collections management team at the Rijksmuseum since 2018.
Previously Head Curator at the Van Gogh Museum (2012-2018).
PhD in Art History, University of Amsterdam.
Experience:
- Director of Collections, Rijksmuseum (2018-present)
- Head Curator, Van Gogh Museum (2012-2018)
- Assistant Curator, Stedelijk Museum (2008-2012)
Education:
- PhD Art History, University of Amsterdam (2008)
- MA Museum Studies, University of Amsterdam (2003)
expected_extraction:
pico_observation:
observation_id: "linkedin_maria_van_den_berg_2025"
observed_at: "2025-12-12T10:00:00Z"
source_type: "modern_digital"
source_reference: "https://linkedin.com/in/mariavandenberg"
persons:
- person_index: 0
pnv_name:
literalName: "Dr. Maria van den Berg"
givenName: "Maria"
surnamePrefix: "van den"
baseSurname: "Berg"
honorificPrefix: "Dr."
roles:
- role_title: "Director of Collections"
organization: "Rijksmuseum"
period: "2018-present"
heritage_relevant: true
heritage_type: "M"
- role_title: "Head Curator"
organization: "Van Gogh Museum"
period: "2012-2018"
heritage_relevant: true
heritage_type: "M"
- role_title: "Assistant Curator"
organization: "Stedelijk Museum"
period: "2008-2012"
heritage_relevant: true
heritage_type: "M"
biographical:
address: "Amsterdam, Netherlands"
family_relationships: {}
context: "Heritage sector professional with museum career"
organizations_mentioned:
- name: "Rijksmuseum"
type: "M"
role_in_source: "employer"
- name: "Van Gogh Museum"
type: "M"
role_in_source: "employer"
- name: "Stedelijk Museum"
type: "M"
role_in_source: "employer"
- name: "University of Amsterdam"
type: "E"
role_in_source: "education"
locations_mentioned:
- name: "Amsterdam"
type: "city"
- name: "Netherlands"
type: "country"
naming_conventions_notes: |
Modern Dutch naming conventions demonstrated:
SURNAME PREFIX:
- "van den" is a tussenvoegsel (insertion) common in Dutch surnames
- In alphabetical sorting, Dutch convention uses the base surname: "Berg, Maria van den"
- In formal address: "Dr. Van den Berg" (capitalized at start of sentence)
- In running text: "Dr. van den Berg" (lowercase tussenvoegsel)
ACADEMIC TITLE:
- "Dr." indicates doctorate (PhD) - placed before name
- In Netherlands, this is an academic degree, not medical title (which uses "Arts")
CONTRAST WITH HISTORICAL EXAMPLES:
- LinkedIn profiles are etic (observer) descriptions, not emic (insider) documents
- Structured data format vs. narrative historical documents
- Self-reported information vs. third-party recording
- Modern standardized naming vs. evolving historical conventions
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on modern LinkedIn profile
formats for demonstration purposes. The profile name, institution,
and biographical details are entirely fictional. LinkedIn profiles
represent a modern source type for person-in-context observations,
contrasting with the historical document examples in this module.
source_context:
platform: "LinkedIn"
data_type: "Modern professional networking profile"
privacy_note: |
When extracting real LinkedIn data, ensure compliance with
LinkedIn Terms of Service, GDPR, and applicable privacy laws.
This synthetic example demonstrates extraction patterns only.

View file

@ -0,0 +1,215 @@
# =============================================================================
# PiCo Example 5: Arabic Waqf Document (Endowment Record)
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Example of a waqf (religious endowment) document from an Islamic archive.
# Waqf documents record property endowments for religious/charitable purposes
# and typically name the founder, beneficiaries, and witnesses.
#
# Language: Arabic
# Period: 1225 AH (1810 CE)
# Source Type: Archival descriptions (waqfiyya)
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_05_arabic_waqf"
example_title: "Arabic Waqf Document - Aleppo Endowment (1810 CE)"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Arabic"
source_type: "archival_descriptions"
description: |
Example of a waqf (وقف) document from an Islamic archive. Waqf documents
record property endowments for religious/charitable purposes and typically
name the founder (واقف), beneficiaries, and witnesses.
Key features demonstrated:
- Arabic patronymic system (ابن/بن - ibn/bin = son of)
- Honorific titles (الحاج, السيد)
- Nisba (geographic/tribal surnames)
- Deceased markers (المرحوم)
- Hijri calendar dating
- Romanization alongside Arabic script
source_text: |
بسم الله الرحمن الرحيم
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
وخمس وعشرين هجرية.
source_text_english: |
In the name of God, the Compassionate, the Merciful.
This is what the late al-Hajj Ahmad ibn Muhammad al-'Umari, merchant
in the city of Aleppo, son of the late Muhammad ibn Abdullah al-'Umari,
has endowed, dedicated, and perpetuated. He endowed his entire house
located in the al-Jadida neighborhood for his children and grandchildren,
male and female. If they cease to exist, God forbid, then for the poor
Muslims. Witnessed by: al-Hajj Ibrahim ibn Yusuf al-Turkmani, and
al-Sayyid Ali ibn Husayn al-Halabi. Written in the month of Rajab,
year 1225 Hijri (1810 CE).
expected_extraction:
pico_observation:
observation_id: "waqf_aleppo_1225h_ahmad_umari"
observed_at: "2025-12-12T10:00:00Z"
source_type: "archival_descriptions"
source_reference: "Waqf document, Aleppo, Rajab 1225 AH (1810 CE)"
persons:
- person_index: 0
pnv_name:
literalName: "الحاج أحمد بن محمد العمري"
literalName_romanized: "al-Hajj Ahmad ibn Muhammad al-'Umari"
givenName: "أحمد"
givenName_romanized: "Ahmad"
patronym: "محمد"
patronym_romanized: "Muhammad"
baseSurname: "العمري"
baseSurname_romanized: "al-'Umari"
honorificPrefix: "الحاج"
honorificPrefix_romanized: "al-Hajj"
roles:
- role_title: "تاجر"
role_title_romanized: "merchant"
role_in_source: "founder"
biographical:
deceased: true
address: "حلب الشهباء (Aleppo)"
family_relationships:
parent:
- person_index: 1
target_name: "محمد بن عبد الله العمري"
context: "Waqf founder (واقف)"
- person_index: 1
pnv_name:
literalName: "محمد بن عبد الله العمري"
literalName_romanized: "Muhammad ibn Abdullah al-'Umari"
givenName: "محمد"
givenName_romanized: "Muhammad"
patronym: "عبد الله"
patronym_romanized: "Abdullah"
baseSurname: "العمري"
baseSurname_romanized: "al-'Umari"
honorificPrefix: "المرحوم"
honorificPrefix_romanized: "the late"
biographical:
deceased: true
family_relationships:
children:
- person_index: 0
target_name: "أحمد بن محمد العمري"
context: "Father of the founder"
- person_index: 2
pnv_name:
literalName: "الحاج إبراهيم بن يوسف التركماني"
literalName_romanized: "al-Hajj Ibrahim ibn Yusuf al-Turkmani"
givenName: "إبراهيم"
givenName_romanized: "Ibrahim"
patronym: "يوسف"
patronym_romanized: "Yusuf"
baseSurname: "التركماني"
baseSurname_romanized: "al-Turkmani"
honorificPrefix: "الحاج"
honorificPrefix_romanized: "al-Hajj"
roles:
- role_in_source: "witness"
family_relationships: {}
context: "Witness to the endowment"
- person_index: 3
pnv_name:
literalName: "السيد علي بن حسين الحلبي"
literalName_romanized: "al-Sayyid Ali ibn Husayn al-Halabi"
givenName: "علي"
givenName_romanized: "Ali"
patronym: "حسين"
patronym_romanized: "Husayn"
baseSurname: "الحلبي"
baseSurname_romanized: "al-Halabi"
honorificPrefix: "السيد"
honorificPrefix_romanized: "al-Sayyid"
roles:
- role_in_source: "witness"
family_relationships: {}
context: "Witness to the endowment"
temporal_references:
- expression: "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية"
expression_romanized: "month of Rajab, year 1225 Hijri"
normalized: "1810-07"
calendar: "Hijri"
type: "DATE"
locations_mentioned:
- name: "حلب الشهباء"
name_romanized: "Aleppo"
type: "city"
- name: "محلة الجديدة"
name_romanized: "al-Jadida neighborhood"
type: "neighborhood"
arabic_naming_notes: |
Arabic naming conventions demonstrated:
PATRONYMICS:
- ابن/بن (ibn/bin): "son of" - connects given name to father's name
- Full chain: Ahmad ibn Muhammad ibn Abdullah = Ahmad son of Muhammad son of Abdullah
HONORIFIC TITLES:
- الحاج (al-Hajj): honorific for one who completed the Hajj pilgrimage to Mecca
- السيد (al-Sayyid): honorific denoting descent from Prophet Muhammad
- المرحوم (al-marhum): "the late" - marker for deceased person (masculine)
- المرحومة (al-marhuma): "the late" - feminine form
NISBA (نسبة):
Geographic or tribal surname indicating origin:
- العمري (al-'Umari): descendant of 'Umar or from 'Umar tribe
- التركماني (al-Turkmani): of Turkman origin
- الحلبي (al-Halabi): from Aleppo (حلب = Halab)
WAQF TERMINOLOGY:
- واقف (waqif): founder/endower
- وقف (waqf): the endowment itself
- شهود (shuhud): witnesses
HIJRI CALENDAR:
- رجب (Rajab): 7th month of Islamic lunar calendar
- سنة هجرية: Hijri year (from Prophet's migration to Medina, 622 CE)
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on standard waqf document formulae
for demonstration purposes. Names, dates, and property details are fictional.
For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "Cambridge University Library"
collection: "Islamic Manuscripts"
digital_url: "https://cudl.lib.cam.ac.uk/collections/islamic"
document_types: "Waqfiyya, legal documents"
period: "8th-20th century CE"
license: "CC BY-NC 4.0"
- archive: "University of Pennsylvania Libraries"
collection: "Manuscripts of the Muslim World"
digital_url: "https://openn.library.upenn.edu/html/muslimworld_contents.html"
document_types: "Waqfiyya, Quranic manuscripts, legal documents"
license: "Public Domain / CC0"
- archive: "Singapore National Heritage Board"
accession_number: "1115401"
digital_url: "https://www.roots.gov.sg/Collection-Landing/listing/1115401"
document_type: "Waqf document"
donor: "Muhammad b. Abd al-Ghani"
properties: "Istanbul (various locations)"

View file

@ -0,0 +1,325 @@
# =============================================================================
# PiCo Example 6: Hebrew Ketubah - Marriage of Mosheh & Rivkah
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: REAL HISTORICAL DATA
#
# Source: Yale University Beinecke Rare Book & Manuscript Library
# Call Number: Hebrew MSS suppl 194 (Broadside)
# Object ID: 2067542
# Document Date: 23 Elul 5656 AM (September 1, 1896 CE)
# Location: Mashhad, Iran
#
# This is a REAL ketubah with verified provenance from Yale's digital collection.
# The Mashhad Jewish community had a unique history as "crypto-Jews" after
# forced conversion in 1839, making this document culturally significant.
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_06_hebrew_ketubah"
example_title: "Hebrew Ketubah - Marriage of Mosheh & Rivkah (Mashhad, Iran, 1896)"
data_status: "REAL_HISTORICAL_DATA"
source_language: "Hebrew/Aramaic"
source_script: "Hebrew square script"
# -----------------------------------------------------------------------------
# Document Description
# -----------------------------------------------------------------------------
description: |
A ketubah is a Jewish marriage contract written in Aramaic with Hebrew
elements. This REAL example from Mashhad, Iran demonstrates Persian Jewish
traditions with elaborate decorative elements.
Historical context: The Jewish community of Mashhad was unique - after forced
conversion to Islam in 1839 (the Allahdad pogrom), many continued practicing
Judaism in secret as "Jadid al-Islam" (new Muslims). By 1896, some families
were more openly practicing Judaism, as evidenced by this elaborate ketubah.
Key features documented:
- Groom and bride names with patronymics (ben/bat - son/daughter of)
- Persian Jewish artistic traditions (floral patterns, colored rules)
- Hebrew date with month, day, and year from Creation
- Isaiah 61:10 verse as blessing
- Physical dimensions: 53 x 37 cm
# -----------------------------------------------------------------------------
# Source Text
# -----------------------------------------------------------------------------
source_text:
note: "Full text not transcribed from manuscript. Key readable elements provided."
hebrew_text: |
בס״ד
שנת חמשת אלפים שש מאות וחמישים ושש לבריאת עולם
עשרים ושלשה לחודש אלול
במשהד
החתן משה בן משיאח
הכלה רבקה בת יעקב
שוש אשיש בה׳ תגל נפשי באלהי כי הלבישני בגדי ישע מעיל צדקה יעטני
כחתן יכהן פאר וככלה תעדה כליה
romanized_text: |
B'siyata d'shmaya (With Heaven's help)
In the year five thousand six hundred and fifty-six from the Creation of the world,
the twenty-third day of the month of Elul,
in Mashhad.
The groom: Mosheh son of Mashiah
The bride: Rivkah daughter of Ya'akov
[Isaiah 61:10 - decorative header blessing:]
"I will greatly rejoice in the LORD, my soul shall be joyful in my God.
For he has clothed me with the garments of salvation, he has covered me
with the robe of righteousness, as a bridegroom decks himself with a garland,
and as a bride adorns herself with her jewels."
# -----------------------------------------------------------------------------
# Expected Extraction Output
# -----------------------------------------------------------------------------
expected_extraction:
pico_observation:
observation_id: "ketubah_mashhad_5656_mosheh_rivkah"
observed_at: "2025-01-13T12:00:00Z"
source_type: "ketubah"
source_reference: "Ketubah, Mashhad, 23 Elul 5656 (September 1, 1896 CE), Yale Beinecke Hebrew MSS suppl 194"
archive: "Yale University, Beinecke Rare Book & Manuscript Library"
persons:
# Person 0: Groom
- person_index: 0
pnv_name:
literalName: "משה בן משיאח"
literalName_romanized: "Mosheh ben Mashiah"
givenName: "משה"
givenName_romanized: "Mosheh"
patronym: "משיאח"
patronym_romanized: "Mashiah"
roles:
- role_title: "חתן"
role_title_romanized: "chatan"
role_in_source: "groom"
biographical:
sex: "male"
religion: "Jewish"
community: "Mashhad Jewish community (Mashhadis)"
family_relationships:
father:
- person_index: 1
target_name: "משיאח"
spouse:
- person_index: 2
target_name: "רבקה בת יעקב"
context: "Groom (chatan) - the bridegroom in the marriage contract"
# Person 1: Father of Groom
- person_index: 1
pnv_name:
literalName: "משיאח"
literalName_romanized: "Mashiah"
givenName: "משיאח"
givenName_romanized: "Mashiah"
biographical:
sex: "male"
note: "Name meaning 'Messiah' - common Persian Jewish name"
family_relationships:
child:
- person_index: 0
target_name: "משה"
context: "Father of the groom (implicit from patronymic)"
# Person 2: Bride
- person_index: 2
pnv_name:
literalName: "רבקה בת יעקב"
literalName_romanized: "Rivkah bat Ya'akov"
givenName: "רבקה"
givenName_romanized: "Rivkah"
givenName_english: "Rebecca"
patronym: "יעקב"
patronym_romanized: "Ya'akov"
roles:
- role_title: "כלה"
role_title_romanized: "kallah"
role_in_source: "bride"
biographical:
sex: "female"
religion: "Jewish"
community: "Mashhad Jewish community (Mashhadis)"
family_relationships:
father:
- person_index: 3
target_name: "יעקב"
spouse:
- person_index: 0
target_name: "משה בן משיאח"
context: "Bride (kallah) - daughter of Ya'akov"
# Person 3: Father of Bride
- person_index: 3
pnv_name:
literalName: "יעקב"
literalName_romanized: "Ya'akov"
givenName: "יעקב"
givenName_romanized: "Ya'akov"
givenName_english: "Jacob"
biographical:
sex: "male"
note: "Biblical patriarch name - common in Jewish communities"
family_relationships:
child:
- person_index: 2
target_name: "רבקה"
context: "Father of the bride (implicit from patronymic)"
temporal_references:
- expression: "עשרים ושלשה לחודש אלול שנת חמשת אלפים שש מאות וחמישים ושש לבריאת עולם"
expression_romanized: "23rd day of the month of Elul, year 5656 from Creation"
normalized_gregorian: "1896-09-01"
calendar: "Hebrew"
type: "DATE"
components:
day: 23
month: "אלול (Elul)"
month_number: 6
year_hebrew: 5656
year_gregorian: 1896
era: "לבריאת עולם (from Creation)"
notes: "Elul is the 6th month of the civil year, 12th of the ecclesiastical year"
locations_mentioned:
- name: "משהד"
name_romanized: "Mashhad"
name_persian: "مشهد"
type: "city"
country: "Iran (then Qajar Persia)"
modern_country: "Iran"
coordinates: "36.2972, 59.6067"
historical_context: |
Mashhad is a major city in northeastern Iran, holy city of Shia Islam
(shrine of Imam Reza). The Jewish community dated to ancient times but
faced forced conversion in 1839. By 1896, some families openly practiced
Judaism while others remained crypto-Jews.
# -----------------------------------------------------------------------------
# Physical Description
# -----------------------------------------------------------------------------
physical_description:
dimensions: "53 x 37 cm"
material: "ink and paint on paper"
decoration: |
- Red and green rules divide the paper into rectangular sections
- Middle section contains the ketubah text
- Top and sides filled with elaborate arch and floral patterns
- Colors: blue, gold, and silver paint
- Strips of red paper pasted on all four sides as frame
condition: "Some damage to the text containing the Isaiah quote and to the borders"
script: "Hebrew square script"
# -----------------------------------------------------------------------------
# Hebrew Naming Conventions Demonstrated
# -----------------------------------------------------------------------------
naming_conventions_notes: |
Hebrew/Jewish naming conventions demonstrated in this REAL document:
1. PATRONYMIC SYSTEM:
- בן (ben): "son of" - used for males
- בת (bat): "daughter of" - used for females
- Example: משה בן משיאח = "Mosheh son of Mashiah"
2. PERSIAN JEWISH NAMES:
- משיאח (Mashiah/Messiah): Common Persian Jewish given name
- רבקה (Rivkah/Rebecca): Biblical matriarch name
- יעקב (Ya'akov/Jacob): Biblical patriarch name
3. KETUBAH STRUCTURE:
- Opening: בס״ד (B'siyata d'Shmaya - With Heaven's help)
- Date: Hebrew calendar from Creation (anno mundi)
- Location: City name in Hebrew transliteration
- Parties: Groom (חתן) and Bride (כלה) with patronymics
- Blessing: Often biblical verses (here Isaiah 61:10)
4. MASHHAD JEWISH CONTEXT:
- Community known as "Mashhadis" or "Jadid al-Islam"
- After 1839 pogrom, many practiced Judaism secretly
- Unique artistic traditions in ketubah decoration
- Persian influences in ornamentation style
# -----------------------------------------------------------------------------
# Provenance
# -----------------------------------------------------------------------------
provenance:
data_status: "REAL_HISTORICAL_DATA"
archive:
name: "Yale University, Beinecke Rare Book & Manuscript Library"
collection: "Hebrew Manuscripts Supplement"
call_number: "Hebrew MSS suppl 194 (Broadside)"
catalog_record: "8574921"
object_id: "2067542"
digital_access:
url: "https://digital.library.yale.edu/catalog/2067542"
iiif_manifest: "https://digital.library.yale.edu/manifests/2067542"
pdf_url: "https://digital.library.yale.edu/pdfs/2067542.pdf"
document_metadata:
date_hebrew: "23 Elul 5656"
date_gregorian: "1896-09-01"
place: "Mashhad, Iran"
groom: "Mosheh ben Mashiah"
bride: "Rivkah bat Ya'akov"
physical_extent: "1 leaf, 53 x 37 cm, color illustrations"
languages:
- "Hebrew"
- "Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)"
subjects:
geographic: "Mashhad (Iran) -- Religious life and customs"
topical:
- "Ketubah -- Iran -- Mashhad"
- "Prenuptial agreements (Jewish law)"
genres:
- "Autographs"
- "Illustrations"
- "Ketubahs"
- "Manuscripts"
- "Marginalia"
rights: |
The use of this image may be subject to the copyright law of the
United States (Title 17, United States Code) or to site license or
other rights management terms and conditions. The person using the
image is liable for any infringement.
access_date: "2025-01-13"
citation: |
"Ketubah : Mashhad, Iran, 1896, September 1," Yale University Library,
Beinecke Rare Book and Manuscript Library, Hebrew MSS suppl 194 (Broadside),
Object ID 2067542. Digital Collections, https://digital.library.yale.edu/catalog/2067542
(accessed January 13, 2025).
verification_notes: |
This is a REAL historical document with verified provenance:
- Held at Yale University Beinecke Rare Book & Manuscript Library
- Fully digitized and publicly accessible
- Catalog record #8574921 with complete metadata
- Both principal parties (groom and bride) are named in Yale's catalog
- Physical dimensions and condition documented
- High-resolution images available via IIIF manifest
- Document represents unique Mashhad Jewish community traditions

View file

@ -0,0 +1,263 @@
# =============================================================================
# PiCo Example 7: Spanish Colonial Baptism Record
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Spanish colonial baptismal records from New Spain (Mexico) with rich
# genealogical data including casta (racial/social classification)
# designations and compadrazgo (godparent) relationships.
#
# Language: Spanish
# Period: 1742 CE
# Source Type: Baptismal register (Libro de bautismos)
# Location: Mexico City, New Spain
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_07_spanish_colonial"
example_title: "Spanish Colonial Baptism Record - Mexico City (1742)"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Spanish"
source_type: "baptismal_register"
description: |
Spanish colonial baptismal record from New Spain (Mexico) demonstrating
the casta system and compadrazgo relationships.
Key features:
- Casta designations (español, mestizo, mulato, indio, etc.)
- Legitimacy markers (hijo legítimo vs hijo natural)
- Compadrazgo (godparent relationships creating spiritual kinship)
- Place of origin (vecino de, natural de)
- Ecclesiastical formulae and clerical titles (Br., teniente de cura)
source_text: |
En la ciudad de México, a veinte y tres días del mes de febrero de mil
setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza,
teniente de cura de esta santa iglesia catedral, bauticé solemnemente,
puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro
García de la Cruz, español, natural de la villa de Puebla de los Ángeles,
y de Doña María Josefa de los Reyes, española, natural de esta ciudad.
Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino
de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa,
a quienes advertí el parentesco espiritual y obligaciones que contrajeron.
Y lo firmé.
Br. Don Antonio de Mendoza
expected_extraction:
description: "Spanish colonial baptism demonstrating casta system and compadrazgo"
pico_observation:
observation_id: "bautismo_mexico_1742_juan_jose_garcia"
observed_at: "2025-12-12T12:00:00Z"
source_type: "baptismal_register"
source_reference: "Libro de Bautismos, Catedral de México, 23 Feb 1742"
persons:
- person_index: 0
pnv_name:
literalName: "Juan José"
givenName: "Juan José"
roles:
- role_title: "bautizado"
role_in_source: "baptized"
biographical:
casta: "español"
legitimacy: "hijo legítimo"
religion: "Catholic"
family_relationships:
parent:
- person_index: 1
target_name: "Don Pedro García de la Cruz"
- person_index: 2
target_name: "Doña María Josefa de los Reyes"
godparent:
- person_index: 3
target_name: "Don Francisco Xavier de Castañeda"
- person_index: 4
target_name: "Doña Ana María de la Encarnación"
context: "Infant being baptized"
- person_index: 1
pnv_name:
literalName: "Don Pedro García de la Cruz"
givenName: "Pedro"
surnamePrefix: "García de"
baseSurname: "la Cruz"
honorificPrefix: "Don"
biographical:
casta: "español"
origin: "natural de la villa de Puebla de los Ángeles"
family_relationships:
spouse:
- person_index: 2
target_name: "Doña María Josefa de los Reyes"
children:
- person_index: 0
target_name: "Juan José"
context: "Father of the baptized child"
- person_index: 2
pnv_name:
literalName: "Doña María Josefa de los Reyes"
givenName: "María Josefa"
surnamePrefix: "de"
baseSurname: "los Reyes"
honorificPrefix: "Doña"
biographical:
casta: "española"
origin: "natural de esta ciudad"
family_relationships:
spouse:
- person_index: 1
target_name: "Don Pedro García de la Cruz"
children:
- person_index: 0
target_name: "Juan José"
context: "Mother of the baptized child"
- person_index: 3
pnv_name:
literalName: "Don Francisco Xavier de Castañeda"
givenName: "Francisco Xavier"
surnamePrefix: "de"
baseSurname: "Castañeda"
honorificPrefix: "Don"
roles:
- role_title: "padrino"
role_in_source: "godfather"
biographical:
casta: "español"
residence: "vecino de esta ciudad"
family_relationships:
spouse:
- person_index: 4
target_name: "Doña Ana María de la Encarnación"
godchildren:
- person_index: 0
target_name: "Juan José"
compadre:
- person_index: 1
target_name: "Don Pedro García de la Cruz"
context: "Godfather (padrino)"
- person_index: 4
pnv_name:
literalName: "Doña Ana María de la Encarnación"
givenName: "Ana María"
surnamePrefix: "de"
baseSurname: "la Encarnación"
honorificPrefix: "Doña"
roles:
- role_title: "madrina"
role_in_source: "godmother"
biographical:
marital_status: "legítima esposa"
family_relationships:
spouse:
- person_index: 3
target_name: "Don Francisco Xavier de Castañeda"
godchildren:
- person_index: 0
target_name: "Juan José"
comadre:
- person_index: 2
target_name: "Doña María Josefa de los Reyes"
context: "Godmother (madrina)"
- person_index: 5
pnv_name:
literalName: "Br. Don Antonio de Mendoza"
givenName: "Antonio"
surnamePrefix: "de"
baseSurname: "Mendoza"
honorificPrefix: "Br. Don"
roles:
- role_title: "teniente de cura"
role_in_source: "officiant"
biographical:
ecclesiastical_position: "teniente de cura de esta santa iglesia catedral"
family_relationships: {}
context: "Priest who performed the baptism"
temporal_references:
- expression: "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años"
normalized: "1742-02-23"
calendar: "Gregorian"
type: "DATE"
locations_mentioned:
- name: "ciudad de México"
type: "city"
administrative_entity: "New Spain"
- name: "santa iglesia catedral"
type: "church"
full_name: "Catedral Metropolitana de la Asunción de la Santísima Virgen María"
- name: "villa de Puebla de los Ángeles"
type: "city"
modern_name: "Puebla"
administrative_entity: "New Spain"
colonial_naming_notes: |
Spanish colonial naming conventions demonstrated:
HONORIFIC TITLES:
- Don/Doña: honorific indicating Spanish (peninsular or criollo) status
- Br. (Bachiller): academic degree, often held by clergy
CASTA SYSTEM:
- español/española: persons of Spanish descent (peninsular or criollo)
- mestizo: Spanish + Indigenous ancestry
- mulato: Spanish + African ancestry
- indio: Indigenous person
- (Many other classifications existed in the sistema de castas)
PLACE INDICATORS:
- "natural de": indicates place of birth
- "vecino de": indicates place of residence
LEGITIMACY MARKERS:
- "hijo legítimo": legitimate child (parents married in Church)
- "hijo natural": illegitimate child (parents not married)
COMPADRAZGO (Spiritual Kinship):
- Padrino/madrina: godfather/godmother
- Compadre/comadre: relationship between godparents and parents
- "parentesco espiritual": spiritual kinship with religious obligations
- Created lifelong obligations between families
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on standard Spanish colonial
baptismal formulae for demonstration purposes. Names, dates, and
locations are fictional but follow authentic 17th-century patterns.
For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "Brigham Young University"
collection: "Script Tutorial - Spanish Colonial Baptisms"
digital_url: "https://script.byu.edu/spanish-handwriting/documents/church-records/baptisms"
document_type: "Tutorial with real transcription examples"
license: "Educational use"
- archive: "FamilySearch"
collection: "Mexico, Yucatán, Catholic Church Records, 1543-1977"
collection_id: "1909116"
digital_url: "https://www.familysearch.org/en/search/collection/1909116"
document_type: "Baptisms, marriages, deaths"
license: "Free with registration"
notes: "Contains some of earliest New World records (from 1543)"
- archive: "Archivo General de la Nación (AGN)"
location: "Mexico City, Mexico"
collection: "Colonial parish records"
document_type: "Spanish colonial baptismal records"
period: "16th-20th century CE"
languages: "Spanish, Nahuatl, Latin"

View file

@ -0,0 +1,315 @@
# =============================================================================
# PiCo Example 8: Italian Notarial Act (Venice, 1654 CE)
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Demonstrates extraction from an Italian notarial act showing:
# - Italian naming conventions (patronymic "fu", "quondam")
# - Venetian nobility titles (Nobil Homo, Magnifico)
# - Profession-based surnames (Fabbro, Ferrari)
# - Parish-based location (contrada, sestiere)
#
# Language: Italian (Venetian)
# Period: 1654 CE
# Source Type: Notarial act
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_08_italian_notarial"
example_title: "Italian Notarial Act - Venice (1654)"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Italian"
source_script: "Latin"
source_type: "notarial_act"
description: |
Example of a 17th-century Venetian notarial act demonstrating:
- Italian naming conventions with Latin survivals
- Venetian nobility titles and social hierarchy
- Deceased father markers (fu, quondam)
- Profession-based surnames
- Parish-based location system (contrada)
Notarial acts were legal documents recording contracts, wills, property
transfers, and other legal transactions. They provide rich genealogical
and social history data.
source_text: |
Adì 15 Marzo 1654, in Venetia.
Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu
quondam Magnifico Messer Andrea, della contrada di San Marco,
et sua moglie la Nobil Donna Madonna Caterina Contarini fu
quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo
Fabbro, habitante nella contrada di San Polo, et Messer Marco
Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io
Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico
di Venetia.
expected_extraction:
pico_observation:
observation_id: "notarial_venice_1654-03-15_morosini"
source_type: "notarial_act"
source_reference: "Notarial act, Venice, March 15, 1654"
persons:
- person_index: 0
pnv_name:
literalName: "Il Nobil Homo Messer Giovanni Battista Morosini"
givenName: "Giovanni Battista"
baseSurname: "Morosini"
honorificPrefix: "Il Nobil Homo Messer"
roles:
- role_title: "principal party"
role_in_source: "party to act"
biographical:
social_status: "Venetian nobility"
patronymic: "fu quondam Magnifico Messer Andrea"
father_status: "deceased (quondam)"
family_relationships:
father:
- person_index: 1
target_name: "Magnifico Messer Andrea Morosini"
spouse:
- person_index: 2
target_name: "Nobil Donna Madonna Caterina Contarini"
context: "Principal party, Venetian noble"
- person_index: 1
pnv_name:
literalName: "Magnifico Messer Andrea Morosini"
givenName: "Andrea"
baseSurname: "Morosini"
honorificPrefix: "Magnifico Messer"
roles: []
biographical:
social_status: "Venetian nobility"
deceased: true
deceased_marker: "quondam"
family_relationships:
child:
- person_index: 0
target_name: "Giovanni Battista Morosini"
context: "Father of Giovanni Battista, deceased"
- person_index: 2
pnv_name:
literalName: "Nobil Donna Madonna Caterina Contarini"
givenName: "Caterina"
baseSurname: "Contarini"
honorificPrefix: "Nobil Donna Madonna"
roles:
- role_title: "moglie"
role_in_source: "wife"
biographical:
social_status: "Venetian nobility"
patronymic: "fu quondam Messer Francesco"
family_relationships:
father:
- person_index: 3
target_name: "Messer Francesco Contarini"
spouse:
- person_index: 0
target_name: "Giovanni Battista Morosini"
context: "Wife of Giovanni Battista"
- person_index: 3
pnv_name:
literalName: "Messer Francesco Contarini"
givenName: "Francesco"
baseSurname: "Contarini"
honorificPrefix: "Messer"
roles: []
biographical:
deceased: true
deceased_marker: "quondam"
family_relationships:
child:
- person_index: 2
target_name: "Caterina Contarini"
context: "Father of Caterina, deceased"
- person_index: 4
pnv_name:
literalName: "Messer Pietro fu Paolo Fabbro"
givenName: "Pietro"
baseSurname: "Fabbro"
honorificPrefix: "Messer"
roles:
- role_title: "testimone"
role_in_source: "witness"
biographical:
patronymic: "fu Paolo"
residence: "contrada di San Polo"
family_relationships:
father:
- person_index: 5
target_name: "Paolo Fabbro"
context: "First witness"
- person_index: 5
pnv_name:
literalName: "Paolo Fabbro"
givenName: "Paolo"
baseSurname: "Fabbro"
roles: []
biographical:
deceased: true
family_relationships:
child:
- person_index: 4
target_name: "Pietro Fabbro"
context: "Father of witness Pietro, deceased"
- person_index: 6
pnv_name:
literalName: "Messer Marco Antonio Ferrari fu Giovanni"
givenName: "Marco Antonio"
baseSurname: "Ferrari"
honorificPrefix: "Messer"
roles:
- role_title: "testimone"
role_in_source: "witness"
biographical:
patronymic: "fu Giovanni"
occupation: "bottegaio"
workplace: "Rialto"
family_relationships:
father:
- person_index: 7
target_name: "Giovanni Ferrari"
context: "Second witness, shopkeeper"
- person_index: 7
pnv_name:
literalName: "Giovanni Ferrari"
givenName: "Giovanni"
baseSurname: "Ferrari"
roles: []
biographical:
deceased: true
family_relationships:
child:
- person_index: 6
target_name: "Marco Antonio Ferrari"
context: "Father of witness Marco Antonio, deceased"
- person_index: 8
pnv_name:
literalName: "Notaro Antonio Zen fu quondam Messer Giacomo"
givenName: "Antonio"
baseSurname: "Zen"
honorificPrefix: "Notaro"
roles:
- role_title: "notaro"
role_in_source: "notary"
biographical:
patronymic: "fu quondam Messer Giacomo"
occupation: "Notaro publico di Venetia"
family_relationships:
father:
- person_index: 9
target_name: "Messer Giacomo Zen"
context: "Notary who drafted the act"
- person_index: 9
pnv_name:
literalName: "Messer Giacomo Zen"
givenName: "Giacomo"
baseSurname: "Zen"
honorificPrefix: "Messer"
roles: []
biographical:
deceased: true
deceased_marker: "quondam"
family_relationships:
child:
- person_index: 8
target_name: "Antonio Zen"
context: "Father of notary, deceased"
temporal_references:
- expression: "Adì 15 Marzo 1654"
normalized: "1654-03-15"
calendar: "Gregorian"
type: "DATE"
locations_mentioned:
- name: "Venetia"
name_modern: "Venice"
type: "city"
- name: "contrada di San Marco"
type: "parish/district"
parent: "Venice"
- name: "contrada di San Polo"
type: "parish/district"
parent: "Venice"
- name: "Rialto"
type: "district/market"
parent: "Venice"
italian_naming_notes: |
Italian notarial naming conventions demonstrated:
DECEASED FATHER MARKERS:
- "fu": Italian for "was" - indicates deceased father
- "quondam": Latin survival meaning "formerly/the late"
- Often combined: "fu quondam" for emphasis
VENETIAN NOBILITY TITLES:
- "Magnifico Messer": high honorific for nobility
- "Il Nobil Homo" / "N.H.": Venetian noble title (male)
- "Nobil Donna" / "N.D.": Venetian noble title (female)
- "Madonna": honorific for married noble women
COMMONER TITLES:
- "Messer": general respectful address (Mister)
PROFESSION-BASED SURNAMES:
- Fabbro: smith (from Latin faber)
- Ferrari: ironworker (from Latin ferrarius)
LOCATION INDICATORS:
- "habitante in/nella": residence indicator
- "bottegaio": shopkeeper
- Contrada: parish neighborhood system of Venice
- Sestiere: one of six districts of Venice
NOTARIAL TERMINOLOGY:
- "Rogato": drafted/witnessed (by notary)
- "Notaro publico": public notary (licensed)
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on authentic 17th-century
Venetian notarial document formulae for demonstration purposes.
Names, dates, and locations are fictional but follow period-accurate
conventions. For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "Italian Ministry of Culture"
project: "Antenati (Ancestors)"
digital_url: "https://antenati.cultura.gov.it/"
venice_url: "https://antenati.cultura.gov.it/archivio/state-archives-of-venezia/?lang=en"
document_type: "Civil registry, notarial acts, parish records"
period: "15th century+"
license: "Open Access"
- archive: "University of California Libraries"
collection: "Italian Notarial Documents Collection"
finding_aid: "https://oac.cdlib.org/findaid/ark:%2F13030%2Fc8v412zd"
document_count: "168 documents"
period: "1465-1635 CE"
locations: "Venice, Padua, Verona"
languages: "Latin, Italian (Venetian)"
- project: "SION-Digit (Sources for the History of Italian Jewish Notarial Documents)"
coverage: "Venice, Bordeaux, Amsterdam"
period: "16th-18th century CE"
focus: "Jewish community notarial acts"
languages: "Italian, Hebrew, Ladino"

View file

@ -0,0 +1,259 @@
# =============================================================================
# PiCo Example 9: Greek Orthodox Parish Register (1875 CE, Thessaloniki)
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Demonstrates extraction from a Greek Orthodox baptismal register showing:
# - Greek script with romanization
# - Greek patronymics (του + genitive)
# - Godparent system (νονός/νονά)
# - Orthodox naming conventions
# - Deceased marker (μακαρίτης/μακαρίτισσα)
#
# Language: Greek (polytonic)
# Period: 1875 CE
# Source Type: Baptismal register
# Calendar: Julian (Orthodox Church)
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_09_greek_baptismal_register"
example_title: "Greek Orthodox Baptismal Register - Thessaloniki 1875"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Greek"
source_script: "Greek (polytonic)"
source_period: "1875 CE"
source_type: "baptismal_register"
description: |
This example demonstrates extraction from a 19th-century Greek Orthodox
baptismal register, illustrating key features of Greek naming conventions
and ecclesiastical record-keeping during the Ottoman period.
Key features demonstrated:
- Polytonic Greek orthography (common in 19th century)
- Patronymic formation with του + genitive case
- Godparent (νονός/νονά) relationships
- Deceased marker μακαρίτης/μακαρίτισσα ("the late")
- Surnames derived from occupations (Παπαδόπουλος, Οἰκονόμος)
- Ecclesiastical titles (Πρωτοπρεσβύτερος = Archpriest)
- Julian calendar dating (Greek Orthodox tradition)
source_text: |
Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875.
Ἐβαπτίσθη ὁ Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου,
ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ
μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς ὁ Κωνσταντῖνος
Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: ὁ Πρωτοπρεσβύτερος
Ἀθανάσιος Χρυσοστόμου.
expected_extraction:
pico_observation:
observation_id: "baptism_thessaloniki_1875-03-15_papadopoulos"
source_type: "baptismal_register"
source_reference: "Greek Orthodox baptismal register, Thessaloniki, March 15, 1875"
persons:
- person_index: 0
pnv_name:
literalName: "Δημήτριος"
literalName_romanized: "Dimitrios"
givenName: "Δημήτριος"
givenName_romanized: "Dimitrios"
roles:
- role_title: "βαπτισθείς"
role_in_source: "baptized infant"
biographical:
sex: "male"
religion: "Greek Orthodox"
family_relationships:
father:
- person_index: 1
target_name: "Νικόλαος Παπαδόπουλος"
mother:
- person_index: 2
target_name: "Ἑλένη"
godfather:
- person_index: 4
target_name: "Κωνσταντῖνος Καρατζᾶς"
context: "Baptized infant"
- person_index: 1
pnv_name:
literalName: "Νικόλαος Παπαδόπουλος"
literalName_romanized: "Nikolaos Papadopoulos"
givenName: "Νικόλαος"
givenName_romanized: "Nikolaos"
baseSurname: "Παπαδόπουλος"
baseSurname_romanized: "Papadopoulos"
roles:
- role_title: "πατήρ"
role_in_source: "father"
biographical:
occupation: "ἔμπορος (merchant)"
family_relationships:
child:
- person_index: 0
target_name: "Δημήτριος"
spouse:
- person_index: 2
target_name: "Ἑλένη"
context: "Father of the baptized, merchant"
- person_index: 2
pnv_name:
literalName: "Ἑλένη τῆς τοῦ μακαρίτου Γεωργίου Οἰκονόμου"
literalName_romanized: "Eleni tis tou makaritou Georgiou Oikonomou"
givenName: "Ἑλένη"
givenName_romanized: "Eleni"
roles:
- role_title: "μήτηρ"
role_in_source: "mother"
biographical:
marital_status: "νομίμη σύζυγος (lawful wife)"
patronymic: "τῆς τοῦ μακαρίτου Γεωργίου Οἰκονόμου"
family_relationships:
father:
- person_index: 3
target_name: "Γεώργιος Οἰκονόμος"
child:
- person_index: 0
target_name: "Δημήτριος"
spouse:
- person_index: 1
target_name: "Νικόλαος Παπαδόπουλος"
context: "Mother of the baptized"
- person_index: 3
pnv_name:
literalName: "μακαρίτης Γεώργιος Οἰκονόμος"
literalName_romanized: "makaritis Georgios Oikonomos"
givenName: "Γεώργιος"
givenName_romanized: "Georgios"
baseSurname: "Οἰκονόμος"
baseSurname_romanized: "Oikonomos"
roles: []
biographical:
deceased: true
deceased_marker: "μακαρίτης"
family_relationships:
child:
- person_index: 2
target_name: "Ἑλένη"
context: "Maternal grandfather, deceased"
- person_index: 4
pnv_name:
literalName: "Κωνσταντῖνος Καρατζᾶς τοῦ Ἰωάννου"
literalName_romanized: "Konstantinos Karatzas tou Ioannou"
givenName: "Κωνσταντῖνος"
givenName_romanized: "Konstantinos"
baseSurname: "Καρατζᾶς"
baseSurname_romanized: "Karatzas"
roles:
- role_title: "νονός"
role_in_source: "godfather"
biographical:
occupation: "ἰατρός (physician)"
patronymic: "τοῦ Ἰωάννου"
family_relationships:
father:
- person_index: 5
target_name: "Ἰωάννης Καρατζᾶς"
godchild:
- person_index: 0
target_name: "Δημήτριος"
context: "Godfather, physician"
- person_index: 5
pnv_name:
literalName: "Ἰωάννης Καρατζᾶς"
literalName_romanized: "Ioannis Karatzas"
givenName: "Ἰωάννης"
givenName_romanized: "Ioannis"
baseSurname: "Καρατζᾶς"
baseSurname_romanized: "Karatzas"
roles: []
biographical: {}
family_relationships:
child:
- person_index: 4
target_name: "Κωνσταντῖνος Καρατζᾶς"
context: "Father of godfather"
- person_index: 6
pnv_name:
literalName: "Πρωτοπρεσβύτερος Ἀθανάσιος Χρυσοστόμου"
literalName_romanized: "Protopresbyteros Athanasios Chrysostomou"
givenName: "Ἀθανάσιος"
givenName_romanized: "Athanasios"
patronymic: "Χρυσοστόμου"
patronymic_romanized: "Chrysostomou"
honorificPrefix: "Πρωτοπρεσβύτερος"
roles:
- role_title: "ἱερεύς"
role_in_source: "priest"
biographical:
ecclesiastical_rank: "Πρωτοπρεσβύτερος (Protopresbyter/Archpriest)"
family_relationships: {}
context: "Officiating priest"
temporal_references:
- expression: "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875"
expression_romanized: "ti dekati pempti Martiou tou etous 1875"
normalized: "1875-03-15"
calendar: "Julian"
type: "DATE"
note: "Greek Orthodox used Julian calendar; Gregorian equivalent: March 27, 1875"
locations_mentioned:
- name: "Θεσσαλονίκη"
name_romanized: "Thessaloniki"
type: "city"
modern_country: "Greece"
historical_context: "Ottoman Empire (Selanik vilayet)"
greek_naming_notes: |
Greek Orthodox naming conventions demonstrated:
- "τοῦ" + genitive: patronymic marker ("son/daughter of")
- "μακαρίτης/μακαρίτισσα": deceased marker ("the late")
- "νομίμη σύζυγος": lawful wife
- "νονός/νονά": godfather/godmother
- Surnames from occupations: Παπαδόπουλος (priest's son), Οἰκονόμος (steward)
- Ecclesiastical titles: Πρωτοπρεσβύτερος (Archpriest)
- Polytonic Greek orthography common in 19th century
- Julian calendar used by Greek Orthodox Church
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on authentic Greek Orthodox
baptismal register formulae for demonstration purposes. Names, dates,
and locations are fictional but follow 19th-century conventions.
For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "FamilySearch"
wiki_url: "https://www.familysearch.org/en/wiki/Greece_Church_Records"
document_type: "Baptisms, marriages, deaths"
period: "17th century - 1925 CE"
language: "Greek"
license: "Free with registration"
notes: "Greek Orthodox records are primary source before 1925 civil registration"
- archive: "Γενικά Αρχεία του Κράτους (General State Archives of Greece)"
abbreviation: "GAK"
document_type: "Church records, civil registry, Ottoman-era documents"
period: "15th century - present"
languages: "Greek, Ottoman Turkish"
notes: "National archive with records from all Greek regions"
- resource: "Greek Ancestry"
coverage: "Village church records guide"
document_type: "Baptismal registers, marriage registers"
notes: "Guides to accessing island and mainland records"

View file

@ -0,0 +1,489 @@
# =============================================================================
# PiCo Example 10: Russian Imperial Metrical Book - Birth of Stefan Nowicki
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: REAL HISTORICAL DATA
#
# Source: Archiwum Panstwowe w Poznaniu Oddzial w Koninie
# Reference Code: 54/792/0/6.1/140
# Scan: 4 of 76
# Document Date: 27 December 1893 (Julian) / 8 January 1894 (Gregorian)
# Location: Osiek Wielki, Congress Poland, Russian Empire
#
# Demonstrates extraction from a Russian Imperial metrical book showing:
# - Cyrillic script with romanization
# - Polish names recorded in Russian (Congress Poland context)
# - Pre-revolutionary orthography
# - Julian/Gregorian calendar dual dating
# - Godparents (vospriemniki)
# - Village-level vital records
#
# Transcription verified by BYU Script Tutorial paleographers.
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_10_russian_metrical_book"
example_title: "Russian Imperial Metrical Book - Birth of Stefan Nowicki (1894)"
data_status: "REAL_HISTORICAL_DATA"
source_language: "Russian"
source_script: "Cyrillic (pre-1918 orthography)"
source_period: "1894 CE (Gregorian) / 1893 CE (Julian)"
source_type: "metrical_book"
document_subtype: "birth_record"
# -----------------------------------------------------------------------------
# Source Text
# -----------------------------------------------------------------------------
source_text:
russian_original: |
Любины
Состаялосъ въ деревнѣ осѣкъ велькій двадцать седьмаго Декабря
/:восьмаго Января:/ тысяча восемьсоть девяносто третяго (четвертаго) года
въ одинадцать часовъ утра Явился Янъ Новицкій /:Jan Nowicki:/
сорока лѣтъ отъ роду земледѣлецъ изъ Любинъ, въ присутствіи
Францишка Новицкаго сорока лѣтъ, и Михаила Влодарчика
шестидесяти лѣтъ отъ роду, обоихъ земледѣльцевъ изъ Любинъ
и предьявилъ намъ младенца мужскаго пола, объявляя
что онъ родился въ Любинахъ двадцать пятаго Декабря
/:шестаго Января:/ текущаго года, въ четыре часа вечеромъ
отъ законной его жены Маріанны изъ Адамковъ /:Mary-
anny z Adamkow:/ тридцати лѣтъ отъ роду, младенцу
этому при святомъ крещеніи совершенномъ сего
числа дано имя Стефанъ /:Stefan:/ а воспріемниками
его были Войцех Гаудынъ, и Катаржина Гембка.
Актъ сей объявляющему и свидѣтелямъ негра-
мотнымъ прочитанъ нами только подписанъ
Ксндзъ Павелъ Выборскій
romanized: |
Lyubiny
Sostoyalos' v derevne Osek Vel'kiy dvadtsat' sed'mago Dekabrya
/:vos'mago Yanvarya:/ tysyacha vosem'sot' devyanosto tret'yago (chetvertago) goda
v odinnadtsat' chasov utra Yavilsya Yan Novitskiy /:Jan Nowicki:/
soroka let ot rodu zemledelets iz Lyubin, v prisutstvii
Frantsishka Novitskago soroka let, i Mikhaila Vlodarchika
shestidesyati let ot rodu, oboikh zemledeltsev iz Lyubin
i pred'yavil nam mladentsa muzhskago pola, ob'yavlyaya
chto on rodilsya v Lyubinakh dvadtsat' pyatago Dekabrya
/:shestago Yanvarya:/ tekushchago goda, v chetyre chasa vecherom
ot zakonnoy ego zheny Marianny iz Adamkov /:Mary-
anny z Adamkow:/ tridtsati let ot rodu, mladentsu
etomu pri svyatom kreshchenii sovershennom sego
chisla dano imya Stefan /:Stefan:/ a vospriyemnikami
ego byli Voytsekh Gaudyn, i Katarzhina Gembka.
Akt sey ob'yavlyayushchemu i svidetel'yam negra-
motnym prochitan nami tol'ko podpisan
Ksndz Pavel Vyborskiy
english_translation: |
Lubin
It happened in the village of Osiek Wielki on the twenty-seventh of December
/:eighth of January:/ in the year one thousand eight hundred ninety-three (four)
at eleven o'clock in the morning. Appeared Jan Nowicki /:Jan Nowicki:/
forty years of age, farmer from Lubin, in the presence of
Franciszek Nowicki, forty years old, and Michal Wlodarczyk
sixty years of age, both farmers from Lubin
and presented to us an infant of the male sex, declaring
that he was born in Lubin on the twenty-fifth of December
/:sixth of January:/ of the current year, at four o'clock in the evening
of his lawful wife Marianna nee Adamkow /:Mary-
anna z Adamkow:/ thirty years of age. To this infant,
at the holy baptism performed on this
date, was given the name Stefan /:Stefan:/ and his godparents
were Wojciech Gaudyn and Katarzyna Gembka.
This act, to the declarant and to the illiterate witnesses,
was read by us and only signed.
Priest Pawel Wyborski
# -----------------------------------------------------------------------------
# Expected Extraction Output
# -----------------------------------------------------------------------------
expected_extraction:
pico_observation:
observation_id: "birth_osiek_wielki_1894_stefan_nowicki"
source_type: "metrical_book"
source_reference: "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki, Reference Code 54/792/0/6.1/140, scan 4/76"
archive: "Archiwum Panstwowe w Poznaniu Oddzial w Koninie"
persons:
# Person 0: The Infant (Stefan Nowicki)
- person_index: 0
pnv_name:
literalName: "Стефанъ Новицкій"
literalName_romanized: "Stefan Novitskiy"
literalName_polish: "Stefan Nowicki"
givenName: "Стефанъ"
givenName_romanized: "Stefan"
baseSurname: "Новицкій"
baseSurname_romanized: "Novitskiy"
baseSurname_polish: "Nowicki"
roles:
- role_title: "младенецъ"
role_in_source: "infant"
biographical:
sex: "male"
religion: "Roman Catholic"
birth_date_julian: "1893-12-25"
birth_date_gregorian: "1894-01-06"
baptism_date_julian: "1893-12-27"
baptism_date_gregorian: "1894-01-08"
birth_place: "Любины (Lubin)"
birth_time: "4 o'clock in the evening"
family_relationships:
father:
- person_index: 1
target_name: "Янъ Новицкій"
mother:
- person_index: 2
target_name: "Маріанна изъ Адамковъ"
godfather:
- person_index: 5
target_name: "Войцех Гаудынъ"
godmother:
- person_index: 6
target_name: "Катаржина Гембка"
context: "Newborn infant, subject of the birth registration"
# Person 1: Father (Jan Nowicki)
- person_index: 1
pnv_name:
literalName: "Янъ Новицкій"
literalName_romanized: "Yan Novitskiy"
literalName_polish: "Jan Nowicki"
givenName: "Янъ"
givenName_romanized: "Yan"
givenName_polish: "Jan"
baseSurname: "Новицкій"
baseSurname_romanized: "Novitskiy"
baseSurname_polish: "Nowicki"
roles:
- role_title: "отецъ"
role_in_source: "father"
- role_title: "объявляющій"
role_in_source: "declarant"
biographical:
sex: "male"
age: 40
age_expression: "сорока лѣтъ отъ роду"
occupation: "земледѣлецъ (farmer)"
residence: "Любины (Lubin)"
literacy: "illiterate (implied - act read to him)"
family_relationships:
child:
- person_index: 0
target_name: "Стефанъ Новицкій"
spouse:
- person_index: 2
target_name: "Маріанна изъ Адамковъ"
possible_relative:
- person_index: 3
target_name: "Францишекъ Новицкій"
relationship_type: "same surname - possibly brother or cousin"
context: "Father of the infant, farmer from Lubin, appeared to register the birth"
# Person 2: Mother (Marianna nee Adamkow)
- person_index: 2
pnv_name:
literalName: "Маріанна изъ Адамковъ"
literalName_romanized: "Marianna iz Adamkov"
literalName_polish: "Maryanna z Adamkow"
givenName: "Маріанна"
givenName_romanized: "Marianna"
givenName_polish: "Maryanna"
maidenName: "Адамковъ"
maidenName_romanized: "Adamkov"
maidenName_polish: "Adamkow"
roles:
- role_title: "мать"
role_in_source: "mother"
biographical:
sex: "female"
age: 30
age_expression: "тридцати лѣтъ отъ роду"
marital_status: "законная жена (lawful wife)"
maiden_name_marker: "изъ (nee/z)"
family_relationships:
child:
- person_index: 0
target_name: "Стефанъ Новицкій"
spouse:
- person_index: 1
target_name: "Янъ Новицкій"
context: "Mother of the infant, lawful wife of Jan Nowicki"
# Person 3: First Witness (Franciszek Nowicki)
- person_index: 3
pnv_name:
literalName: "Францишекъ Новицкій"
literalName_romanized: "Frantsishek Novitskiy"
literalName_polish: "Franciszek Nowicki"
givenName: "Францишекъ"
givenName_romanized: "Frantsishek"
givenName_polish: "Franciszek"
baseSurname: "Новицкій"
baseSurname_romanized: "Novitskiy"
baseSurname_polish: "Nowicki"
roles:
- role_title: "свидѣтель"
role_in_source: "witness"
biographical:
sex: "male"
age: 40
age_expression: "сорока лѣтъ"
occupation: "земледѣлецъ (farmer)"
residence: "Любины (Lubin)"
literacy: "illiterate (неграмотный)"
family_relationships:
possible_relative:
- person_index: 1
target_name: "Янъ Новицкій"
relationship_type: "same surname, same age, same village - possibly brother"
context: "First witness, farmer from Lubin, same surname as father"
# Person 4: Second Witness (Michal Wlodarczyk)
- person_index: 4
pnv_name:
literalName: "Михаилъ Влодарчикъ"
literalName_romanized: "Mikhail Vlodarchik"
literalName_polish: "Michal Wlodarczyk"
givenName: "Михаилъ"
givenName_romanized: "Mikhail"
givenName_polish: "Michal"
baseSurname: "Влодарчикъ"
baseSurname_romanized: "Vlodarchik"
baseSurname_polish: "Wlodarczyk"
roles:
- role_title: "свидѣтель"
role_in_source: "witness"
biographical:
sex: "male"
age: 60
age_expression: "шестидесяти лѣтъ отъ роду"
occupation: "земледѣлецъ (farmer)"
residence: "Любины (Lubin)"
literacy: "illiterate (неграмотный)"
family_relationships: {}
context: "Second witness, farmer from Lubin, age 60"
# Person 5: Godfather (Wojciech Gaudyn)
- person_index: 5
pnv_name:
literalName: "Войцех Гаудынъ"
literalName_romanized: "Voytsekh Gaudyn"
literalName_polish: "Wojciech Gaudyn"
givenName: "Войцех"
givenName_romanized: "Voytsekh"
givenName_polish: "Wojciech"
baseSurname: "Гаудынъ"
baseSurname_romanized: "Gaudyn"
baseSurname_polish: "Gaudyn"
roles:
- role_title: "воспріемникъ"
role_in_source: "godfather"
biographical:
sex: "male"
family_relationships:
godchild:
- person_index: 0
target_name: "Стефанъ Новицкій"
context: "Godfather (baptismal sponsor)"
# Person 6: Godmother (Katarzyna Gembka)
- person_index: 6
pnv_name:
literalName: "Катаржина Гембка"
literalName_romanized: "Katarzhina Gembka"
literalName_polish: "Katarzyna Gembka"
givenName: "Катаржина"
givenName_romanized: "Katarzhina"
givenName_polish: "Katarzyna"
baseSurname: "Гембка"
baseSurname_romanized: "Gembka"
baseSurname_polish: "Gembka"
roles:
- role_title: "воспріемница"
role_in_source: "godmother"
biographical:
sex: "female"
family_relationships:
godchild:
- person_index: 0
target_name: "Стефанъ Новицкій"
context: "Godmother (baptismal sponsor)"
# Person 7: Priest (Pawel Wyborski)
- person_index: 7
pnv_name:
literalName: "Ксндзъ Павелъ Выборскій"
literalName_romanized: "Ksndz Pavel Vyborskiy"
literalName_polish: "Ksiadz Pawel Wyborski"
givenName: "Павелъ"
givenName_romanized: "Pavel"
givenName_polish: "Pawel"
baseSurname: "Выборскій"
baseSurname_romanized: "Vyborskiy"
baseSurname_polish: "Wyborski"
honorificPrefix: "Ксндзъ (Priest)"
roles:
- role_title: "ксндзъ"
role_in_source: "priest"
- role_title: "registrar"
role_in_source: "signed the act"
biographical:
sex: "male"
ecclesiastical_status: "Roman Catholic priest"
literacy: "literate (only signer)"
family_relationships: {}
context: "Officiating priest who performed baptism and signed the registration"
temporal_references:
- expression: "тысяча восемьсоть девяносто третяго (четвертаго) года"
expression_romanized: "tysyacha vosem'sot' devyanosto tret'yago (chetvertago) goda"
normalized_julian: "1893"
normalized_gregorian: "1894"
calendar: "Dual (Julian/Gregorian)"
type: "YEAR"
note: "Document shows both Julian (1893) and Gregorian (1894) years"
- expression: "двадцать седьмаго Декабря /:восьмаго Января:/"
expression_romanized: "dvadtsat' sed'mago Dekabrya /:vos'mago Yanvarya:/"
normalized_julian: "1893-12-27"
normalized_gregorian: "1894-01-08"
calendar: "Dual (Julian/Gregorian)"
type: "DATE"
event: "registration and baptism"
- expression: "двадцать пятаго Декабря /:шестаго Января:/"
expression_romanized: "dvadtsat' pyatago Dekabrya /:shestago Yanvarya:/"
normalized_julian: "1893-12-25"
normalized_gregorian: "1894-01-06"
calendar: "Dual (Julian/Gregorian)"
type: "DATE"
event: "birth"
note: "Born on Christmas Day (Julian calendar)"
- expression: "въ четыре часа вечеромъ"
expression_romanized: "v chetyre chasa vecherom"
normalized: "16:00"
type: "TIME"
event: "birth"
- expression: "въ одинадцать часовъ утра"
expression_romanized: "v odinnadtsat' chasov utra"
normalized: "11:00"
type: "TIME"
event: "registration"
locations_mentioned:
- name: "Осѣкъ Велькій"
name_romanized: "Osek Vel'kiy"
name_polish: "Osiek Wielki"
type: "village (derevnya)"
modern_location: "Greater Poland Voivodeship, Poland"
coordinates: "52.2461, 18.6207"
geonames_url: "https://www.google.com/maps/place/Osiek+Wielki,+Poland"
- name: "Любины"
name_romanized: "Lyubiny"
name_polish: "Lubin"
type: "village"
note: "Village where the family resided and child was born"
- name: "Parafia Rzymskokatolicka Osiek Wielki"
type: "parish"
note: "Roman Catholic Parish of Osiek Wielki - registration authority"
# -----------------------------------------------------------------------------
# Russian/Polish Naming Conventions Demonstrated
# -----------------------------------------------------------------------------
naming_conventions_notes: |
Congress Poland naming conventions demonstrated in this REAL document:
1. DUAL SCRIPT NOTATION:
- Polish names recorded in both Russian Cyrillic AND Latin script
- Example: "Янъ Новицкій /:Jan Nowicki:/"
- Slashes and colons mark the Latin/Polish original
2. PRE-REVOLUTIONARY ORTHOGRAPHY:
- Hard sign at end of words: Новицкій, Стефанъ
- Yat instead of e: лѣтъ, деревнѣ, свидѣтелямъ
- -аго/-яго genitive endings (later simplified to -ого/-его)
3. POLISH MAIDEN NAME CONVENTION:
- "изъ Адамковъ" = "z Adamkow" = nee Adamkow
- "изъ" (from) marks maiden/birth name
4. WITNESSES (свидѣтели):
- Two male witnesses required for registration
- Both noted as illiterate (неграмотнымъ)
- Father (declarant) also illiterate - act "read" to them
5. CALENDAR SYSTEM:
- Russian Empire used Julian calendar
- Congress Poland (under Russian rule) noted both dates
- 12-day difference in 1893-1894
- Format: Julian date /:Gregorian date:/
6. GODPARENTS (воспріемники):
- Male: воспріемникъ (godfather)
- Female: воспріемница (godmother)
- Not necessarily from same family as parents
7. SOCIAL/OCCUPATIONAL TERMS:
- земледѣлецъ = farmer/agriculturalist
- ксндзъ = ksiadz (Polish priest title, from German "Knez")
# -----------------------------------------------------------------------------
# Provenance
# -----------------------------------------------------------------------------
provenance:
data_status: "REAL_HISTORICAL_DATA"
archive:
name: "Archiwum Panstwowe w Poznaniu Oddzial w Koninie"
name_english: "State Archive in Poznan, Konin Branch"
collection: "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki (pow. kolski)"
collection_english: "Civil Registration Records of the Roman Catholic Parish of Osiek Wielki (Kolo district)"
reference_code: "54/792/0/6.1/140"
scan_number: "4 of 76"
document_metadata:
date_julian: "1893-12-27"
date_gregorian: "1894-01-08"
digital_access:
archive_url: "https://szukajwarchiwach.gov.pl"
tutorial_url: "https://script.byu.edu/russian-handwriting/transcription/birth/osiek-wielki-poland/1894"
license: "Public domain (historical document over 100 years old)"
citation: |
"Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki (pow. kolski),"
Archiwum Panstwowe w Poznaniu Oddzial w Koninie, Szukaj w Archiwach
(szukajwarchiwach.gov.pl: accessed 25 January 2023), entry for Stefan Novitsky,
Catholic birth record, 6 January 1894 (Gregorian date), Osiek Wielki, Czolowo,
Kolo, Kaliska, Russian Empire, Reference Code 54/792/0/6.1/140, scan no. 4 of 76.
transcription_source:
institution: "Brigham Young University"
project: "Script Tutorial"
url: "https://script.byu.edu/russian-handwriting/transcription/birth/osiek-wielki-poland/1894"
access_date: "2025-01-13"
notes: "Complete line-by-line transcription with Russian original, romanization, and English translation"
verification_notes: |
This is a REAL historical document with verified transcription:
- Original held at Polish State Archives (Archiwum Panstwowe)
- Transcribed and verified by BYU Script Tutorial paleographers
- All 8 persons are real historical individuals
- Names provided in both Russian Cyrillic and Polish Latin script in original
- Stefan Nowicki born 6 January 1894 (Gregorian) in Lubin village
- Family: farmers (zemledelcy) in Greater Poland region
- Document context: Congress Poland under Russian Imperial rule

View file

@ -0,0 +1,281 @@
# =============================================================================
# PiCo Example 11: Ottoman Turkish Sijill (Court Record)
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: _examples_index.yaml
#
# DATA STATUS: SYNTHETIC_EXAMPLE
#
# Demonstrates extraction from an Ottoman Turkish court record (sijill) showing:
# - Ottoman Turkish in Arabic script
# - Honorific titles: Ağa, Efendi, Çelebi, Hatun
# - Patronymics: bin (son of), bint (daughter of)
# - Deceased markers: merhum/merhume
# - Hijri calendar
# - Mixed Arabic-Turkish vocabulary
# - Court terminology
#
# Language: Ottoman Turkish (Arabic script)
# Period: 1258 AH (1842 CE)
# Source Type: Sijill (Sharia Court Register)
# Archive Context: Şer'iyye Sicilleri (Islamic Court Registers)
#
# Last Updated: 2025-12-12
# =============================================================================
example_id: "example_11_ottoman_sijill"
example_title: "Ottoman Court Record (Sijill) - Property Sale, Demirciköy 1258 AH"
data_status: "SYNTHETIC_EXAMPLE"
source_language: "Ottoman Turkish"
source_script: "Arabic"
source_period: "1258 AH (1842 CE)"
source_type: "sijill"
document_subtype: "property_sale"
archive_context: "Şer'iyye Sicilleri (Islamic Court Registers)"
description: |
This example demonstrates extraction from an Ottoman Turkish sijill
(Islamic court register) documenting a property sale transaction.
Key features demonstrated:
- Ottoman Turkish written in Arabic script
- Honorific titles indicating social class (Ağa, Efendi, Çelebi, Hatun)
- Arabic patronymic markers (bin, bint)
- Turkish patronymic suffix (-oğlu)
- Deceased markers (merhum/merhume)
- Hijri lunar calendar dating
- Mixed Arabic-Turkish legal vocabulary
- Court record terminology (şahid, mübayi', ba'i)
source_text: |
بسم الله الرحمن الرحيم
مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجی‌کوی
ساکنلرندن محمد بن احمد افندی و زوجه‌سی فاطمه خاتون بنت علی‌اوغلو
حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی
شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی
فی اوائل شهر رجب سنة ١٢٥٨
source_text_romanized: |
Bismillahirrahmanirrahim
Meclis-i şer'-i şerifde Mehmed Ağa bin Abdullah merhum kasaba Demirciköy
sakinlerinden Mehmed bin Ahmed Efendi ve zevcesi Fatma Hatun bint Ali-oğlu
hazır olub Mehmed Ağa tarafından yirmi beş guruş bedel-i ma'lum ile satıldı
Şuhud al-hal: Hasan Efendi bin Ömer, İbrahim Çelebi bin Mustafa
Fi evail-i şehr-i Receb sene 1258
source_text_english: |
In the name of God, the Merciful, the Compassionate
In the noble Sharia court, Mehmed Ağa son of the late Abdullah, [sold to]
residents of the town of Demirciköy, Mehmed son of Ahmed Efendi and his
wife Fatma Hatun daughter of Ali-oğlu, who were present, for the known
price of twenty-five guruş, [the property] was sold by Mehmed Ağa.
Witnesses present: Hasan Efendi son of Ömer, İbrahim Çelebi son of Mustafa
In early Receb of the year 1258 [Hijri]
expected_extraction:
pico_observation:
observation_id: "sijill_demircikoy_1258ah_sale"
source_type: "sijill"
source_reference: "Şer'iyye Sicili, Demirciköy, Receb 1258 AH"
persons:
- person_index: 0
pnv_name:
literalName: "محمد آغا بن عبد الله"
literalName_romanized: "Mehmed Ağa bin Abdullah"
givenName: "محمد"
givenName_romanized: "Mehmed"
title: "آغا (Ağa)"
patronymic: "بن عبد الله"
patronymic_romanized: "bin Abdullah"
roles:
- role_title: "با‌ئع (ba'i)"
role_in_source: "seller"
biographical:
sex: "male"
status: "deceased"
deceased_marker: "مرحوم (merhum)"
social_rank: "Ağa (military/landowning class)"
family_relationships:
father:
- name: "عبد الله (Abdullah)"
status: "deceased"
context: "Seller (deceased), Ağa = military/landowning"
- person_index: 1
pnv_name:
literalName: "محمد بن احمد افندی"
literalName_romanized: "Mehmed bin Ahmed Efendi"
givenName: "محمد"
givenName_romanized: "Mehmed"
title: "افندی (Efendi)"
patronymic: "بن احمد"
patronymic_romanized: "bin Ahmed"
roles:
- role_title: "مشتری (müşteri)"
role_in_source: "buyer"
biographical:
sex: "male"
residence: "Demirciköy"
social_rank: "Efendi (educated class)"
family_relationships:
father:
- name: "احمد (Ahmed)"
spouse:
- person_index: 2
target_name: "Fatma Hatun"
context: "Buyer, Efendi = literate/administrative"
- person_index: 2
pnv_name:
literalName: "فاطمه خاتون بنت علی‌اوغلو"
literalName_romanized: "Fatma Hatun bint Ali-oğlu"
givenName: "فاطمه"
givenName_romanized: "Fatma"
title: "خاتون (Hatun)"
patronymic: "بنت علی‌اوغلو"
patronymic_romanized: "bint Ali-oğlu"
roles:
- role_title: "مشتری (müşteri)"
role_in_source: "buyer"
- role_title: "زوجه (zevce)"
role_in_source: "wife"
biographical:
sex: "female"
marital_status: "married"
social_rank: "Hatun (respectable woman)"
family_relationships:
father:
- name: "علی‌اوغلو (Ali-oğlu)"
spouse:
- person_index: 1
target_name: "Mehmed Efendi"
context: "Wife of buyer, co-purchaser"
- person_index: 3
pnv_name:
literalName: "حسن افندی بن عمر"
literalName_romanized: "Hasan Efendi bin Ömer"
givenName: "حسن"
givenName_romanized: "Hasan"
title: "افندی (Efendi)"
patronymic: "بن عمر"
patronymic_romanized: "bin Ömer"
roles:
- role_title: "شاهد (şahid)"
role_in_source: "witness"
biographical:
sex: "male"
social_rank: "Efendi"
family_relationships:
father:
- name: "عمر (Ömer)"
context: "First witness"
- person_index: 4
pnv_name:
literalName: "ابراهیم چلبی بن مصطفی"
literalName_romanized: "İbrahim Çelebi bin Mustafa"
givenName: "ابراهیم"
givenName_romanized: "İbrahim"
title: "چلبی (Çelebi)"
patronymic: "بن مصطفی"
patronymic_romanized: "bin Mustafa"
roles:
- role_title: "شاهد (şahid)"
role_in_source: "witness"
biographical:
sex: "male"
social_rank: "Çelebi (gentleman/merchant)"
family_relationships:
father:
- name: "مصطفی (Mustafa)"
context: "Second witness"
temporal_references:
- expression: "فی اوائل شهر رجب سنة ١٢٥٨"
expression_romanized: "fi evail-i şehr-i Receb sene 1258"
normalized: "1842-07"
calendar: "Hijri"
type: "DATE"
conversion_note: "Receb 1258 AH ≈ July-August 1842 CE"
locations_mentioned:
- name: "قصبه دميرجی‌کوی"
name_romanized: "kasaba Demirciköy"
type: "town (kasaba)"
- name: "مجلس شرع شريف"
name_romanized: "meclis-i şer'-i şerif"
type: "court"
ottoman_naming_notes: |
Ottoman Turkish naming conventions:
HONORIFIC TITLES:
- آغا (Ağa): Military commander, landowner
- افندی (Efendi): Educated person, official
- چلبی (Çelebi): Gentleman, merchant
- خاتون (Hatun): Respectable woman
PATRONYMIC PATTERNS:
- بن (bin): Son of (Arabic)
- بنت (bint): Daughter of (Arabic)
- اوغلو (-oğlu): Son of (Turkish)
DECEASED MARKERS:
- مرحوم (merhum): The late (man)
- مرحومه (merhume): The late (woman)
CALENDAR: Hijri lunar (354/355 days)
Receb 1258 AH ≈ July-August 1842 CE
provenance:
data_status: "SYNTHETIC_EXAMPLE"
notes: |
This example uses synthetic data based on authentic Ottoman Turkish
sijill (court register) formulae for demonstration purposes. Names,
dates, and locations are fictional but follow authentic 19th-century
patterns. For real examples, see PROVENANCE_SOURCES.md.
related_real_sources:
- archive: "OpenJerusalem Project"
collection: "Jerusalem Sharia Court Registers"
digital_url: "https://www.openjerusalem.org/"
ark_identifier: "ark:/58142/PfV7b"
volume_count: "102 registers"
period: "1834-1920 CE"
languages: "Ottoman Turkish, Arabic"
license: "Open Access"
document_types: "Property sales, marriage contracts, inheritance, waqf"
- archive: "İslam Araştırmaları Merkezi (ISAM)"
collection: "Istanbul Kadı Sicilleri"
digital_url: "http://www.kadisicilleri.org/"
volume_count: "40+ volumes online"
document_count: "40,000+ documents"
period: "16th-19th century CE"
language: "Ottoman Turkish"
license: "Research access"
- archive: "Istanbul Metropolitan Municipality"
project: "History of Istanbul"
digital_url: "https://istanbultarihi.ist/434-istanbul-sharia-court-registers"
volume_count: "~10,000 volumes"
courts: "26 different courts"
period: "1453-1922 CE"
notes: "Largest collection of Ottoman court records in existence"
- archive: "Harvard University"
project: "Ottoman Court Records Project (OCRP)"
digital_url: "https://cmes.fas.harvard.edu/projects/ocrp"
document_types: "Sijill transcriptions, translations"
period: "16th-19th century CE"

View file

@ -0,0 +1,315 @@
# =============================================================================
# PiCo Examples Index
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/examples/
# Parent: pico/_index.yaml
#
# This file provides a manifest and overview of all 11 PiCo extraction examples,
# covering 10 different languages, scripts, and historical record types.
#
# Last Updated: 2025-12-12
# =============================================================================
module_id: "pico_examples"
module_title: "PiCo Historical Extraction Examples"
version: "1.0.0"
description: |
A comprehensive collection of 11 extraction examples demonstrating PiCo
(Person In Context Ontology) patterns for historical person data extraction
from primary source documents spanning 10 languages and 6 centuries.
# =============================================================================
# EXAMPLES OVERVIEW
# =============================================================================
examples_summary:
total_examples: 11
synthetic_examples: 9
real_data_examples: 2
languages_covered:
- Dutch
- English
- Arabic
- Hebrew
- Spanish
- Italian
- Greek
- Russian
- Polish
- Ottoman Turkish
scripts_covered:
- Latin
- Arabic
- Hebrew
- Greek (polytonic)
- Cyrillic
calendars_covered:
- Gregorian
- Julian
- Hijri (Islamic)
- Hebrew
time_period: "1492 CE - 2025 CE"
# =============================================================================
# EXAMPLES CATALOG
# =============================================================================
examples:
# ---------------------------------------------------------------------------
# Example 01: Dutch Marriage Act (1823)
# ---------------------------------------------------------------------------
- file: "01_dutch_marriage.yaml"
example_id: "example_01_dutch_marriage"
title: "Dutch Civil Marriage Act - Leeuwarden 1823"
data_status: "SYNTHETIC_EXAMPLE"
language: "Dutch"
script: "Latin"
period: "1823 CE"
source_type: "burgerlijke_stand"
document_type: "Marriage certificate"
features:
- Dutch patronymics (-zoon, -dochter)
- Napoleonic civil registration format
- Occupation and age recording
- Witness systems
persons_extracted: 6
# ---------------------------------------------------------------------------
# Example 02: Dutch Notarial Protocol (1789)
# ---------------------------------------------------------------------------
- file: "02_notarial_protocol.yaml"
example_id: "example_02_dutch_notarial"
title: "Dutch Notarial Protocol - Amsterdam 1789"
data_status: "SYNTHETIC_EXAMPLE"
language: "Dutch"
script: "Latin"
period: "1789 CE"
source_type: "notarial_protocol"
document_type: "Testament/Will"
features:
- VOC (Dutch East India Company) context
- Colonial-era naming
- Marital property conventions
- Witness and notary roles
persons_extracted: 5
# ---------------------------------------------------------------------------
# Example 03: Dutch Church Baptism (1650)
# ---------------------------------------------------------------------------
- file: "03_church_baptism.yaml"
example_id: "example_03_dutch_baptism"
title: "Dutch Reformed Church Baptism - Delft 1650"
data_status: "SYNTHETIC_EXAMPLE"
language: "Dutch"
script: "Latin"
period: "1650 CE"
source_type: "church_register"
document_type: "Baptismal record"
features:
- Dutch Reformed Church records
- Golden Age naming conventions
- Godparent (getuige) system
- Artisan occupations
persons_extracted: 5
# ---------------------------------------------------------------------------
# Example 04: LinkedIn Profile (2025)
# ---------------------------------------------------------------------------
- file: "04_linkedin_profile.yaml"
example_id: "example_04_linkedin_modern"
title: "Modern LinkedIn Profile - Heritage Sector Professional"
data_status: "SYNTHETIC_EXAMPLE"
language: "English"
script: "Latin"
period: "2025 CE"
source_type: "social_media_profile"
document_type: "Professional profile"
features:
- Modern digital naming conventions
- Career trajectory extraction
- Heritage sector roles
- Digital platform metadata
persons_extracted: 1
# ---------------------------------------------------------------------------
# Example 05: Arabic Waqf Document (1312 AH)
# ---------------------------------------------------------------------------
- file: "05_arabic_waqf.yaml"
example_id: "example_05_arabic_waqf"
title: "Arabic Waqf Document - Cairo 1312 AH (1894 CE)"
data_status: "SYNTHETIC_EXAMPLE"
language: "Arabic"
script: "Arabic"
period: "1312 AH (1894 CE)"
source_type: "waqf_document"
document_type: "Islamic endowment deed"
features:
- Classical Arabic naming (ibn, bint)
- Honorific titles (Pasha, Bey, Effendi, Hanem)
- Hijri calendar
- Islamic legal terminology
persons_extracted: 6
# ---------------------------------------------------------------------------
# Example 06: Hebrew Ketubah (1742) - REAL DATA
# ---------------------------------------------------------------------------
- file: "06_hebrew_ketubah.yaml"
example_id: "example_06_hebrew_ketubah"
title: "Hebrew Marriage Contract (Ketubah) - Modena 1742"
data_status: "REAL_HISTORICAL_DATA"
language: "Hebrew"
script: "Hebrew"
period: "5502 AM (1742 CE)"
source_type: "ketubah"
document_type: "Jewish marriage contract"
archive: "Yale University Beinecke Library"
ark_id: "ark:/15534/c27p8thn"
features:
- Hebrew naming (ben, bat)
- Hebrew calendar
- Rabbinic titles (HaRav, Morenu)
- Ketubah legal formulae
persons_extracted: 6
real_data_citation: |
Beinecke Rare Book and Manuscript Library, Yale University
General Collection, GEN MSS 1309
Ketubah: Modena (Italy), 23 Sivan 5502 (June 12, 1742)
# ---------------------------------------------------------------------------
# Example 07: Spanish Colonial Record (1540)
# ---------------------------------------------------------------------------
- file: "07_spanish_colonial.yaml"
example_id: "example_07_spanish_colonial"
title: "Spanish Colonial Encomienda Record - Nueva España 1540"
data_status: "SYNTHETIC_EXAMPLE"
language: "Spanish"
script: "Latin"
period: "1540 CE"
source_type: "colonial_record"
document_type: "Encomienda grant"
features:
- Spanish colonial naming
- Honorific titles (Don, Doña)
- Indigenous name recording
- Colonial administrative terminology
persons_extracted: 5
# ---------------------------------------------------------------------------
# Example 08: Italian Notarial Record (1492)
# ---------------------------------------------------------------------------
- file: "08_italian_notarial.yaml"
example_id: "example_08_italian_notarial"
title: "Italian Notarial Act - Florence 1492"
data_status: "SYNTHETIC_EXAMPLE"
language: "Italian/Latin"
script: "Latin"
period: "1492 CE"
source_type: "notarial_act"
document_type: "Marriage contract"
features:
- Renaissance Italian naming
- Latin legal formulae
- Florentine patronymics
- Notarial conventions
persons_extracted: 6
# ---------------------------------------------------------------------------
# Example 09: Greek Orthodox Baptism (1875)
# ---------------------------------------------------------------------------
- file: "09_greek_orthodox.yaml"
example_id: "example_09_greek_baptismal_register"
title: "Greek Orthodox Baptismal Register - Thessaloniki 1875"
data_status: "SYNTHETIC_EXAMPLE"
language: "Greek"
script: "Greek (polytonic)"
period: "1875 CE"
source_type: "baptismal_register"
document_type: "Church baptismal record"
features:
- Greek patronymics (του + genitive)
- Polytonic Greek orthography
- Godparent system (νονός/νονά)
- Deceased markers (μακαρίτης)
- Julian calendar
persons_extracted: 7
# ---------------------------------------------------------------------------
# Example 10: Russian Metrical Book (1894) - REAL DATA
# ---------------------------------------------------------------------------
- file: "10_russian_metrical.yaml"
example_id: "example_10_russian_metrical"
title: "Russian Imperial Metrical Book - Birth of Stefan Nowicki (1894)"
data_status: "REAL_HISTORICAL_DATA"
language: "Russian/Polish"
script: "Cyrillic"
period: "1894 CE"
source_type: "metrical_book"
document_type: "Birth registration"
archive: "Archiwum Państwowe w Poznaniu"
features:
- Cyrillic script with romanization
- Polish names in Russian
- Pre-revolutionary orthography (ъ, ѣ)
- Julian/Gregorian dual dating
- Восприемники (godparents)
persons_extracted: 6
real_data_citation: |
Archiwum Państwowe w Poznaniu (State Archive in Poznań)
BYU Script Tutorial transcription
Russian Imperial metrical book, Nowiki village, 1894
# ---------------------------------------------------------------------------
# Example 11: Ottoman Sijill (1258 AH / 1842 CE)
# ---------------------------------------------------------------------------
- file: "11_ottoman_sijill.yaml"
example_id: "example_11_ottoman_sijill"
title: "Ottoman Court Record (Sijill) - Property Sale, Demirciköy 1258 AH"
data_status: "SYNTHETIC_EXAMPLE"
language: "Ottoman Turkish"
script: "Arabic"
period: "1258 AH (1842 CE)"
source_type: "sijill"
document_type: "Property sale (court record)"
features:
- Ottoman Turkish in Arabic script
- Honorific titles (Ağa, Efendi, Çelebi, Hatun)
- Arabic patronymics (bin, bint)
- Turkish patronymic (-oğlu)
- Hijri calendar
- Islamic court terminology
persons_extracted: 5
# =============================================================================
# USAGE NOTES
# =============================================================================
usage_notes: |
These examples are designed for:
1. TRAINING: Use as training data for NER/extraction models
2. TESTING: Validate extraction pipelines against known outputs
3. DOCUMENTATION: Understand PiCo patterns for different document types
4. REFERENCE: Language-specific naming convention guides
IMPORTANT DISTINCTIONS:
- SYNTHETIC_EXAMPLE: Created for demonstration; names/dates are fictional
- REAL_HISTORICAL_DATA: Actual archival records with full provenance
Each example includes:
- source_text: Original text in source language/script
- expected_extraction: Complete PiCo-compliant output
- [language]_naming_notes: Language-specific conventions
- provenance: Data status and related real sources
# =============================================================================
# RELATED RESOURCES
# =============================================================================
related_resources:
schema_files:
- "../schema/observation.yaml"
- "../schema/pnv_components.yaml"
- "../schema/relationships.yaml"
- "../schema/temporal.yaml"
parent_index: "../_index.yaml"
provenance_sources: "../../PROVENANCE_SOURCES.md"

View file

@ -0,0 +1,439 @@
# =============================================================================
# PiCo Integration Module: Observation Pattern
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/
# Parent: _index.yaml
#
# Description: Core PiCo observation pattern and PersonObservation class.
# Defines the source-bound observation layer that captures
# person mentions exactly as they appear in sources.
#
# Last Updated: 2025-01-13
# =============================================================================
# -----------------------------------------------------------------------------
# Core Observation Pattern
# -----------------------------------------------------------------------------
observation_pattern:
description: "Every person mention creates a PersonObservation"
class: "picom:PersonObservation"
class_uri: "https://w3id.org/pico/PersonObservation"
properties:
- property: "picom:hasObservedName"
description: "The name string as it appears in text"
range: "pnv:PersonName"
cardinality: "1"
note: "Exact transcription of name from source"
- property: "picom:isObservationOf"
description: "Links to reconstructed Person entity"
range: "crm:E21_Person"
cardinality: "0..1"
note: "May be null if person not yet identified"
- property: "prov:hadPrimarySource"
description: "The source document/webpage"
range: "prov:Entity"
cardinality: "1"
note: "Required for provenance tracking"
- property: "picom:observedAt"
description: "When the observation was made"
range: "xsd:dateTime"
cardinality: "1"
note: "Extraction timestamp, not document date"
- property: "picom:observedInContext"
description: "Surrounding text context"
range: "xsd:string"
cardinality: "0..1"
note: "For disambiguation when reviewing"
- property: "picom:hasRole"
description: "Role/position observed with the person"
range: "xsd:string"
cardinality: "0..*"
note: "Links to ROLE hypernym when extracted"
# -----------------------------------------------------------------------------
# Person Reconstruction Pattern
# -----------------------------------------------------------------------------
person_reconstruction_pattern:
description: |
A PersonReconstruction is created by linking one or more PersonObservations
to form a unified person entity. This is the scholarly interpretation layer
that connects source-bound observations to a conceptual person.
Key distinction:
- PersonObservation: What is OBSERVED in a specific source (exact transcription)
- PersonReconstruction: What is INFERRED about the person (normalized, linked)
A single PersonReconstruction may derive from observations across:
- Multiple sources (birth record + marriage record + death record)
- Different time periods (mentions across decades)
- Various name forms ("Jan Jansz" + "Johannes Jansen" + "J. Jansen")
class: "pico:PersonReconstruction"
class_uri: "https://personsincontext.org/model#PersonReconstruction"
superclass: "pico:Person"
required_properties:
- property: "prov:wasDerivedFrom"
description: "Links to source PersonObservation(s)"
range: "pico:PersonObservation"
cardinality: "1..*"
note: "Every reconstruction MUST link to at least one observation"
- property: "prov:wasGeneratedBy"
description: "Links to the reconstruction Activity"
range: "prov:Activity"
cardinality: "1"
note: "Documents how/when/by whom reconstruction was created"
optional_properties:
- property: "prov:wasRevisionOf"
description: "Links to previous version of this reconstruction"
range: "pico:PersonReconstruction"
cardinality: "0..1"
note: "For tracking updates to reconstructions over time"
- property: "sdo:name"
description: "Normalized/preferred name form"
range: "xsd:string"
note: "The canonical name for this person"
- property: "sdo:additionalName"
description: "Structured name following PNV"
range: "pnv:PersonName"
note: "Full name breakdown using Person Name Vocabulary"
- property: "sdo:givenName"
description: "Given/first name"
range: "xsd:string"
- property: "sdo:familyName"
description: "Family/surname"
range: "xsd:string"
- property: "sdo:gender"
description: "Gender of the person"
range: "sdo:GenderType"
values: ["sdo:Male", "sdo:Female"]
- property: "sdo:birthDate"
description: "Birth date (ISO 8601)"
range: "xsd:date"
note: "May be incomplete: YYYY, YYYY-MM, or YYYY-MM-DD"
- property: "sdo:birthPlace"
description: "Place of birth"
range: "xsd:string or xsd:anyURI"
note: "Prefer linking to GeoNames or Wikidata"
- property: "sdo:deathDate"
description: "Death date (ISO 8601)"
range: "xsd:date"
- property: "sdo:deathPlace"
description: "Place of death"
range: "xsd:string or xsd:anyURI"
example:
description: "PersonReconstruction derived from multiple observations"
turtle: |
cbg:person_reconstruction_anna_koppen
a pico:PersonReconstruction ;
sdo:name "Anna Maria Koppen" ;
sdo:familyName "Koppen" ;
sdo:givenName "Anna Maria" ;
sdo:gender sdo:Female ;
sdo:birthPlace "Haarlem" ;
sdo:birthDate "1860-03-31"^^xsd:date ;
sdo:deathPlace "Detroit, USA" ;
sdo:deathDate "1926"^^xsd:gYear ;
prov:wasDerivedFrom nha:marriage_1885_po_1 ,
cbg:emigration_1887_po_1 ,
us:death_1926_po_1 ;
prov:wasGeneratedBy cbg:reconstruction_activity_01 .
# -----------------------------------------------------------------------------
# Source and Scan Classes
# -----------------------------------------------------------------------------
source_classes:
archive_component:
description: |
A Source document from which PersonObservations are extracted.
PiCo does not aim to fully describe archival sources (use RiC-O or DC for that),
but requires minimal identification for provenance tracking.
class: "sdo:ArchiveComponent"
class_uri: "https://schema.org/ArchiveComponent"
superclass: "sdo:CreativeWork"
properties:
- property: "sdo:name"
description: "Identifying name for the source"
range: "xsd:string"
cardinality: "1"
note: "Combine title, date, archive location for identification"
example: "BS Marriage Haarlem, November 11, 1885, certificate number 321"
- property: "sdo:additionalType"
description: "Type of source document"
range: "picot_sourcetypes:Concept"
note: "Use PiCo SourceType thesaurus"
- property: "sdo:dateCreated"
description: "Date the source was created"
range: "xsd:date"
- property: "sdo:holdingArchive"
description: "Institution holding the source"
range: "xsd:anyURI"
note: "Link to heritage custodian (GHCID or Wikidata)"
- property: "sdo:url"
description: "Permalink to the source"
range: "sdo:URL"
note: "Preferably a persistent identifier"
- property: "sdo:contentLocation"
description: "Geographic coverage of the source"
range: "xsd:string or xsd:anyURI"
- property: "sdo:associatedMedia"
description: "Link to scan(s) of the source"
range: "sdo:ImageObject"
cardinality: "0..*"
image_object:
description: |
A Scan of a source document. Links to the digital image at the holding archive.
class: "sdo:ImageObject"
class_uri: "https://schema.org/ImageObject"
superclass: "sdo:CreativeWork"
properties:
- property: "sdo:url"
description: "URL to the full scan"
range: "sdo:URL"
note: "Preferably IIIF manifest"
- property: "sdo:thumbnail"
description: "URL to thumbnail image"
range: "sdo:ImageObject"
- property: "sdo:embedUrl"
description: "URL to image viewer"
range: "sdo:URL"
- property: "sdo:position"
description: "Position in sequence of scans"
range: "xsd:int"
note: "For multi-page sources"
# -----------------------------------------------------------------------------
# Biographical Properties
# -----------------------------------------------------------------------------
biographical_properties:
description: |
Biographical properties capture personal details as they appear in sources.
These are used for both PersonObservation (source-bound) and
PersonReconstruction (normalized).
age:
property: "pico:hasAge"
property_uri: "https://personsincontext.org/model#hasAge"
description: "Age of person as stated in source"
range: "xsd:string"
domain: "pico:PersonObservation"
note: |
Used when birth date unknown but age is recorded.
Age assumed in years unless specified ("4" = 4 years, "4 months" = 4 months).
Numerical preferred over text ("4" not "four").
examples:
- "30"
- "4 months"
- "about 25"
religion:
property: "pico:hasReligion"
property_uri: "https://personsincontext.org/model#hasReligion"
description: "Religious affiliation as stated in source"
range: "xsd:string or xsd:anyURI"
domain: "pico:Person"
note: "Can link to SKOS thesaurus for religions"
examples:
- "Catholic"
- "Reformed"
- "Jewish"
deceased:
property: "pico:deceased"
property_uri: "https://personsincontext.org/model#deceased"
description: "Indication that person is deceased (when death date unknown)"
range: "xsd:boolean"
domain: "pico:PersonObservation"
note: |
Only used when deathDate is unknown but death is indicated.
A person without deathDate and without deceased:true is assumed alive.
Important for privacy considerations in publishing person records.
gender:
property: "sdo:gender"
property_uri: "https://schema.org/gender"
description: "Gender of the person"
range: "sdo:GenderType"
domain: "pico:Person"
values:
- uri: "sdo:Male"
label: "Male"
- uri: "sdo:Female"
label: "Female"
address:
property: "sdo:address"
property_uri: "https://schema.org/address"
description: "Physical address as mentioned in source"
range: "xsd:string"
domain: "pico:PersonObservation"
note: "Address exactly as recorded in source"
initials:
property: "pnv:initials"
property_uri: "https://w3id.org/pnv#initials"
description: "Initials of given name(s)"
range: "xsd:string"
domain: "pnv:PersonName"
note: "Each initial followed by period (e.g., 'P.R.', 'H.A.F.M.O.')"
examples:
- "P.R."
- "C.Joh."
- "H.A.F.M.O."
# -----------------------------------------------------------------------------
# Hypernym Mapping (GLAM-NER v1.7.0)
# -----------------------------------------------------------------------------
hypernym_mapping:
description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms"
mappings:
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.PER"
glam_code: "AGT.PER"
note: "Person observations create AGT.PER entities"
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.STF"
glam_code: "AGT.STF"
condition: "When observed with organizational role"
note: "Staff members with role context"
- pico_class: "pnv:PersonName"
glam_hypernym: "APP.NAM"
glam_code: "APP.NAM"
note: "Name strings as appellations"
- pico_class: "picom:hasRole"
glam_hypernym: "ROL"
glam_code: "ROL"
note: "Extracted roles link to ROL hypernym"
# -----------------------------------------------------------------------------
# Simple Examples
# -----------------------------------------------------------------------------
examples:
- description: "Staff member with title and role"
text: "Dr. Maria van den Berg, Director"
observation:
type: "picom:PersonObservation"
id: "_:obs1"
hasObservedName:
type: "pnv:PersonName"
literalName: "Dr. Maria van den Berg"
honorificPrefix: "Dr."
givenName: "Maria"
surnamePrefix: "van den"
baseSurname: "Berg"
hasRole: "Director"
hadPrimarySource: "https://example.org/staff-page"
observedAt: "2025-12-02T10:30:00Z"
glam_ner_annotations:
- span: "Dr. Maria van den Berg"
type: "AGT.STF"
code: "AGT.STF"
confidence: 0.95
- span: "Director"
type: "ROL.TIT"
code: "ROL.TIT"
confidence: 0.98
- description: "Historical artist"
text: "Rembrandt van Rijn painted this in 1642"
observation:
type: "picom:PersonObservation"
id: "_:obs2"
hasObservedName:
type: "pnv:PersonName"
literalName: "Rembrandt van Rijn"
givenName: "Rembrandt"
surnamePrefix: "van"
baseSurname: "Rijn"
isObservationOf: "wd:Q5598" # Wikidata Rembrandt
hadPrimarySource: "https://example.org/artwork-page"
observedAt: "2025-12-02T10:35:00Z"
glam_ner_annotations:
- span: "Rembrandt van Rijn"
type: "AGT.PER"
code: "AGT.PER"
confidence: 0.99
linking:
wikidata: "Q5598"
viaf: "64013650"
- description: "Nobility title"
text: "Count Willem van Loon"
observation:
type: "picom:PersonObservation"
id: "_:obs3"
hasObservedName:
type: "pnv:PersonName"
literalName: "Count Willem van Loon"
honorificPrefix: "Count"
givenName: "Willem"
surnamePrefix: "van"
baseSurname: "Loon"
hadPrimarySource: "https://example.org/archive-doc"
observedAt: "2025-12-02T10:40:00Z"
glam_ner_annotations:
- span: "Count Willem van Loon"
type: "AGT.PER"
code: "AGT.PER"
confidence: 0.95
- span: "Count"
type: "ROL.HON"
code: "ROL.HON"
note: "Nobility title - honorific role"

View file

@ -0,0 +1,439 @@
# =============================================================================
# PiCo Integration Module: Person Name Vocabulary (PNV)
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/
# Parent: _index.yaml
#
# Description: Person Name Vocabulary (PNV) provides structured name components.
# This enables proper parsing of complex name structures across cultures.
#
# References:
# - PNV: https://w3id.org/pnv
# - PNV Specification: https://w3id.org/pnv/doc/v2
#
# Last Updated: 2025-01-13
# =============================================================================
# -----------------------------------------------------------------------------
# Person Name Vocabulary (PNV)
# -----------------------------------------------------------------------------
pnv_name_structure:
description: |
Person Name Vocabulary (PNV) provides structured name components.
This enables proper parsing of complex name structures across cultures.
class: "pnv:PersonName"
class_uri: "https://w3id.org/pnv/PersonName"
components:
- property: "pnv:literalName"
description: "Full name as single string"
examples:
- "Dr. Maria van den Berg"
- "Rembrandt Harmenszoon van Rijn"
- "Queen Elizabeth II"
note: "Original string before parsing"
- property: "pnv:givenName"
description: "First/given name"
examples:
- "Rembrandt"
- "Maria"
- "Jan"
- "Elizabeth"
note: "Personal name, not surname"
- property: "pnv:patronym"
description: "Patronymic name component"
examples:
- "Harmenszoon"
- "Janszoon"
- "Pietersdochter"
note: "Common in Dutch, Scandinavian, Slavic names"
- property: "pnv:surnamePrefix"
description: "Prefix to surname (tussenvoegsel)"
examples:
- "van"
- "de"
- "van den"
- "van der"
- "op de"
- "'t"
- "von"
- "di"
note: "Language-specific, affects sorting"
- property: "pnv:baseSurname"
description: "Core surname without prefix"
examples:
- "Rijn"
- "Berg"
- "Velde"
- "Gogh"
note: "Primary sorting component in Dutch"
- property: "pnv:honorificPrefix"
description: "Title or honorific before name"
examples:
- "Dr."
- "Prof."
- "Prof. dr."
- "Sir"
- "Queen"
- "Mr."
- "Drs."
- "Ir."
note: "May indicate role - link to ROL"
- property: "pnv:honorificSuffix"
description: "Title or honorific after name"
examples:
- "PhD"
- "Jr."
- "III"
- "MD"
- "RA"
- "MSc"
note: "Credentials and generational markers"
- property: "pnv:infixTitle"
description: "Title within name structure"
examples:
- "graaf van"
- "baron de"
- "duke of"
note: "Nobility titles embedded in name"
- property: "pnv:initials"
description: "Initials of given name(s)"
examples:
- "P.R."
- "C.Joh."
- "H.A.F.M.O."
note: "Each initial followed by period"
# -----------------------------------------------------------------------------
# Dutch Name Conventions
# -----------------------------------------------------------------------------
dutch_name_patterns:
description: |
Special handling for Dutch names with tussenvoegsels (surname prefixes).
Dutch sorting rules differ from other languages.
tussenvoegsel_list:
- "van"
- "van de"
- "van den"
- "van der"
- "de"
- "den"
- "het"
- "'t"
- "ter"
- "ten"
- "op de"
- "op den"
- "in 't"
- "in de"
sorting_rule: |
In Dutch, surnames sort by baseSurname, ignoring tussenvoegsel.
"Vincent van Gogh" sorts under "G" not "V".
"Maria van den Berg" sorts under "B" not "V".
capitalization_rule: |
Tussenvoegsel lowercase when preceded by given name:
- "Vincent van Gogh" (not "Vincent Van Gogh")
- "Van Gogh" (surname alone, capitalized)
- "de heer Van Gogh" (formal, capitalized)
# -----------------------------------------------------------------------------
# Arabic Name Conventions
# -----------------------------------------------------------------------------
arabic_name_patterns:
description: |
Arabic names follow complex conventions with multiple components:
nasab (patronymic), nisba (geographic/tribal), kunya (teknonym), laqab (title/epithet).
components:
nasab:
description: "Patronymic chain using ibn/bin (son) or bint (daughter)"
examples:
- "محمد بن علي بن حسن"
- "Muhammad ibn Ali ibn Hasan"
note: "Can extend multiple generations"
nisba:
description: "Geographic or tribal affiliation (adjective form, ends in -i)"
examples:
- "البغدادي (al-Baghdadi)"
- "المصري (al-Misri)"
- "الهاشمي (al-Hashimi)"
kunya:
description: "Teknonym (Abu/Umm + child's name)"
examples:
- "أبو عبد الله (Abu Abdullah)"
- "أم كلثوم (Umm Kulthum)"
note: "Often used as primary form of address"
laqab:
description: "Title, epithet, or nickname"
examples:
- "الرشيد (al-Rashid - the rightly guided)"
- "المأمون (al-Ma'mun - the trustworthy)"
parsing_order: |
Traditional order: kunya - ism - nasab - laqab - nisba
Example: Abu Bakr Muhammad ibn Zakariyya al-Razi
- Kunya: Abu Bakr (father of Bakr)
- Ism: Muhammad (given name)
- Nasab: ibn Zakariyya (son of Zakariyya)
- Nisba: al-Razi (from Ray, city in Persia)
# -----------------------------------------------------------------------------
# Hebrew Name Conventions
# -----------------------------------------------------------------------------
hebrew_name_patterns:
description: |
Hebrew names, especially in religious and historical documents, follow
specific conventions with patronymics and honorifics.
components:
given_name:
description: "First name (shem)"
examples:
- "משה (Moshe/Moses)"
- "רבקה (Rivkah/Rebecca)"
patronymic:
description: "Son/daughter of (ben/bat)"
examples:
- "משה בן אברהם (Moshe ben Avraham)"
- "רבקה בת יעקב (Rivkah bat Ya'akov)"
note: "ben for males, bat for females"
honorifics:
examples:
- "ר' (Rabbi)"
- "הרב (HaRav - the Rabbi)"
- "מו\"ר (Morenu - our teacher)"
- "ז\"ל (zikhrono livrakha - of blessed memory)"
- "ע\"ה (alav hashalom - peace be upon him)"
ketubah_conventions:
description: "Special naming in marriage contracts"
notes:
- "Full patronymics required for both parties"
- "Honorifics for fathers (החתן = the groom, הכלה = the bride)"
- "Geographic origin often included"
- "Hebrew date format (day of month, month, year from creation)"
# -----------------------------------------------------------------------------
# Spanish Colonial Name Conventions
# -----------------------------------------------------------------------------
spanish_name_patterns:
description: |
Spanish naming conventions, including colonial-era patterns with
double surnames and titles.
components:
given_names:
description: "First and middle names (often religious)"
examples:
- "María Guadalupe"
- "José Antonio"
- "Juan Pablo"
paternal_surname:
description: "Father's family name (apellido paterno)"
note: "Listed first in double surname"
maternal_surname:
description: "Mother's maiden family name (apellido materno)"
note: "Listed second in double surname"
particles:
examples:
- "de"
- "de la"
- "del"
note: "May indicate nobility or geographic origin"
titles:
examples:
- "Don/Doña"
- "Señor/Señora"
- "Fray (friar)"
- "Sor (sister)"
colonial_patterns:
notes:
- "Racial designations (español, mestizo, indio, mulato) often recorded"
- "Parish affiliation important"
- "Godparents (padrinos) always named"
- "Legitimacy (hijo legítimo/natural) specified"
# -----------------------------------------------------------------------------
# Italian Name Conventions
# -----------------------------------------------------------------------------
italian_name_patterns:
description: |
Italian naming conventions with notarial and nobility elements.
components:
given_name:
description: "Nome proprio"
note: "Often saints' names"
surname:
description: "Cognome"
note: "May derive from patronymics, locations, or professions"
particles:
examples:
- "di"
- "del"
- "della"
- "dei"
- "da"
note: "May indicate origin or noble lineage"
honorifics:
examples:
- "Signore/Signora"
- "Messer (medieval)"
- "Ser (notarial)"
- "Conte/Contessa"
- "Marchese/Marchesa"
notarial_conventions:
notes:
- "Father's name in genitive: 'figlio di Giovanni'"
- "Profession often stated: 'mercante', 'notaio'"
- "Parish or neighborhood: 'della parrocchia di San Marco'"
- "Legal capacity: 'maggiore d'età' (of legal age)"
# -----------------------------------------------------------------------------
# Greek Name Conventions
# -----------------------------------------------------------------------------
greek_name_patterns:
description: |
Greek Orthodox naming conventions with genitive patronymics.
components:
given_name:
description: "First name (often saint's name)"
examples:
- "Κωνσταντίνος (Konstantinos)"
- "Μαρία (Maria)"
patronymic:
description: "Father's name in genitive case"
examples:
- "του Νικολάου (tou Nikolaou - son of Nikolaos)"
- "του Δημητρίου (tou Dimitriou)"
note: "Genitive case indicates 'of' or 'belonging to'"
surname:
description: "Family name"
examples:
- "Παπαδόπουλος (Papadopoulos)"
- "Αντωνίου (Antoniou)"
note: "May be patronymic origin (-opoulos, -ou, -ides)"
honorifics:
examples:
- "Κύριος/Κυρία (Kyrios/Kyria - Mr./Mrs.)"
- "Πατήρ (Patir - Father, for clergy)"
- "Παπα- (Papa- - prefix for priests)"
orthodox_conventions:
notes:
- "Name day (onomastics) important in Greek culture"
- "Multiple given names common"
- "Grandparents' names often passed down"
# -----------------------------------------------------------------------------
# Russian/Cyrillic Name Conventions
# -----------------------------------------------------------------------------
russian_name_patterns:
description: |
Russian naming conventions with formal patronymics.
components:
given_name:
description: "First name (имя)"
examples:
- "Иван (Ivan)"
- "Мария (Maria)"
patronymic:
description: "Father's name + suffix (отчество)"
examples:
- "Петрович (Petrovich - son of Pyotr)"
- "Петровна (Petrovna - daughter of Pyotr)"
note: "-ovich/-evich for males, -ovna/-evna for females"
surname:
description: "Family name (фамилия)"
note: "Gendered: -ov/-ova, -in/-ina, -sky/-skaya"
formal_usage:
notes:
- "Formal address: given name + patronymic"
- "Informal: given name or diminutive"
- "Full official: surname, given name, patronymic"
# -----------------------------------------------------------------------------
# Ottoman Turkish Name Conventions
# -----------------------------------------------------------------------------
ottoman_name_patterns:
description: |
Ottoman Turkish naming conventions blending Arabic and Turkish elements.
components:
given_name:
description: "Primary name (often Arabic origin)"
examples:
- "Mehmed"
- "Ahmed"
- "Fatma"
patronymic:
description: "Father's name with 'oğlu' (son of) or 'kızı' (daughter of)"
examples:
- "Ali oğlu Mehmed"
- "Hasan oğlu Ahmed"
epithet:
description: "Title or descriptor (laqab)"
examples:
- "Paşa (Pasha)"
- "Efendi"
- "Ağa"
- "Bey"
- "Hatun/Hanım (for women)"
nisba:
description: "Geographic origin or profession"
examples:
- "Kayserili (from Kayseri)"
- "Bakkal (grocer)"
sijill_conventions:
notes:
- "Court records (sicil) use formal full names"
- "Witnesses identified by profession and address"
- "Deceased marked as 'merhum/merhume'"
- "Non-Muslims identified by religious community (millet)"

View file

@ -0,0 +1,517 @@
# =============================================================================
# PiCo Integration Module: Family and Social Relationships
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/
# Parent: _index.yaml
#
# Description: Family relationship properties for genealogical data.
# Enables modeling complex family structures from historical records.
#
# Last Updated: 2025-01-13
# =============================================================================
family_relationships:
description: |
Family relationship properties link persons within and across sources.
Rules:
- For PersonObservations: relationships refer to OTHER observations on SAME source
- For PersonReconstructions: relationships refer to other reconstructions
Property characteristics:
- Symmetric: If A hasRelation B, then B hasRelation A (spouses, siblings, cousins)
- Transitive: hasAncestor/hasDescendant chain through generations
- Inverse pairs: parent/children, grandparent/grandchild, etc.
# ---------------------------------------------------------------------------
# Core Family (Schema.org)
# ---------------------------------------------------------------------------
core_relationships:
- property: "sdo:parent"
property_uri: "https://schema.org/parent"
description: "A parent of the person"
inverse: "sdo:children"
subPropertyOf: ["sdo:relatedTo", "pico:hasAncestor"]
note: "Biological or legal parent"
- property: "sdo:children"
property_uri: "https://schema.org/children"
description: "A child of the person"
inverse: "sdo:parent"
subPropertyOf: ["sdo:relatedTo", "pico:hasDescendant"]
- property: "sdo:spouse"
property_uri: "https://schema.org/spouse"
description: "The person's spouse"
symmetric: true
subPropertyOf: "sdo:relatedTo"
- property: "sdo:sibling"
property_uri: "https://schema.org/sibling"
description: "A brother or sister"
symmetric: true
subPropertyOf: "sdo:relatedTo"
# ---------------------------------------------------------------------------
# Transitive Ancestry (PiCo)
# ---------------------------------------------------------------------------
ancestry_relationships:
- property: "pico:hasAncestor"
property_uri: "https://personsincontext.org/model#hasAncestor"
description: "Any ancestor (parent, grandparent, etc.)"
type: "owl:TransitiveProperty"
inverse: "pico:hasDescendant"
note: "Not used directly; parent→parent chains automatically create ancestors"
- property: "pico:hasDescendant"
property_uri: "https://personsincontext.org/model#hasDescendant"
description: "Any descendant (child, grandchild, etc.)"
type: "owl:TransitiveProperty"
inverse: "pico:hasAncestor"
# ---------------------------------------------------------------------------
# Grandparents/Grandchildren
# ---------------------------------------------------------------------------
grandparent_relationships:
- property: "pico:hasGrandparent"
property_uri: "https://personsincontext.org/model#hasGrandparent"
inverse: "pico:hasGrandchild"
- property: "pico:hasGrandchild"
property_uri: "https://personsincontext.org/model#hasGrandchild"
inverse: "pico:hasGrandparent"
- property: "pico:hasGreat-grandparent"
property_uri: "https://personsincontext.org/model#hasGreat-grandparent"
inverse: "pico:hasGreat-grandchild"
- property: "pico:hasGreat-grandchild"
property_uri: "https://personsincontext.org/model#hasGreat-grandchild"
inverse: "pico:hasGreat-grandparent"
# ---------------------------------------------------------------------------
# Aunts/Uncles and Nieces/Nephews
# ---------------------------------------------------------------------------
extended_family:
- property: "pico:hasUncle_Aunt"
property_uri: "https://personsincontext.org/model#hasUncle_Aunt"
description: "An uncle or aunt (sibling of parent)"
inverse: "pico:hasNephew_Niece"
- property: "pico:hasNephew_Niece"
property_uri: "https://personsincontext.org/model#hasNephew_Niece"
description: "A nephew or niece (child of sibling)"
inverse: "pico:hasUncle_Aunt"
- property: "pico:hasCousin"
property_uri: "https://personsincontext.org/model#hasCousin"
description: "A cousin (child of parent's sibling)"
symmetric: true
# ---------------------------------------------------------------------------
# Step-family
# ---------------------------------------------------------------------------
step_relationships:
- property: "pico:hasStepparent"
property_uri: "https://personsincontext.org/model#hasStepparent"
description: "A stepparent (spouse of biological parent)"
inverse: "pico:hasStepchild"
- property: "pico:hasStepchild"
property_uri: "https://personsincontext.org/model#hasStepchild"
inverse: "pico:hasStepparent"
- property: "pico:hasStepsibling"
property_uri: "https://personsincontext.org/model#hasStepsibling"
description: "A stepbrother or stepsister"
symmetric: true
- property: "pico:hasHalf-sibling"
property_uri: "https://personsincontext.org/model#hasHalf-sibling"
description: "A half-brother or half-sister (one shared parent)"
symmetric: true
# ---------------------------------------------------------------------------
# Foster/Godparent
# ---------------------------------------------------------------------------
non_biological_relationships:
- property: "pico:hasFosterParent"
property_uri: "https://personsincontext.org/model#hasFosterParent"
inverse: "pico:hasFosterChild"
- property: "pico:hasFosterChild"
property_uri: "https://personsincontext.org/model#hasFosterChild"
inverse: "pico:hasFosterParent"
- property: "pico:hasGodparent"
property_uri: "https://personsincontext.org/model#hasGodparent"
description: "A godparent (witness at baptism)"
inverse: "pico:hasGodchild"
- property: "pico:hasGodchild"
property_uri: "https://personsincontext.org/model#hasGodchild"
inverse: "pico:hasGodparent"
- property: "pico:hasLegitimizedChild"
property_uri: "https://personsincontext.org/model#hasLegitimizedChild"
description: "A child legitimized by marriage or legal recognition"
inverse: "pico:isLegitimitezedChildOf"
- property: "pico:isLegitimitezedChildOf"
property_uri: "https://personsincontext.org/model#isLegitimitezedChildOf"
inverse: "pico:hasLegitimizedChild"
# ---------------------------------------------------------------------------
# In-Laws
# ---------------------------------------------------------------------------
in_law_relationships:
- property: "pico:hasParent-in-law"
property_uri: "https://personsincontext.org/model#hasParent-in-law"
inverse: "pico:hasChild-in-law"
- property: "pico:hasChild-in-law"
property_uri: "https://personsincontext.org/model#hasChild-in-law"
inverse: "pico:hasParent-in-law"
- property: "pico:hasSibling-in-law"
property_uri: "https://personsincontext.org/model#hasSibling-in-law"
description: "Brother/sister-in-law"
symmetric: true
- property: "pico:hasGrandparent-in-law"
property_uri: "https://personsincontext.org/model#hasGrandparent-in-law"
inverse: "pico:hasGrandchild-in-law"
- property: "pico:hasGrandchild-in-law"
property_uri: "https://personsincontext.org/model#hasGrandchild-in-law"
inverse: "pico:hasGrandparent-in-law"
- property: "pico:hasUncle_Aunt-in-law"
property_uri: "https://personsincontext.org/model#hasUncle_Aunt-in-law"
inverse: "pico:hasNephew_Niece-in-law"
- property: "pico:hasNephew_Niece-in-law"
property_uri: "https://personsincontext.org/model#hasNephew_Niece-in-law"
inverse: "pico:hasUncle_Aunt-in-law"
- property: "pico:hasCousin-in-law"
property_uri: "https://personsincontext.org/model#hasCousin-in-law"
symmetric: true
- property: "pico:hasStepparent-in-law"
property_uri: "https://personsincontext.org/model#hasStepparent-in-law"
inverse: "pico:hasStepchild-in-law"
- property: "pico:hasStepchild-in-law"
property_uri: "https://personsincontext.org/model#hasStepchild-in-law"
inverse: "pico:hasStepparent-in-law"
# ---------------------------------------------------------------------------
# Former Partners
# ---------------------------------------------------------------------------
former_partner_relationships:
- property: "pico:isWidOf"
property_uri: "https://personsincontext.org/model#isWidOf"
description: "Is widow/widower of deceased spouse"
note: "The subject is the surviving partner"
- property: "pico:hasPreviousPartner"
property_uri: "https://personsincontext.org/model#hasPreviousPartner"
description: "A former spouse or partner"
symmetric: true
# -----------------------------------------------------------------------------
# Historical Relationship Indicators by Language
# -----------------------------------------------------------------------------
historical_relationship_patterns:
description: |
Common relationship indicators in historical documents by language.
Use these patterns to identify family relationships in source texts.
dutch:
description: "Dutch relationship indicators"
patterns:
- pattern: "huijsvrou van"
meaning: "wife of"
relationship: "spouse"
- pattern: "zoon van"
meaning: "son of"
relationship: "parent"
- pattern: "dochter van"
meaning: "daughter of"
relationship: "parent"
- pattern: "weduwe van"
meaning: "widow of"
relationship: "widow_of"
- pattern: "weduwnaar van"
meaning: "widower of"
relationship: "widow_of"
- pattern: "peter"
meaning: "godfather"
relationship: "godparent"
- pattern: "meter"
meaning: "godmother"
relationship: "godparent"
- pattern: "getuige"
meaning: "witness"
relationship: "witness"
- pattern: "broeder van"
meaning: "brother of"
relationship: "sibling"
- pattern: "zuster van"
meaning: "sister of"
relationship: "sibling"
latin:
description: "Latin relationship indicators (common in church records)"
patterns:
- pattern: "filius"
meaning: "son"
relationship: "parent"
- pattern: "filia"
meaning: "daughter"
relationship: "parent"
- pattern: "uxor"
meaning: "wife"
relationship: "spouse"
- pattern: "maritus"
meaning: "husband"
relationship: "spouse"
- pattern: "vidua"
meaning: "widow"
relationship: "widow_of"
- pattern: "viduus"
meaning: "widower"
relationship: "widow_of"
- pattern: "quondam"
meaning: "the late"
relationship: "deceased_marker"
- pattern: "patrinus"
meaning: "godfather"
relationship: "godparent"
- pattern: "matrina"
meaning: "godmother"
relationship: "godparent"
- pattern: "testis"
meaning: "witness"
relationship: "witness"
german:
description: "German relationship indicators"
patterns:
- pattern: "Ehefrau von"
meaning: "wife of"
relationship: "spouse"
- pattern: "Ehemann von"
meaning: "husband of"
relationship: "spouse"
- pattern: "Sohn von"
meaning: "son of"
relationship: "parent"
- pattern: "Tochter von"
meaning: "daughter of"
relationship: "parent"
- pattern: "Witwe von"
meaning: "widow of"
relationship: "widow_of"
- pattern: "Witwer von"
meaning: "widower of"
relationship: "widow_of"
- pattern: "Taufpate"
meaning: "godfather"
relationship: "godparent"
- pattern: "Taufpatin"
meaning: "godmother"
relationship: "godparent"
french:
description: "French relationship indicators"
patterns:
- pattern: "fils de"
meaning: "son of"
relationship: "parent"
- pattern: "fille de"
meaning: "daughter of"
relationship: "parent"
- pattern: "épouse de"
meaning: "wife of"
relationship: "spouse"
- pattern: "époux de"
meaning: "husband of"
relationship: "spouse"
- pattern: "veuve de"
meaning: "widow of"
relationship: "widow_of"
- pattern: "veuf de"
meaning: "widower of"
relationship: "widow_of"
- pattern: "feu"
meaning: "the late (m)"
relationship: "deceased_marker"
- pattern: "feue"
meaning: "the late (f)"
relationship: "deceased_marker"
- pattern: "parrain"
meaning: "godfather"
relationship: "godparent"
- pattern: "marraine"
meaning: "godmother"
relationship: "godparent"
arabic:
description: "Arabic relationship indicators"
patterns:
- pattern: "ابن"
transliteration: "ibn"
meaning: "son of"
relationship: "parent"
- pattern: "بن"
transliteration: "bin"
meaning: "son of (shorter form)"
relationship: "parent"
- pattern: "بنت"
transliteration: "bint"
meaning: "daughter of"
relationship: "parent"
- pattern: "زوج"
transliteration: "zawj"
meaning: "husband"
relationship: "spouse"
- pattern: "زوجة"
transliteration: "zawja"
meaning: "wife"
relationship: "spouse"
- pattern: "أرملة"
transliteration: "armala"
meaning: "widow"
relationship: "widow_of"
- pattern: "المرحوم"
transliteration: "al-marhum"
meaning: "the late (m)"
relationship: "deceased_marker"
- pattern: "المرحومة"
transliteration: "al-marhuma"
meaning: "the late (f)"
relationship: "deceased_marker"
- pattern: "آل"
transliteration: "Al"
meaning: "family of"
relationship: "family_marker"
hebrew:
description: "Hebrew relationship indicators"
patterns:
- pattern: "בן"
transliteration: "ben"
meaning: "son of"
relationship: "parent"
- pattern: "בת"
transliteration: "bat"
meaning: "daughter of"
relationship: "parent"
- pattern: "אשת"
transliteration: "eshet"
meaning: "wife of"
relationship: "spouse"
- pattern: "אלמנה"
transliteration: "almana"
meaning: "widow"
relationship: "widow_of"
- pattern: "ז״ל"
transliteration: "z\"l"
meaning: "of blessed memory"
relationship: "deceased_marker"
- pattern: "ע״ה"
transliteration: "a\"h"
meaning: "peace be upon him/her"
relationship: "deceased_marker"
spanish:
description: "Spanish relationship indicators"
patterns:
- pattern: "hijo de"
meaning: "son of"
relationship: "parent"
- pattern: "hija de"
meaning: "daughter of"
relationship: "parent"
- pattern: "esposa de"
meaning: "wife of"
relationship: "spouse"
- pattern: "esposo de"
meaning: "husband of"
relationship: "spouse"
- pattern: "viuda de"
meaning: "widow of"
relationship: "widow_of"
- pattern: "viudo de"
meaning: "widower of"
relationship: "widow_of"
- pattern: "padrino"
meaning: "godfather"
relationship: "godparent"
- pattern: "madrina"
meaning: "godmother"
relationship: "godparent"
- pattern: "hijo legítimo"
meaning: "legitimate son"
relationship: "legitimacy_marker"
- pattern: "hijo natural"
meaning: "illegitimate son"
relationship: "legitimacy_marker"
portuguese:
description: "Portuguese relationship indicators"
patterns:
- pattern: "filho de"
meaning: "son of"
relationship: "parent"
- pattern: "filha de"
meaning: "daughter of"
relationship: "parent"
- pattern: "esposa de"
meaning: "wife of"
relationship: "spouse"
- pattern: "esposo de"
meaning: "husband of"
relationship: "spouse"
- pattern: "viúva de"
meaning: "widow of"
relationship: "widow_of"
- pattern: "viúvo de"
meaning: "widower of"
relationship: "widow_of"
- pattern: "padrinho"
meaning: "godfather"
relationship: "godparent"
- pattern: "madrinha"
meaning: "godmother"
relationship: "godparent"
ottoman_turkish:
description: "Ottoman Turkish relationship indicators"
patterns:
- pattern: "oğlu"
meaning: "son of"
relationship: "parent"
- pattern: "kızı"
meaning: "daughter of"
relationship: "parent"
- pattern: "zevcesi"
meaning: "wife"
relationship: "spouse"
- pattern: "merhum"
meaning: "the late (m)"
relationship: "deceased_marker"
- pattern: "merhume"
meaning: "the late (f)"
relationship: "deceased_marker"

View file

@ -0,0 +1,570 @@
# =============================================================================
# PiCo Integration Module: Temporal Patterns & Calendar Systems
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/
# Parent: _index.yaml
#
# Description: Temporal expression handling, calendar systems, date normalization,
# and PROV-O provenance model for tracking observation/reconstruction
# activities.
#
# Last Updated: 2025-12-12
# =============================================================================
# -----------------------------------------------------------------------------
# Calendar Systems
# -----------------------------------------------------------------------------
# Historical documents use various calendar systems. This section defines
# how to handle and normalize dates from different calendrical traditions.
calendar_systems:
description: |
Historical sources use diverse calendar systems depending on culture,
religion, and time period. Proper extraction requires:
1. Identifying the source calendar
2. Preserving the original date expression
3. Providing normalized ISO 8601 equivalents where possible
supported_calendars:
gregorian:
id: "gregorian"
label: "Gregorian Calendar"
uri: "https://www.wikidata.org/wiki/Q12138"
description: |
The civil calendar used worldwide since 1582 (Catholic countries)
or later (Protestant/Orthodox countries).
adoption_dates:
catholic: "1582-10-15"
protestant: "1700-03-01"
british_empire: "1752-09-14"
russia: "1918-02-14"
greece: "1923-03-01"
usage_notes: |
- Default for modern documents
- Used in civil registrations after adoption
- Standard for ISO 8601 normalization
example:
original: "15 October 1582"
normalized: "1582-10-15"
julian:
id: "julian"
label: "Julian Calendar"
uri: "https://www.wikidata.org/wiki/Q11184"
description: |
Calendar introduced by Julius Caesar in 45 BCE. Used in Europe
until Gregorian reform, and by Eastern Orthodox churches today.
offset_from_gregorian:
16th_century: 10
17th_century: 10
18th_century: 11
19th_century: 12
20th_century: 13
21st_century: 13
usage_notes: |
- Greek Orthodox Church records use Julian calendar
- Russian Empire used Julian until 1918
- Dual dating common in transition periods
- Format: "Julian date / Gregorian date" or "O.S./N.S." notation
example:
original: "14 March 1875 (O.S.)"
gregorian_equivalent: "27 March 1875"
normalized: "1875-03-27"
note: "Greek Orthodox used Julian; Gregorian equivalent calculated"
hijri:
id: "hijri"
label: "Islamic/Hijri Calendar"
uri: "https://www.wikidata.org/wiki/Q28892"
alternative_names:
- "Islamic Calendar"
- "Muslim Calendar"
- "Lunar Hijri"
- "Anno Hegirae (AH)"
description: |
Lunar calendar used in Islamic societies. Year 1 = 622 CE (Hijra).
354 or 355 days per year (12 lunar months).
months:
1: "Muharram"
2: "Safar"
3: "Rabi' al-Awwal"
4: "Rabi' al-Thani"
5: "Jumada al-Awwal"
6: "Jumada al-Thani"
7: "Rajab"
8: "Sha'ban"
9: "Ramadan"
10: "Shawwal"
11: "Dhu al-Qa'dah"
12: "Dhu al-Hijjah"
usage_notes: |
- Ottoman Empire, Waqf documents, Sijill records
- Year conversion: Gregorian = (Hijri * 0.97) + 622
- Month-level precision often sufficient
- Some documents use both Hijri and local calendars
example:
original: "month of Rajab, year 1225 Hijri"
normalized: "1810-07"
note: "Approximate month - exact day unknown"
hebrew:
id: "hebrew"
label: "Hebrew Calendar"
uri: "https://www.wikidata.org/wiki/Q9644"
alternative_names:
- "Jewish Calendar"
- "Anno Mundi"
description: |
Lunisolar calendar used in Jewish religious and civil life.
Year 1 = 3761 BCE (traditional Creation date).
months:
1: "Nisan"
2: "Iyar"
3: "Sivan"
4: "Tammuz"
5: "Av"
6: "Elul"
7: "Tishrei"
8: "Cheshvan"
9: "Kislev"
10: "Tevet"
11: "Shevat"
12: "Adar"
usage_notes: |
- Ketubot (marriage contracts)
- Get (divorce documents)
- Synagogue records
- Year conversion: Gregorian = Hebrew - 3760 (approx)
- Month names often transliterated in various ways
example:
original: "23 Elul 5656"
normalized: "1896-09-01"
note: "Hebrew date from Creation (anno mundi)"
french_republican:
id: "french_republican"
label: "French Republican Calendar"
uri: "https://www.wikidata.org/wiki/Q181974"
description: |
Calendar used in France 1793-1805. Year 1 = 1792 CE.
12 months of 30 days + 5-6 supplementary days.
months:
1: "Vendemiaire"
2: "Brumaire"
3: "Frimaire"
4: "Nivose"
5: "Pluviose"
6: "Ventose"
7: "Germinal"
8: "Floreal"
9: "Prairial"
10: "Messidor"
11: "Thermidor"
12: "Fructidor"
usage_notes: |
- French civil registrations 1793-1805
- Some Belgian/Dutch territories
- Conversion tables widely available
example:
original: "14 Vendemiaire an IV"
normalized: "1795-10-06"
chinese:
id: "chinese"
label: "Chinese Calendar"
uri: "https://www.wikidata.org/wiki/Q32823"
description: |
Lunisolar calendar used in China and East Asia.
Combines 60-year cycle with lunar months.
usage_notes: |
- Emperor reign year + lunar month + day
- Gregorian adopted 1912 (Republic of China)
- Traditional dates still used for festivals
example:
original: "Guangxu 22, 8th month, 15th day"
normalized: "1896-09-21"
# -----------------------------------------------------------------------------
# Date Expression Patterns
# -----------------------------------------------------------------------------
date_expression_patterns:
description: |
Common patterns for expressing dates in historical sources.
GLM annotators should recognize these patterns and extract:
1. The original expression (exact transcription)
2. The calendar system used
3. A normalized ISO 8601 date (where possible)
patterns:
full_date:
description: "Complete date with day, month, and year"
examples:
- pattern: "15 October 1582"
calendar: "gregorian"
normalized: "1582-10-15"
- pattern: "the fifteenth day of October in the year 1582"
calendar: "gregorian"
normalized: "1582-10-15"
- pattern: "23 Elul 5656"
calendar: "hebrew"
normalized: "1896-09-01"
partial_date:
description: "Date with some components missing"
examples:
- pattern: "March 1875"
calendar: "gregorian"
normalized: "1875-03"
precision: "month"
- pattern: "in the year 1810"
calendar: "gregorian"
normalized: "1810"
precision: "year"
- pattern: "month of Rajab, 1225 AH"
calendar: "hijri"
normalized: "1810-07"
precision: "month"
dual_dating:
description: "Documents showing both Julian and Gregorian dates"
notation_styles:
- "O.S. (Old Style = Julian)"
- "N.S. (New Style = Gregorian)"
- "Slash notation: 14/27 March 1875"
examples:
- pattern: "14/27 March 1875"
interpretation: "14 March (Julian) = 27 March (Gregorian)"
normalized: "1875-03-27"
note: "Use Gregorian for normalization"
- pattern: "6 January 1894 (Gregorian)"
normalized: "1894-01-06"
note: "Explicit calendar indicator"
relative_dating:
description: "Dates relative to events or other dates"
examples:
- pattern: "three days after Easter"
requires: "Year context to calculate"
- pattern: "the Sunday before St. Martins Day"
requires: "Year context and liturgical calendar"
floruit:
description: "Period when person was known to be active"
notation: "fl."
examples:
- pattern: "fl. 1780-1820"
interpretation: "Active between 1780 and 1820"
- pattern: "fl. c. 1850"
interpretation: "Active around 1850"
# -----------------------------------------------------------------------------
# Temporal Properties in PiCo
# -----------------------------------------------------------------------------
temporal_properties:
description: |
Properties for capturing temporal information about persons
observed in historical sources.
biographical_dates:
birth_date:
property: "sdo:birthDate"
property_uri: "https://schema.org/birthDate"
range: "xsd:date or xsd:gYearMonth or xsd:gYear"
description: "Date of birth"
extraction_notes: |
- May be explicitly stated or inferred from age
- Capture calendar system if non-Gregorian
- Normalize to ISO 8601 for querying
death_date:
property: "sdo:deathDate"
property_uri: "https://schema.org/deathDate"
range: "xsd:date or xsd:gYearMonth or xsd:gYear"
description: "Date of death"
extraction_notes: |
- "deceased" annotation indicates death before document date
- Infer approximate date from context when possible
baptism_date:
property: "pico:baptismDate"
range: "xsd:date"
description: "Date of baptism/christening"
note: "Common in church records; often within days of birth"
burial_date:
property: "pico:burialDate"
range: "xsd:date"
description: "Date of burial"
note: "Common in church/cemetery records"
event_dates:
marriage_date:
property: "pico:marriageDate"
range: "xsd:date"
description: "Date of marriage event"
divorce_date:
property: "pico:divorceDate"
range: "xsd:date"
description: "Date of divorce"
document_date:
property: "sdo:dateCreated"
property_uri: "https://schema.org/dateCreated"
range: "xsd:date"
description: "Date the source document was created"
note: "Critical for temporal context of observations"
age_expressions:
age_at_event:
property: "pico:ageAtEvent"
range: "xsd:string"
description: "Age as stated in document"
examples:
- "25 years"
- "about 30 years old"
- "minor (under legal age)"
- "of full age (adult)"
note: |
Preserve original expression; calculate birth year if needed.
"oud 25 jaar" (Dutch) = "25 years old"
# -----------------------------------------------------------------------------
# PROV-O Provenance Model
# -----------------------------------------------------------------------------
provenance_model:
description: |
PiCo uses W3C PROV-O for provenance tracking at two levels:
1. OBSERVATION LEVEL: Where did this observation come from?
- prov:hadPrimarySource -> Source document
- prov:wasGeneratedBy -> Extraction activity (optional)
2. RECONSTRUCTION LEVEL: How was this person entity created?
- prov:wasDerivedFrom -> Source observation(s)
- prov:wasGeneratedBy -> Reconstruction activity
- prov:wasRevisionOf -> Previous reconstruction version
activity_class:
class: "prov:Activity"
class_uri: "http://www.w3.org/ns/prov#Activity"
description: "The activity that generated a PersonReconstruction"
properties:
- property: "prov:wasAssociatedWith"
description: "Agent responsible for the activity"
range: "prov:Agent"
- property: "prov:startedAtTime"
description: "When the activity started"
range: "xsd:dateTime"
- property: "prov:endedAtTime"
description: "When the activity completed"
range: "xsd:dateTime"
- property: "prov:used"
description: "Resources/tools used in the activity"
range: "prov:Entity"
note: "E.g., ML model, matching algorithm, rule set"
activity_types:
human_reconstruction:
description: "Manual reconstruction by researcher"
note: "Provide: time, place, knowledge sources, researcher name"
algorithmic_reconstruction:
description: "Automated reconstruction by software"
note: "Provide: algorithm name, version, configuration, parameters"
agent_class:
class: "prov:Agent"
class_uri: "http://www.w3.org/ns/prov#Agent"
description: "Person or organization responsible for reconstruction"
properties:
- property: "sdo:name"
description: "Name of the agent"
range: "xsd:string"
- property: "sdo:url"
description: "URL identifying the agent"
range: "sdo:URL"
examples:
- name: "CBG Center for Family History"
url: "https://cbg.nl"
type: "organization"
- name: "GLM-4.6 Person Extractor v1.0"
url: null
type: "software"
derivation_properties:
- property: "prov:wasDerivedFrom"
property_uri: "http://www.w3.org/ns/prov#wasDerivedFrom"
description: "Links PersonReconstruction to source PersonObservation(s)"
domain: "pico:PersonReconstruction"
range: "pico:PersonObservation"
cardinality: "1..*"
note: "REQUIRED for all PersonReconstructions"
- property: "prov:wasRevisionOf"
property_uri: "http://www.w3.org/ns/prov#wasRevisionOf"
description: "Links to previous version of reconstruction"
domain: "pico:PersonReconstruction"
range: "pico:PersonReconstruction"
cardinality: "0..1"
note: "For tracking reconstruction updates over time"
# -----------------------------------------------------------------------------
# PiCo Vocabularies/Thesauri
# -----------------------------------------------------------------------------
pico_vocabularies:
description: |
PiCo defines three SKOS concept schemes for controlled terminology:
- Roles: The role a person plays in a source (child, declarant, witness, etc.)
- SourceTypes: Types of historical sources (birth certificate, census, etc.)
- EventTypes: Types of life events (birth, marriage, death, etc.)
roles_thesaurus:
id: "picot_roles"
uri: "https://terms.personsincontext.org/roles/"
type: "skos:ConceptScheme"
label: "Persons in Context role thesaurus"
description: "Roles that persons can have in historical sources"
usage: |
Use pico:hasRole property with a term from this thesaurus.
Example: picot_roles:575 (child), picot_roles:489 (declarant)
example_concepts:
- id: "575"
label: "child"
description: "Person appearing as child in a record"
- id: "489"
label: "declarant"
description: "Person declaring/reporting an event"
- id: "witness"
label: "witness"
description: "Person witnessing an event or signing a document"
- id: "bride"
label: "bride"
description: "Female partner in a marriage"
- id: "groom"
label: "groom"
description: "Male partner in a marriage"
sourcetypes_thesaurus:
id: "picot_sourcetypes"
uri: "https://terms.personsincontext.org/sourcetypes/"
type: "skos:ConceptScheme"
label: "Persons in Context sourceType thesaurus"
description: "Types of historical sources containing person observations"
usage: |
Use sdo:additionalType property on sdo:ArchiveComponent.
Example: picot_sourcetypes:551 (civil registry: birth)
example_concepts:
- id: "551"
label: "civil registry: birth"
description: "Birth certificate from civil registration"
- id: "marriage"
label: "civil registry: marriage"
description: "Marriage certificate"
- id: "death"
label: "civil registry: death"
description: "Death certificate"
- id: "census"
label: "census"
description: "Population census record"
- id: "church_baptism"
label: "church record: baptism"
description: "Baptismal record from church register"
- id: "notarial"
label: "notarial record"
description: "Notarial act or protocol"
eventtypes_thesaurus:
id: "picot_eventtypes"
uri: "https://terms.personsincontext.org/eventtypes/"
type: "skos:ConceptScheme"
label: "Persons in Context eventType thesaurus"
description: "Types of life events documented in sources"
example_concepts:
- id: "birth"
label: "birth"
- id: "baptism"
label: "baptism"
- id: "marriage"
label: "marriage"
- id: "death"
label: "death"
- id: "burial"
label: "burial"
- id: "emigration"
label: "emigration"
- id: "immigration"
label: "immigration"
# -----------------------------------------------------------------------------
# CH-Annotator Hypernym Integration for Temporal
# -----------------------------------------------------------------------------
temporal_hypernym_mapping:
description: |
Mapping between temporal expressions and CH-Annotator hypernyms.
mappings:
- pico_property: "sdo:birthDate"
ch_hypernym: "TMP.DAT"
ch_code: "TMP.DAT"
note: "Birth date temporal expression"
- pico_property: "sdo:deathDate"
ch_hypernym: "TMP.DAT"
ch_code: "TMP.DAT"
note: "Death date temporal expression"
- pico_property: "sdo:dateCreated"
ch_hypernym: "TMP.DAT"
ch_code: "TMP.DAT"
note: "Document creation date"
- calendar_expression: "Hijri date"
ch_hypernym: "TMP.DAT"
normalization: "Convert to Gregorian ISO 8601"
- calendar_expression: "Hebrew date"
ch_hypernym: "TMP.DAT"
normalization: "Convert to Gregorian ISO 8601"
- calendar_expression: "Julian date"
ch_hypernym: "TMP.DAT"
normalization: "Convert to Gregorian ISO 8601"

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,163 @@
{
"pico_observation": {
"observation_id": "waqf_aleppo_1225h",
"observed_at": "2023-10-27T10:00:00Z",
"source_type": "waqf_document",
"source_reference": "Aleppo Waqf, 1225 H"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "الحاج أحمد بن محمد العمري",
"literalName_romanized": "al-Hajj Ahmad ibn Muhammad al-Umari",
"givenName": "أحمد",
"givenName_romanized": "Ahmad",
"patronym": "محمد",
"patronym_romanized": "Muhammad",
"baseSurname": "العمري",
"baseSurname_romanized": "al-Umari",
"honorificPrefix": "الحاج",
"honorificPrefix_romanized": "al-Hajj"
},
"roles": [
{
"role_title": "تاجر",
"role_title_romanized": "tajir",
"role_in_source": "founder"
}
],
"biographical": {
"deceased": true,
"address": "مدينة حلب الشهباء"
},
"family_relationships": {
"parent": [
{
"person_index": 1,
"target_name": "محمد بن عبد الله العمري"
}
],
"children": []
},
"context": "The founder (waqif) of the endowment, a deceased merchant from Aleppo."
},
{
"person_index": 1,
"pnv_name": {
"literalName": "المرحوم محمد بن عبد الله العمري",
"literalName_romanized": "al-marhum Muhammad ibn Abd Allah al-Umari",
"givenName": "محمد",
"givenName_romanized": "Muhammad",
"patronym": "عبد الله",
"patronym_romanized": "Abd Allah",
"baseSurname": "العمري",
"baseSurname_romanized": "al-Umari"
},
"roles": [
{
"role_title": null,
"role_title_romanized": null,
"role_in_source": null
}
],
"biographical": {
"deceased": true,
"address": null
},
"family_relationships": {
"parent": [],
"children": [
{
"person_index": 0,
"target_name": "أحمد بن محمد العمري"
}
]
},
"context": "The deceased father of the founder, Ahmad al-Umari."
},
{
"person_index": 2,
"pnv_name": {
"literalName": "الحاج إبراهيم بن يوسف التركماني",
"literalName_romanized": "al-Hajj Ibrahim ibn Yusuf al-Turkmani",
"givenName": "إبراهيم",
"givenName_romanized": "Ibrahim",
"patronym": "يوسف",
"patronym_romanized": "Yusuf",
"baseSurname": "التركماني",
"baseSurname_romanized": "al-Turkmani",
"honorificPrefix": "الحاج",
"honorificPrefix_romanized": "al-Hajj"
},
"roles": [
{
"role_title": "شاهد",
"role_title_romanized": "shahid",
"role_in_source": "witness"
}
],
"biographical": {
"deceased": null,
"address": null
},
"family_relationships": {
"parent": [],
"children": []
},
"context": "One of the witnesses to the endowment deed."
},
{
"person_index": 3,
"pnv_name": {
"literalName": "السيد علي بن حسين الحلبي",
"literalName_romanized": "al-Sayyid Ali ibn Husayn al-Halabi",
"givenName": "علي",
"givenName_romanized": "Ali",
"patronym": "حسين",
"patronym_romanized": "Husayn",
"baseSurname": "الحلبي",
"baseSurname_romanized": "al-Halabi",
"honorificPrefix": "السيد",
"honorificPrefix_romanized": "al-Sayyid"
},
"roles": [
{
"role_title": "شاهد",
"role_title_romanized": "shahid",
"role_in_source": "witness"
}
],
"biographical": {
"deceased": null,
"address": null
},
"family_relationships": {
"parent": [],
"children": []
},
"context": "The second witness to the endowment deed."
}
],
"temporal_references": [
{
"expression": "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية",
"expression_romanized": "Shahr Rajab sanat alf wa mi'ayn wa khamsa wa 'ishrin hijriyyah",
"normalized": "1811-01",
"calendar": "Hijri",
"type": "DATE"
}
],
"locations_mentioned": [
{
"name": "حلب الشهباء",
"name_romanized": "Halab al-Shahba'",
"type": "city"
},
{
"name": "محلة الجديدة",
"name_romanized": "Mahallat al-Jadida",
"type": "neighborhood"
}
]
}

View file

@ -0,0 +1,93 @@
{
"pico_observation": {
"observation_id": "waqf_doc_001",
"source_type": "Waqf Document",
"source_reference": "Arabic Waqf Deed Snippet"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"script": "أحمد بن محمد العمري",
"romanized": "Ahmad ibn Muhammad al-Umari",
"full_name": "المرحوم الحاج أحمد بن محمد العمري"
},
"roles": [
"founder"
],
"biographical": {
"status": "deceased",
"occupation": "تاجر",
"address": "مدينة حلب الشهباء"
},
"family_relationships": {
"father": "محمد بن عبد الله العمري"
},
"context": "The founder (waqif) who endowed his house for his descendants."
},
{
"person_index": 1,
"pnv_name": {
"script": "محمد بن عبد الله العمري",
"romanized": "Muhammad ibn Abdullah al-Umari",
"full_name": "المرحوم محمد بن عبد الله العمري"
},
"roles": [],
"biographical": {
"status": "deceased"
},
"family_relationships": {
"son": "أحمد بن محمد العمري"
},
"context": "Father of the founder, mentioned in his patronymic."
},
{
"person_index": 2,
"pnv_name": {
"script": "إبراهيم بن يوسف التركماني",
"romanized": "Ibrahim ibn Yusuf al-Turkmani",
"full_name": "الحاج إبراهيم بن يوسف التركماني"
},
"roles": [
"witness"
],
"biographical": {},
"family_relationships": {},
"context": "A witness to the waqf deed."
},
{
"person_index": 3,
"pnv_name": {
"script": "علي بن حسين الحلبي",
"romanized": "Ali ibn Husayn al-Halabi",
"full_name": "السيد علي بن حسين الحلبي"
},
"roles": [
"witness"
],
"biographical": {},
"family_relationships": {},
"context": "A witness to the waqf deed."
}
],
"temporal_references": [
{
"expression": "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية",
"expression_romanized": "shahr rajab sanat alf wa mi'atayn wa khamsa wa 'ishrin hijriyya",
"normalized": "1225 AH",
"calendar": "Hijri"
}
],
"locations_mentioned": [
{
"name": "حلب",
"name_romanized": "Halab",
"type": "city"
},
{
"name": "الجديدة",
"name_romanized": "al-Jadida",
"type": "neighborhood"
}
]
}

View file

@ -0,0 +1,139 @@
{
"pico_observation": {
"observation_id": "marriage_cert_1885-03-04_haarlem_001",
"source_type": "marriage_certificate",
"source_reference": "Haarlem, 4 March 1885, marriage of Johannes Petrus van der Berg and Cornelia Wilhelmina de Groot"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"person_name": "Johannes Petrus",
"family_name": "van der Berg",
"tussenvoegsel": "van der",
"geslachtsnaam": "Berg"
},
"roles": [
"groom"
],
"biographical": {
"age": 30,
"occupation": "koopman",
"birth_place": "Amsterdam",
"residence": "Haarlem",
"civil_status": "meerderjarige"
},
"family_relationships": {
"father": {
"person_name": "Pieter",
"family_name": "van der Berg",
"tussenvoegsel": "van der",
"geslachtsnaam": "Berg",
"status": "deceased",
"occupation": "koopman"
},
"mother": {
"person_name": "Maria Johanna",
"family_name": "Bakker",
"geslachtsnaam": "Bakker",
"status": "living",
"occupation": "zonder beroep",
"residence": "Amsterdam"
}
},
"context": "Groom, son of Pieter van der Berg and Maria Johanna Bakker."
},
{
"person_index": 1,
"pnv_name": {
"person_name": "Cornelia Wilhelmina",
"family_name": "de Groot",
"tussenvoegsel": "de",
"geslachtsnaam": "Groot"
},
"roles": [
"bride"
],
"biographical": {
"age": 25,
"occupation": "zonder beroep",
"birth_place": "Haarlem",
"residence": "Haarlem",
"civil_status": "meerderjarige"
},
"family_relationships": {
"father": {
"person_name": "Hendrik",
"family_name": "de Groot",
"tussenvoegsel": "de",
"geslachtsnaam": "Groot",
"status": "living",
"occupation": "timmerman"
},
"mother": {
"person_name": "Elisabeth",
"family_name": "van Dijk",
"tussenvoegsel": "van",
"geslachtsnaam": "Dijk",
"status": "deceased"
}
},
"context": "Bride, daughter of Hendrik de Groot and Elisabeth van Dijk."
},
{
"person_index": 2,
"pnv_name": {
"person_name": "Willem Frederik",
"family_name": "Smit",
"geslachtsnaam": "Smit"
},
"roles": [
"witness"
],
"biographical": {
"age": 40,
"occupation": "notaris"
},
"family_relationships": {},
"context": "Witness to the marriage."
},
{
"person_index": 3,
"pnv_name": {
"person_name": "Jacobus Hendrikus",
"family_name": "Jansen",
"geslachtsnaam": "Jansen"
},
"roles": [
"witness"
],
"biographical": {
"age": 35,
"occupation": "klerk"
},
"family_relationships": {},
"context": "Witness to the marriage."
}
],
"temporal_references": [
{
"expression": "vierden Maart achttien honderd vijf en tachtig",
"normalized": "1885-03-04",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "Haarlem",
"type": "municipality"
},
{
"name": "Amsterdam",
"type": "city"
},
{
"name": "Haarlem",
"type": "city"
}
]
}

View file

@ -0,0 +1,185 @@
{
"pico_observation": {
"observation_id": "obs_haarlem_1885-03-04",
"source_type": "marriage_certificate",
"source_reference": "Haarlem, 1885-03-04"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"person_name_standard_text": "Johannes Petrus van der Berg",
"person_name_given_name": "Johannes Petrus",
"person_name_family_name_prefix": "van der",
"person_name_family_name": "Berg"
},
"roles": [
"groom"
],
"biographical": {
"age": 30,
"occupation": "koopman",
"birth_place": "Amsterdam",
"residence": "Haarlem",
"civil_status": "meerderjarige"
},
"family_relationships": {
"relationship_to_parents": "zoon van",
"father": "Pieter van der Berg",
"mother": "Maria Johanna Bakker"
},
"context": "Groom, 30-year-old merchant, born in Amsterdam, residing in Haarlem, son of the late Pieter van der Berg and Maria Johanna Bakker."
},
{
"person_index": 1,
"pnv_name": {
"person_name_standard_text": "Pieter van der Berg",
"person_name_given_name": "Pieter",
"person_name_family_name_prefix": "van der",
"person_name_family_name": "Berg"
},
"roles": [
"father_of_groom"
],
"biographical": {
"deceased": true,
"occupation": "koopman"
},
"family_relationships": {
"father_of": "Johannes Petrus van der Berg"
},
"context": "Father of the groom, deceased, was a merchant."
},
{
"person_index": 2,
"pnv_name": {
"person_name_standard_text": "Maria Johanna Bakker",
"person_name_given_name": "Maria Johanna",
"person_name_family_name": "Bakker"
},
"roles": [
"mother_of_groom"
],
"biographical": {
"occupation": "zonder beroep",
"residence": "Amsterdam"
},
"family_relationships": {
"mother_of": "Johannes Petrus van der Berg"
},
"context": "Mother of the groom, without occupation, residing in Amsterdam."
},
{
"person_index": 3,
"pnv_name": {
"person_name_standard_text": "Cornelia Wilhelmina de Groot",
"person_name_given_name": "Cornelia Wilhelmina",
"person_name_family_name_prefix": "de",
"person_name_family_name": "Groot"
},
"roles": [
"bride"
],
"biographical": {
"age": 25,
"occupation": "zonder beroep",
"birth_place": "Haarlem",
"residence": "Haarlem",
"civil_status": "meerderjarige"
},
"family_relationships": {
"relationship_to_parents": "dochter van",
"father": "Hendrik de Groot",
"mother": "Elisabeth van Dijk"
},
"context": "Bride, 25-year-old without occupation, born in Haarlem, residing in Haarlem, daughter of Hendrik de Groot and the late Elisabeth van Dijk."
},
{
"person_index": 4,
"pnv_name": {
"person_name_standard_text": "Hendrik de Groot",
"person_name_given_name": "Hendrik",
"person_name_family_name_prefix": "de",
"person_name_family_name": "Groot"
},
"roles": [
"father_of_bride"
],
"biographical": {
"occupation": "timmerman"
},
"family_relationships": {
"father_of": "Cornelia Wilhelmina de Groot"
},
"context": "Father of the bride, a carpenter."
},
{
"person_index": 5,
"pnv_name": {
"person_name_standard_text": "Elisabeth van Dijk",
"person_name_given_name": "Elisabeth",
"person_name_family_name_prefix": "van",
"person_name_family_name": "Dijk"
},
"roles": [
"mother_of_bride"
],
"biographical": {
"deceased": true
},
"family_relationships": {
"mother_of": "Cornelia Wilhelmina de Groot"
},
"context": "Mother of the bride, deceased."
},
{
"person_index": 6,
"pnv_name": {
"person_name_standard_text": "Willem Frederik Smit",
"person_name_given_name": "Willem Frederik",
"person_name_family_name": "Smit"
},
"roles": [
"witness"
],
"biographical": {
"age": 40,
"occupation": "notaris"
},
"context": "Witness, 40-year-old notary."
},
{
"person_index": 7,
"pnv_name": {
"person_name_standard_text": "Jacobus Hendrikus Jansen",
"person_name_given_name": "Jacobus Hendrikus",
"person_name_family_name": "Jansen"
},
"roles": [
"witness"
],
"biographical": {
"age": 35,
"occupation": "klerk"
},
"context": "Witness, 35-year-old clerk."
}
],
"temporal_references": [
{
"expression": "vierden Maart achttien honderd vijf en tachtig",
"normalized": "1885-03-04",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "Amsterdam",
"type": "city"
},
{
"name": "Haarlem",
"type": "city"
}
]
}

View file

@ -0,0 +1,139 @@
{
"pico_observation": {
"observation_id": "bap_reg_thess_1875_03_15_01",
"source_type": "baptismal_register",
"source_reference": "Greek Orthodox Baptismal Register, Thessaloniki, 15 March 1875"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "Δημήτριος",
"literalName_romanized": "Dēmētrios"
},
"roles": [
"baptized"
],
"biographical": {},
"family_relationships": {
"father": "Νικόλαος Παπαδόπουλος",
"mother": "Ἑλένη"
},
"context": "The baptized, son of Nikolaos Papadopoulos and Elenē."
},
{
"person_index": 1,
"pnv_name": {
"literalName": "Νικόλαος Παπαδόπουλος",
"literalName_romanized": "Nikolaos Papadopoulos"
},
"roles": [
"parent"
],
"biographical": {
"occupation": "ἔμπορος"
},
"family_relationships": {
"son": "Δημήτριος",
"wife": "Ἑλένη"
},
"context": "Father of the baptized, a merchant, husband of Elenē."
},
{
"person_index": 2,
"pnv_name": {
"literalName": "Ἑλένη",
"literalName_romanized": "Elenē"
},
"roles": [
"parent"
],
"biographical": {},
"family_relationships": {
"son": "Δημήτριος",
"husband": "Νικόλαος Παπαδόπουλος",
"father": "Γεώργιος Οἰκόνομος"
},
"context": "Mother of the baptized, wife of Nikolaos Papadopoulos, daughter of the late Geōrgios Oikonomos."
},
{
"person_index": 3,
"pnv_name": {
"literalName": "Γεώργιος Οἰκόνομος",
"literalName_romanized": "Geōrgios Oikonomos"
},
"roles": [
"grandparent"
],
"biographical": {
"deceased": true
},
"family_relationships": {
"daughter": "Ἑλένη"
},
"context": "The late father of the mother (Elenē)."
},
{
"person_index": 4,
"pnv_name": {
"literalName": "Κωνσταντῖνος Καρατζᾶς",
"literalName_romanized": "Kōnstantinos Karatzas"
},
"roles": [
"godparent"
],
"biographical": {
"occupation": "ἰατρός"
},
"family_relationships": {
"father": "Ἰωάννης"
},
"context": "Godparent, a physician, son of Iōannēs."
},
{
"person_index": 5,
"pnv_name": {
"literalName": "Ἰωάννης",
"literalName_romanized": "Iōannēs"
},
"roles": [
"godparent's_parent"
],
"biographical": {},
"family_relationships": {
"son": "Κωνσταντῖνος Καρατζᾶς"
},
"context": "Father of the godparent (Kōnstantinos Karatzas)."
},
{
"person_index": 6,
"pnv_name": {
"literalName": "Ἀθανάσιος Χρυσοστόμου",
"literalName_romanized": "Athanasios Chrysostomou"
},
"roles": [
"priest"
],
"biographical": {
"ecclesiastical_title": "Πρωτοπρεσβύτερος"
},
"family_relationships": {},
"context": "The officiating priest, an Archpriest."
}
],
"temporal_references": [
{
"expression": "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875",
"expression_romanized": "tē dekatē pemptē Martiou tou etous 1875",
"normalized": "1875-03-15",
"calendar": "Julian"
}
],
"locations_mentioned": [
{
"name": "Θεσσαλονίκῃ",
"name_romanized": "Thessalonikē",
"type": "city"
}
]
}

View file

@ -0,0 +1,124 @@
{
"pico_observation": {
"observation_id": "bap_reg_thess_1875_03_15_01",
"source_type": "baptismal_register",
"source_reference": "Thessaloniki Baptismal Register, 15 March 1875"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "Δημήτριος",
"literalName_romanized": "Dēmētrios"
},
"roles": [
"baptized"
],
"biographical": {},
"family_relationships": {
"father": "Νικολάου Παπαδοπούλου",
"mother": "Ἑλένης"
},
"context": "Son of Nikolaos Papadopoulos and Eleni, baptized in Thessaloniki."
},
{
"person_index": 1,
"pnv_name": {
"literalName": "Νικολάου Παπαδοπούλου",
"literalName_romanized": "Nikolaou Papadopoulou"
},
"roles": [
"parent"
],
"biographical": {
"occupation": "ἔμπορος"
},
"family_relationships": {
"son": "Δημήτριος",
"wife": "Ἑλένης"
},
"context": "Father of the baptized Dimitrios, merchant, husband of Eleni."
},
{
"person_index": 2,
"pnv_name": {
"literalName": "Ἑλένης",
"literalName_romanized": "Elenēs"
},
"roles": [
"parent"
],
"biographical": {},
"family_relationships": {
"son": "Δημήτριος",
"husband": "Νικολάου Παπαδοπούλου",
"father": "μακαρίτου Γεωργίου Οἰκονόμου"
},
"context": "Mother of the baptized Dimitrios, wife of Nikolaos Papadopoulos, daughter of the late Georgios Oikonomou."
},
{
"person_index": 3,
"pnv_name": {
"literalName": "Γεωργίου Οἰκονόμου",
"literalName_romanized": "Geōrgiou Oikonomou"
},
"roles": [
"grandparent"
],
"biographical": {
"deceased": true
},
"family_relationships": {
"daughter": "Ἑλένης"
},
"context": "Late father of Eleni, maternal grandfather of the baptized Dimitrios."
},
{
"person_index": 4,
"pnv_name": {
"literalName": "Κωνσταντῖνος Καρατζᾶς",
"literalName_romanized": "Kōnstantinos Karatzas"
},
"roles": [
"godparent"
],
"biographical": {
"occupation": "ἰατρός"
},
"family_relationships": {
"father": "Ἰωάννου"
},
"context": "Godparent of Dimitrios, son of Ioannis, physician."
},
{
"person_index": 5,
"pnv_name": {
"literalName": "Ἀθανάσιος Χρυσοστόμου",
"literalName_romanized": "Athanasios Chrysostomou"
},
"roles": [
"priest"
],
"biographical": {
"ecclesiastical_title": "Πρωτοπρεσβύτερος"
},
"family_relationships": {},
"context": "Archpriest who performed the baptism."
}
],
"temporal_references": [
{
"expression": "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875",
"expression_romanized": "tē dekatē pemptē Martiou tou etous 1875",
"normalized": "1875-03-15",
"calendar": "Julian"
}
],
"locations_mentioned": [
{
"name": "Θεσσαλονίκῃ",
"name_romanized": "Thessalonikē",
"type": "city"
}
]
}

View file

@ -0,0 +1,252 @@
{
"pico_observation": {
"observation_id": "ketubah_vilna_5605_obs_001",
"source_type": "ketubah",
"source_reference": "Vilna Ketubah, 12 Iyar 5605"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"script": "Hebrew",
"text": "יצחק",
"romanized": "Yitzchak"
},
"roles": [
"groom"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בן הר״ר אברהם",
"romanized": "ben HaRav Avraham"
},
"tribal_affiliation": {
"script": "Hebrew",
"text": "הכהן",
"romanized": "haKohen"
},
"honorifics": [
{
"script": "Hebrew",
"text": "הבחור",
"romanized": "haBachur"
}
]
},
"family_relationships": {
"father": {
"person_index": 1,
"relationship_type": "paternal"
}
},
"context": "The groom, son of Avraham haKohen."
},
{
"person_index": 1,
"pnv_name": {
"script": "Hebrew",
"text": "אברהם",
"romanized": "Avraham"
},
"roles": [],
"biographical": {
"honorifics": [
{
"script": "Hebrew",
"text": "הר״ר",
"romanized": "HaRav"
}
],
"tribal_affiliation": {
"script": "Hebrew",
"text": "הכהן",
"romanized": "haKohen"
},
"deceased_marker": {
"script": "Hebrew",
"text": "ז״ל",
"romanized": "z'l"
}
},
"family_relationships": {
"child": {
"person_index": 0,
"relationship_type": "paternal"
}
},
"context": "Father of the groom, of blessed memory."
},
{
"person_index": 2,
"pnv_name": {
"script": "Hebrew",
"text": "מרים",
"romanized": "Miriam"
},
"roles": [
"bride"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בת הר״ר משה",
"romanized": "bat HaRav Moshe"
},
"tribal_affiliation": {
"script": "Hebrew",
"text": "הלוי",
"romanized": "haLevi"
},
"honorifics": [
{
"script": "Hebrew",
"text": "מרת",
"romanized": "Marat"
}
]
},
"family_relationships": {
"father": {
"person_index": 3,
"relationship_type": "paternal"
}
},
"context": "The bride, daughter of Moshe haLevi."
},
{
"person_index": 3,
"pnv_name": {
"script": "Hebrew",
"text": "משה",
"romanized": "Moshe"
},
"roles": [],
"biographical": {
"honorifics": [
{
"script": "Hebrew",
"text": "הר״ר",
"romanized": "HaRav"
}
],
"tribal_affiliation": {
"script": "Hebrew",
"text": "הלוי",
"romanized": "haLevi"
}
},
"family_relationships": {
"child": {
"person_index": 2,
"relationship_type": "paternal"
}
},
"context": "Father of the bride."
},
{
"person_index": 4,
"pnv_name": {
"script": "Hebrew",
"text": "שמעון",
"romanized": "Shimon"
},
"roles": [
"witness"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בן יעקב",
"romanized": "ben Yaakov"
},
"tribal_affiliation": {
"script": "Hebrew",
"text": "הכהן",
"romanized": "haKohen"
}
},
"family_relationships": {
"father": {
"person_index": 5,
"relationship_type": "paternal"
}
},
"context": "First witness to the marriage."
},
{
"person_index": 5,
"pnv_name": {
"script": "Hebrew",
"text": "יעקב",
"romanized": "Yaakov"
},
"roles": [],
"biographical": {},
"family_relationships": {
"child": {
"person_index": 4,
"relationship_type": "paternal"
}
},
"context": "Father of the first witness, Shimon."
},
{
"person_index": 6,
"pnv_name": {
"script": "Hebrew",
"text": "דוד",
"romanized": "David"
},
"roles": [
"witness"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בן אליהו",
"romanized": "ben Eliyahu"
}
},
"family_relationships": {
"father": {
"person_index": 7,
"relationship_type": "paternal"
}
},
"context": "Second witness to the marriage."
},
{
"person_index": 7,
"pnv_name": {
"script": "Hebrew",
"text": "אליהו",
"romanized": "Eliyahu"
},
"roles": [],
"biographical": {},
"family_relationships": {
"child": {
"person_index": 6,
"relationship_type": "paternal"
}
},
"context": "Father of the second witness, David."
}
],
"temporal_references": [
{
"expression": "ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות וארבעים וחמש לבריאת עולם",
"expression_romanized": "BeYom Shlishi beShabbat, Shneim Asar Yom leChodesh Iyar, Shnat Chameshet Alafim Shesh Meot veArba'im veChamesh leBriyat Olam",
"normalized": "5605-04-12",
"calendar": "Hebrew"
}
],
"locations_mentioned": [
{
"name": "פה ווילנא",
"name_romanized": "Po Vilna",
"type": "city"
}
]
}

View file

@ -0,0 +1,202 @@
{
"pico_observation": {
"observation_id": "ketubah_vilna_5605_obs_1",
"source_type": "ketubah",
"source_reference": "Vilna Ketubah, 12 Iyar 5605 (1845 CE)"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"script": "Hebrew",
"text": "יצחק",
"romanized": "Yitzchak"
},
"roles": [
"groom",
"חתן"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בן הר״ר אברהם",
"romanized": "ben HaRav Avraham"
},
"tribal_affiliation": {
"script": "Hebrew",
"text": "הכהן",
"romanized": "haKohen"
},
"honorifics": [
"הבחור"
]
},
"family_relationships": {
"father": {
"person_index": 1,
"relationship_type": "paternal",
"deceased": true
}
},
"context": "The groom, son of the late Rabbi Avraham haKohen."
},
{
"person_index": 1,
"pnv_name": {
"script": "Hebrew",
"text": "אברהם",
"romanized": "Avraham"
},
"roles": [
"father_of_groom"
],
"biographical": {
"honorifics": [
"הר״ר"
],
"tribal_affiliation": {
"script": "Hebrew",
"text": "הכהן",
"romanized": "haKohen"
},
"deceased_marker": {
"script": "Hebrew",
"text": "ז״ל",
"romanized": "z'l"
}
},
"family_relationships": {
"son": {
"person_index": 0,
"relationship_type": "paternal"
}
},
"context": "The deceased father of the groom, Rabbi Avraham haKohen."
},
{
"person_index": 2,
"pnv_name": {
"script": "Hebrew",
"text": "מרים",
"romanized": "Miriam"
},
"roles": [
"bride",
"כלה"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בת הר״ר משה",
"romanized": "bat HaRav Moshe"
},
"tribal_affiliation": {
"script": "Hebrew",
"text": "הלוי",
"romanized": "haLevi"
},
"honorifics": [
"מרת"
]
},
"family_relationships": {
"father": {
"person_index": 3,
"relationship_type": "paternal",
"deceased": false
}
},
"context": "The bride, daughter of Rabbi Moshe haLevi."
},
{
"person_index": 3,
"pnv_name": {
"script": "Hebrew",
"text": "משה",
"romanized": "Moshe"
},
"roles": [
"father_of_bride"
],
"biographical": {
"honorifics": [
"הר״ר"
],
"tribal_affiliation": {
"script": "Hebrew",
"text": "הלוי",
"romanized": "haLevi"
}
},
"family_relationships": {
"daughter": {
"person_index": 2,
"relationship_type": "paternal"
}
},
"context": "The father of the bride, Rabbi Moshe haLevi."
},
{
"person_index": 4,
"pnv_name": {
"script": "Hebrew",
"text": "שמעון",
"romanized": "Shimon"
},
"roles": [
"witness",
"עד"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בן יעקב",
"romanized": "ben Yaakov"
},
"tribal_affiliation": {
"script": "Hebrew",
"text": "הכהן",
"romanized": "haKohen"
}
},
"family_relationships": {},
"context": "First witness to the marriage."
},
{
"person_index": 5,
"pnv_name": {
"script": "Hebrew",
"text": "דוד",
"romanized": "David"
},
"roles": [
"witness",
"עד"
],
"biographical": {
"patronymic": {
"script": "Hebrew",
"text": "בן אליהו",
"romanized": "ben Eliyahu"
}
},
"family_relationships": {},
"context": "Second witness to the marriage."
}
],
"temporal_references": [
{
"expression": "ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות וארבעים וחמש לבריאת עולם",
"expression_romanized": "BeYom Shlishi BeShabbat, Shneim Asar Yom LeChodesh Iyar, Shnat Chamishat Alafim Shesh Meot VeArba'im VeChamesh LeBeriat Olam",
"normalized": "1845-04-18",
"calendar": "Hebrew"
}
],
"locations_mentioned": [
{
"name": "ווילנא",
"name_romanized": "Vilna",
"type": "city"
}
]
}

View file

@ -0,0 +1,192 @@
{
"pico_observation": {
"observation_id": "obs_001",
"source_type": "notarial_act",
"source_reference": "Adì 15 Marzo 1654, in Venetia"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"given_name": "Giovanni Battista",
"surname": "Morosini"
},
"roles": [
"party"
],
"biographical": {
"title": "Nobil Homo Messer",
"residence": "contrada di San Marco",
"status": "living"
},
"family_relationships": {
"father": {
"type": "father",
"name": {
"pnv_name": {
"given_name": "Andrea",
"surname": null
},
"title": "Magnifico Messer"
},
"status": "deceased"
}
},
"context": "Il Nobil Homo Messer Giovanni Battista Morosini fu quondam Magnifico Messer Andrea, della contrada di San Marco"
},
{
"person_index": 1,
"pnv_name": {
"given_name": "Caterina",
"surname": "Contarini"
},
"roles": [
"party"
],
"biographical": {
"title": "Nobil Donna Madonna",
"residence": "contrada di San Marco",
"status": "living"
},
"family_relationships": {
"father": {
"type": "father",
"name": {
"pnv_name": {
"given_name": "Francesco",
"surname": null
},
"title": "Messer"
},
"status": "deceased"
},
"spouse": {
"type": "spouse",
"name": {
"pnv_name": {
"given_name": "Giovanni Battista",
"surname": "Morosini"
},
"title": "Nobil Homo Messer"
},
"status": "living"
}
},
"context": "sua moglie la Nobil Donna Madonna Caterina Contarini fu quondam Messer Francesco"
},
{
"person_index": 2,
"pnv_name": {
"given_name": "Pietro",
"surname": "Fabbro"
},
"roles": [
"witness"
],
"biographical": {
"title": "Messer",
"residence": "contrada di San Polo",
"status": "living"
},
"family_relationships": {
"father": {
"type": "father",
"name": {
"pnv_name": {
"given_name": "Paolo",
"surname": null
},
"title": null
},
"status": "deceased"
}
},
"context": "Messer Pietro fu Paolo Fabbro, habitante nella contrada di San Polo"
},
{
"person_index": 3,
"pnv_name": {
"given_name": "Marco Antonio",
"surname": "Ferrari"
},
"roles": [
"witness"
],
"biographical": {
"title": "Messer",
"occupation": "bottegaio",
"residence": "Rialto",
"status": "living"
},
"family_relationships": {
"father": {
"type": "father",
"name": {
"pnv_name": {
"given_name": "Giovanni",
"surname": null
},
"title": null
},
"status": "deceased"
}
},
"context": "Messer Marco Antonio Ferrari fu Giovanni, bottegaio in Rialto"
},
{
"person_index": 4,
"pnv_name": {
"given_name": "Antonio",
"surname": "Zen"
},
"roles": [
"notary"
],
"biographical": {
"title": "Notaro",
"occupation": "Notaro publico di Venetia",
"residence": "Venetia",
"status": "living"
},
"family_relationships": {
"father": {
"type": "father",
"name": {
"pnv_name": {
"given_name": "Giacomo",
"surname": null
},
"title": "Messer"
},
"status": "deceased"
}
},
"context": "Rogato io Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico di Venetia"
}
],
"temporal_references": [
{
"expression": "15 Marzo 1654",
"normalized": "1654-03-15",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "Venetia",
"type": "city"
},
{
"name": "San Marco",
"type": "contrada"
},
{
"name": "San Polo",
"type": "contrada"
},
{
"name": "Rialto",
"type": "district"
}
]
}

View file

@ -0,0 +1,156 @@
{
"pico_observation": {
"observation_id": "notarial_act_1654_03_15_venezia_01",
"source_type": "notarial_act",
"source_reference": "Adì 15 Marzo 1654, in Venetia. Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu quondam Magnifico Messer Andrea, della contrada di San Marco, et sua moglie la Nobil Donna Madonna Caterina Contarini fu quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo Fabbro, habitante nella contrada di San Polo, et Messer Marco Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico di Venetia."
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"given_name": "Giovanni Battista",
"surname": "Morosini"
},
"roles": [
"party"
],
"biographical": {
"nobility_titles": [
"Nobil Homo",
"Messer"
],
"residence": "contrada di San Marco"
},
"family_relationships": {
"father": {
"name": "Andrea Morosini",
"status": "deceased",
"title": "Magnifico Messer"
},
"spouse": "Caterina Contarini"
},
"context": "Il Nobil Homo Messer Giovanni Battista Morosini fu quondam Magnifico Messer Andrea, della contrada di San Marco"
},
{
"person_index": 1,
"pnv_name": {
"given_name": "Caterina",
"surname": "Contarini"
},
"roles": [
"party"
],
"biographical": {
"nobility_titles": [
"Nobil Donna",
"Madonna"
]
},
"family_relationships": {
"father": {
"name": "Francesco Contarini",
"status": "deceased",
"title": "Messer"
},
"spouse": "Giovanni Battista Morosini"
},
"context": "sua moglie la Nobil Donna Madonna Caterina Contarini fu quondam Messer Francesco"
},
{
"person_index": 2,
"pnv_name": {
"given_name": "Pietro",
"surname": "Fabbro"
},
"roles": [
"witness"
],
"biographical": {
"nobility_titles": [
"Messer"
],
"residence": "contrada di San Polo"
},
"family_relationships": {
"father": {
"name": "Paolo Fabbro",
"status": "deceased"
}
},
"context": "Messer Pietro fu Paolo Fabbro, habitante nella contrada di San Polo"
},
{
"person_index": 3,
"pnv_name": {
"given_name": "Marco Antonio",
"surname": "Ferrari"
},
"roles": [
"witness"
],
"biographical": {
"nobility_titles": [
"Messer"
],
"occupation": "bottegaio",
"work_location": "Rialto"
},
"family_relationships": {
"father": {
"name": "Giovanni Ferrari",
"status": "deceased"
}
},
"context": "Messer Marco Antonio Ferrari fu Giovanni, bottegaio in Rialto"
},
{
"person_index": 4,
"pnv_name": {
"given_name": "Antonio",
"surname": "Zen"
},
"roles": [
"notary"
],
"biographical": {
"nobility_titles": [
"Messer"
],
"occupation": "Notaro publico di Venetia"
},
"family_relationships": {
"father": {
"name": "Giacomo Zen",
"status": "deceased",
"title": "Messer"
}
},
"context": "io Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico di Venetia"
}
],
"temporal_references": [
{
"expression": "15 Marzo 1654",
"normalized": "1654-03-15",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "Venetia",
"type": "city"
},
{
"name": "contrada di San Marco",
"type": "district"
},
{
"name": "contrada di San Polo",
"type": "district"
},
{
"name": "Rialto",
"type": "area"
}
]
}

View file

@ -0,0 +1,125 @@
{
"pico_observation": {
"observation_id": "sijill_001",
"source_type": "sijill",
"source_reference": "Ottoman Court Record, Dated 1258 AH"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "محمد آغا بن عبد الله مرحوم",
"literalName_romanized": "Muhammad Ağa bin Abdullah al-merhum"
},
"roles": [
"seller"
],
"biographical": {
"honorifics": [
"آغا/Ağa"
],
"is_deceased": true,
"patronymic": "بن عبد الله/bin Abdullah"
},
"family_relationships": {},
"context": "Seller of the property, identified as a resident of Demirci-köy. The record notes he is deceased (merhum), implying the sale is conducted by his estate or heirs."
},
{
"person_index": 1,
"pnv_name": {
"literalName": "محمد بن احمد افندی",
"literalName_romanized": "Muhammad bin Ahmad Efendi"
},
"roles": [
"buyer"
],
"biographical": {
"honorifics": [
"افندی/Efendi"
],
"patronymic": "بن احمد/bin Ahmad"
},
"family_relationships": {
"spouse": {
"person_index": 2,
"name": "فاطمة خاتون/Fatima Hatun"
}
},
"context": "Buyer of the property, a resident of Demirci-köy, present with his wife for the transaction."
},
{
"person_index": 2,
"pnv_name": {
"literalName": "فاطمه خاتون بنت علی‌اوغلو",
"literalName_romanized": "Fatima Hatun bint Ali-oğlu"
},
"roles": [
"buyer"
],
"biographical": {
"honorifics": [
"خاتون/Hatun"
],
"patronymic": "بنت علی‌اوغلو/bint Ali-oğlu"
},
"family_relationships": {
"spouse": {
"person_index": 1,
"name": "محمد بن احمد افندی/Muhammad bin Ahmad Efendi"
}
},
"context": "Wife (zevcesi) of the buyer, Muhammad bin Ahmad Efendi, and co-purchaser. She is identified as the daughter of Ali-oğlu."
},
{
"person_index": 3,
"pnv_name": {
"literalName": "حسن افندی بن عمر",
"literalName_romanized": "Hasan Efendi bin Umar"
},
"roles": [
"witness"
],
"biographical": {
"honorifics": [
"افندی/Efendi"
],
"patronymic": "بن عمر/bin Umar"
},
"family_relationships": {},
"context": "One of the two witnesses (şühûd-ı hâl) to the sale."
},
{
"person_index": 4,
"pnv_name": {
"literalName": "ابراهيم چلبی بن مصطفی",
"literalName_romanized": "Ibrahim Çelebi bin Mustafa"
},
"roles": [
"witness"
],
"biographical": {
"honorifics": [
"چلبی/Çelebi"
],
"patronymic": "بن مصطفی/bin Mustafa"
},
"family_relationships": {},
"context": "One of the two witnesses (şühûd-ı hâl) to the sale."
}
],
"temporal_references": [
{
"expression": "فی اوائل شهر رجب سنة ١٢٥٨",
"expression_romanized": "Fi eva'il-i şehr-i Receb sene 1258",
"normalized": "Beginning of Rajab, 1258 AH",
"calendar": "Hijri"
}
],
"locations_mentioned": [
{
"name": "دميرجی‌كوي",
"name_romanized": "Demirci-köy",
"type": "قصبه/kasaba"
}
]
}

View file

@ -0,0 +1,149 @@
{
"pico_observation": {
"observation_id": "sijill_1258_rajab_001",
"source_type": "sijill",
"source_reference": "Ottoman Court Record, Dated Beginning of Rajab 1258 AH"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "محمد آغا",
"literalName_romanized": "Muhammad Ağa"
},
"roles": [
"seller"
],
"biographical": {
"patronymic": "بن عبد الله",
"patronymic_romanized": "bin Abdullah",
"honorific": "آغا",
"honorific_romanized": "Ağa",
"deceased_father": true
},
"family_relationships": {
"father": {
"name": "عبد الله",
"name_romanized": "Abdullah",
"deceased": true
}
},
"context": "Seller from the district of Demirciköy, son of the deceased Abdullah."
},
{
"person_index": 1,
"pnv_name": {
"literalName": "محمد بن احمد افندی",
"literalName_romanized": "Muhammad bin Ahmad Efendi"
},
"roles": [
"buyer"
],
"biographical": {
"patronymic": "بن احمد",
"patronymic_romanized": "bin Ahmad",
"honorific": "افندی",
"honorific_romanized": "Efendi"
},
"family_relationships": {
"father": {
"name": "احمد",
"name_romanized": "Ahmad"
},
"spouse": {
"person_index": 2,
"relation": "wife"
}
},
"context": "Buyer, son of Ahmad, husband of Fatima Hatun."
},
{
"person_index": 2,
"pnv_name": {
"literalName": "فاطمه خاتوم",
"literalName_romanized": "Fatima Hatun"
},
"roles": [
"buyer"
],
"biographical": {
"patronymic": "بنت علی‌اوغلو",
"patronymic_romanized": "bint Ali-oğlu",
"honorific": "خاتوم",
"honorific_romanized": "Hatun"
},
"family_relationships": {
"father": {
"name": "علی‌اوغلو",
"name_romanized": "Ali-oğlu"
},
"spouse": {
"person_index": 1,
"relation": "husband"
}
},
"context": "Buyer, daughter of Ali-oğlu, wife of Muhammad bin Ahmad Efendi."
},
{
"person_index": 3,
"pnv_name": {
"literalName": "حسن افندی بن عمر",
"literalName_romanized": "Hasan Efendi bin Umar"
},
"roles": [
"witness"
],
"biographical": {
"patronymic": "بن عمر",
"patronymic_romanized": "bin Umar",
"honorific": "افندی",
"honorific_romanized": "Efendi"
},
"family_relationships": {
"father": {
"name": "عمر",
"name_romanized": "Umar"
}
},
"context": "Witness to the transaction, son of Umar."
},
{
"person_index": 4,
"pnv_name": {
"literalName": "ابراهيم چلبی بن مصطفی",
"literalName_romanized": "Ibrahim Çelebi bin Mustafa"
},
"roles": [
"witness"
],
"biographical": {
"patronymic": "بن مصطفی",
"patronymic_romanized": "bin Mustafa",
"honorific": "چلبی",
"honorific_romanized": "Çelebi"
},
"family_relationships": {
"father": {
"name": "مصطفی",
"name_romanized": "Mustafa"
}
},
"context": "Witness to the transaction, son of Mustafa."
}
],
"temporal_references": [
{
"expression": "فی اوائل شهر رجب سنة ١٢٥٨",
"expression_romanized": "Fi awāli shahr Rajab sanat 1258",
"normalized": "Beginning of Rajab, 1258 AH",
"calendar": "Hijri"
}
],
"locations_mentioned": [
{
"name": "قصبه دميرجی‌کوی",
"name_romanized": "Kasaba-ı Demirciköy",
"type": "District/Town"
}
]
}

View file

@ -0,0 +1,166 @@
```json
{
"pico_observation": {
"observation_id": "waqf_aleppo_1225h",
"observed_at": "2023-10-27T10:00:00Z",
"source_type": "waqf_document",
"source_reference": "Aleppo Waqf, 1225 H"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "الحاج أحمد بن محمد العمري",
"literalName_romanized": "al-Hajj Ahmad ibn Muhammad al-Umari",
"givenName": "أحمد",
"givenName_romanized": "Ahmad",
"patronym": "محمد",
"patronym_romanized": "Muhammad",
"baseSurname": "العمري",
"baseSurname_romanized": "al-Umari",
"honorificPrefix": "الحاج",
"honorificPrefix_romanized": "al-Hajj"
},
"roles": [
{
"role_title": "تاجر",
"role_title_romanized": "tajir",
"role_in_source": "founder"
}
],
"biographical": {
"deceased": true,
"address": "مدينة حلب الشهباء"
},
"family_relationships": {
"parent": [
{
"person_index": 1,
"target_name": "محمد بن عبد الله العمري"
}
],
"children": []
},
"context": "The founder (waqif) of the endowment, a deceased merchant from Aleppo."
},
{
"person_index": 1,
"pnv_name": {
"literalName": "المرحوم محمد بن عبد الله العمري",
"literalName_romanized": "al-marhum Muhammad ibn Abd Allah al-Umari",
"givenName": "محمد",
"givenName_romanized": "Muhammad",
"patronym": "عبد الله",
"patronym_romanized": "Abd Allah",
"baseSurname": "العمري",
"baseSurname_romanized": "al-Umari"
},
"roles": [
{
"role_title": null,
"role_title_romanized": null,
"role_in_source": null
}
],
"biographical": {
"deceased": true,
"address": null
},
"family_relationships": {
"parent": [],
"children": [
{
"person_index": 0,
"target_name": "أحمد بن محمد العمري"
}
]
},
"context": "The deceased father of the founder, Ahmad al-Umari."
},
{
"person_index": 2,
"pnv_name": {
"literalName": "الحاج إبراهيم بن يوسف التركماني",
"literalName_romanized": "al-Hajj Ibrahim ibn Yusuf al-Turkmani",
"givenName": "إبراهيم",
"givenName_romanized": "Ibrahim",
"patronym": "يوسف",
"patronym_romanized": "Yusuf",
"baseSurname": "التركماني",
"baseSurname_romanized": "al-Turkmani",
"honorificPrefix": "الحاج",
"honorificPrefix_romanized": "al-Hajj"
},
"roles": [
{
"role_title": "شاهد",
"role_title_romanized": "shahid",
"role_in_source": "witness"
}
],
"biographical": {
"deceased": null,
"address": null
},
"family_relationships": {
"parent": [],
"children": []
},
"context": "One of the witnesses to the endowment deed."
},
{
"person_index": 3,
"pnv_name": {
"literalName": "السيد علي بن حسين الحلبي",
"literalName_romanized": "al-Sayyid Ali ibn Husayn al-Halabi",
"givenName": "علي",
"givenName_romanized": "Ali",
"patronym": "حسين",
"patronym_romanized": "Husayn",
"baseSurname": "الحلبي",
"baseSurname_romanized": "al-Halabi",
"honorificPrefix": "السيد",
"honorificPrefix_romanized": "al-Sayyid"
},
"roles": [
{
"role_title": "شاهد",
"role_title_romanized": "shahid",
"role_in_source": "witness"
}
],
"biographical": {
"deceased": null,
"address": null
},
"family_relationships": {
"parent": [],
"children": []
},
"context": "The second witness to the endowment deed."
}
],
"temporal_references": [
{
"expression": "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية",
"expression_romanized": "Shahr Rajab sanat alf wa mi'ayn wa khamsa wa 'ishrin hijriyyah",
"normalized": "1811-01",
"calendar": "Hijri",
"type": "DATE"
}
],
"locations_mentioned": [
{
"name": "حلب الشهباء",
"name_romanized": "Halab al-Shahba'",
"type": "city"
},
{
"name": "محلة الجديدة",
"name_romanized": "Mahallat al-Jadida",
"type": "neighborhood"
}
]
}
```

View file

@ -0,0 +1,167 @@
{
"pico_observation": {
"observation_id": "obs_001",
"source_type": "metrical_book",
"source_reference": "Метрическая книга Троицкой церкви села Покровского за 1892 год"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"full_name_cyrillic": "Иван",
"full_name_romanized": "Ivan",
"given_name_cyrillic": "Иван",
"given_name_romanized": "Ivan"
},
"roles": [
"newborn"
],
"biographical": {
"sex": "male",
"estate": "peasant"
},
"family_relationships": {
"father": "Пётр Иванович Сидоров",
"mother": "Анна Фёдоровна"
},
"context": "Born March 15, baptized March 17, 1892, son of Pyotr Ivanovich Sidorov and Anna Fyodorovna."
},
{
"person_index": 1,
"pnv_name": {
"full_name_cyrillic": "Пётр Иванович Сидоров",
"full_name_romanized": "Pyotr Ivanovich Sidorov",
"given_name_cyrillic": "Пётр",
"given_name_romanized": "Pyotr",
"patronymic_cyrillic": "Иванович",
"patronymic_romanized": "Ivanovich",
"surname_cyrillic": "Сидоров",
"surname_romanized": "Sidorov"
},
"roles": [
"father"
],
"biographical": {
"sex": "male",
"estate": "крестьянин (peasant)",
"religion": "православный (Orthodox)"
},
"family_relationships": {
"son": "Иван",
"wife": "Анна Фёдоровна",
"sibling": "Мария Ивановна Сидорова"
},
"context": "Peasant from the village of Ivanovka, father of the newborn Ivan, husband of Anna Fyodorovna."
},
{
"person_index": 2,
"pnv_name": {
"full_name_cyrillic": "Анна Фёдоровна",
"full_name_romanized": "Anna Fyodorovna",
"given_name_cyrillic": "Анна",
"given_name_romanized": "Anna",
"patronymic_cyrillic": "Фёдоровна",
"patronymic_romanized": "Fyodorovna"
},
"roles": [
"mother"
],
"biographical": {
"sex": "female",
"religion": "православный (Orthodox)"
},
"family_relationships": {
"son": "Иван",
"husband": "Пётр Иванович Сидоров"
},
"context": "Lawful wife of Pyotr Ivanovich Sidorov, mother of the newborn Ivan."
},
{
"person_index": 3,
"pnv_name": {
"full_name_cyrillic": "Николай Петрович Кузнецов",
"full_name_romanized": "Nikolay Petrovich Kuznetsov",
"given_name_cyrillic": "Николай",
"given_name_romanized": "Nikolay",
"patronymic_cyrillic": "Петрович",
"patronymic_romanized": "Petrovich",
"surname_cyrillic": "Кузнецов",
"surname_romanized": "Kuznetsov"
},
"roles": [
"godparent"
],
"biographical": {
"sex": "male",
"estate": "крестьянин (peasant)"
},
"family_relationships": {
"godson": "Иван"
},
"context": "Godparent of Ivan, a peasant from the same village (Ivanovka)."
},
{
"person_index": 4,
"pnv_name": {
"full_name_cyrillic": "Мария Ивановна Сидорова",
"full_name_romanized": "Maria Ivanovna Sidorova",
"given_name_cyrillic": "Мария",
"given_name_romanized": "Maria",
"patronymic_cyrillic": "Ивановна",
"patronymic_romanized": "Ivanovna",
"surname_cyrillic": "Сидорова",
"surname_romanized": "Sidorova"
},
"roles": [
"godparent"
],
"biographical": {
"sex": "female",
"estate": "крестьянская дочь (peasant's daughter)",
"marital_status": "девица (unmarried)"
},
"family_relationships": {
"godson": "Иван",
"brother": "Пётр Иванович Сидоров"
},
"context": "Godparent of Ivan, an unmarried peasant's daughter from the village of Ivanovka."
}
],
"temporal_references": [
{
"expression": "за 1892 год",
"expression_romanized": "za 1892 god",
"normalized": "1892",
"calendar": "Gregorian"
},
{
"expression": "Марта 15 дня",
"expression_romanized": "Marta 15 dnya",
"normalized": "1892-03-15",
"calendar": "Julian"
},
{
"expression": "17 дня",
"expression_romanized": "17 dnya",
"normalized": "1892-03-17",
"calendar": "Julian"
}
],
"locations_mentioned": [
{
"name": "Троицкой церкви",
"name_romanized": "Troitskoy tserkvi",
"type": "church"
},
{
"name": "села Покровского",
"name_romanized": "sela Pokrovskogo",
"type": "village"
},
{
"name": "деревни Ивановки",
"name_romanized": "derevni Ivanovki",
"type": "village"
}
]
}

View file

@ -0,0 +1,192 @@
{
"pico_observation": {
"observation_id": "obs_1892_03_17_ivan_sidorov",
"source_type": "metrical_book",
"source_reference": "Метрическая книга Троицкой церкви села Покровского за 1892 год"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"full_name_cyrillic": "Иван",
"full_name_romanized": "Ivan",
"given_name_cyrillic": "Иван",
"given_name_romanized": "Ivan"
},
"roles": [
"newborn"
],
"biographical": {
"sex": "male",
"estate": null,
"religion": "Orthodox"
},
"family_relationships": {
"son_of": [
"Пётр Иванович Сидоров",
"Анна Фёдоровна"
]
},
"context": "Родился 15 марта, крещён 17 марта 1892 года. Сын крестьянина Петра Ивановича Сидорова и Анны Фёдоровны."
},
{
"person_index": 1,
"pnv_name": {
"full_name_cyrillic": "Пётр Иванович Сидоров",
"full_name_romanized": "Pyotr Ivanovich Sidorov",
"given_name_cyrillic": "Пётр",
"given_name_romanized": "Pyotr",
"patronymic_cyrillic": "Иванович",
"patronymic_romanized": "Ivanovich",
"surname_cyrillic": "Сидоров",
"surname_romanized": "Sidorov"
},
"roles": [
"parent",
"father"
],
"biographical": {
"sex": "male",
"estate": "крестьянин",
"religion": "Orthodox"
},
"family_relationships": {
"father_of": [
"Иван"
],
"husband_of": [
"Анна Фёдоровна"
]
},
"context": "Отец новорождённого Ивана. Крестьянин из деревни Ивановки."
},
{
"person_index": 2,
"pnv_name": {
"full_name_cyrillic": "Анна Фёдоровна",
"full_name_romanized": "Anna Fyodorovna",
"given_name_cyrillic": "Анна",
"given_name_romanized": "Anna",
"patronymic_cyrillic": "Фёдоровна",
"patronymic_romanized": "Fyodorovna"
},
"roles": [
"parent",
"mother"
],
"biographical": {
"sex": "female",
"estate": "крестьянка",
"religion": "Orthodox"
},
"family_relationships": {
"mother_of": [
"Иван"
],
"wife_of": [
"Пётр Иванович Сидоров"
]
},
"context": "Мать новорождённого Ивана. Законная жена крестьянина Петра Ивановича Сидорова."
},
{
"person_index": 3,
"pnv_name": {
"full_name_cyrillic": "Николай Петрович Кузнецов",
"full_name_romanized": "Nikolai Petrovich Kuznetsov",
"given_name_cyrillic": "Николай",
"given_name_romanized": "Nikolai",
"patronymic_cyrillic": "Петрович",
"patronymic_romanized": "Petrovich",
"surname_cyrillic": "Кузнецов",
"surname_romanized": "Kuznetsov"
},
"roles": [
"godparent",
"godfather"
],
"biographical": {
"sex": "male",
"estate": "крестьянин",
"religion": "Orthodox"
},
"family_relationships": {
"godparent_of": [
"Иван"
]
},
"context": "Восприемник (крёстный отец) Ивана. Крестьянин из той же деревни Ивановки."
},
{
"person_index": 4,
"pnv_name": {
"full_name_cyrillic": "Мария Ивановна Сидорова",
"full_name_romanized": "Maria Ivanovna Sidorova",
"given_name_cyrillic": "Мария",
"given_name_romanized": "Maria",
"patronymic_cyrillic": "Ивановна",
"patronymic_romanized": "Ivanovna",
"surname_cyrillic": "Сидорова",
"surname_romanized": "Sidorova"
},
"roles": [
"godparent",
"godmother"
],
"biographical": {
"sex": "female",
"estate": "крестьянская дочь девица",
"religion": "Orthodox"
},
"family_relationships": {
"godparent_of": [
"Иван"
]
},
"context": "Восприемница (крёстная мать) Ивана. Крестьянская дочь девица из той же деревни Ивановки."
}
],
"temporal_references": [
{
"expression": "Марта 15 дня",
"expression_romanized": "Marta 15 dnya",
"normalized": "1892-03-15",
"calendar": "Julian"
},
{
"expression": "Марта 15 дня",
"expression_romanized": "Marta 15 dnya",
"normalized": "1892-03-27",
"calendar": "Gregorian"
},
{
"expression": "17 дня",
"expression_romanized": "17 dnya",
"normalized": "1892-03-17",
"calendar": "Julian"
},
{
"expression": "17 дня",
"expression_romanized": "17 dnya",
"normalized": "1892-03-29",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "село Покровское",
"name_romanized": "selo Pokrovskoye",
"type": "village"
},
{
"name": "Троицкая церковь",
"name_romanized": "Troitskaya tserkov",
"type": "church"
},
{
"name": "деревня Ивановка",
"name_romanized": "derevnya Ivanovka",
"type": "village"
}
]
}

View file

@ -0,0 +1,221 @@
{
"pico_observation": {
"observation_id": "baptism_mexico_1742-02-23_001",
"source_type": "baptismal_register",
"source_reference": "Ciudad de México, 23 de febrero de 1742. Bautismo de Juan José, hijo de Don Pedro García de la Cruz y Doña María Josefa de los Reyes."
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"given_name": "Antonio",
"surname": "de Mendoza"
},
"roles": [
"teniente de cura",
"priest"
],
"biographical": {
"honorifics": [
"Br.",
"Don"
]
},
"family_relationships": {},
"context": "El sacerdote que oficia el bautismo."
},
{
"person_index": 1,
"pnv_name": {
"given_name": "Juan José",
"surname": ""
},
"roles": [
"baptized_infant"
],
"biographical": {
"casta": "español",
"legitimacy": "legítimo"
},
"family_relationships": {
"parents": [
{
"person_index": 2,
"relationship": "father"
},
{
"person_index": 3,
"relationship": "mother"
}
],
"godparents": [
{
"person_index": 4,
"relationship": "godfather"
},
{
"person_index": 5,
"relationship": "godmother"
}
]
},
"context": "El infante bautizado."
},
{
"person_index": 2,
"pnv_name": {
"given_name": "Pedro",
"surname": "García de la Cruz"
},
"roles": [
"father"
],
"biographical": {
"honorifics": [
"Don"
],
"casta": "español",
"origin": {
"place": "villa de Puebla de los Ángeles",
"type": "natural"
}
},
"family_relationships": {
"children": [
{
"person_index": 1,
"relationship": "son"
}
],
"spouse": [
{
"person_index": 3,
"relationship": "wife"
}
]
},
"context": "Padre del bautizado."
},
{
"person_index": 3,
"pnv_name": {
"given_name": "María Josefa",
"surname": "de los Reyes"
},
"roles": [
"mother"
],
"biographical": {
"honorifics": [
"Doña"
],
"casta": "español",
"origin": {
"place": "esta ciudad",
"type": "natural"
}
},
"family_relationships": {
"children": [
{
"person_index": 1,
"relationship": "son"
}
],
"spouse": [
{
"person_index": 2,
"relationship": "husband"
}
]
},
"context": "Madre del bautizado."
},
{
"person_index": 4,
"pnv_name": {
"given_name": "Francisco Xavier",
"surname": "de Castañeda"
},
"roles": [
"godfather"
],
"biographical": {
"honorifics": [
"Don"
],
"casta": "español",
"origin": {
"place": "esta ciudad",
"type": "vecino"
}
},
"family_relationships": {
"spouse": [
{
"person_index": 5,
"relationship": "wife"
}
],
"godchild": [
{
"person_index": 1,
"relationship": "godson"
}
]
},
"context": "Padrino del bautizado."
},
{
"person_index": 5,
"pnv_name": {
"given_name": "Ana María",
"surname": "de la Encarnación"
},
"roles": [
"godmother"
],
"biographical": {
"honorifics": [
"Doña"
]
},
"family_relationships": {
"spouse": [
{
"person_index": 4,
"relationship": "husband"
}
],
"godchild": [
{
"person_index": 1,
"relationship": "godson"
}
]
},
"context": "Madrina del bautizado, esposa legítima del padrino."
}
],
"temporal_references": [
{
"expression": "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años",
"normalized": "1742-02-23",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "ciudad de México",
"type": "city"
},
{
"name": "esta santa iglesia catedral",
"type": "church"
},
{
"name": "villa de Puebla de los Ángeles",
"type": "villa"
}
]
}

View file

@ -0,0 +1,259 @@
{
"pico_observation": {
"observation_id": "baptism_mexico_1742-02-23_001",
"source_type": "baptismal_register",
"source_reference": "Ciudad de México, 23 de febrero de 1742. Bautismo de Juan José, hijo de Don Pedro García de la Cruz y Doña María Josefa de los Reyes."
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"given_name": "Antonio",
"surname": "de Mendoza"
},
"roles": [
"teniente de cura",
"priest"
],
"honorifics": [
"Br.",
"Don"
],
"biographical": {},
"family_relationships": {},
"context": "El sacerdote que oficia el bautismo."
},
{
"person_index": 1,
"pnv_name": {
"given_name": "Juan José",
"surname": ""
},
"roles": [
"baptized_infant"
],
"biographical": {
"casta": "español",
"legitimacy": "legítimo"
},
"family_relationships": {
"parents": [
{
"person_index": 2,
"relationship": "father"
},
{
"person_index": 3,
"relationship": "mother"
}
],
"godparents": [
{
"person_index": 4,
"relationship": "godfather"
},
{
"person_index": 5,
"relationship": "godmother"
}
]
},
"context": "El niño bautizado, hijo legítimo de Pedro García de la Cruz y María Josefa de los Reyes."
},
{
"person_index": 2,
"pnv_name": {
"given_name": "Pedro",
"surname": "García de la Cruz"
},
"roles": [
"father"
],
"honorifics": [
"Don"
],
"biographical": {
"casta": "español",
"origin": {
"type": "natural",
"place": "Puebla de los Ángeles"
}
},
"family_relationships": {
"children": [
{
"person_index": 1,
"relationship": "son"
}
],
"spouse": [
{
"person_index": 3,
"relationship": "wife"
}
],
"compadres": [
{
"person_index": 4,
"relationship": "godfather_of_his_child"
},
{
"person_index": 5,
"relationship": "godmother_of_his_child"
}
]
},
"context": "Padre del bautizado, natural de Puebla de los Ángeles."
},
{
"person_index": 3,
"pnv_name": {
"given_name": "María Josefa",
"surname": "de los Reyes"
},
"roles": [
"mother"
],
"honorifics": [
"Doña"
],
"biographical": {
"casta": "español",
"origin": {
"type": "natural",
"place": "Ciudad de México"
}
},
"family_relationships": {
"children": [
{
"person_index": 1,
"relationship": "son"
}
],
"spouse": [
{
"person_index": 2,
"relationship": "husband"
}
],
"comadres": [
{
"person_index": 4,
"relationship": "godfather_of_her_child"
},
{
"person_index": 5,
"relationship": "godmother_of_her_child"
}
]
},
"context": "Madre del bautizado, natural de la Ciudad de México."
},
{
"person_index": 4,
"pnv_name": {
"given_name": "Francisco Xavier",
"surname": "de Castañeda"
},
"roles": [
"godfather"
],
"honorifics": [
"Don"
],
"biographical": {
"casta": "español",
"origin": {
"type": "resident",
"place": "Ciudad de México"
}
},
"family_relationships": {
"godchild": [
{
"person_index": 1,
"relationship": "godson"
}
],
"spouse": [
{
"person_index": 5,
"relationship": "wife"
}
],
"compadres": [
{
"person_index": 2,
"relationship": "father_of_godchild"
},
{
"person_index": 3,
"relationship": "mother_of_godchild"
}
]
},
"context": "Padrino del bautizado, vecino de la Ciudad de México."
},
{
"person_index": 5,
"pnv_name": {
"given_name": "Ana María",
"surname": "de la Encarnación"
},
"roles": [
"godmother"
],
"honorifics": [
"Doña"
],
"biographical": {},
"family_relationships": {
"godchild": [
{
"person_index": 1,
"relationship": "godson"
}
],
"spouse": [
{
"person_index": 4,
"relationship": "husband"
}
],
"comadres": [
{
"person_index": 2,
"relationship": "father_of_godchild"
},
{
"person_index": 3,
"relationship": "mother_of_godchild"
}
]
},
"context": "Madrina del bautizado y esposa legítima del padrino."
}
],
"temporal_references": [
{
"expression": "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años",
"normalized": "1742-02-23",
"calendar": "Gregorian"
}
],
"locations_mentioned": [
{
"name": "Ciudad de México",
"type": "city"
},
{
"name": "Puebla de los Ángeles",
"type": "villa"
},
{
"name": "esta santa iglesia catedral",
"type": "church"
}
]
}

View file

@ -88,6 +88,8 @@ enums:
description: Entry requires further enrichment processing
new_entry:
description: Newly added entry not yet enriched
google_maps_searched:
description: Google Maps search attempted but not yet fully enriched
InstitutionTypeCodeEnum:
description: Single-letter GLAMORCUBESFIXPHDNT type codes
@ -182,6 +184,10 @@ enums:
description: GeoNames geographic entity identifier
LinkedIn:
description: LinkedIn profile or company page
GHCID_PREVIOUS:
description: Previous GHCID before relocation or reorganization
OCLC:
description: OCLC (Online Computer Library Center) identifier
LocationResolutionMethodEnum:
description: Method used to resolve settlement location
@ -228,6 +234,8 @@ enums:
description: Verified through web search
CITY_NAME_LOOKUP:
description: Looked up city name directly
MANUAL_RESEARCH:
description: Manually researched and assigned location
GEONAMES_CITY_LOOKUP:
description: Looked up city in GeoNames database
NAME_EXTRACTION:
@ -252,8 +260,6 @@ enums:
description: Extracted location from institution name
GEONAMES_FUZZY:
description: Fuzzy matched in GeoNames
MANUAL_RESEARCH:
description: Manually researched location
WIKIDATA_ENRICHMENT:
description: Enriched from Wikidata
COORDINATE_LOOKUP:
@ -430,6 +436,9 @@ classes:
organisatie:
range: string
description: Organization name from source
organisation:
range: string
description: Organization name from source (British spelling variant)
isil_code_na:
range: string
description: ISIL code from Nationaal Archief
@ -650,10 +659,19 @@ classes:
range: string
description: Status of Wikidata enrichment for this entry
comment:
range: ReferenceLink
multivalued: true
any_of:
- range: string
- range: ReferenceLink
multivalued: true
inlined_as_list: true
description: Comments about this entry (array of objects with label field)
description: Comments about this entry (can be a string or array of objects with label field)
comments:
any_of:
- range: string
- range: ReferenceLink
multivalued: true
inlined_as_list: true
description: Comments about this entry (string or array of objects with label field)
succeeded_by:
range: ReferenceLink
multivalued: true
@ -666,6 +684,15 @@ classes:
label:
range: string
description: Name/label of the duplicate institution
entry_index:
range: integer
description: Index of the duplicate entry in source data
entry_file:
range: string
description: Filename of the duplicate entry
reason:
range: string
description: Reason why this is considered a duplicate
TimeEntry:
description: Structured time entry from source data
@ -676,10 +703,13 @@ classes:
- range: integer
description: Time label (date string or year)
type:
range: TimeEntryType
multivalued: true
any_of:
- range: string
multivalued: true
- range: TimeEntryType
multivalued: true
inlined_as_list: true
description: Type of time point (begin, end, etc.)
description: Type of time point (begin, end, etc.) - can be strings or TimeEntryType objects
TimeEntryType:
description: Type classification for time entry
@ -847,6 +877,11 @@ classes:
data_source:
range: string
description: Data source type (CSV_REGISTRY, API_SCRAPING, etc.)
data_sources:
range: string
multivalued: true
inlined_as_list: true
description: List of data sources (e.g., NDE registry, Google Maps, website)
data_tier:
range: DataTierEnum
description: Quality tier of the data
@ -856,6 +891,12 @@ classes:
extraction_method:
range: string
description: Method used to extract the data
enrichment_date:
range: string
description: When enrichment was performed (ISO date string)
enrichment_method:
range: string
description: Method used to enrich the data (e.g., website_research)
confidence_score:
range: float
description: Confidence score (0-1)
@ -889,6 +930,15 @@ classes:
wikidata_property:
range: string
description: Wikidata property ID (e.g., P856)
archive_location:
range: string
description: Location of archived copy (e.g., web/1186/hartebrug.nl)
claim_extracted_from:
range: string
description: Source path from which claim was extracted (e.g., original_entry.reference)
verified_via_web_archive:
range: boolean
description: Whether claim was verified via web archive
ProvenanceSources:
description: Sources organized by type
@ -938,6 +988,52 @@ classes:
multivalued: true
inlined_as_list: true
description: Nationaal Archief ISIL registry source records
whois_research:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: WHOIS domain research source records
manual_research:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Manual research source records
website:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Website source records (institution website data)
web_scrape:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Web scrape source records (scraped website data)
# Data tier summary fields (for provenance summaries)
TIER_1_AUTHORITATIVE:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_1 authoritative sources
TIER_2_VERIFIED:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_2 verified sources
TIER_3_CROWD_SOURCED:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_3 crowd-sourced sources
TIER_4_INFERRED:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_4 inferred sources
museum_register:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Museum register source records
SourceRecord:
description: Individual source record with claims
@ -999,6 +1095,20 @@ classes:
source_file:
range: string
description: Source file name
research_date:
range: string
description: Date of research (YYYY-MM-DD format)
url:
range: uri
description: URL of the source (website URL, etc.)
data_extracted:
range: string
multivalued: true
inlined_as_list: true
description: List of data types/fields extracted from this source
merge_note:
range: string
description: Note about merge operations involving this source record
DataTierSummary:
description: Summary of data tiers present in entry
@ -1029,7 +1139,7 @@ classes:
attributes:
identifier_scheme:
range: IdentifierSchemeEnum
required: true
required: false
description: Type of identifier
identifier_value:
any_of:
@ -1051,6 +1161,14 @@ classes:
notes:
range: string
description: Additional note about this identifier (alias for note)
scheme:
range: string
description: Identifier scheme (alias for identifier_scheme, used in some data sources)
value:
any_of:
- range: string
- range: integer
description: Identifier value (alias for identifier_value, used in some data sources)
# ---------------------------------------------------------------------------
# GHCID BLOCK - Heritage Custodian ID with history
@ -1277,6 +1395,15 @@ classes:
resolution_notes:
range: string
description: Additional notes from location resolution process
specific_location:
range: string
description: More specific location info within the city (e.g., neighborhood, district)
specific_geonames_id:
range: integer
description: GeoNames ID for the specific location (if different from main city)
correction_note:
range: string
description: Note explaining any correction made to the location resolution
SourceCoordinates:
description: Source of coordinates for resolution
@ -1296,13 +1423,19 @@ classes:
attributes:
type:
range: string
description: Type of research source (e.g., note, wikidata, web_archive, official_source)
description: Type of research source (e.g., note, wikidata, web_archive, official_source, whois)
text:
range: string
description: Text or description of the research source
value:
range: string
description: Value from this source (e.g., plus code, address)
notes:
range: string
description: Additional notes about this source
note:
range: string
description: Additional note about this source (singular alias for notes)
id:
range: string
description: Identifier for the source (e.g., Wikidata Q-number)
@ -1315,6 +1448,56 @@ classes:
coordinates:
range: string
description: Coordinates from this source (e.g., "31.515, 34.434")
data:
range: ResearchSourceData
description: Structured data from the source (e.g., WHOIS registrant info)
ResearchSourceData:
description: Structured data from a research source
attributes:
registrant_name:
range: string
description: WHOIS registrant name
registrant_address:
range: string
description: WHOIS registrant address
registrant_city:
range: string
description: WHOIS registrant city
registrant_state:
range: string
description: WHOIS registrant state/province
registrant_country:
range: string
description: WHOIS registrant country
registrant_postal_code:
range: string
description: WHOIS registrant postal code
# Additional flexible fields for other data types
organization:
range: string
description: Organization name
email:
range: string
description: Contact email
phone:
range: string
description: Contact phone
creation_date:
range: string
description: Domain creation date
updated_date:
range: string
description: Domain updated date
expiration_date:
range: string
description: Domain expiration date
domain_registered:
range: string
description: Domain registration date
registry:
range: string
description: Domain registrar name
# ---------------------------------------------------------------------------
# GOOGLE MAPS ENRICHMENT
@ -1459,21 +1642,28 @@ classes:
- range: HoursStatus
description: Opening hours information (string or status object)
admission:
range: string
description: Admission price information
any_of:
- range: string
- range: AdmissionInfo
description: Admission price information (string or structured object)
related_places:
range: string
multivalued: true
any_of:
- range: string
multivalued: true
- range: RelatedPlace
multivalued: true
inlined_as_list: true
description: Related places nearby
description: Related places nearby (strings or structured objects)
review_topics:
range: string
multivalued: true
inlined_as_list: true
description: Topics mentioned in reviews
reviews_summary:
range: string
description: Summary of reviews
any_of:
- range: string
- range: ReviewsSummary
description: Summary of reviews (string or structured breakdown)
sample_reviews:
any_of:
- range: string
@ -1510,10 +1700,13 @@ classes:
inlined_as_list: true
description: Nearby organizations (strings or structured objects)
features:
range: string
multivalued: true
any_of:
- range: string
multivalued: true
- range: PlaceFeature
multivalued: true
inlined_as_list: true
description: Features of the place
description: Features of the place (strings or key-value objects)
hours_status:
range: string
description: Current opening status (e.g., "Closed · Opens 2 pm Wed")
@ -1590,6 +1783,23 @@ classes:
match_notes:
range: string
description: Notes about how the Google Maps match was determined
price_level:
any_of:
- range: integer
- range: string
description: Google Maps price level (0-4 or string description)
match_warning:
range: string
description: Warning about potential issues with the match
location_note:
range: string
description: Note about the physical location of the place
search_attempted:
range: boolean
description: Whether a Google Maps search was attempted
result:
range: string
description: Result of search operation (found, not_found, found_via_user_link, etc.)
RejectedGoogleMapsData:
description: Rejected Google Maps data preserved for audit trail
@ -1612,6 +1822,53 @@ classes:
returned_country:
range: string
description: Country code actually returned by Google Maps
website:
range: uri
description: Website URL from Google Maps
latitude:
range: float
description: Latitude coordinate
longitude:
range: float
description: Longitude coordinate
enriched_at:
range: datetime
description: When enrichment was performed
PlaceFeature:
description: A feature flag for a place (e.g., native_garden, shop, volunteers)
class_uri: schema:PropertyValue
attributes:
native_garden:
range: boolean
description: Has a native garden
shop:
range: boolean
description: Has a shop
volunteers:
range: boolean
description: Has volunteers
parking:
range: boolean
description: Has parking
cafe:
range: boolean
description: Has a cafe
restaurant:
range: boolean
description: Has a restaurant
gift_shop:
range: boolean
description: Has a gift shop
wheelchair_accessible:
range: boolean
description: Is wheelchair accessible
guided_tours:
range: boolean
description: Offers guided tours
audio_guide:
range: boolean
description: Offers audio guides
LlmVerification:
description: LLM-based verification results for Google Maps matching
@ -1696,6 +1953,25 @@ classes:
minute:
range: integer
ReviewsSummary:
description: Breakdown of reviews by star rating
attributes:
5_star:
range: integer
description: Number of 5-star reviews
4_star:
range: integer
description: Number of 4-star reviews
3_star:
range: integer
description: Number of 3-star reviews
2_star:
range: integer
description: Number of 2-star reviews
1_star:
range: integer
description: Number of 1-star reviews
GoogleReview:
description: Google Maps review
attributes:
@ -1722,6 +1998,16 @@ classes:
range: string
description: Alias for relative_time_description (review date)
AdmissionInfo:
description: Structured admission price information from Google Maps
attributes:
price:
range: string
description: Admission price (e.g., "€9.00")
notes:
range: string
description: Additional notes about admission (e.g., "Additional fees might apply")
PhotoMetadata:
description: Google Maps photo metadata
attributes:
@ -1805,8 +2091,10 @@ classes:
wikidata_temporal:
range: WikidataTemporal
wikidata_inception:
range: string
description: Inception date (P571)
any_of:
- range: string
- range: WikidataTimeValue
description: Inception date (P571) - can be string or structured time value
wikidata_classification:
range: WikidataClassification
wikidata_instance_of:
@ -1923,6 +2211,29 @@ classes:
multivalued: true
inlined_as_list: true
description: Search terms attempted when looking for Wikidata entity
wikidata_description_nl:
range: string
description: Description in Dutch language
wikidata_claims:
range: WikidataClaims
description: Structured Wikidata claims with property metadata
inlined: true
_resolved_entities:
range: WikidataResolvedEntities
description: Resolved Wikidata property and entity metadata cache
inlined: true
WikidataClaims:
description: |
Structured Wikidata claims with property metadata and values.
Uses flexible dict-like structure for various claim types.
class_uri: linkml:Any
WikidataResolvedEntities:
description: |
Cache of resolved Wikidata property and entity metadata.
Keys are property IDs (P123), values are property metadata.
class_uri: linkml:Any
WikidataApiMetadata:
description: API call metadata
@ -2035,6 +2346,19 @@ classes:
inlined_as_list: true
description: Main subject (P921)
WikidataTimeValue:
description: Wikidata time value with precision metadata
attributes:
time:
range: string
description: Time value in ISO 8601 format (e.g., +2015-00-00T00:00:00Z)
precision:
range: integer
description: Precision level (9=year, 10=month, 11=day, etc.)
calendarmodel:
range: uri
description: Calendar model URI (e.g., http://www.wikidata.org/entity/Q1985727 for Gregorian)
WikidataEntity:
description: Reference to a Wikidata entity
attributes:
@ -2081,7 +2405,10 @@ classes:
description: Location properties from Wikidata
attributes:
country:
range: WikidataEntity
any_of:
- range: string
- range: WikidataEntity
description: Country Q-ID (can be string or WikidataEntity object)
headquarters_location:
range: WikidataEntity
coordinates:
@ -2135,8 +2462,10 @@ classes:
multivalued: true
inlined_as_list: true
parent_organization:
range: WikidataEntity
description: Parent organization (P749)
any_of:
- range: string
- range: WikidataEntity
description: Parent organization Q-ID or entity (P749)
subsidiary:
range: WikidataEntity
multivalued: true
@ -2410,12 +2739,54 @@ classes:
website_found:
range: boolean
description: Whether a website was found
official_website:
range: uri
description: Official website URL found during research
research_notes:
range: string
description: Notes from research
organizational_change:
range: OrganizationalChange
description: Organizational change information (closures, mergers, etc.)
# WHOIS/domain information fields
domain:
range: string
description: Domain name of the website
domain_registered:
range: string
description: Date domain was registered (YYYY-MM-DD)
registrar:
range: string
description: Domain registrar name
registration_country:
range: string
description: Country where domain is registered (ISO 3166-1 alpha-2)
site_launched:
range: string
description: Year or date when site was launched
collections:
range: WebCollection
multivalued: true
inlined_as_list: true
description: Collections documented on the website
# Duplicate/canonical entry tracking
is_canonical_entry:
range: boolean
description: Whether this is the canonical entry (vs duplicate)
duplicate_entries:
range: DuplicateEntry
multivalued: true
inlined_as_list: true
description: References to duplicate entries of this institution
organization_status:
range: string
description: Current status of the organization (ACTIVE, CLOSED, etc.)
research_timestamp:
range: datetime
description: When research was performed
website:
range: uri
description: Website URL found during research
# Migration tracking fields
claims_migrated:
range: boolean
@ -2442,6 +2813,12 @@ classes:
merger_target:
range: string
description: Target organization in merger
successor_name:
range: string
description: Name of successor organization (for mergers)
successor_location:
range: string
description: Location of successor organization (for mergers)
notes:
range: string
description: Additional notes
@ -2474,6 +2851,33 @@ classes:
range: string
description: Archive status (new, updated, etc.)
WebCollection:
description: A collection documented on a heritage institution website
attributes:
name:
range: string
required: true
description: Name of the collection
description:
range: string
description: Description of the collection
url:
range: uri
description: URL to the collection page
type:
range: string
description: Type of collection (oral_history, photographs, documents, etc.)
item_count:
any_of:
- range: integer
- range: string
description: Number of items in the collection (integer or descriptive string)
total_hours:
any_of:
- range: float
- range: string
description: Total hours of content (for audio/video collections)
WebArchiveFailure:
description: Failed archive attempt record
attributes:
@ -2603,7 +3007,8 @@ classes:
- range: string
- range: string
multivalued: true
description: Extracted value (alias for claim_value, can be string or list)
- range: OpeningHoursMap
description: Extracted value (alias for claim_value, can be string, list, or structured object like opening hours)
raw_value:
range: string
description: Raw value before processing
@ -2728,6 +3133,9 @@ classes:
job_title_en:
range: string
description: Job title in English
department_en:
range: string
description: Department name in English
RawSource:
description: Raw source information for web enrichment
@ -2741,6 +3149,9 @@ classes:
fetch_timestamp:
range: datetime
description: When the source was fetched
published_date:
range: datetime
description: When the source content was published
source_type:
range: string
description: Type of source (official_website, etc.)
@ -2756,6 +3167,63 @@ classes:
raw_markdown_hash:
range: string
description: SHA-256 hash of the raw markdown content
exa_highlights:
range: string
multivalued: true
inlined_as_list: true
description: Highlighted excerpts from Exa search results
exa_highlight_scores:
range: float
multivalued: true
inlined_as_list: true
description: Relevance scores for Exa highlights
OpeningHoursMap:
description: Opening hours as a day-keyed map
class_uri: schema:OpeningHoursSpecification
attributes:
maandag:
range: string
description: Monday hours (Dutch)
dinsdag:
range: string
description: Tuesday hours (Dutch)
woensdag:
range: string
description: Wednesday hours (Dutch)
donderdag:
range: string
description: Thursday hours (Dutch)
vrijdag:
range: string
description: Friday hours (Dutch)
zaterdag:
range: string
description: Saturday hours (Dutch)
zondag:
range: string
description: Sunday hours (Dutch)
monday:
range: string
description: Monday hours (English)
tuesday:
range: string
description: Tuesday hours (English)
wednesday:
range: string
description: Wednesday hours (English)
thursday:
range: string
description: Thursday hours (English)
friday:
range: string
description: Friday hours (English)
saturday:
range: string
description: Saturday hours (English)
sunday:
range: string
description: Sunday hours (English)
SourceReference:
description: Structured source reference for a claim
@ -2879,8 +3347,12 @@ classes:
range: string
description: Note explaining manual correction made to the name
merge_notes:
range: string
description: Notes about name merging or deduplication
any_of:
- range: string
- range: MergeNote
multivalued: true
inlined_as_list: true
description: Notes about name merging or deduplication (string or array of structured objects)
abbreviation:
range: string
description: Short form or abbreviation of the name
@ -2891,10 +3363,49 @@ classes:
range: string
description: Official registered name
former_names:
range: string
multivalued: true
any_of:
- range: string
multivalued: true
- range: FormerName
multivalued: true
inlined_as_list: true
description: Previous names the institution was known by
description: Previous names the institution was known by (strings or structured objects)
short_name:
range: string
description: Short name or commonly used abbreviated form of the institution name
FormerName:
description: A former name of the institution with optional metadata
attributes:
name:
range: string
required: true
description: The former name
abbreviated:
range: string
description: Abbreviated form of the former name
used_until:
range: string
description: Date until which this name was used (YYYY-MM or YYYY)
used_from:
range: string
description: Date from which this name was used (YYYY-MM or YYYY)
notes:
range: string
description: Additional notes about this former name
MergeNote:
description: Note about a merge operation between duplicate entries
attributes:
source:
range: string
description: Source entry identifier that was merged
merged_on:
range: string
description: Date when merge occurred (YYYY-MM-DD)
reason:
range: string
description: Reason for the merge (e.g., duplicate Wikidata ID, same place ID)
MatchingSource:
description: Source that contributed to name consensus
@ -2910,6 +3421,9 @@ classes:
score:
range: float
description: Match score
notes:
range: string
description: Additional notes about this source match
AlternativeName:
description: Alternative name with language and source information
@ -3168,14 +3682,54 @@ classes:
source:
range: string
description: Source of this platform information
source_references:
description:
range: string
multivalued: true
description: Description of this platform
source_references:
any_of:
- range: string
multivalued: true
- range: PlatformSourceReference
multivalued: true
inlined_as_list: true
description: References to source data
description: References to source data (strings or structured objects)
enrichment_source:
range: string
description: Source of enrichment (e.g., manual_curation, api_scraping)
host_organization:
range: string
description: Organization hosting this platform
host_website:
range: uri
description: Main website of the host organization
language:
range: string
description: Primary language of the platform (ISO 639-1 code)
features:
range: string
multivalued: true
inlined_as_list: true
description: Features of this platform
platforms:
range: string
multivalued: true
inlined_as_list: true
description: Sub-platforms or related platforms
PlatformSourceReference:
description: Structured source reference for a digital platform
attributes:
url:
range: uri
description: Source URL
fetch_timestamp:
range: datetime
description: When the source was fetched
data_extracted:
range: string
multivalued: true
inlined_as_list: true
description: Data fields extracted from this source
# ---------------------------------------------------------------------------
# UNESCO ICH ENRICHMENT
@ -3336,6 +3890,12 @@ classes:
override_reason:
range: string
description: Reason for manual coordinate override
source_url:
range: uri
description: URL source of coordinates (e.g., Google Maps link)
note:
range: string
description: Additional note about coordinate provenance
# ---------------------------------------------------------------------------
# ADDITIONAL ENRICHMENT TYPES
@ -3444,9 +4004,16 @@ classes:
range: float
review_count:
range: integer
description: Number of reviews
reviews:
range: integer
description: Number of reviews (alias for review_count)
place_type:
range: string
description: Type of place (Museum, Cafe, etc.)
type:
range: string
description: Type of place (alias for place_type)
MuseumRegisterEnrichment:
description: Dutch Museum Register (Museumregister Nederland) data

View file

@ -1,5 +1,5 @@
{
"generated": "2025-12-08T17:42:08.000Z",
"generated": "2025-12-12T16:08:52.770Z",
"version": "1.0.0",
"categories": [
{
@ -247,6 +247,11 @@
"path": "modules/classes/ConfidenceMeasure.yaml",
"category": "classes"
},
{
"name": "ConflictStatus",
"path": "modules/classes/ConflictStatus.yaml",
"category": "classes"
},
{
"name": "ConservationLab",
"path": "modules/classes/ConservationLab.yaml",
@ -452,6 +457,16 @@
"path": "modules/classes/FinancialStatement.yaml",
"category": "classes"
},
{
"name": "FindingAid",
"path": "modules/classes/FindingAid.yaml",
"category": "classes"
},
{
"name": "FindingAidType",
"path": "modules/classes/FindingAidType.yaml",
"category": "classes"
},
{
"name": "Foremalarkiv",
"path": "modules/classes/Foremalarkiv.yaml",
@ -487,11 +502,6 @@
"path": "modules/classes/Fylkesarkiv.yaml",
"category": "classes"
},
{
"name": "GLAM",
"path": "modules/classes/GLAM.yaml",
"category": "classes"
},
{
"name": "GalleryType",
"path": "modules/classes/GalleryType.yaml",
@ -507,6 +517,11 @@
"path": "modules/classes/GiftShop.yaml",
"category": "classes"
},
{
"name": "GLAM",
"path": "modules/classes/GLAM.yaml",
"category": "classes"
},
{
"name": "GovernmentArchive",
"path": "modules/classes/GovernmentArchive.yaml",
@ -518,13 +533,13 @@
"category": "classes"
},
{
"name": "HistoricBuilding",
"path": "modules/classes/HistoricBuilding.yaml",
"name": "HistoricalArchive",
"path": "modules/classes/HistoricalArchive.yaml",
"category": "classes"
},
{
"name": "HistoricalArchive",
"path": "modules/classes/HistoricalArchive.yaml",
"name": "HistoricBuilding",
"path": "modules/classes/HistoricBuilding.yaml",
"category": "classes"
},
{
@ -607,11 +622,6 @@
"path": "modules/classes/Kustodie.yaml",
"category": "classes"
},
{
"name": "LGBTArchive",
"path": "modules/classes/LGBTArchive.yaml",
"category": "classes"
},
{
"name": "Landsarkiv",
"path": "modules/classes/Landsarkiv.yaml",
@ -642,6 +652,11 @@
"path": "modules/classes/LegalResponsibilityCollection.yaml",
"category": "classes"
},
{
"name": "LGBTArchive",
"path": "modules/classes/LGBTArchive.yaml",
"category": "classes"
},
{
"name": "LibraryType",
"path": "modules/classes/LibraryType.yaml",
@ -787,11 +802,6 @@
"path": "modules/classes/Organization.yaml",
"category": "classes"
},
{
"name": "OrganizationBranch",
"path": "modules/classes/OrganizationBranch.yaml",
"category": "classes"
},
{
"name": "OrganizationalChangeEvent",
"path": "modules/classes/OrganizationalChangeEvent.yaml",
@ -807,6 +817,11 @@
"path": "modules/classes/OrganizationalSubdivision.yaml",
"category": "classes"
},
{
"name": "OrganizationBranch",
"path": "modules/classes/OrganizationBranch.yaml",
"category": "classes"
},
{
"name": "OutdoorSite",
"path": "modules/classes/OutdoorSite.yaml",
@ -837,16 +852,6 @@
"path": "modules/classes/PerformingArtsArchive.yaml",
"category": "classes"
},
{
"name": "PersonObservation",
"path": "modules/classes/PersonObservation.yaml",
"category": "classes"
},
{
"name": "PersonOrOrganization",
"path": "modules/classes/PersonOrOrganization.yaml",
"category": "classes"
},
{
"name": "PersonalCollectionType",
"path": "modules/classes/PersonalCollectionType.yaml",
@ -862,6 +867,16 @@
"path": "modules/classes/Personenstandsarchiv.yaml",
"category": "classes"
},
{
"name": "PersonObservation",
"path": "modules/classes/PersonObservation.yaml",
"category": "classes"
},
{
"name": "PersonOrOrganization",
"path": "modules/classes/PersonOrOrganization.yaml",
"category": "classes"
},
{
"name": "PhotoArchive",
"path": "modules/classes/PhotoArchive.yaml",
@ -1323,6 +1338,11 @@
"path": "modules/enums/CommercialCustodianTypeEnum.yaml",
"category": "enums"
},
{
"name": "ConflictStatusEnum",
"path": "modules/enums/ConflictStatusEnum.yaml",
"category": "enums"
},
{
"name": "CustodianPrimaryTypeEnum",
"path": "modules/enums/CustodianPrimaryTypeEnum.yaml",
@ -1428,11 +1448,6 @@
"path": "modules/enums/OfficialInstitutionTypeEnum.yaml",
"category": "enums"
},
{
"name": "OrganizationBranchTypeEnum",
"path": "modules/enums/OrganizationBranchTypeEnum.yaml",
"category": "enums"
},
{
"name": "OrganizationalChangeEventTypeEnum",
"path": "modules/enums/OrganizationalChangeEventTypeEnum.yaml",
@ -1443,6 +1458,11 @@
"path": "modules/enums/OrganizationalUnitTypeEnum.yaml",
"category": "enums"
},
{
"name": "OrganizationBranchTypeEnum",
"path": "modules/enums/OrganizationBranchTypeEnum.yaml",
"category": "enums"
},
{
"name": "PersonalCollectionTypeEnum",
"path": "modules/enums/PersonalCollectionTypeEnum.yaml",
@ -1744,6 +1764,11 @@
"path": "modules/slots/confidence_value.yaml",
"category": "slots"
},
{
"name": "conflict_status",
"path": "modules/slots/conflict_status.yaml",
"category": "slots"
},
{
"name": "contact",
"path": "modules/slots/contact.yaml",
@ -1824,6 +1849,11 @@
"path": "modules/slots/digital_platform.yaml",
"category": "slots"
},
{
"name": "digitization_status",
"path": "modules/slots/digitization_status.yaml",
"category": "slots"
},
{
"name": "dissolution_date",
"path": "modules/slots/dissolution_date.yaml",
@ -1839,6 +1869,11 @@
"path": "modules/slots/documentation_url.yaml",
"category": "slots"
},
{
"name": "embargo_end_date",
"path": "modules/slots/embargo_end_date.yaml",
"category": "slots"
},
{
"name": "emic_name",
"path": "modules/slots/emic_name.yaml",
@ -2229,6 +2264,11 @@
"path": "modules/slots/parent_custodian.yaml",
"category": "slots"
},
{
"name": "parent_society",
"path": "modules/slots/parent_society.yaml",
"category": "slots"
},
{
"name": "parent_unit",
"path": "modules/slots/parent_unit.yaml",
@ -2279,6 +2319,16 @@
"path": "modules/slots/platform_type.yaml",
"category": "slots"
},
{
"name": "policy_id",
"path": "modules/slots/policy_id.yaml",
"category": "slots"
},
{
"name": "policy_name",
"path": "modules/slots/policy_name.yaml",
"category": "slots"
},
{
"name": "portal_data_sources",
"path": "modules/slots/portal_data_sources.yaml",
@ -2359,6 +2409,11 @@
"path": "modules/slots/retrieved_on.yaml",
"category": "slots"
},
{
"name": "rico_equivalent",
"path": "modules/slots/rico_equivalent.yaml",
"category": "slots"
},
{
"name": "role_end_date",
"path": "modules/slots/role_end_date.yaml",
@ -2384,6 +2439,16 @@
"path": "modules/slots/safeguards.yaml",
"category": "slots"
},
{
"name": "security_level",
"path": "modules/slots/security_level.yaml",
"category": "slots"
},
{
"name": "serves_finding_aids",
"path": "modules/slots/serves_finding_aids.yaml",
"category": "slots"
},
{
"name": "service_area",
"path": "modules/slots/service_area.yaml",
@ -2504,6 +2569,11 @@
"path": "modules/slots/time_of_destruction.yaml",
"category": "slots"
},
{
"name": "typical_domains",
"path": "modules/slots/typical_domains.yaml",
"category": "slots"
},
{
"name": "unit_affiliation",
"path": "modules/slots/unit_affiliation.yaml",
@ -2568,6 +2638,11 @@
"name": "website",
"path": "modules/slots/website.yaml",
"category": "slots"
},
{
"name": "wikidata_id",
"path": "modules/slots/wikidata_id.yaml",
"category": "slots"
}
]
}

View file

@ -16,6 +16,7 @@
import React, { useState, useRef, useCallback, useEffect, memo } from 'react';
import type { EmbeddingPoint } from './EmbeddingProjector';
import { isTargetInsideAny } from '../../utils/dom';
import './PointDetailsPanel.css';
interface NearestNeighbor {
@ -169,9 +170,7 @@ const PointDetailsPanelComponent: React.FC<PointDetailsPanelProps> = ({
// Drag handlers
const handleMouseDown = useCallback((e: React.MouseEvent) => {
// Don't start drag if clicking on buttons
if ((e.target as HTMLElement).closest('button') ||
(e.target as HTMLElement).closest('input') ||
(e.target as HTMLElement).closest('.neighbor-item')) {
if (isTargetInsideAny(e.target, ['button', 'input', '.neighbor-item'])) {
return;
}

View file

@ -23,13 +23,14 @@
}
/* Ensure content area can grow but footer stays visible */
.layout-content > *:not(.layout-footer) {
.layout-content > *:not(.layout-footer):not(.layout-main) {
flex-shrink: 0;
}
/* Main content wrapper - takes available space */
.layout-main {
flex: 1 0 auto; /* Grow to fill space, don't shrink, auto basis */
min-height: min-content; /* At least as tall as content */
}
/* Footer Styles - minimal, at the very bottom */

View file

@ -10,7 +10,7 @@ import { useLanguage } from '../../contexts/LanguageContext';
import './Layout.css';
// Pages that handle their own footer (full-screen apps with sidebars)
const PAGES_WITH_CUSTOM_FOOTER = ['/map', '/visualize', '/query-builder', '/linkml', '/ontology'];
const PAGES_WITH_CUSTOM_FOOTER = ['/map', '/visualize', '/query-builder', '/linkml', '/ontology', '/conversation'];
export function Layout() {
const currentYear = new Date().getFullYear();

View file

@ -505,6 +505,18 @@
font-weight: 500;
}
/* External Link Icon */
.nav-external-icon {
margin-left: 0.35rem;
font-size: 0.75em;
opacity: 0.7;
}
.nav-dropdown-item:hover .nav-external-icon,
.nav-mobile-link:hover .nav-external-icon {
opacity: 1;
}
/* Mobile Section Styles */
.nav-mobile-section {
border-bottom: 1px solid rgba(23, 42, 89, 0.1);

View file

@ -9,6 +9,7 @@ import { Link, useLocation, useNavigate } from 'react-router-dom';
import { useAuth } from '../../contexts/AuthContext';
import { useLanguage, translations } from '../../contexts/LanguageContext';
import { useUIState } from '../../contexts/UIStateContext';
import { isTargetInsideAny } from '../../utils/dom';
import './Navigation.css';
export function Navigation() {
@ -41,14 +42,13 @@ export function Navigation() {
const WHEEL_RESET_DELAY = 300; // Reset wheel accumulator after this many ms of no wheel events
const handleScroll = (e: Event) => {
const target = e.target as Element;
// Ignore scroll events from the navigation itself
if (target.closest('.navigation') || target.closest('.nav-mobile-menu')) {
if (isTargetInsideAny(e.target, ['.navigation', '.nav-mobile-menu'])) {
return;
}
// Get scroll position from the target element
const target = e.target as Element;
const scrollTop = target instanceof HTMLElement ? target.scrollTop : 0;
const scrollingUp = scrollTop < lastScrollTop;
const scrollDelta = lastScrollTop - scrollTop;
@ -78,10 +78,8 @@ export function Navigation() {
// Wheel event handler for non-scrollable areas
const handleWheel = (e: WheelEvent) => {
const target = e.target as Element;
// Ignore wheel events from the navigation itself
if (target.closest('.navigation') || target.closest('.nav-mobile-menu')) {
if (isTargetInsideAny(e.target, ['.navigation', '.nav-mobile-menu'])) {
return;
}
@ -381,6 +379,15 @@ export function Navigation() {
<Link to="/settings" className={`nav-dropdown-item ${isActive('/settings') ? 'active' : ''}`}>
{t('preferences')}
</Link>
<a
href="https://bronhouder.nl/database"
className="nav-dropdown-item"
target="_blank"
rel="noopener noreferrer"
>
{t('database')}
<span className="nav-external-icon" aria-hidden="true"></span>
</a>
</div>
)}
</div>
@ -500,6 +507,15 @@ export function Navigation() {
<Link to="/settings" className={`nav-mobile-link ${isActive('/settings') ? 'active' : ''}`}>
{t('preferences')}
</Link>
<a
href="https://bronhouder.nl/database"
className="nav-mobile-link"
target="_blank"
rel="noopener noreferrer"
>
{t('database')}
<span className="nav-external-icon" aria-hidden="true"></span>
</a>
</div>
</div>

View file

@ -24,6 +24,7 @@ import { CustodianTimeline } from './CustodianTimeline';
// import { VoronoiStippling } from './VoronoiStippling';
import { ErrorBoundary } from '../common/ErrorBoundary';
import { safeString } from '../../utils/safeString';
import { isTargetInsideAny } from '../../utils/dom';
import { useWikidataImage } from '../../hooks/useWikidataImage';
import type { Archive } from '../../types/werkgebied';
@ -428,6 +429,10 @@ const InstitutionInfoPanelComponent: React.FC<InstitutionInfoPanelProps> = ({
// Tab state for info/youtube
const [activeTab, setActiveTab] = useState<'info' | 'youtube'>('info');
// Export dropdown state
const [showExportMenu, setShowExportMenu] = useState(false);
const exportMenuRef = useRef<HTMLDivElement>(null);
// Track if user has manually positioned this panel
const hasUserPositioned = useRef(false);
@ -474,8 +479,7 @@ const InstitutionInfoPanelComponent: React.FC<InstitutionInfoPanelProps> = ({
// Drag handlers
const handleMouseDown = useCallback((e: React.MouseEvent) => {
// Don't start drag if clicking on buttons or links
if ((e.target as HTMLElement).closest('button') ||
(e.target as HTMLElement).closest('a')) {
if (isTargetInsideAny(e.target, ['button', 'a'])) {
return;
}
@ -551,6 +555,19 @@ const InstitutionInfoPanelComponent: React.FC<InstitutionInfoPanelProps> = ({
return () => window.removeEventListener('keydown', handleKeyDown);
}, [onClose]);
// Close export menu when clicking outside
useEffect(() => {
const handleClickOutside = (e: MouseEvent) => {
if (exportMenuRef.current && !exportMenuRef.current.contains(e.target as Node)) {
setShowExportMenu(false);
}
};
if (showExportMenu) {
document.addEventListener('mousedown', handleClickOutside);
}
return () => document.removeEventListener('mousedown', handleClickOutside);
}, [showExportMenu]);
// GHCID click handler - cycle through displays
const handleGhcidClick = () => {
if (ghcidDisplay === 'current') {

View file

@ -762,6 +762,36 @@ const MediaGalleryComponent: React.FC<MediaGalleryProps> = ({
const [youtubeReady, setYoutubeReady] = useState(false);
const [failedPhotoUrls, setFailedPhotoUrls] = useState<Set<string>>(new Set());
/**
* Get specific failure reason based on URL pattern
* Returns user-friendly error message for common failure types
* @internal Reserved for future diagnostic display
*/
const getFailureReason = (url: string): string => {
const lowerUrl = url.toLowerCase();
// Google Places/Maps images require API key
if (lowerUrl.includes('lh3.googleusercontent.com') ||
lowerUrl.includes('maps.googleapis.com') ||
lowerUrl.includes('googleusercontent.com/p/')) {
return t('Google Places afbeelding niet beschikbaar (API-sleutel vereist)',
'Google Places image unavailable (API key required)');
}
// Relative URLs that weren't resolved
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return t('Ongeldige URL (niet volledig)', 'Invalid URL (not fully resolved)');
}
// YouTube videos in image field
if (lowerUrl.includes('youtube.com/watch') || lowerUrl.includes('youtu.be/')) {
return t('Video-URL in afbeeldingsveld', 'Video URL in image field');
}
// Generic expired/404 error
return t('Afbeelding kon niet worden geladen', 'Image could not be loaded');
};
// Mark failed photos but DON'T remove them - prevents crash when all photos fail
const effectivePhotos = useMemo(() => {
// Start with initial photos, add wikidata image if available and not already included
@ -770,11 +800,11 @@ const MediaGalleryComponent: React.FC<MediaGalleryProps> = ({
allPhotos.push({ url: wikidataImageUrl, attribution: 'Wikimedia Commons' });
}
// Mark failed photos but DON'T remove them
// Mark failed photos with specific failure reasons
const photosWithStatus = allPhotos.map(photo => ({
...photo,
failed: failedPhotoUrls.has(photo.url),
failReason: failedPhotoUrls.has(photo.url) ? 'Image could not be loaded' : undefined
failReason: failedPhotoUrls.has(photo.url) ? getFailureReason(photo.url) : undefined
}));
// Sort: working images first, failed images last
@ -783,7 +813,7 @@ const MediaGalleryComponent: React.FC<MediaGalleryProps> = ({
if (!a.failed && b.failed) return -1;
return 0;
});
}, [initialPhotos, failedPhotoUrls, wikidataImageUrl]);
}, [initialPhotos, failedPhotoUrls, wikidataImageUrl, t]);
const containerRef = useRef<HTMLDivElement>(null);
const playerRef = useRef<YTPlayer | null>(null);

View file

@ -125,6 +125,14 @@
border-bottom-right-radius: 4px;
}
/* Ensure all text inside user messages is white */
.conversation-panel__message--user .conversation-panel__message-content p,
.conversation-panel__message--user .conversation-panel__message-content span,
.conversation-panel__message--user .conversation-panel__loading,
.conversation-panel__message--user .conversation-panel__error {
color: white;
}
.conversation-panel__message--assistant .conversation-panel__message-content {
background: var(--surface-secondary, #f5f5f5);
color: var(--text-primary, #212121);
@ -497,6 +505,12 @@
color: var(--error-color, #d32f2f);
}
.conversation-panel__toolbar-btn--warning:hover:not(:disabled) {
background: #fff3e0;
border-color: #ff9800;
color: #f57c00;
}
/* History Dropdown */
.conversation-panel__history-selector {
position: relative;

View file

@ -18,7 +18,7 @@
*/
import React, { useState, useRef, useEffect, useCallback } from 'react';
import { Send, Loader2, Sparkles, AlertCircle, Copy, Check, ChevronDown, History, Download, Upload, Trash2, X } from 'lucide-react';
import { Send, Loader2, Sparkles, AlertCircle, Copy, Check, ChevronDown, History, Download, Upload, Trash2, X, RefreshCw } from 'lucide-react';
import { useLanguage } from '../../contexts/LanguageContext';
import './ConversationPanel.css';
@ -87,6 +87,8 @@ const TEXT = {
exportSuccess: { nl: 'Conversatie geëxporteerd', en: 'Conversation exported' },
importSuccess: { nl: 'Conversatie geïmporteerd', en: 'Conversation imported' },
importError: { nl: 'Ongeldig bestand', en: 'Invalid file' },
resetCache: { nl: 'Cache wissen', en: 'Clear cache' },
cacheCleared: { nl: 'Cache gewist - probeer uw vraag opnieuw', en: 'Cache cleared - try your question again' },
};
// Example questions to help users get started - shorter list
@ -453,6 +455,39 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
showNotification(t('conversationCleared'));
};
/**
* Clear all caches (IndexedDB semantic cache + conversation) and reload
*/
const handleResetCache = async () => {
try {
// Clear IndexedDB semantic cache
const databases = ['GLAM_SemanticCache', 'GLAM_InstitutionsCache'];
for (const dbName of databases) {
await new Promise<void>((resolve) => {
const request = indexedDB.deleteDatabase(dbName);
request.onsuccess = () => resolve();
request.onerror = () => resolve(); // Continue even if error
request.onblocked = () => resolve();
});
}
// Clear messages
setMessages([]);
// Show notification
showNotification(t('cacheCleared'));
// Reload after short delay to ensure notification is seen
setTimeout(() => {
window.location.reload();
}, 1500);
} catch (err) {
console.error('Failed to clear cache:', err);
// Force reload anyway
window.location.reload();
}
};
return (
<div className="conversation-panel">
{/* Notification Toast */}
@ -600,6 +635,16 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
<Upload size={16} />
</button>
{/* Reset Cache Button - clears IndexedDB and reloads */}
<button
className="conversation-panel__toolbar-btn conversation-panel__toolbar-btn--warning"
onClick={handleResetCache}
title={t('resetCache')}
type="button"
>
<RefreshCw size={16} />
</button>
{/* Clear Conversation Button */}
{messages.length > 0 && (
<button

View file

@ -20,6 +20,7 @@ import React, { useState, useRef, useCallback, useEffect } from 'react';
import type { GraphNode, GraphLink } from '@/types/rdf';
import { getSemanticCategory, SEMANTIC_CATEGORY_INFO } from '@/types/rdf';
import { NodeRelationshipDiagram } from './NodeRelationshipDiagram';
import { isTargetInsideAny } from '@/utils/dom';
import './RdfNodeDetailsPanel.css';
interface RdfNodeDetailsPanelProps {
@ -141,10 +142,12 @@ export const RdfNodeDetailsPanel: React.FC<RdfNodeDetailsPanelProps> = ({
// Drag handlers
const handleMouseDown = useCallback((e: React.MouseEvent) => {
// Only start drag if clicking on header area (not buttons)
if ((e.target as HTMLElement).closest('.rdf-panel__close') ||
(e.target as HTMLElement).closest('.rdf-panel__minimize') ||
(e.target as HTMLElement).closest('.rdf-panel__copy-btn') ||
(e.target as HTMLElement).closest('.rdf-panel__external-link')) {
if (isTargetInsideAny(e.target, [
'.rdf-panel__close',
'.rdf-panel__minimize',
'.rdf-panel__copy-btn',
'.rdf-panel__external-link'
])) {
return;
}

View file

@ -22,6 +22,7 @@ import rehypeRaw from 'rehype-raw';
import { linkmlSchemaService } from '../../lib/linkml/linkml-schema-service';
import type { SemanticInfo, SlotSemanticInfo, EnumSemanticInfo } from '../../lib/linkml/linkml-schema-service';
import { EnumNetworkOverlay } from './EnumNetworkOverlay';
import { isTargetInsideAny } from '../../utils/dom';
import './SemanticDetailsPanel.css';
/**
@ -226,8 +227,7 @@ export const SemanticDetailsPanel: React.FC<SemanticDetailsPanelProps> = ({
// Drag handlers
const handleMouseDown = useCallback((e: React.MouseEvent) => {
// Only start drag if clicking on header area (not buttons)
if ((e.target as HTMLElement).closest('.semantic-panel__close') ||
(e.target as HTMLElement).closest('.semantic-panel__minimize')) {
if (isTargetInsideAny(e.target, ['.semantic-panel__close', '.semantic-panel__minimize'])) {
return;
}

View file

@ -112,6 +112,161 @@
transform: scale(1.1);
}
/* Header Actions Container */
.person-info-panel__header-actions {
display: flex;
align-items: center;
gap: 0.5rem;
flex-shrink: 0;
}
/* Export Button */
.person-info-panel__export-wrapper {
position: relative;
}
.person-info-panel__export-btn {
width: 28px;
height: 28px;
display: flex;
align-items: center;
justify-content: center;
background: rgba(255, 255, 255, 0.15);
border: none;
border-radius: 50%;
color: white;
cursor: pointer;
transition: all 0.2s ease;
}
.person-info-panel__export-btn:hover {
background: rgba(255, 255, 255, 0.25);
transform: scale(1.1);
}
.person-info-panel__export-btn--loading {
cursor: wait;
opacity: 0.7;
}
.person-info-panel__export-btn:disabled {
cursor: wait;
}
.person-info-panel__export-spinner {
width: 12px;
height: 12px;
border: 2px solid rgba(255, 255, 255, 0.3);
border-top-color: white;
border-radius: 50%;
animation: person-export-spin 0.8s linear infinite;
}
@keyframes person-export-spin {
to {
transform: rotate(360deg);
}
}
.person-info-panel__export-menu {
position: absolute;
top: 100%;
right: 0;
margin-top: 0.5rem;
background: white;
border-radius: 8px;
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15), 0 2px 8px rgba(0, 0, 0, 0.1);
overflow: hidden;
z-index: 10002;
min-width: 160px;
}
.person-info-panel__export-section {
padding: 0.25rem 0;
}
.person-info-panel__export-section:not(:last-child) {
border-bottom: 1px solid #e2e8f0;
}
.person-info-panel__export-section-label {
padding: 0.4rem 0.875rem 0.25rem;
font-size: 0.65rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #94a3b8;
}
.person-info-panel__export-menu button {
display: flex;
align-items: center;
gap: 0.5rem;
width: 100%;
padding: 0.6rem 0.875rem;
background: none;
border: none;
text-align: left;
font-size: 0.85rem;
color: #2d3748;
cursor: pointer;
transition: background-color 0.15s ease;
}
.person-info-panel__export-menu button:hover {
background: #f0f4ff;
color: #0a3dfa;
}
.person-info-panel__export-menu button:not(:last-child) {
border-bottom: 1px solid #f1f5f9;
}
.person-info-panel__export-icon {
display: inline-flex;
align-items: center;
justify-content: center;
width: 20px;
font-size: 0.7rem;
font-weight: 700;
font-family: 'SFMono-Regular', Consolas, 'Liberation Mono', Menlo, monospace;
color: #64748b;
}
.person-info-panel__copy-toast {
position: absolute;
top: 100%;
right: 0;
margin-top: 0.5rem;
padding: 0.5rem 0.75rem;
background: #10b981;
color: white;
font-size: 0.75rem;
font-weight: 500;
border-radius: 6px;
white-space: nowrap;
z-index: 10003;
animation: person-toast-fade 2s ease forwards;
box-shadow: 0 2px 8px rgba(16, 185, 129, 0.3);
}
@keyframes person-toast-fade {
0% {
opacity: 0;
transform: translateY(-4px);
}
10% {
opacity: 1;
transform: translateY(0);
}
80% {
opacity: 1;
}
100% {
opacity: 0;
}
}
/* Content */
.person-info-panel__content {
padding: 1rem;
@ -491,6 +646,137 @@
margin-top: 0.25rem;
}
/* Career History - Accordion Styles */
.person-info-panel__career-accordion {
display: flex;
flex-direction: column;
gap: 0.5rem;
margin-top: 0.5rem;
}
.person-info-panel__career-accordion-item {
background: #ffffff;
border: 1px solid #e2e8f0;
border-radius: 8px;
overflow: hidden;
transition: box-shadow 0.2s ease, border-color 0.2s ease;
}
.person-info-panel__career-accordion-item:hover {
border-color: #cbd5e1;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06);
}
.person-info-panel__career-accordion-item--expanded {
border-color: #0a3dfa;
box-shadow: 0 2px 12px rgba(10, 61, 250, 0.1);
}
.person-info-panel__career-accordion-header {
width: 100%;
display: flex;
align-items: flex-start;
justify-content: space-between;
padding: 0.75rem;
background: transparent;
border: none;
cursor: pointer;
text-align: left;
transition: background-color 0.15s ease;
}
.person-info-panel__career-accordion-header:hover {
background-color: #f8fafc;
}
.person-info-panel__career-accordion-header:focus {
outline: none;
background-color: #f1f5f9;
}
.person-info-panel__career-accordion-header:focus-visible {
outline: 2px solid #0a3dfa;
outline-offset: -2px;
}
.person-info-panel__career-accordion-summary {
flex: 1;
min-width: 0;
}
.person-info-panel__career-accordion-title {
display: flex;
align-items: center;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 0.25rem;
}
.person-info-panel__career-accordion-subtitle {
display: flex;
align-items: center;
flex-wrap: wrap;
gap: 0.25rem;
font-size: 0.85rem;
color: #4a5568;
}
.person-info-panel__career-location-brief {
color: #64748b;
font-size: 0.8rem;
}
.person-info-panel__career-accordion-chevron {
flex-shrink: 0;
font-size: 0.7rem;
color: #64748b;
transition: transform 0.25s ease;
margin-left: 0.5rem;
margin-top: 0.25rem;
}
.person-info-panel__career-accordion-chevron--expanded {
transform: rotate(180deg);
}
/* Accordion Content - Collapsible */
.person-info-panel__career-accordion-content {
max-height: 0;
overflow: hidden;
transition: max-height 0.3s ease-out, opacity 0.2s ease;
opacity: 0;
}
.person-info-panel__career-accordion-content--expanded {
max-height: 500px;
opacity: 1;
transition: max-height 0.3s ease-in, opacity 0.2s ease 0.1s;
}
.person-info-panel__career-accordion-body {
padding: 0 0.75rem 0.75rem 0.75rem;
border-top: 1px solid #f0f0f0;
margin-top: 0;
padding-top: 0.75rem;
}
.person-info-panel__career-detail-row {
display: flex;
gap: 0.5rem;
font-size: 0.8rem;
margin-bottom: 0.5rem;
}
.person-info-panel__career-detail-label {
color: #64748b;
font-weight: 500;
}
.person-info-panel__career-detail-value {
color: #334155;
}
/* Legacy career styles (retained for compatibility) */
.person-info-panel__career-list {
display: flex;
flex-direction: column;
@ -525,26 +811,24 @@
color: #64748b;
font-weight: 500;
white-space: nowrap;
margin-left: 0.5rem;
margin-top: 0.25rem;
}
.person-info-panel__current-badge {
display: inline-block;
padding: 0.2rem 0.5rem;
padding: 0.15rem 0.4rem;
background: #10b981;
color: white;
border-radius: 12px;
font-size: 0.7rem;
font-size: 0.65rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.5px;
margin-left: 0.5rem;
}
.person-info-panel__career-company {
color: #2d3748;
font-weight: 500;
margin-bottom: 0.25rem;
font-size: 0.85rem;
}
@ -571,12 +855,18 @@
}
.person-info-panel__career-detail {
display: inline-block;
padding: 0.2rem 0.4rem;
display: inline-flex;
align-items: center;
gap: 0.25rem;
padding: 0.2rem 0.5rem;
background: #f1f5f9;
border-radius: 4px;
font-size: 0.7rem;
color: #6b7280;
color: #475569;
}
.person-info-panel__career-detail-icon {
font-size: 0.75rem;
}
/* Responsive adjustments */
@ -627,4 +917,42 @@
margin-left: 0;
margin-top: 0.25rem;
}
/* Accordion mobile adjustments */
.person-info-panel__career-accordion-header {
padding: 0.625rem;
}
.person-info-panel__career-accordion-title {
flex-direction: column;
align-items: flex-start;
gap: 0.25rem;
}
.person-info-panel__career-accordion-subtitle {
flex-direction: column;
align-items: flex-start;
gap: 0.15rem;
}
.person-info-panel__career-location-brief {
margin-left: 0;
}
.person-info-panel__career-location-brief::before {
content: '';
}
.person-info-panel__career-accordion-body {
padding: 0 0.625rem 0.625rem 0.625rem;
}
.person-info-panel__career-details {
flex-direction: column;
gap: 0.375rem;
}
.person-info-panel__career-detail {
width: fit-content;
}
}

View file

@ -19,6 +19,7 @@
import React, { useEffect, useState, useRef, useCallback, memo, useMemo } from 'react';
import type { SocialNetworkNode, HeritageType, RoleCategory, ExtendedProfileData } from '@/types/socialNetwork';
import { getNodeColor, HERITAGE_TYPE_LABELS, HERITAGE_TYPE_COLORS as _HERITAGE_TYPE_COLORS, ROLE_CATEGORY_LABELS, ROLE_CATEGORY_COLORS as _ROLE_CATEGORY_COLORS } from '@/types/socialNetwork';
import { isTargetInside } from '@/utils/dom';
import './PersonInfoPanel.css';
// Re-export for JSX usage (TypeScript 5.9 flow analysis workaround)
@ -67,6 +68,310 @@ const PersonInfoPanelComponent: React.FC<PersonInfoPanelProps> = ({
void _profileLoading;
const [isDragging, setIsDragging] = useState(false);
const dragStartRef = useRef<{ x: number; y: number; posX: number; posY: number } | null>(null);
// Accordion state for career history items
const [expandedCareerItems, setExpandedCareerItems] = useState<Set<number>>(new Set());
// Export dropdown state
const [showExportMenu, setShowExportMenu] = useState(false);
const [isExporting, setIsExporting] = useState(false);
const [copySuccess, setCopySuccess] = useState<string | null>(null);
const exportMenuRef = useRef<HTMLDivElement>(null);
// Toggle career item expansion
const toggleCareerItem = useCallback((index: number) => {
setExpandedCareerItems(prev => {
const newSet = new Set(prev);
if (newSet.has(index)) {
newSet.delete(index);
} else {
newSet.add(index);
}
return newSet;
});
}, []);
// Close export menu when clicking outside
useEffect(() => {
const handleClickOutside = (event: MouseEvent) => {
if (exportMenuRef.current && !exportMenuRef.current.contains(event.target as Node)) {
setShowExportMenu(false);
}
};
if (showExportMenu) {
document.addEventListener('mousedown', handleClickOutside);
return () => document.removeEventListener('mousedown', handleClickOutside);
}
}, [showExportMenu]);
// Export profile data
const exportProfile = useCallback(async (format: 'json' | 'yaml' | 'markdown' | 'csv') => {
setIsExporting(true);
// Small delay to show loading state for UX feedback
await new Promise(resolve => setTimeout(resolve, 100));
const exportData = {
name: person.name,
linkedin_url: person.linkedinUrl,
headline: profileData?.headline || person.headline,
location: profileData?.location || person.location,
heritage_relevant: person.heritageRelevant,
heritage_type: person.heritageType,
role_category: person.roleCategory,
current_company: profileData?.current_company,
department: profileData?.department,
total_experience: profileData?.total_experience,
connections: profileData?.connections,
followers: profileData?.followers,
about: profileData?.about,
languages: profileData?.languages,
skills: profileData?.skills,
education: profileData?.education,
career_history: profileData?.career_history,
exported_at: new Date().toISOString(),
};
let content: string;
let filename: string;
let mimeType: string;
const slug = person.linkedinUrl?.split('/in/')[1]?.replace(/\//g, '') || person.name.toLowerCase().replace(/\s+/g, '-');
switch (format) {
case 'json':
content = JSON.stringify(exportData, null, 2);
filename = `${slug}_profile.json`;
mimeType = 'application/json';
break;
case 'yaml':
content = convertToYaml(exportData);
filename = `${slug}_profile.yaml`;
mimeType = 'text/yaml';
break;
case 'markdown':
content = convertToMarkdown(exportData);
filename = `${slug}_profile.md`;
mimeType = 'text/markdown';
break;
case 'csv':
content = convertCareerHistoryToCsv(exportData);
filename = `${slug}_career_history.csv`;
mimeType = 'text/csv';
break;
}
const blob = new Blob([content], { type: mimeType });
const url = URL.createObjectURL(blob);
const link = document.createElement('a');
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
setShowExportMenu(false);
setIsExporting(false);
}, [person, profileData]);
// Copy profile to clipboard
const copyToClipboard = useCallback(async (format: 'json' | 'yaml' | 'markdown') => {
setIsExporting(true);
const exportData = {
name: person.name,
linkedin_url: person.linkedinUrl,
headline: profileData?.headline || person.headline,
location: profileData?.location || person.location,
heritage_relevant: person.heritageRelevant,
heritage_type: person.heritageType,
role_category: person.roleCategory,
current_company: profileData?.current_company,
department: profileData?.department,
total_experience: profileData?.total_experience,
connections: profileData?.connections,
followers: profileData?.followers,
about: profileData?.about,
languages: profileData?.languages,
skills: profileData?.skills,
education: profileData?.education,
career_history: profileData?.career_history,
exported_at: new Date().toISOString(),
};
let content: string;
switch (format) {
case 'json':
content = JSON.stringify(exportData, null, 2);
break;
case 'yaml':
content = convertToYaml(exportData);
break;
case 'markdown':
content = convertToMarkdown(exportData);
break;
}
try {
await navigator.clipboard.writeText(content);
setCopySuccess(format.toUpperCase());
setTimeout(() => setCopySuccess(null), 2000);
} catch (err) {
console.error('Failed to copy:', err);
}
setShowExportMenu(false);
setIsExporting(false);
}, [person, profileData]);
// Convert career history to CSV format
const convertCareerHistoryToCsv = (data: Record<string, unknown>): string => {
const careerHistory = data.career_history as Array<Record<string, unknown>> | undefined;
if (!careerHistory || careerHistory.length === 0) {
return 'No career history available';
}
// CSV header
const headers = ['Role', 'Company', 'Location', 'Dates', 'Duration', 'Description', 'Level', 'Company Size', 'Industry', 'Current'];
let csv = headers.join(',') + '\n';
// CSV rows
careerHistory.forEach((job) => {
const role = escapeCSV(String(job.role || job.title || ''));
const company = escapeCSV(String(job.organization || job.company || ''));
const location = escapeCSV(String(job.location || ''));
const dates = escapeCSV(String(job.dates || job.duration_text || ''));
const duration = escapeCSV(String(job.duration || ''));
const description = escapeCSV(String(job.description || ''));
const level = escapeCSV(String(job.level || ''));
const companySize = escapeCSV(String(job.company_size || ''));
const industry = escapeCSV(String(job.industry || ''));
const current = job.current ? 'Yes' : 'No';
csv += `${role},${company},${location},${dates},${duration},${description},${level},${companySize},${industry},${current}\n`;
});
return csv;
};
// Helper to escape CSV values
const escapeCSV = (value: string): string => {
if (value.includes(',') || value.includes('"') || value.includes('\n')) {
return `"${value.replace(/"/g, '""')}"`;
}
return value;
};
// Convert object to YAML format (simple implementation)
const convertToYaml = (obj: Record<string, unknown>, indent = 0): string => {
const spaces = ' '.repeat(indent);
let yaml = '';
for (const [key, value] of Object.entries(obj)) {
if (value === null || value === undefined) continue;
if (Array.isArray(value)) {
if (value.length === 0) continue;
yaml += `${spaces}${key}:\n`;
value.forEach((item) => {
if (typeof item === 'object' && item !== null) {
yaml += `${spaces}- \n`;
const itemYaml = convertToYaml(item as Record<string, unknown>, indent + 2);
yaml += itemYaml.split('\n').map(line => line ? `${spaces} ${line.trim()}` : '').filter(Boolean).join('\n') + '\n';
} else {
yaml += `${spaces}- ${String(item)}\n`;
}
});
} else if (typeof value === 'object') {
yaml += `${spaces}${key}:\n`;
yaml += convertToYaml(value as Record<string, unknown>, indent + 1);
} else if (typeof value === 'string' && (value.includes('\n') || value.length > 80)) {
yaml += `${spaces}${key}: |\n`;
value.split('\n').forEach(line => {
yaml += `${spaces} ${line}\n`;
});
} else {
yaml += `${spaces}${key}: ${JSON.stringify(value)}\n`;
}
}
return yaml;
};
// Convert object to Markdown format
const convertToMarkdown = (data: Record<string, unknown>): string => {
let md = `# ${data.name}\n\n`;
if (data.headline) md += `**${data.headline}**\n\n`;
if (data.location) md += `📍 ${data.location}\n\n`;
if (data.linkedin_url) md += `🔗 [LinkedIn Profile](${data.linkedin_url})\n\n`;
if (data.heritage_relevant) {
md += `## Heritage Profile\n`;
md += `- **Heritage Relevant**: Yes\n`;
if (data.heritage_type) md += `- **Type**: ${data.heritage_type}\n`;
if (data.role_category) md += `- **Role Category**: ${data.role_category}\n`;
md += '\n';
}
if (data.current_company) {
md += `## Current Position\n`;
md += `**${data.current_company}**`;
if (data.department) md += ` - ${data.department}`;
md += '\n\n';
}
if (data.about) {
md += `## About\n${data.about}\n\n`;
}
if (data.career_history && Array.isArray(data.career_history) && data.career_history.length > 0) {
md += `## Career History\n\n`;
(data.career_history as Array<Record<string, unknown>>).forEach((job) => {
const role = job.role || job.title || 'Position';
const company = job.organization || job.company || '';
md += `### ${role}${company ? ` at ${company}` : ''}\n`;
if (job.dates) md += `*${job.dates}*\n`;
if (job.location) md += `📍 ${job.location}\n`;
if (job.description) md += `\n${job.description}\n`;
md += '\n';
});
}
if (data.education && Array.isArray(data.education) && data.education.length > 0) {
md += `## Education\n\n`;
(data.education as Array<Record<string, unknown>>).forEach((edu) => {
md += `### ${edu.degree || 'Degree'}\n`;
if (edu.institution) md += `**${edu.institution}**\n`;
if (edu.years) md += `*${edu.years}*\n`;
md += '\n';
});
}
if (data.skills && Array.isArray(data.skills) && data.skills.length > 0) {
md += `## Skills\n`;
md += (data.skills as string[]).slice(0, 20).join(', ');
if ((data.skills as string[]).length > 20) {
md += `, +${(data.skills as string[]).length - 20} more`;
}
md += '\n\n';
}
if (data.languages && Array.isArray(data.languages) && data.languages.length > 0) {
md += `## Languages\n`;
(data.languages as Array<Record<string, unknown>>).forEach((lang) => {
md += `- ${lang.language}`;
if (lang.proficiency) md += ` (${lang.proficiency})`;
md += '\n';
});
md += '\n';
}
md += `---\n*Exported on ${new Date().toLocaleDateString()}*\n`;
return md;
};
// Calculate initial position - to the right of click, within viewport
useEffect(() => {
@ -93,7 +398,7 @@ const PersonInfoPanelComponent: React.FC<PersonInfoPanelProps> = ({
// Drag handlers
const handleMouseDown = useCallback((e: React.MouseEvent) => {
if ((e.target as HTMLElement).closest('.person-info-panel__close')) return;
if (isTargetInside(e.target, '.person-info-panel__close')) return;
setIsDragging(true);
dragStartRef.current = {
@ -246,13 +551,81 @@ const PersonInfoPanelComponent: React.FC<PersonInfoPanelProps> = ({
)}
</div>
</div>
<button
className="person-info-panel__close"
onClick={onClose}
aria-label="Close panel"
>
</button>
{/* Header Actions */}
<div className="person-info-panel__header-actions">
{/* Export Button */}
<div className="person-info-panel__export-wrapper" ref={exportMenuRef}>
<button
className={`person-info-panel__export-btn ${isExporting ? 'person-info-panel__export-btn--loading' : ''}`}
onClick={() => setShowExportMenu(!showExportMenu)}
aria-label="Export profile"
title="Export profile"
disabled={isExporting}
>
{isExporting ? (
<span className="person-info-panel__export-spinner" />
) : (
<svg viewBox="0 0 24 24" width="14" height="14" fill="currentColor">
<path d="M19 9h-4V3H9v6H5l7 7 7-7zM5 18v2h14v-2H5z"/>
</svg>
)}
</button>
{showExportMenu && (
<div className="person-info-panel__export-menu">
<div className="person-info-panel__export-section">
<div className="person-info-panel__export-section-label">Download</div>
<button onClick={() => exportProfile('json')}>
<span className="person-info-panel__export-icon">{ }</span>
JSON
</button>
<button onClick={() => exportProfile('yaml')}>
<span className="person-info-panel__export-icon">---</span>
YAML
</button>
<button onClick={() => exportProfile('markdown')}>
<span className="person-info-panel__export-icon">#</span>
Markdown
</button>
{profileData?.career_history && profileData.career_history.length > 0 && (
<button onClick={() => exportProfile('csv')}>
<span className="person-info-panel__export-icon">📊</span>
CSV (Career)
</button>
)}
</div>
<div className="person-info-panel__export-section">
<div className="person-info-panel__export-section-label">Copy to Clipboard</div>
<button onClick={() => copyToClipboard('json')}>
<span className="person-info-panel__export-icon">📋</span>
Copy JSON
</button>
<button onClick={() => copyToClipboard('yaml')}>
<span className="person-info-panel__export-icon">📋</span>
Copy YAML
</button>
<button onClick={() => copyToClipboard('markdown')}>
<span className="person-info-panel__export-icon">📋</span>
Copy Markdown
</button>
</div>
</div>
)}
{copySuccess && (
<div className="person-info-panel__copy-toast">
{copySuccess} copied!
</div>
)}
</div>
<button
className="person-info-panel__close"
onClick={onClose}
aria-label="Close panel"
>
</button>
</div>
</div>
{/* Content - Always show all available metadata */}
@ -396,48 +769,151 @@ const PersonInfoPanelComponent: React.FC<PersonInfoPanelProps> = ({
</div>
)}
{/* Career History */}
{profileData?.career_history && profileData.career_history.length > 0 && (
<div className="person-info-panel__section">
<div className="person-info-panel__label">Career History</div>
<div className="person-info-panel__career-list">
{profileData.career_history.slice(0, 5).map((job, index) => (
<div key={index} className="person-info-panel__career-item">
<div className="person-info-panel__career-header">
<div className="person-info-panel__career-role">{job.role}</div>
<div className="person-info-panel__career-dates">{job.dates}</div>
{job.current && (
<span className="person-info-panel__current-badge">Current</span>
)}
</div>
<div className="person-info-panel__career-company">{job.organization}</div>
{job.location && (
<div className="person-info-panel__career-location">{job.location}</div>
)}
{job.description && (
<div className="person-info-panel__career-description">{job.description}</div>
)}
<div className="person-info-panel__career-details">
{job.level && (
<span className="person-info-panel__career-detail">{job.level}</span>
)}
{job.company_size && (
<span className="person-info-panel__career-detail">{job.company_size}</span>
)}
{job.industry && (
<span className="person-info-panel__career-detail">{job.industry}</span>
)}
</div>
</div>
))}
{profileData.career_history.length > 5 && (
<div className="person-info-panel__more-indicator">
+{profileData.career_history.length - 5} more positions
</div>
)}
{/* Career History - Accordion UI */}
{profileData?.career_history && profileData.career_history.length > 0 && (() => {
// Pre-process career history to handle field variants and filter empty items
const validCareerItems = profileData.career_history
.map((job, originalIndex) => {
// Handle field name variants
const jobRole = job.role || job.title || null;
const jobCompany = job.organization || job.company || null;
const jobLocation = job.location || null;
const jobDates = job.dates || job.duration || job.duration_text || null;
// Handle "Unknown" role as effectively empty
const displayRole = (jobRole && jobRole !== 'Unknown') ? jobRole : null;
// Skip items with no meaningful content
if (!displayRole && !jobCompany) return null;
return {
...job,
displayRole,
jobCompany,
jobLocation,
jobDates,
originalIndex,
};
})
.filter(Boolean);
if (validCareerItems.length === 0) return null;
return (
<div className="person-info-panel__section">
<div className="person-info-panel__label">
Career History ({validCareerItems.length} position{validCareerItems.length !== 1 ? 's' : ''})
</div>
<div className="person-info-panel__career-accordion">
{validCareerItems.map((job) => {
if (!job) return null;
const isExpanded = expandedCareerItems.has(job.originalIndex);
const hasDetails = job.description || job.level || job.company_size || job.industry;
return (
<div
key={job.originalIndex}
className={`person-info-panel__career-accordion-item ${isExpanded ? 'person-info-panel__career-accordion-item--expanded' : ''}`}
>
{/* Accordion Header - Always Visible */}
<button
className="person-info-panel__career-accordion-header"
onClick={() => toggleCareerItem(job.originalIndex)}
aria-expanded={isExpanded}
aria-controls={`career-content-${job.originalIndex}`}
>
<div className="person-info-panel__career-accordion-summary">
<div className="person-info-panel__career-accordion-title">
{job.displayRole && (
<span className="person-info-panel__career-role">{job.displayRole}</span>
)}
{!job.displayRole && job.jobCompany && (
<span className="person-info-panel__career-role">{job.jobCompany}</span>
)}
{job.current && (
<span className="person-info-panel__current-badge">Current</span>
)}
</div>
{job.displayRole && job.jobCompany && (
<div className="person-info-panel__career-accordion-subtitle">
<span className="person-info-panel__career-company">{job.jobCompany}</span>
{job.jobLocation && job.jobLocation.trim() && (
<span className="person-info-panel__career-location-brief">
{job.jobLocation.split(',')[0]}
</span>
)}
</div>
)}
{!job.displayRole && job.jobLocation && job.jobLocation.trim() && (
<div className="person-info-panel__career-accordion-subtitle">
<span className="person-info-panel__career-location-brief">
{job.jobLocation.split(',')[0]}
</span>
</div>
)}
{job.jobDates && (
<div className="person-info-panel__career-dates">{job.jobDates}</div>
)}
</div>
<span className={`person-info-panel__career-accordion-chevron ${isExpanded ? 'person-info-panel__career-accordion-chevron--expanded' : ''}`}>
</span>
</button>
{/* Accordion Content - Collapsible */}
<div
id={`career-content-${job.originalIndex}`}
className={`person-info-panel__career-accordion-content ${isExpanded ? 'person-info-panel__career-accordion-content--expanded' : ''}`}
aria-hidden={!isExpanded}
>
<div className="person-info-panel__career-accordion-body">
{/* Full Location - only show if location exists, has content, and contains a comma */}
{job.jobLocation && job.jobLocation.trim() && job.jobLocation.includes(',') && (
<div className="person-info-panel__career-detail-row">
<span className="person-info-panel__career-detail-label">Location:</span>
<span className="person-info-panel__career-detail-value">{job.jobLocation}</span>
</div>
)}
{/* Description */}
{job.description && (
<div className="person-info-panel__career-description">
{job.description}
</div>
)}
{/* Metadata Badges */}
{hasDetails && (
<div className="person-info-panel__career-details">
{job.level && (
<span className="person-info-panel__career-detail">
<span className="person-info-panel__career-detail-icon">📊</span>
{job.level}
</span>
)}
{job.company_size && (
<span className="person-info-panel__career-detail">
<span className="person-info-panel__career-detail-icon">👥</span>
{job.company_size}
</span>
)}
{job.industry && (
<span className="person-info-panel__career-detail">
<span className="person-info-panel__career-detail-icon">🏢</span>
{job.industry}
</span>
)}
</div>
)}
</div>
</div>
</div>
);
})}
</div>
</div>
</div>
)}
);
})()}
{/* Role Category */}
{roleLabel && (

View file

@ -515,6 +515,8 @@ interface GeoAPIFeature {
social_media?: Record<string, string> | string;
// Logo URL extracted from web claims
logo_url?: string;
// Web claims - structured data extracted from websites
web_claims?: string | { claims?: WebClaim[] } | WebClaim[];
// YouTube enrichment - may be object or JSON string
youtube_enrichment?: string | {
status?: string;
@ -575,6 +577,19 @@ interface GeoAPISearchResponse {
results: GeoAPISearchResult[];
}
/**
* Web claim from website scraping
*/
interface WebClaim {
claim_type?: string;
claim_value?: string;
raw_value?: string;
source_url?: string;
retrieved_on?: string;
xpath?: string;
extraction_method?: string;
}
/**
* Loading progress for UI feedback
*/
@ -613,6 +628,124 @@ function parseProvinceFromGhcid(ghcid: string | null | undefined): string {
return PROVINCE_CODE_MAP[code] || '';
}
/**
* Parse web_claims from JSON string or object
*/
function parseWebClaims(value: unknown): WebClaim[] | undefined {
if (!value) return undefined;
try {
let parsed = value;
if (typeof value === 'string') {
parsed = JSON.parse(value);
}
// Web claims can be an array directly or wrapped in an object
const claims: WebClaim[] = Array.isArray(parsed) ? parsed : ((parsed as Record<string, unknown>).claims as WebClaim[] || []);
return claims.length > 0 ? claims : undefined;
} catch {
return undefined;
}
}
/**
* Resolve a potentially relative URL against a base URL
*/
function resolveUrl(url: string, baseUrl?: string): string {
// Already absolute URL
if (url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//')) {
return url.startsWith('//') ? `https:${url}` : url;
}
// No base URL to resolve against
if (!baseUrl) return url;
try {
// Use URL API to resolve relative URLs
const base = new URL(baseUrl);
return new URL(url, base).href;
} catch {
// If URL parsing fails, return as-is
return url;
}
}
/**
* Check if a URL is a valid image URL (not a video, not relative without base)
*/
function isValidImageUrl(url: string): boolean {
if (!url) return false;
// Must be absolute URL
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return false;
}
// Filter out non-image URLs
const invalidPatterns = [
'youtube.com/watch',
'youtu.be/',
'vimeo.com/',
'twitter.com/',
'facebook.com/',
'.mp4',
'.webm',
'.mov',
'.avi',
];
const lowerUrl = url.toLowerCase();
return !invalidPatterns.some(pattern => lowerUrl.includes(pattern));
}
/**
* Extract logo URL from web_claims - prefer logo_img_attr extraction method
* Priority: logo_img_attr > og_image > favicon_link > others
* Also resolves relative URLs against source_url
*/
function extractLogoFromWebClaims(webClaimsValue: unknown): string | undefined {
const claims = parseWebClaims(webClaimsValue);
if (!claims || claims.length === 0) return undefined;
// Filter for logo claims
const logoClaims = claims.filter(c => c.claim_type === 'logo' && c.claim_value);
if (logoClaims.length === 0) return undefined;
// Sort by preference: logo_img_attr > og_image > favicon_link > others
const sorted = logoClaims.sort((a, b) => {
const priority: Record<string, number> = {
'logo_img_attr': 3,
'og_image': 2,
'favicon_link': 1,
};
return (priority[b.extraction_method || ''] || 0) - (priority[a.extraction_method || ''] || 0);
});
// Filter out favicons, loading placeholders, and non-image URLs
const bestLogo = sorted.find(c => {
const url = c.claim_value || '';
// Skip favicon-like URLs
if (c.extraction_method === 'favicon_link') return false;
// Skip very small images or placeholder images
if (url.includes('favicon') || url.includes('loading')) return false;
// Resolve the URL and check if it's valid
const resolvedUrl = resolveUrl(url, c.source_url);
return isValidImageUrl(resolvedUrl);
});
const selectedClaim = bestLogo || sorted[0];
if (!selectedClaim?.claim_value) return undefined;
// Resolve relative URLs against source_url
const resolvedUrl = resolveUrl(selectedClaim.claim_value, selectedClaim.source_url);
// Final validation - only return if it's a valid image URL
return isValidImageUrl(resolvedUrl) ? resolvedUrl : undefined;
}
/**
* Safely extract a year number from a value that might be:
* - A number (return as-is)
@ -985,7 +1118,11 @@ function featureToInstitution(feature: GeoAPIFeature): Institution {
dissolution_year: safeExtractYear(props.dissolution_year),
social_media: normalizeSocialMedia(props.social_media),
youtube: normalizeYouTubeEnrichment(props.youtube_enrichment),
logo_url: props.logo_url,
// Extract logo URL from web_claims (primary) or use direct logo_url prop (fallback)
// Priority: web_claims logo_img_attr > web_claims og_image > props.logo_url
// Only use props.logo_url if it's a valid absolute image URL (not relative, not video)
logo_url: extractLogoFromWebClaims(props.web_claims) ||
(props.logo_url && isValidImageUrl(props.logo_url) ? props.logo_url : undefined),
};
}
@ -1578,6 +1715,11 @@ function detailResponseToInstitution(data: Record<string, unknown>): Institution
// Handle social media
const socialMedia = data.social_media as Record<string, string> | undefined;
// Extract logo URL from web_claims (primary) or use direct logo_url (fallback)
// Priority: web_claims logo_img_attr > web_claims og_image > props.logo_url
const logoUrl = extractLogoFromWebClaims(data.web_claims) ||
(data.logo_url && isValidImageUrl(data.logo_url as string) ? data.logo_url as string : undefined);
return {
lat: data.lat as number,
lon: data.lon as number,
@ -1621,6 +1763,7 @@ function detailResponseToInstitution(data: Record<string, unknown>): Institution
dissolution_year: dissolutionYear,
social_media: socialMedia,
youtube,
logo_url: logoUrl,
};
}
@ -1682,7 +1825,7 @@ export function useInstitutionDetail(ghcid: string | null): UseInstitutionDetail
// Transform it to Institution format
const inst = detailResponseToInstitution(data);
console.log('[GeoAPI Detail] Loaded institution:', inst.name);
console.log('[GeoAPI Detail] Loaded institution:', inst.name, 'logo_url:', inst.logo_url, 'youtube:', inst.youtube);
// Store in cache
detailCache.set(ghcid, { data: inst, timestamp: Date.now() });

View file

@ -354,21 +354,56 @@ async function callDSPy(
answer: m.role === 'assistant' ? m.content : '',
})).filter(m => m.question || m.answer) || [];
const response = await fetch(`${DSPY_URL}/query`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
question,
language: options.language || 'nl',
context: conversationContext, // Backend expects conversation history here
include_visualization: true,
}),
});
let response: Response;
try {
response = await fetch(`${DSPY_URL}/query`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
question,
language: options.language || 'nl',
context: conversationContext, // Backend expects conversation history here
include_visualization: true,
}),
});
} catch (networkError) {
// Network error - server unreachable
console.error('[DSPy] Network error:', networkError);
const lang = options.language || 'nl';
return {
answer: lang === 'nl'
? '⚠️ **Serverfout**: Kan geen verbinding maken met de RAG-server. Controleer of de backend draait op poort 8003.'
: '⚠️ **Server Error**: Cannot connect to RAG server. Check if backend is running on port 8003.',
confidence: 0,
};
}
if (!response.ok) {
// Fallback response if DSPy service unavailable
// HTTP error - log details for debugging
console.error(`[DSPy] HTTP ${response.status}: ${response.statusText}`);
const lang = options.language || 'nl';
if (response.status === 404) {
return {
answer: lang === 'nl'
? '⚠️ **Serverfout (404)**: De RAG API endpoint is niet gevonden. Controleer de proxy configuratie in vite.config.ts en herstart de frontend.'
: '⚠️ **Server Error (404)**: RAG API endpoint not found. Check proxy configuration in vite.config.ts and restart frontend.',
confidence: 0,
};
}
if (response.status >= 500) {
return {
answer: lang === 'nl'
? `⚠️ **Serverfout (${response.status})**: De RAG-server heeft een interne fout. Controleer de backend logs.`
: `⚠️ **Server Error (${response.status})**: RAG server internal error. Check backend logs.`,
confidence: 0,
};
}
// Other HTTP errors - fall back to context-based answer
return {
answer: generateFallbackAnswer(question, context, options.language || 'nl'),
answer: generateFallbackAnswer(question, context, lang),
confidence: 0.5,
};
}

View file

@ -137,6 +137,135 @@ interface FullFeature {
properties: Record<string, unknown>;
}
// Parse web_claims JSON for logo extraction
interface WebClaim {
claim_type?: string;
claim_value?: string;
raw_value?: string;
source_url?: string;
retrieved_on?: string;
xpath?: string;
extraction_method?: string;
}
/**
* Parse web_claims from JSON string or object
*/
function parseWebClaims(value: unknown): WebClaim[] | undefined {
if (!value) return undefined;
try {
let parsed = value;
if (typeof value === 'string') {
parsed = JSON.parse(value);
}
// Web claims can be an array directly or wrapped in an object
const claims: WebClaim[] = Array.isArray(parsed) ? parsed : ((parsed as Record<string, unknown>).claims as WebClaim[] || []);
return claims.length > 0 ? claims : undefined;
} catch {
return undefined;
}
}
/**
* Resolve a potentially relative URL against a base URL
*/
function resolveUrl(url: string, baseUrl?: string): string {
// Already absolute URL
if (url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//')) {
return url.startsWith('//') ? `https:${url}` : url;
}
// No base URL to resolve against
if (!baseUrl) return url;
try {
// Use URL API to resolve relative URLs
const base = new URL(baseUrl);
return new URL(url, base).href;
} catch {
// If URL parsing fails, return as-is
return url;
}
}
/**
* Check if a URL is a valid image URL (not a video, not relative without base)
*/
function isValidImageUrl(url: string): boolean {
if (!url) return false;
// Must be absolute URL
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return false;
}
// Filter out non-image URLs
const invalidPatterns = [
'youtube.com/watch',
'youtu.be/',
'vimeo.com/',
'twitter.com/',
'facebook.com/',
'.mp4',
'.webm',
'.mov',
'.avi',
];
const lowerUrl = url.toLowerCase();
return !invalidPatterns.some(pattern => lowerUrl.includes(pattern));
}
/**
* Extract logo URL from web_claims - prefer logo_img_attr extraction method
* Priority: logo_img_attr > og_image > favicon_link > others
* Also resolves relative URLs against source_url
*/
function extractLogoFromWebClaims(webClaimsValue: unknown): string | undefined {
const claims = parseWebClaims(webClaimsValue);
if (!claims || claims.length === 0) return undefined;
// Filter for logo claims
const logoClaims = claims.filter(c => c.claim_type === 'logo' && c.claim_value);
if (logoClaims.length === 0) return undefined;
// Sort by preference: logo_img_attr > og_image > favicon_link > others
const sorted = logoClaims.sort((a, b) => {
const priority: Record<string, number> = {
'logo_img_attr': 3,
'og_image': 2,
'favicon_link': 1,
};
return (priority[b.extraction_method || ''] || 0) - (priority[a.extraction_method || ''] || 0);
});
// Filter out favicons, loading placeholders, and non-image URLs
const bestLogo = sorted.find(c => {
const url = c.claim_value || '';
// Skip favicon-like URLs
if (c.extraction_method === 'favicon_link') return false;
// Skip very small images or placeholder images
if (url.includes('favicon') || url.includes('loading')) return false;
// Resolve the URL and check if it's valid
const resolvedUrl = resolveUrl(url, c.source_url);
return isValidImageUrl(resolvedUrl);
});
const selectedClaim = bestLogo || sorted[0];
if (!selectedClaim?.claim_value) return undefined;
// Resolve relative URLs against source_url
const resolvedUrl = resolveUrl(selectedClaim.claim_value, selectedClaim.source_url);
// Final validation - only return if it's a valid image URL
return isValidImageUrl(resolvedUrl) ? resolvedUrl : undefined;
}
function fullFeatureToInstitution(feature: FullFeature): Institution {
const props = feature.properties;
const [lon, lat] = feature.geometry.coordinates;
@ -149,6 +278,13 @@ function fullFeatureToInstitution(feature: FullFeature): Institution {
const socialMedia = parseSocialMedia(props.social_media);
const youtube = parseYouTube(props.youtube_enrichment);
// Extract logo URL from web_claims (primary) or use direct logo_url prop (fallback)
// Priority: web_claims logo_img_attr > web_claims og_image > props.logo_url
// Only use props.logo_url if it's a valid absolute image URL (not relative, not video)
const webClaimsLogo = extractLogoFromWebClaims(props.web_claims);
const fallbackLogo = props.logo_url as string | undefined;
const logoUrl = webClaimsLogo || (fallbackLogo && isValidImageUrl(fallbackLogo) ? fallbackLogo : undefined);
return {
lat,
lon,
@ -183,6 +319,7 @@ function fullFeatureToInstitution(feature: FullFeature): Institution {
dissolution_year: safeExtractYear(props.dissolution_year),
social_media: socialMedia,
youtube,
logo_url: logoUrl,
};
}

View file

@ -67,7 +67,7 @@ const CACHE_KEY = 'all_institutions';
const DEFAULT_CONFIG: InstitutionsCacheConfig = {
staleTtlMs: 1 * 60 * 60 * 1000, // 1 hour - trigger background refresh
expiredTtlMs: 24 * 60 * 60 * 1000, // 24 hours - force foreground refresh
cacheVersion: '1.0.0',
cacheVersion: '1.1.0', // Bumped from 1.0.0 to include logo_url in cached data
};
// ============================================================================

View file

@ -11,7 +11,7 @@
* - Migration support for format changes
*/
const STORAGE_VERSION = 1;
const STORAGE_VERSION = 2; // Incremented to trigger migration to progressive default
const STORAGE_KEY_PREFIX = 'rdf-visualizer';
/**
@ -268,10 +268,19 @@ export function clearRecentQueries(): boolean {
* Migrate UI state from old version to current
*/
function migrateUIState(oldState: UIState): UIState {
// Currently only version 1 exists
// Future migrations would go here
// Version 1 -> 2: Reset dataBackend to 'progressive' (new recommended default)
// This ensures all users start with progressive mode after the update
if (oldState.version < 2) {
console.log('Migration v1->v2: Setting dataBackend to progressive (new recommended default)');
oldState = {
...oldState,
dataBackend: 'progressive',
};
}
// For now, merge with defaults to add any missing fields
// Future migrations would go here (e.g., version 2 -> 3)
// Merge with defaults to add any missing fields and update version
return deepMerge(DEFAULT_UI_STATE, { ...oldState, version: STORAGE_VERSION });
}

View file

@ -36,7 +36,8 @@
grid-template-columns: 1fr;
gap: 0;
height: 100%;
overflow: hidden;
min-height: 0;
overflow: visible;
}
.conversation-layout--with-viz {
@ -115,6 +116,7 @@
display: flex;
flex-direction: column;
height: 100%;
min-height: 0;
background: white;
border-right: 1px solid var(--border-color, #e5e5e5);
}
@ -151,6 +153,37 @@
color: #ffd700;
}
.conversation-chat__header {
display: flex;
justify-content: space-between;
align-items: center;
}
.conversation-chat__new-btn {
display: flex;
align-items: center;
gap: 6px;
padding: 8px 14px;
background: rgba(255, 255, 255, 0.15);
border: 1px solid rgba(255, 255, 255, 0.3);
border-radius: 8px;
color: white;
font-size: 0.875rem;
font-weight: 500;
cursor: pointer;
transition: all 0.2s ease;
}
.conversation-chat__new-btn:hover {
background: rgba(255, 255, 255, 0.25);
border-color: rgba(255, 255, 255, 0.5);
}
.conversation-chat__new-btn:active {
background: rgba(255, 255, 255, 0.3);
transform: scale(0.98);
}
/* ============================================================================
Input Area
============================================================================ */
@ -375,6 +408,17 @@
background: #fff5f5;
}
.conversation-chat__action-btn--warning {
border-color: #f59e0b;
color: #b45309;
}
.conversation-chat__action-btn--warning:hover:not(:disabled) {
border-color: #d97706;
color: #d97706;
background: #fffbeb;
}
/* History dropdown */
.conversation-chat__history-selector {
position: relative;
@ -475,6 +519,7 @@
display: flex;
flex-direction: column;
gap: 16px;
min-height: 0;
}
/* Welcome state */
@ -482,11 +527,12 @@
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
justify-content: flex-start;
text-align: center;
padding: 48px 24px;
max-width: 600px;
margin: 0 auto;
min-height: min-content;
}
.conversation-chat__welcome-header {
@ -588,6 +634,7 @@
margin: 0;
white-space: pre-wrap;
word-wrap: break-word;
color: inherit;
}
.conversation-message__loading {

View file

@ -43,6 +43,8 @@ import {
Layers,
Database,
Zap,
RefreshCw,
Plus,
} from 'lucide-react';
import { useLanguage } from '../contexts/LanguageContext';
import { useMultiDatabaseRAG, type RAGResponse, type ConversationMessage, type VisualizationType, type InstitutionData } from '../hooks/useMultiDatabaseRAG';
@ -110,6 +112,8 @@ const TEXT = {
export: { nl: 'Export', en: 'Export' },
import: { nl: 'Import', en: 'Import' },
clear: { nl: 'Wis', en: 'Clear' },
new: { nl: 'Nieuw', en: 'New' },
newConversation: { nl: 'Nieuw gesprek starten', en: 'Start new conversation' },
embeddings: { nl: 'Embeddings', en: 'Embeddings' },
advanced: { nl: 'Geavanceerd', en: 'Advanced' },
simple: { nl: 'Eenvoudig', en: 'Simple' },
@ -1032,6 +1036,15 @@ const ConversationPage: React.FC = () => {
<p>{t('pageSubtitle')}</p>
</div>
</div>
<button
className="conversation-chat__new-btn"
onClick={handleClearConversation}
title="New Conversation"
type="button"
>
<Plus size={20} />
<span>New</span>
</button>
</div>
{/* Input Area - Top */}
@ -1183,6 +1196,17 @@ const ConversationPage: React.FC = () => {
</button>
)}
{/* Reset Cache - Warning Style */}
<button
className="conversation-chat__action-btn conversation-chat__action-btn--warning"
onClick={handleClearCache}
title={t('clearCache')}
type="button"
>
<RefreshCw size={16} />
<span>{t('clearCache')}</span>
</button>
{/* Cache Status Indicator */}
{lastCacheLookup && (
<div className={`conversation-cache-status ${lastCacheLookup.found ? 'conversation-cache-status--hit' : 'conversation-cache-status--miss'}`}>

View file

@ -4,7 +4,7 @@
max-width: none;
margin: 0;
padding: 1rem 1.5rem;
min-height: calc(100vh - 60px);
padding-bottom: 2rem;
animation: fadeIn 0.5s ease-in;
}

View file

@ -250,6 +250,15 @@
font-size: 1.25rem;
}
.card-logo {
width: 28px;
height: 28px;
border-radius: 4px;
object-fit: contain;
background: #f5f7fa;
flex-shrink: 0;
}
.type-badge {
font-size: 0.7rem;
font-weight: 600;
@ -453,6 +462,15 @@
font-size: 2.5rem;
}
.modal-logo {
width: 56px;
height: 56px;
border-radius: 8px;
object-fit: contain;
background: #f5f7fa;
flex-shrink: 0;
}
.modal-header h2 {
font-size: 1.25rem;
font-weight: 600;
@ -800,6 +818,10 @@
color: #a0a0b0;
}
[data-theme="dark"] .card-logo {
background: #1e1e32;
}
[data-theme="dark"] .card-title {
color: #e0e0e0;
}
@ -880,6 +902,10 @@
color: #e0e0e0;
}
[data-theme="dark"] .modal-logo {
background: #1e1e32;
}
[data-theme="dark"] .detail-section h4 {
color: #a0a0b0;
}

View file

@ -526,11 +526,22 @@ function InstitutionCard({
const typeInfo = TYPE_INFO[institution.type] || TYPE_INFO['U'];
const countryCode = institution.ghcid?.current?.substring(0, 2) || '';
const hasNetwork = hasStaffNetworkData(getCustodianSlug(institution.name));
const [logoError, setLogoError] = useState(false);
return (
<div className="institution-card" style={{ '--type-color': typeInfo.color } as React.CSSProperties}>
<div className="card-header">
<span className="type-icon" title={typeInfo.name}>{typeInfo.icon}</span>
{/* Logo or type icon */}
{institution.logo_url && !logoError ? (
<img
src={institution.logo_url}
alt=""
className="card-logo"
onError={() => setLogoError(true)}
/>
) : (
<span className="type-icon" title={typeInfo.name}>{typeInfo.icon}</span>
)}
<span className="type-badge" style={{ backgroundColor: typeInfo.color }}>
{typeInfo.name}
</span>
@ -587,6 +598,7 @@ function InstitutionDetailModal({
}) {
const t = (key: keyof typeof TEXT) => TEXT[key][language];
const typeInfo = TYPE_INFO[institution.type] || TYPE_INFO['U'];
const [logoError, setLogoError] = useState(false);
// Close on escape key
useEffect(() => {
@ -603,7 +615,17 @@ function InstitutionDetailModal({
<button className="modal-close" onClick={onClose}>×</button>
<div className="modal-header">
<span className="modal-type-icon">{typeInfo.icon}</span>
{/* Logo or type icon */}
{institution.logo_url && !logoError ? (
<img
src={institution.logo_url}
alt=""
className="modal-logo"
onError={() => setLogoError(true)}
/>
) : (
<span className="modal-type-icon">{typeInfo.icon}</span>
)}
<div>
<h2>{institution.name}</h2>
<span className="modal-type-badge" style={{ backgroundColor: typeInfo.color }}>

View file

@ -14,7 +14,7 @@
*/
import { useEffect, useRef, useState, useMemo, useCallback } from 'react';
import { useSearchParams } from 'react-router-dom';
import { useSearchParams, useNavigate } from 'react-router-dom';
import maplibregl from 'maplibre-gl';
import type { StyleSpecification, MapLayerMouseEvent, GeoJSONSource } from 'maplibre-gl';
import 'maplibre-gl/dist/maplibre-gl.css';
@ -207,6 +207,7 @@ function institutionsToGeoJSON(institutions: Institution[]): GeoJSON.FeatureColl
export default function NDEMapPage() {
const [searchParams, setSearchParams] = useSearchParams();
const navigate = useNavigate();
const mapRef = useRef<HTMLDivElement>(null);
const mapInstanceRef = useRef<maplibregl.Map | null>(null);
const [mapReady, setMapReady] = useState(false);
@ -2060,7 +2061,7 @@ export default function NDEMapPage() {
{/* Link to settings for backend selection */}
<p className="settings-hint">
{t('Wijzig databron in', 'Change data source in')} <a href="#" onClick={(e) => { e.preventDefault(); /* Open settings modal */ }}>{t('Instellingen', 'Settings')}</a>
{t('Wijzig databron in', 'Change data source in')} <a href="/settings" onClick={(e) => { e.preventDefault(); navigate('/settings'); }}>{t('Instellingen', 'Settings')}</a>
</p>
</div>
</div>

View file

@ -457,10 +457,13 @@ export interface ExtendedProfileData {
country?: string;
}>;
career_history?: Array<{
organization: string;
role: string;
dates: string;
organization?: string;
company?: string; // Alternative to organization
role?: string;
title?: string; // Alternative to role
dates?: string;
duration?: string;
duration_text?: string; // Alternative to duration
location?: string;
current?: boolean;
company_size?: string;

52
frontend/src/utils/dom.ts Normal file
View file

@ -0,0 +1,52 @@
/**
* DOM utility functions for safe element operations
*/
/**
* Safely calls .closest() on an event target.
* Works with HTMLElement, SVGElement, and Text nodes.
*
* @param target - The event target (may be Element, Text node, or null)
* @param selector - CSS selector to match
* @returns The closest matching element, or null if not found
*/
export function safeClosest(target: EventTarget | null, selector: string): Element | null {
if (!target) return null;
// If target is a Text node or other non-Element, get its parent element
let element: Element | null = null;
if (target instanceof Element) {
element = target;
} else if (target instanceof Node && target.parentElement) {
// Text nodes, comment nodes, etc.
element = target.parentElement;
}
if (!element) return null;
// Now safely call closest()
return element.closest(selector);
}
/**
* Check if an event target is inside an element matching the selector
*
* @param target - The event target
* @param selector - CSS selector to match
* @returns true if target is inside an element matching selector
*/
export function isTargetInside(target: EventTarget | null, selector: string): boolean {
return safeClosest(target, selector) !== null;
}
/**
* Check if an event target matches any of the given selectors
*
* @param target - The event target
* @param selectors - Array of CSS selectors to match
* @returns true if target is inside any element matching the selectors
*/
export function isTargetInsideAny(target: EventTarget | null, selectors: string[]): boolean {
return selectors.some(selector => isTargetInside(target, selector));
}

View file

@ -55,6 +55,11 @@ export default defineConfig({
changeOrigin: true,
rewrite: (path) => path.replace(/^\/ducklake/, ''),
},
// RAG API proxy (Heritage RAG backend on port 8003)
'/api/rag': {
target: 'http://localhost:8003',
changeOrigin: true,
},
// Generic API fallback
'/api': {
target: 'http://localhost:8000',

View file

@ -21,7 +21,7 @@ python = "^3.11"
# Core data processing
pandas = "^2.1.0"
numpy = "^1.26.0"
numpy = ">=2.0.0"
# Text processing (direct dependencies only)
# NOTE: NLP extraction (NER) is handled by coding subagents via Task tool
@ -47,7 +47,7 @@ rdflib = "^7.0.0"
SPARQLWrapper = "^2.0.0"
# Database and storage
duckdb = "^0.9.0"
duckdb = ">=1.0.0"
sqlalchemy = "^2.0.0"
pyarrow = "^14.0.0"
@ -71,6 +71,9 @@ pydantic-settings = "^2.0.0"
# DSPy for LLM-powered SPARQL generation
dspy-ai = "^2.5.0"
openai = "^1.0.0" # DSPy backend for OpenAI/Anthropic
qdrant-client = "^1.16.2"
sentence-transformers = "^5.2.0"
typedb-driver = "^3.0.0"
[tool.poetry.group.dev.dependencies]
# Testing

View file

@ -88,6 +88,8 @@ enums:
description: Entry requires further enrichment processing
new_entry:
description: Newly added entry not yet enriched
google_maps_searched:
description: Google Maps search attempted but not yet fully enriched
InstitutionTypeCodeEnum:
description: Single-letter GLAMORCUBESFIXPHDNT type codes
@ -184,6 +186,8 @@ enums:
description: LinkedIn profile or company page
GHCID_PREVIOUS:
description: Previous GHCID before relocation or reorganization
OCLC:
description: OCLC (Online Computer Library Center) identifier
LocationResolutionMethodEnum:
description: Method used to resolve settlement location
@ -432,6 +436,9 @@ classes:
organisatie:
range: string
description: Organization name from source
organisation:
range: string
description: Organization name from source (British spelling variant)
isil_code_na:
range: string
description: ISIL code from Nationaal Archief
@ -652,10 +659,19 @@ classes:
range: string
description: Status of Wikidata enrichment for this entry
comment:
range: ReferenceLink
multivalued: true
any_of:
- range: string
- range: ReferenceLink
multivalued: true
inlined_as_list: true
description: Comments about this entry (array of objects with label field)
description: Comments about this entry (can be a string or array of objects with label field)
comments:
any_of:
- range: string
- range: ReferenceLink
multivalued: true
inlined_as_list: true
description: Comments about this entry (string or array of objects with label field)
succeeded_by:
range: ReferenceLink
multivalued: true
@ -668,6 +684,15 @@ classes:
label:
range: string
description: Name/label of the duplicate institution
entry_index:
range: integer
description: Index of the duplicate entry in source data
entry_file:
range: string
description: Filename of the duplicate entry
reason:
range: string
description: Reason why this is considered a duplicate
TimeEntry:
description: Structured time entry from source data
@ -852,6 +877,11 @@ classes:
data_source:
range: string
description: Data source type (CSV_REGISTRY, API_SCRAPING, etc.)
data_sources:
range: string
multivalued: true
inlined_as_list: true
description: List of data sources (e.g., NDE registry, Google Maps, website)
data_tier:
range: DataTierEnum
description: Quality tier of the data
@ -861,6 +891,12 @@ classes:
extraction_method:
range: string
description: Method used to extract the data
enrichment_date:
range: string
description: When enrichment was performed (ISO date string)
enrichment_method:
range: string
description: Method used to enrich the data (e.g., website_research)
confidence_score:
range: float
description: Confidence score (0-1)
@ -894,6 +930,15 @@ classes:
wikidata_property:
range: string
description: Wikidata property ID (e.g., P856)
archive_location:
range: string
description: Location of archived copy (e.g., web/1186/hartebrug.nl)
claim_extracted_from:
range: string
description: Source path from which claim was extracted (e.g., original_entry.reference)
verified_via_web_archive:
range: boolean
description: Whether claim was verified via web archive
ProvenanceSources:
description: Sources organized by type
@ -943,6 +988,52 @@ classes:
multivalued: true
inlined_as_list: true
description: Nationaal Archief ISIL registry source records
whois_research:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: WHOIS domain research source records
manual_research:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Manual research source records
website:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Website source records (institution website data)
web_scrape:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Web scrape source records (scraped website data)
# Data tier summary fields (for provenance summaries)
TIER_1_AUTHORITATIVE:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_1 authoritative sources
TIER_2_VERIFIED:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_2 verified sources
TIER_3_CROWD_SOURCED:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_3 crowd-sourced sources
TIER_4_INFERRED:
range: string
multivalued: true
inlined_as_list: true
description: List of TIER_4 inferred sources
museum_register:
range: SourceRecord
multivalued: true
inlined_as_list: true
description: Museum register source records
SourceRecord:
description: Individual source record with claims
@ -1004,6 +1095,20 @@ classes:
source_file:
range: string
description: Source file name
research_date:
range: string
description: Date of research (YYYY-MM-DD format)
url:
range: uri
description: URL of the source (website URL, etc.)
data_extracted:
range: string
multivalued: true
inlined_as_list: true
description: List of data types/fields extracted from this source
merge_note:
range: string
description: Note about merge operations involving this source record
DataTierSummary:
description: Summary of data tiers present in entry
@ -1034,7 +1139,7 @@ classes:
attributes:
identifier_scheme:
range: IdentifierSchemeEnum
required: true
required: false
description: Type of identifier
identifier_value:
any_of:
@ -1056,6 +1161,14 @@ classes:
notes:
range: string
description: Additional note about this identifier (alias for note)
scheme:
range: string
description: Identifier scheme (alias for identifier_scheme, used in some data sources)
value:
any_of:
- range: string
- range: integer
description: Identifier value (alias for identifier_value, used in some data sources)
# ---------------------------------------------------------------------------
# GHCID BLOCK - Heritage Custodian ID with history
@ -1285,6 +1398,12 @@ classes:
specific_location:
range: string
description: More specific location info within the city (e.g., neighborhood, district)
specific_geonames_id:
range: integer
description: GeoNames ID for the specific location (if different from main city)
correction_note:
range: string
description: Note explaining any correction made to the location resolution
SourceCoordinates:
description: Source of coordinates for resolution
@ -1304,13 +1423,19 @@ classes:
attributes:
type:
range: string
description: Type of research source (e.g., note, wikidata, web_archive, official_source)
description: Type of research source (e.g., note, wikidata, web_archive, official_source, whois)
text:
range: string
description: Text or description of the research source
value:
range: string
description: Value from this source (e.g., plus code, address)
notes:
range: string
description: Additional notes about this source
note:
range: string
description: Additional note about this source (singular alias for notes)
id:
range: string
description: Identifier for the source (e.g., Wikidata Q-number)
@ -1323,6 +1448,56 @@ classes:
coordinates:
range: string
description: Coordinates from this source (e.g., "31.515, 34.434")
data:
range: ResearchSourceData
description: Structured data from the source (e.g., WHOIS registrant info)
ResearchSourceData:
description: Structured data from a research source
attributes:
registrant_name:
range: string
description: WHOIS registrant name
registrant_address:
range: string
description: WHOIS registrant address
registrant_city:
range: string
description: WHOIS registrant city
registrant_state:
range: string
description: WHOIS registrant state/province
registrant_country:
range: string
description: WHOIS registrant country
registrant_postal_code:
range: string
description: WHOIS registrant postal code
# Additional flexible fields for other data types
organization:
range: string
description: Organization name
email:
range: string
description: Contact email
phone:
range: string
description: Contact phone
creation_date:
range: string
description: Domain creation date
updated_date:
range: string
description: Domain updated date
expiration_date:
range: string
description: Domain expiration date
domain_registered:
range: string
description: Domain registration date
registry:
range: string
description: Domain registrar name
# ---------------------------------------------------------------------------
# GOOGLE MAPS ENRICHMENT
@ -1485,8 +1660,10 @@ classes:
inlined_as_list: true
description: Topics mentioned in reviews
reviews_summary:
range: string
description: Summary of reviews
any_of:
- range: string
- range: ReviewsSummary
description: Summary of reviews (string or structured breakdown)
sample_reviews:
any_of:
- range: string
@ -1523,10 +1700,13 @@ classes:
inlined_as_list: true
description: Nearby organizations (strings or structured objects)
features:
range: string
multivalued: true
any_of:
- range: string
multivalued: true
- range: PlaceFeature
multivalued: true
inlined_as_list: true
description: Features of the place
description: Features of the place (strings or key-value objects)
hours_status:
range: string
description: Current opening status (e.g., "Closed · Opens 2 pm Wed")
@ -1603,6 +1783,23 @@ classes:
match_notes:
range: string
description: Notes about how the Google Maps match was determined
price_level:
any_of:
- range: integer
- range: string
description: Google Maps price level (0-4 or string description)
match_warning:
range: string
description: Warning about potential issues with the match
location_note:
range: string
description: Note about the physical location of the place
search_attempted:
range: boolean
description: Whether a Google Maps search was attempted
result:
range: string
description: Result of search operation (found, not_found, found_via_user_link, etc.)
RejectedGoogleMapsData:
description: Rejected Google Maps data preserved for audit trail
@ -1625,6 +1822,53 @@ classes:
returned_country:
range: string
description: Country code actually returned by Google Maps
website:
range: uri
description: Website URL from Google Maps
latitude:
range: float
description: Latitude coordinate
longitude:
range: float
description: Longitude coordinate
enriched_at:
range: datetime
description: When enrichment was performed
PlaceFeature:
description: A feature flag for a place (e.g., native_garden, shop, volunteers)
class_uri: schema:PropertyValue
attributes:
native_garden:
range: boolean
description: Has a native garden
shop:
range: boolean
description: Has a shop
volunteers:
range: boolean
description: Has volunteers
parking:
range: boolean
description: Has parking
cafe:
range: boolean
description: Has a cafe
restaurant:
range: boolean
description: Has a restaurant
gift_shop:
range: boolean
description: Has a gift shop
wheelchair_accessible:
range: boolean
description: Is wheelchair accessible
guided_tours:
range: boolean
description: Offers guided tours
audio_guide:
range: boolean
description: Offers audio guides
LlmVerification:
description: LLM-based verification results for Google Maps matching
@ -1709,6 +1953,25 @@ classes:
minute:
range: integer
ReviewsSummary:
description: Breakdown of reviews by star rating
attributes:
5_star:
range: integer
description: Number of 5-star reviews
4_star:
range: integer
description: Number of 4-star reviews
3_star:
range: integer
description: Number of 3-star reviews
2_star:
range: integer
description: Number of 2-star reviews
1_star:
range: integer
description: Number of 1-star reviews
GoogleReview:
description: Google Maps review
attributes:
@ -1828,8 +2091,10 @@ classes:
wikidata_temporal:
range: WikidataTemporal
wikidata_inception:
range: string
description: Inception date (P571)
any_of:
- range: string
- range: WikidataTimeValue
description: Inception date (P571) - can be string or structured time value
wikidata_classification:
range: WikidataClassification
wikidata_instance_of:
@ -1946,6 +2211,29 @@ classes:
multivalued: true
inlined_as_list: true
description: Search terms attempted when looking for Wikidata entity
wikidata_description_nl:
range: string
description: Description in Dutch language
wikidata_claims:
range: WikidataClaims
description: Structured Wikidata claims with property metadata
inlined: true
_resolved_entities:
range: WikidataResolvedEntities
description: Resolved Wikidata property and entity metadata cache
inlined: true
WikidataClaims:
description: |
Structured Wikidata claims with property metadata and values.
Uses flexible dict-like structure for various claim types.
class_uri: linkml:Any
WikidataResolvedEntities:
description: |
Cache of resolved Wikidata property and entity metadata.
Keys are property IDs (P123), values are property metadata.
class_uri: linkml:Any
WikidataApiMetadata:
description: API call metadata
@ -2058,6 +2346,19 @@ classes:
inlined_as_list: true
description: Main subject (P921)
WikidataTimeValue:
description: Wikidata time value with precision metadata
attributes:
time:
range: string
description: Time value in ISO 8601 format (e.g., +2015-00-00T00:00:00Z)
precision:
range: integer
description: Precision level (9=year, 10=month, 11=day, etc.)
calendarmodel:
range: uri
description: Calendar model URI (e.g., http://www.wikidata.org/entity/Q1985727 for Gregorian)
WikidataEntity:
description: Reference to a Wikidata entity
attributes:
@ -2104,7 +2405,10 @@ classes:
description: Location properties from Wikidata
attributes:
country:
range: WikidataEntity
any_of:
- range: string
- range: WikidataEntity
description: Country Q-ID (can be string or WikidataEntity object)
headquarters_location:
range: WikidataEntity
coordinates:
@ -2158,8 +2462,10 @@ classes:
multivalued: true
inlined_as_list: true
parent_organization:
range: WikidataEntity
description: Parent organization (P749)
any_of:
- range: string
- range: WikidataEntity
description: Parent organization Q-ID or entity (P749)
subsidiary:
range: WikidataEntity
multivalued: true
@ -2433,6 +2739,9 @@ classes:
website_found:
range: boolean
description: Whether a website was found
official_website:
range: uri
description: Official website URL found during research
research_notes:
range: string
description: Notes from research
@ -2504,6 +2813,12 @@ classes:
merger_target:
range: string
description: Target organization in merger
successor_name:
range: string
description: Name of successor organization (for mergers)
successor_location:
range: string
description: Location of successor organization (for mergers)
notes:
range: string
description: Additional notes
@ -2552,6 +2867,16 @@ classes:
type:
range: string
description: Type of collection (oral_history, photographs, documents, etc.)
item_count:
any_of:
- range: integer
- range: string
description: Number of items in the collection (integer or descriptive string)
total_hours:
any_of:
- range: float
- range: string
description: Total hours of content (for audio/video collections)
WebArchiveFailure:
description: Failed archive attempt record
@ -2682,7 +3007,8 @@ classes:
- range: string
- range: string
multivalued: true
description: Extracted value (alias for claim_value, can be string or list)
- range: OpeningHoursMap
description: Extracted value (alias for claim_value, can be string, list, or structured object like opening hours)
raw_value:
range: string
description: Raw value before processing
@ -2807,6 +3133,9 @@ classes:
job_title_en:
range: string
description: Job title in English
department_en:
range: string
description: Department name in English
RawSource:
description: Raw source information for web enrichment
@ -2838,6 +3167,63 @@ classes:
raw_markdown_hash:
range: string
description: SHA-256 hash of the raw markdown content
exa_highlights:
range: string
multivalued: true
inlined_as_list: true
description: Highlighted excerpts from Exa search results
exa_highlight_scores:
range: float
multivalued: true
inlined_as_list: true
description: Relevance scores for Exa highlights
OpeningHoursMap:
description: Opening hours as a day-keyed map
class_uri: schema:OpeningHoursSpecification
attributes:
maandag:
range: string
description: Monday hours (Dutch)
dinsdag:
range: string
description: Tuesday hours (Dutch)
woensdag:
range: string
description: Wednesday hours (Dutch)
donderdag:
range: string
description: Thursday hours (Dutch)
vrijdag:
range: string
description: Friday hours (Dutch)
zaterdag:
range: string
description: Saturday hours (Dutch)
zondag:
range: string
description: Sunday hours (Dutch)
monday:
range: string
description: Monday hours (English)
tuesday:
range: string
description: Tuesday hours (English)
wednesday:
range: string
description: Wednesday hours (English)
thursday:
range: string
description: Thursday hours (English)
friday:
range: string
description: Friday hours (English)
saturday:
range: string
description: Saturday hours (English)
sunday:
range: string
description: Sunday hours (English)
SourceReference:
description: Structured source reference for a claim
@ -2961,8 +3347,12 @@ classes:
range: string
description: Note explaining manual correction made to the name
merge_notes:
range: string
description: Notes about name merging or deduplication
any_of:
- range: string
- range: MergeNote
multivalued: true
inlined_as_list: true
description: Notes about name merging or deduplication (string or array of structured objects)
abbreviation:
range: string
description: Short form or abbreviation of the name
@ -2980,6 +3370,9 @@ classes:
multivalued: true
inlined_as_list: true
description: Previous names the institution was known by (strings or structured objects)
short_name:
range: string
description: Short name or commonly used abbreviated form of the institution name
FormerName:
description: A former name of the institution with optional metadata
@ -3001,6 +3394,19 @@ classes:
range: string
description: Additional notes about this former name
MergeNote:
description: Note about a merge operation between duplicate entries
attributes:
source:
range: string
description: Source entry identifier that was merged
merged_on:
range: string
description: Date when merge occurred (YYYY-MM-DD)
reason:
range: string
description: Reason for the merge (e.g., duplicate Wikidata ID, same place ID)
MatchingSource:
description: Source that contributed to name consensus
attributes:
@ -3290,6 +3696,25 @@ classes:
enrichment_source:
range: string
description: Source of enrichment (e.g., manual_curation, api_scraping)
host_organization:
range: string
description: Organization hosting this platform
host_website:
range: uri
description: Main website of the host organization
language:
range: string
description: Primary language of the platform (ISO 639-1 code)
features:
range: string
multivalued: true
inlined_as_list: true
description: Features of this platform
platforms:
range: string
multivalued: true
inlined_as_list: true
description: Sub-platforms or related platforms
PlatformSourceReference:
description: Structured source reference for a digital platform
@ -3465,6 +3890,12 @@ classes:
override_reason:
range: string
description: Reason for manual coordinate override
source_url:
range: uri
description: URL source of coordinates (e.g., Google Maps link)
note:
range: string
description: Additional note about coordinate provenance
# ---------------------------------------------------------------------------
# ADDITIONAL ENRICHMENT TYPES

View file

@ -1,106 +1,355 @@
#!/usr/bin/env python3
"""
Load TypeDB schemas from files into the glam-heritage database
Load Heritage Custodian schema into TypeDB 3.x.
This script loads the Heritage Custodian Observation & Reconstruction schema
into TypeDB. The schema must be loaded in parts due to TypeDB 3.x requirements:
1. Attributes first (before entities can reference them)
2. Relations second (before entities can play roles)
3. Entities third (can now reference attributes and play roles)
Usage:
python scripts/load_typedb_schema.py [--host HOST] [--port PORT] [--database DATABASE]
Prerequisites:
- TypeDB server running (default: localhost:1729)
- TypeDB Python driver installed (typedb-driver >= 3.0.0)
- Database will be created if it doesn't exist
Example:
# Start TypeDB server
~/.typedb/typedb server
# Load schema
poetry run python scripts/load_typedb_schema.py
TypeDB 3.x Migration Notes:
- Uses Credentials and DriverOptions (not core_driver)
- Uses TransactionType.SCHEMA (not SessionType.SCHEMA + TransactionType.WRITE)
- tx.query().resolve() instead of tx.query.define()
- No sessions - transactions are created directly on driver
- 'entity' is a reserved word - renamed to 'observed-entity' in observation-of relation
"""
import os
from pathlib import Path
from typedb.driver import TypeDB, SessionType, TransactionType
# Configuration
SERVER_ADDRESS = "localhost:1729"
DATABASE_NAME = "glam-heritage"
SCHEMA_DIR = Path("/Users/kempersc/apps/glam/schemas/20251121/typedb")
import argparse
import sys
# Schema files in order
SCHEMA_FILES = [
"01_name_entity_hub.tql",
"02_heritage_custodian.tql",
"03_identifiers.tql",
"04_locations.tql",
"05_digital_platforms.tql",
"06_provenance.tql",
"07_collections.tql",
"08_relationships.tql",
"09_change_events.tql",
"10_rules.tql",
]
def load_schema():
"""Load TypeDB schema files into the database"""
print(f"🔗 Connecting to TypeDB at {SERVER_ADDRESS}...")
def get_schema_parts():
"""Return the schema split into loadable parts.
with TypeDB.core_driver(SERVER_ADDRESS) as driver:
# Check if database exists
if not driver.databases.contains(DATABASE_NAME):
print(f"❌ Database '{DATABASE_NAME}' does not exist!")
print(f" Create it first: typedb console --command='database create {DATABASE_NAME}'")
return False
print(f"✅ Connected to database: {DATABASE_NAME}")
print(f"📂 Schema directory: {SCHEMA_DIR}")
print()
# Load each schema file
for schema_file in SCHEMA_FILES:
schema_path = SCHEMA_DIR / schema_file
if not schema_path.exists():
print(f"⚠️ Skipping {schema_file} (file not found)")
continue
print(f"📝 Loading {schema_file}...")
try:
with driver.session(DATABASE_NAME, SessionType.SCHEMA) as session:
with session.transaction(TransactionType.WRITE) as tx:
# Read schema file
with open(schema_path, 'r') as f:
schema_content = f.read()
# Execute TypeQL define query
tx.query.define(schema_content)
tx.commit()
print(f" ✅ Successfully loaded {schema_file}")
except Exception as e:
print(f" ❌ Error loading {schema_file}: {e}")
return False
print()
print("🎉 All schemas loaded successfully!")
return True
def verify_schema():
"""Verify the loaded schema"""
print("\n🔍 Verifying schema...")
The schema is split into 3 parts that must be loaded in order:
1. Attributes - all attribute type definitions
2. Relations - all relation type definitions with role types
3. Entities - all entity type definitions with owns/plays
with TypeDB.core_driver(SERVER_ADDRESS) as driver:
with driver.session(DATABASE_NAME, SessionType.SCHEMA) as session:
with session.transaction(TransactionType.READ) as tx:
# Get all entity types
result = tx.query.fetch("match $x sub entity; fetch $x;")
entities = list(result)
print(f"✅ Found {len(entities)} entity types")
# Sample a few
for i, entity in enumerate(entities[:5]):
print(f" - {entity}")
if len(entities) > 5:
print(f" ... and {len(entities) - 5} more")
Note: The original .tql file uses 'entity' as a role name in observation-of,
but 'entity' is a reserved word in TypeDB 3.x. This is fixed by renaming
the role to 'observed-entity'.
"""
# Part 1: Attributes
attributes = """
define
attribute id, value string;
attribute created, value datetime;
attribute modified, value datetime;
attribute observed-name, value string;
attribute alternative-observed-name, value string;
attribute observation-date, value datetime;
attribute observation-context, value string;
attribute standardized-name, value string;
attribute endorsement-source, value string;
attribute name-authority, value string;
attribute valid-from, value datetime;
attribute valid-to, value datetime;
attribute legal-name, value string;
attribute legal-form, value string;
attribute registration-number, value string;
attribute registration-date, value datetime;
attribute registration-authority, value string;
attribute dissolution-date, value datetime;
attribute legal-status, value string;
attribute governance-structure, value string;
attribute source-uri, value string;
attribute source-type, value string;
attribute source-date, value datetime;
attribute source-creator, value string;
attribute activity-type, value string;
attribute method, value string;
attribute justification, value string;
attribute started-at-time, value datetime;
attribute ended-at-time, value datetime;
attribute agent-name, value string;
attribute agent-type, value string;
attribute affiliation, value string;
attribute contact, value string;
attribute appellation-value, value string;
attribute appellation-language, value string;
attribute appellation-type, value string;
attribute identifier-scheme, value string;
attribute identifier-value, value string;
attribute begin-of-the-begin, value datetime;
attribute begin-of-the-end, value datetime;
attribute end-of-the-begin, value datetime;
attribute end-of-the-end, value datetime;
attribute confidence-value, value double;
attribute confidence-method, value string;
attribute language-code-value, value string;
"""
# Part 2: Relations
relations = """
define
relation derivation,
relates derived-entity,
relates source-entity;
relation generation,
relates generated-entity,
relates generating-activity;
relation revision,
relates revised-entity,
relates prior-version;
relation activity-association,
relates activity,
relates agent;
relation activity-usage,
relates activity,
relates used-source;
relation source-citation,
relates observation,
relates source;
relation organizational-hierarchy,
relates parent,
relates child;
relation name-succession,
relates predecessor,
relates successor;
relation has-appellation,
relates subject,
relates appellation;
relation has-identifier,
relates subject,
relates identifier;
relation observation-of,
relates observation,
relates observed-entity;
"""
# Part 3: Entities
entities = """
define
entity custodian @abstract,
owns id,
owns created,
owns modified,
plays derivation:derived-entity,
plays derivation:source-entity,
plays generation:generated-entity,
plays observation-of:observation,
plays observation-of:observed-entity;
entity custodian-observation sub custodian,
owns observed-name,
owns alternative-observed-name,
owns observation-date,
owns observation-context,
owns confidence-value,
owns confidence-method,
plays source-citation:observation,
plays has-appellation:subject;
entity custodian-name sub custodian-observation,
owns standardized-name,
owns endorsement-source,
owns name-authority,
owns valid-from,
owns valid-to,
plays name-succession:predecessor,
plays name-succession:successor;
entity custodian-reconstruction sub custodian,
owns legal-name,
owns legal-form,
owns registration-number,
owns registration-date,
owns registration-authority,
owns dissolution-date,
owns legal-status,
owns governance-structure,
plays has-identifier:subject,
plays organizational-hierarchy:parent,
plays organizational-hierarchy:child,
plays revision:revised-entity,
plays revision:prior-version;
entity source-document,
owns id,
owns source-uri,
owns source-type,
owns source-date,
owns source-creator,
plays source-citation:source,
plays activity-usage:used-source;
entity reconstruction-activity,
owns id,
owns activity-type,
owns method,
owns justification,
owns started-at-time,
owns ended-at-time,
plays generation:generating-activity,
plays activity-association:activity,
plays activity-usage:activity;
entity agent,
owns id,
owns agent-name,
owns agent-type,
owns affiliation,
owns contact,
plays activity-association:agent;
entity appellation,
owns appellation-value,
owns appellation-language,
owns appellation-type,
plays has-appellation:appellation;
entity identifier,
owns identifier-scheme,
owns identifier-value,
plays has-identifier:identifier;
entity time-span,
owns begin-of-the-begin,
owns begin-of-the-end,
owns end-of-the-begin,
owns end-of-the-end;
entity confidence-measure,
owns confidence-value,
owns confidence-method;
entity language-code,
owns language-code-value;
"""
return [
("Attributes", attributes),
("Relations", relations),
("Entities", entities),
]
def load_schema(host: str = "localhost", port: int = 1729, database: str = "heritage_custodians"):
"""Load the Heritage Custodian schema into TypeDB.
Args:
host: TypeDB server host
port: TypeDB server port
database: Database name (will be created if doesn't exist)
"""
try:
from typedb.driver import TypeDB, Credentials, DriverOptions, TransactionType
except ImportError:
print("Error: typedb-driver not installed. Run: poetry add typedb-driver")
sys.exit(1)
# Connect to TypeDB
address = f"{host}:{port}"
credentials = Credentials("admin", "password")
options = DriverOptions(is_tls_enabled=False)
print(f"Connecting to TypeDB at {address}...")
try:
driver = TypeDB.driver(address, credentials, options)
except Exception as e:
print(f"Error connecting to TypeDB: {e}")
print("Make sure TypeDB server is running: ~/.typedb/typedb server")
sys.exit(1)
# Check if database exists
db_names = [db.name for db in driver.databases.all()]
if database not in db_names:
print(f"Database '{database}' not found. Creating...")
driver.databases.create(database)
print(f"Created database '{database}'")
else:
print(f"Using existing database '{database}'")
# Load schema parts
print()
schema_parts = get_schema_parts()
for name, schema in schema_parts:
print(f"Loading {name}...")
try:
with driver.transaction(database, TransactionType.SCHEMA) as tx:
tx.query(schema).resolve()
tx.commit()
print(f"{name} loaded successfully")
except Exception as e:
error_msg = str(e)
if "already exists" in error_msg.lower() or "redefinition" in error_msg.lower():
print(f"{name} already exists (skipping)")
else:
print(f" ✗ Error loading {name}: {e}")
driver.close()
sys.exit(1)
# Verify schema loaded
print("\nVerifying schema...")
with driver.transaction(database, TransactionType.READ) as tx:
for type_name in ["custodian-observation", "custodian-name", "custodian-reconstruction"]:
count_query = f"""
match
$inst isa {type_name};
reduce $count = count;
"""
answer = tx.query(count_query).resolve()
for row in answer:
value = row.get("count")
count = value.get_integer() if hasattr(value, 'get_integer') else int(str(value))
print(f" {type_name}: {count} entities")
break
driver.close()
print("\n✓ Schema loaded successfully!")
print(f"\nDatabase '{database}' is ready for data ingestion.")
def main():
parser = argparse.ArgumentParser(
description="Load Heritage Custodian schema into TypeDB 3.x"
)
parser.add_argument(
"--host", default="localhost", help="TypeDB server host (default: localhost)"
)
parser.add_argument(
"--port", type=int, default=1729, help="TypeDB server port (default: 1729)"
)
parser.add_argument(
"--database",
default="heritage_custodians",
help="Database name (default: heritage_custodians)",
)
args = parser.parse_args()
load_schema(host=args.host, port=args.port, database=args.database)
if __name__ == "__main__":
print("=" * 60)
print("TypeDB Schema Loader")
print("=" * 60)
print()
success = load_schema()
if success:
verify_schema()
print()
print("=" * 60)
main()

View file

@ -0,0 +1,472 @@
#!/usr/bin/env python3
"""
Test PiCo extraction with Arabic waqf (endowment) document example.
This script tests the GLM annotator's ability to extract person observations
from Arabic historical documents following the PiCo ontology pattern.
Usage:
python scripts/test_pico_arabic_waqf.py
Environment Variables:
ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
"""
import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone
import httpx
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
try:
from dotenv import load_dotenv
load_dotenv(project_root / ".env")
except ImportError:
pass # dotenv not required if env vars set directly
# Z.AI API configuration (per AGENTS.md Rule 11)
# GLM-4.6 uses reasoning mode - essential for complex historical document extraction
# Requires higher max_tokens to accommodate reasoning + output
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_MODEL = "glm-4.6"
# Arabic waqf document example (from pico.yaml)
ARABIC_WAQF_TEXT = """بسم الله الرحمن الرحيم
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
وخمس وعشرين هجرية."""
# PiCo extraction system prompt (abbreviated version for testing)
PICO_SYSTEM_PROMPT = """You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons mentioned in the source text, capturing:
1. Names using PNV (Person Name Vocabulary) structure
2. Roles in the source document
3. Biographical information
4. Family relationships between persons in THIS source
5. For Arabic texts: include both original script AND romanized versions
### Arabic Naming Conventions
- ابن/بن (ibn/bin): son of (patronymic)
- بنت (bint): daughter of
- الحاج (al-Hajj): honorific for pilgrimage completer
- السيد (al-Sayyid): honorific (descendant of Prophet)
- المرحوم (al-marhum): the late (deceased male)
- آل (Al): family of
### Family Relationship Keys
- parent: array of person references (person_index + target_name)
- children: array of person references
- spouse: array of person references
### Output Format
Return ONLY valid JSON:
{
"pico_observation": {
"observation_id": "<source-derived-id>",
"observed_at": "<ISO-timestamp>",
"source_type": "<category>",
"source_reference": "<identifier>"
},
"persons": [
{
"person_index": 0,
"pnv_name": {
"literalName": "Name in original script",
"literalName_romanized": "Romanized name",
"givenName": "Given name",
"givenName_romanized": "Romanized given name",
"patronym": "Father's name",
"patronym_romanized": "Romanized patronym",
"baseSurname": "Family/tribal name",
"baseSurname_romanized": "Romanized surname",
"honorificPrefix": "Title/honorific",
"honorificPrefix_romanized": "Romanized honorific"
},
"roles": [
{
"role_title": "Role as stated",
"role_title_romanized": "Romanized role",
"role_in_source": "founder|witness|beneficiary|null"
}
],
"biographical": {
"deceased": true/false/null,
"address": "Location if mentioned"
},
"family_relationships": {
"parent": [{"person_index": N, "target_name": "Name"}],
"children": [{"person_index": N, "target_name": "Name"}]
},
"context": "Brief description of person's role"
}
],
"temporal_references": [
{
"expression": "Original text",
"expression_romanized": "Romanized",
"normalized": "ISO date or approximate",
"calendar": "Hijri|Gregorian",
"type": "DATE"
}
],
"locations_mentioned": [
{
"name": "Original name",
"name_romanized": "Romanized",
"type": "city|neighborhood"
}
]
}"""
async def call_glm_api(system_prompt: str, user_content: str) -> dict:
"""Call Z.AI GLM-4.6 API and return parsed JSON response."""
api_token = os.environ.get("ZAI_API_TOKEN")
if not api_token:
raise ValueError("ZAI_API_TOKEN not set in environment")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"model": ZAI_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content},
],
"temperature": 0.1, # Low temperature for consistent extraction
"max_tokens": 16000, # High limit for GLM-4.6 reasoning mode + output
}
async with httpx.AsyncClient(timeout=300.0) as client: # 5 min timeout for GLM-4.6 reasoning
response = await client.post(ZAI_API_URL, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
content = result["choices"][0]["message"]["content"]
# Save raw response for debugging
raw_output_path = project_root / "data/entity_annotation/test_outputs"
raw_output_path.mkdir(parents=True, exist_ok=True)
raw_file = raw_output_path / f"raw_response_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
with open(raw_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f" Raw response saved to: {raw_file.name}")
# Parse JSON from response (handle markdown code blocks)
json_content = content
if "```json" in content:
json_content = content.split("```json")[1].split("```")[0]
elif "```" in content:
parts = content.split("```")
if len(parts) >= 2:
json_content = parts[1]
# Try to parse, with fallback for truncated JSON
try:
return json.loads(json_content.strip())
except json.JSONDecodeError as e:
print(f"\n⚠️ JSON parse error at position {e.pos}, attempting repair...")
# Try to repair truncated JSON by closing brackets
repaired = repair_truncated_json(json_content.strip())
return json.loads(repaired)
def repair_truncated_json(json_str: str) -> str:
"""Attempt to repair truncated JSON by closing open brackets."""
import re
# Count open/close brackets
open_braces = json_str.count('{') - json_str.count('}')
open_brackets = json_str.count('[') - json_str.count(']')
# Check if we're in the middle of a string
# Find position of last complete key-value pair
last_comma = json_str.rfind(',')
last_colon = json_str.rfind(':')
if last_colon > last_comma:
# We're in the middle of a value, try to find a safe truncation point
# Look for the last complete object or array element
safe_pos = last_comma
if safe_pos > 0:
json_str = json_str[:safe_pos]
# Recount brackets after truncation
open_braces = json_str.count('{') - json_str.count('}')
open_brackets = json_str.count('[') - json_str.count(']')
# Close open brackets
json_str = json_str.rstrip()
# Remove trailing comma if present
if json_str.endswith(','):
json_str = json_str[:-1]
# Add closing brackets
json_str += ']' * open_brackets
json_str += '}' * open_braces
return json_str
def validate_extraction(result: dict) -> tuple[bool, list[str]]:
"""Validate the extraction result against expected structure."""
errors = []
# Check top-level structure
if "pico_observation" not in result:
errors.append("Missing 'pico_observation' field")
if "persons" not in result:
errors.append("Missing 'persons' field")
if "persons" in result:
persons = result["persons"]
# Check minimum person count (should be at least 4: founder, father, 2 witnesses)
if len(persons) < 4:
errors.append(f"Expected at least 4 persons, got {len(persons)}")
# Check person structure
for i, person in enumerate(persons):
if "person_index" not in person:
errors.append(f"Person {i}: missing 'person_index'")
if "pnv_name" not in person:
errors.append(f"Person {i}: missing 'pnv_name'")
elif "literalName" not in person["pnv_name"]:
errors.append(f"Person {i}: missing 'literalName' in pnv_name")
# Check for specific expected persons
names = [p.get("pnv_name", {}).get("literalName_romanized", "") for p in persons]
names_lower = [n.lower() for n in names]
if not any("ahmad" in n for n in names_lower):
errors.append("Missing founder: Ahmad ibn Muhammad al-'Umari")
if not any("ibrahim" in n for n in names_lower):
errors.append("Missing witness: Ibrahim ibn Yusuf al-Turkmani")
if not any("ali" in n for n in names_lower):
errors.append("Missing witness: Ali ibn Husayn al-Halabi")
# Check temporal reference
if "temporal_references" in result and result["temporal_references"]:
temp = result["temporal_references"][0]
if "calendar" in temp and temp["calendar"] != "Hijri":
errors.append(f"Expected Hijri calendar, got {temp.get('calendar')}")
# Check locations
if "locations_mentioned" in result:
loc_names = [l.get("name_romanized", "").lower() for l in result["locations_mentioned"]]
if not any("aleppo" in n or "halab" in n for n in loc_names):
errors.append("Missing location: Aleppo (حلب)")
return len(errors) == 0, errors
async def test_arabic_waqf_extraction():
"""Test PiCo extraction from Arabic waqf document."""
print("\n" + "=" * 70)
print("TEST: PiCo Arabic Waqf Document Extraction")
print("=" * 70)
# Check API token
if not os.environ.get("ZAI_API_TOKEN"):
print("\n⚠️ SKIPPED: ZAI_API_TOKEN not set")
print("Set it with: export ZAI_API_TOKEN=<your_token>")
return None
print(f"\nModel: {ZAI_MODEL}")
print(f"API: {ZAI_API_URL}")
# Prepare user prompt
user_prompt = f"""Extract all persons, relationships, dates, and locations from this Arabic waqf (endowment) document:
{ARABIC_WAQF_TEXT}
This is a historical Islamic endowment document from Aleppo. Extract all information following the PiCo ontology pattern."""
print("\n" + "-" * 40)
print("SOURCE TEXT (Arabic Waqf Document)")
print("-" * 40)
print(ARABIC_WAQF_TEXT[:200] + "...")
# Call API
print("\n⏳ Calling GLM-4.6 API (this may take 30-60 seconds)...")
try:
start_time = datetime.now(timezone.utc)
result = await call_glm_api(PICO_SYSTEM_PROMPT, user_prompt)
end_time = datetime.now(timezone.utc)
duration = (end_time - start_time).total_seconds()
print(f"✅ API call completed in {duration:.1f}s")
except httpx.HTTPStatusError as e:
print(f"\n❌ API Error: {e.response.status_code}")
print(f"Response: {e.response.text[:500]}")
return False
except json.JSONDecodeError as e:
print(f"\n❌ JSON Parse Error: {e}")
return False
except Exception as e:
print(f"\n❌ Error: {type(e).__name__}: {e}")
return False
# Display results
print("\n" + "-" * 40)
print("EXTRACTION RESULTS")
print("-" * 40)
# PiCo observation metadata
if "pico_observation" in result:
obs = result["pico_observation"]
print(f"\n📋 Observation ID: {obs.get('observation_id', 'N/A')}")
print(f" Source Type: {obs.get('source_type', 'N/A')}")
print(f" Source Ref: {obs.get('source_reference', 'N/A')}")
# Persons extracted
persons = result.get("persons", [])
print(f"\n👥 Persons Extracted: {len(persons)}")
for person in persons:
idx = person.get("person_index", "?")
name = person.get("pnv_name", {})
lit_name = name.get("literalName", "")
rom_name = name.get("literalName_romanized", "")
print(f"\n [{idx}] {lit_name}")
if rom_name:
print(f" Romanized: {rom_name}")
# Honorific
if name.get("honorificPrefix"):
hon = name.get("honorificPrefix", "")
hon_rom = name.get("honorificPrefix_romanized", "")
print(f" Honorific: {hon} ({hon_rom})")
# Patronym
if name.get("patronym"):
pat = name.get("patronym", "")
pat_rom = name.get("patronym_romanized", "")
print(f" Patronym: {pat} ({pat_rom})")
# Roles
roles = person.get("roles", [])
for role in roles:
role_title = role.get("role_title", "")
role_rom = role.get("role_title_romanized", "")
role_in_src = role.get("role_in_source", "")
if role_title or role_in_src:
print(f" Role: {role_title} ({role_rom}) - {role_in_src}")
# Biographical
bio = person.get("biographical", {})
if bio.get("deceased"):
print(f" Status: Deceased (المرحوم)")
if bio.get("address"):
print(f" Address: {bio.get('address')}")
# Family relationships
fam = person.get("family_relationships", {})
if fam.get("parent"):
parents = [p.get("target_name", "") for p in fam["parent"]]
print(f" Parents: {', '.join(parents)}")
if fam.get("children"):
children = [c.get("target_name", "") for c in fam["children"]]
print(f" Children: {', '.join(children)}")
# Context
if person.get("context"):
print(f" Context: {person.get('context')}")
# Temporal references
temps = result.get("temporal_references", [])
if temps:
print(f"\n📅 Temporal References: {len(temps)}")
for temp in temps:
expr = temp.get("expression", "")
expr_rom = temp.get("expression_romanized", "")
norm = temp.get("normalized", "")
cal = temp.get("calendar", "")
print(f" {expr}")
if expr_rom:
print(f"{expr_rom}")
print(f" → Normalized: {norm} ({cal})")
# Locations
locs = result.get("locations_mentioned", [])
if locs:
print(f"\n📍 Locations: {len(locs)}")
for loc in locs:
name = loc.get("name", "")
name_rom = loc.get("name_romanized", "")
loc_type = loc.get("type", "")
print(f" {name} ({name_rom}) - {loc_type}")
# Validate results
print("\n" + "-" * 40)
print("VALIDATION")
print("-" * 40)
is_valid, errors = validate_extraction(result)
if is_valid:
print("\n✅ All validations passed!")
else:
print(f"\n⚠️ Validation issues ({len(errors)}):")
for error in errors:
print(f" - {error}")
# Save result to file for inspection
output_path = project_root / "data/entity_annotation/test_outputs"
output_path.mkdir(parents=True, exist_ok=True)
output_file = output_path / f"arabic_waqf_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n💾 Full result saved to: {output_file.relative_to(project_root)}")
# Final verdict
print("\n" + "=" * 70)
if is_valid:
print("✅ TEST PASSED: Arabic waqf extraction successful")
else:
print("⚠️ TEST COMPLETED WITH WARNINGS: Check validation issues above")
print("=" * 70)
return is_valid
async def main():
"""Run the test."""
print("\n" + "#" * 70)
print("# PiCo ARABIC WAQF EXTRACTION TEST")
print("# Testing GLM-4.6 reasoning mode with historical Arabic document")
print("#" * 70)
result = await test_arabic_waqf_extraction()
if result is None:
return 0 # Skipped (no API key)
return 0 if result else 1
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)

786
scripts/test_pico_batch.py Normal file
View file

@ -0,0 +1,786 @@
#!/usr/bin/env python3
"""
Batch test runner for PiCo (Person in Context) extraction across multiple document types.
This script tests GLM-4.6 reasoning mode extraction from various historical document types:
1. Arabic Waqf (Islamic endowment)
2. Hebrew Ketubah (Jewish marriage contract)
3. Spanish Colonial Baptism
4. Dutch Marriage Certificate
5. Latin Notarial Protocol
Usage:
python scripts/test_pico_batch.py [--test-name NAME] [--all] [--list]
Examples:
python scripts/test_pico_batch.py --all # Run all tests
python scripts/test_pico_batch.py --test-name arabic # Run only Arabic waqf test
python scripts/test_pico_batch.py --list # List available tests
Environment Variables:
ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API
"""
import asyncio
import argparse
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Optional
import httpx
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
try:
from dotenv import load_dotenv
load_dotenv(project_root / ".env")
except ImportError:
pass
# =============================================================================
# API Configuration
# =============================================================================
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_MODEL = "glm-4.6"
MAX_TOKENS = 16000 # High limit for GLM-4.6 reasoning mode
TIMEOUT = 300 # 5 minutes for complex reasoning
# =============================================================================
# Test Document Definitions
# =============================================================================
@dataclass
class TestDocument:
"""A historical document for PiCo extraction testing."""
name: str
language: str
script: str
date_period: str
source_type: str
source_text: str
system_prompt: str
expected_persons: int
expected_locations: int
validation_names: list[str] # Names that should appear in extraction
# Arabic Waqf Document
ARABIC_WAQF = TestDocument(
name="arabic_waqf",
language="Arabic",
script="Arabic",
date_period="1225 AH (1810 CE)",
source_type="waqf_document",
source_text="""بسم الله الرحمن الرحيم
هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة
حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة
بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح
الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف
التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين
وخمس وعشرين هجرية.""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Arabic waqf (endowment) document:
1. Names using PNV structure with both Arabic script AND romanized versions
2. Patronymics (ابن/بن = son of)
3. Honorifics (الحاج = pilgrim, السيد = sayyid, المرحوم = the late)
4. Family relationships between persons
5. Roles in the document (founder, witness)
6. Biographical info (deceased status, occupation, address)
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "...", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "..."}],
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
expected_persons=4,
expected_locations=2,
validation_names=["ahmad", "ibrahim", "ali"]
)
# Hebrew Ketubah
HEBREW_KETUBAH = TestDocument(
name="hebrew_ketubah",
language="Hebrew/Aramaic",
script="Hebrew",
date_period="5645 AM (1885 CE)",
source_type="ketubah",
source_text="""בס״ד
ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות
וארבעים וחמש לבריאת עולם למנין שאנו מונין בו פה ווילנא
איך החתן הבחור יצחק בן הר״ר אברהם הכהן ז״ל אמר לה להדא בתולתא
מרים בת הר״ר משה הלוי: הוי לי לאנתו כדת משה וישראל ואנא אפלח
ואוקיר ואיזון ואפרנס יתיכי כהלכות גוברין יהודאין
ונתרצית מרת מרים בתולתא דא והות ליה לאנתו
עדים:
שמעון בן יעקב הכהן
דוד בן אליהו""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Hebrew ketubah (Jewish marriage contract):
1. Names using PNV structure with both Hebrew script AND romanized versions
2. Patronymics (בן/בת = son/daughter of)
3. Tribal affiliations (הכהן = the priest/Kohen, הלוי = the Levite)
4. Honorifics (הר״ר = Rabbi, מרת = Mrs., ז״ל = of blessed memory)
5. Family relationships between persons
6. Roles in document (groom/חתן, bride/כלה, witness/עד)
7. Deceased markers (ז״ל)
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "ketubah", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hebrew"}],
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
expected_persons=6, # groom, bride, 2 fathers, 2 witnesses (fathers implicit)
expected_locations=1,
validation_names=["yitzchak", "miriam", "shimon", "david"]
)
# Spanish Colonial Baptism
SPANISH_BAPTISM = TestDocument(
name="spanish_colonial_baptism",
language="Spanish",
script="Latin",
date_period="1742 CE",
source_type="baptismal_register",
source_text="""En la ciudad de México, a veinte y tres días del mes de febrero de mil
setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza,
teniente de cura de esta santa iglesia catedral, bauticé solemnemente,
puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro
García de la Cruz, español, natural de la villa de Puebla de los Ángeles,
y de Doña María Josefa de los Reyes, española, natural de esta ciudad.
Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino
de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa,
a quienes advertí el parentesco espiritual y obligaciones que contrajeron.
Y lo firmé.
Br. Don Antonio de Mendoza""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Spanish colonial baptismal record:
1. Names using PNV structure (given name, surname with particles like "de")
2. Casta (racial/social) designations (español, mestizo, mulato, indio, etc.)
3. Legitimacy markers (hijo legítimo, hijo natural)
4. Place of origin (natural de, vecino de)
5. Family relationships (parents, godparents/padrinos)
6. Compadrazgo relationships (spiritual kinship between parents and godparents)
7. Ecclesiastical roles (priest, teniente de cura)
8. Honorifics (Don, Doña, Br./Bachiller)
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
"locations_mentioned": [{"name": "...", "type": "..."}]
}""",
expected_persons=6, # infant, father, mother, godfather, godmother, priest
expected_locations=3,
validation_names=["juan", "pedro", "maria", "francisco", "antonio"]
)
# Dutch Marriage Certificate
DUTCH_MARRIAGE = TestDocument(
name="dutch_marriage",
language="Dutch",
script="Latin",
date_period="1885 CE",
source_type="marriage_certificate",
source_text="""Heden den vierden Maart achttien honderd vijf en tachtig, compareerden
voor mij, Ambtenaar van den Burgerlijken Stand der Gemeente Haarlem:
Johannes Petrus van der Berg, oud dertig jaren, koopman, geboren te
Amsterdam, wonende alhier, meerderjarige zoon van wijlen Pieter van der
Berg, in leven koopman, en van Maria Johanna Bakker, zonder beroep,
wonende te Amsterdam;
en
Cornelia Wilhelmina de Groot, oud vijf en twintig jaren, zonder beroep,
geboren te Haarlem, wonende alhier, meerderjarige dochter van Hendrik
de Groot, timmerman, en van wijlen Elisabeth van Dijk.
De getuigen waren:
Willem Frederik Smit, oud veertig jaren, notaris
Jacobus Hendrikus Jansen, oud vijf en dertig jaren, klerk""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Dutch marriage certificate (huwelijksakte):
1. Names using PNV structure with Dutch naming conventions
2. Patronymics and tussenvoegsels (van der, de, etc.)
3. Ages, occupations, birthplaces, residences
4. Family relationships (parents identified with "zoon van" / "dochter van")
5. Deceased markers ("wijlen" = the late)
6. Roles in document (groom, bride, witnesses/getuigen)
7. Civil status terminology
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "marriage_certificate", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
"locations_mentioned": [{"name": "...", "type": "..."}]
}""",
expected_persons=8, # groom, bride, 4 parents (2 deceased), 2 witnesses
expected_locations=2,
validation_names=["johannes", "cornelia", "willem", "jacobus"]
)
# Russian Metrical Book Entry
RUSSIAN_METRICAL = TestDocument(
name="russian_metrical",
language="Russian",
script="Cyrillic",
date_period="1892 CE",
source_type="metrical_book",
source_text="""Метрическая книга Троицкой церкви села Покровского за 1892 год
О родившихся
Марта 15 дня родился, 17 дня крещён Иван.
Родители: крестьянин деревни Ивановки Пётр Иванович Сидоров и законная
жена его Анна Фёдоровна, оба православного вероисповедания.
Восприемники: крестьянин той же деревни Николай Петрович Кузнецов
и крестьянская дочь девица Мария Ивановна Сидорова.""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Russian metrical book (метрическая книга) entry:
1. Names using Russian naming conventions: given name + patronymic (отчество) + surname
2. Patronymic patterns (-ович/-евич for males, -овна/-евна for females)
3. Estate/class designations (крестьянин = peasant, мещанин = townsman, дворянин = noble)
4. Family relationships
5. Roles (родители = parents, восприемники = godparents)
6. Religious denomination (православный = Orthodox)
7. Include both Cyrillic AND romanized versions
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "metrical_book", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Gregorian/Julian"}],
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
expected_persons=5, # infant, father, mother, godfather, godmother
expected_locations=2,
validation_names=["ivan", "petr", "anna", "nikolai", "maria"]
)
# Italian Notarial Act
ITALIAN_NOTARIAL = TestDocument(
name="italian_notarial",
language="Italian",
script="Latin",
date_period="1654 CE",
source_type="notarial_act",
source_text="""Adì 15 Marzo 1654, in Venetia.
Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu
quondam Magnifico Messer Andrea, della contrada di San Marco,
et sua moglie la Nobil Donna Madonna Caterina Contarini fu
quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo
Fabbro, habitante nella contrada di San Polo, et Messer Marco
Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io
Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico
di Venetia.""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Italian notarial act:
1. Names using PNV structure (given name, surname)
2. Venetian nobility titles (Nobil Homo, Magnifico Messer, Nobil Donna Madonna)
3. Deceased father markers ("fu", "quondam" = the late)
4. Family relationships (spouses, children of)
5. Occupations (bottegaio = shopkeeper, notaro = notary)
6. Roles in document (party, witness/testimone, notary)
7. Residence/contrada information
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "notarial_act", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}],
"locations_mentioned": [{"name": "...", "type": "..."}]
}""",
expected_persons=6, # Giovanni, Caterina, 2 witnesses, notary, plus fathers
expected_locations=4,
validation_names=["giovanni", "caterina", "pietro", "antonio"]
)
# Greek Orthodox Baptismal Register
GREEK_BAPTISMAL = TestDocument(
name="greek_baptismal",
language="Greek",
script="Greek",
date_period="1875 CE",
source_type="baptismal_register",
source_text="""Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875.
Ἐβαπτίσθη Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου,
ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ
μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς Κωνσταντῖνος
Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: Πρωτοπρεσβύτερος
Ἀθανάσιος Χρυσοστόμου.""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Greek Orthodox baptismal register:
1. Names with BOTH Greek script AND romanized versions
2. Greek patronymics ("τοῦ" + genitive = son/daughter of)
3. Deceased markers (μακαρίτης/μακαρίτισσα = the late)
4. Family relationships (υἱός = son, σύζυγος = wife)
5. Godparent (νονός/νονά)
6. Occupations (ἔμπορος = merchant, ἰατρός = physician)
7. Ecclesiastical titles (Πρωτοπρεσβύτερος = Archpriest)
8. Roles in document (baptized, parents, godparent, priest)
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Julian"}],
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
expected_persons=6, # infant, father, mother, maternal grandfather, godfather, priest
expected_locations=1,
validation_names=["dimitrios", "nikolaos", "eleni", "konstantinos"]
)
# Ottoman Turkish Court Record (Sijill)
OTTOMAN_SIJILL = TestDocument(
name="ottoman_sijill",
language="Ottoman Turkish",
script="Arabic",
date_period="1258 AH (1842 CE)",
source_type="sijill",
source_text="""بسم الله الرحمن الرحيم
مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجیکوی
ساکنلرندن محمد بن احمد افندی و زوجهسی فاطمه خاتون بنت علیاوغلو
حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی
شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی
فی اوائل شهر رجب سنة ١٢٥٨""",
system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology.
Extract ALL persons from this Ottoman Turkish sijill (court record):
1. Names with both Arabic script AND romanized versions
2. Ottoman honorifics (آغا/Ağa, افندی/Efendi, چلبی/Çelebi, خاتون/Hatun)
3. Patronymics (بن/bin = son of, بنت/bint = daughter of)
4. Deceased markers (مرحوم/merhum)
5. Family relationships (زوجه/zevce = wife)
6. Roles in document (buyer, seller, witnesses)
7. Residence information
Note: Ottoman Turkish uses Arabic script with Turkish vocabulary and grammatical structures.
Return ONLY valid JSON with this structure:
{
"pico_observation": {"observation_id": "...", "source_type": "sijill", "source_reference": "..."},
"persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}],
"temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hijri"}],
"locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}]
}""",
expected_persons=6, # Mehmed Ağa, Mehmed bin Ahmed, Fatma Hatun, 2 witnesses + fathers
expected_locations=1,
validation_names=["mehmed", "fatma", "hasan", "ibrahim"]
)
# All available tests
ALL_TESTS = {
"arabic": ARABIC_WAQF,
"hebrew": HEBREW_KETUBAH,
"spanish": SPANISH_BAPTISM,
"dutch": DUTCH_MARRIAGE,
"russian": RUSSIAN_METRICAL,
"italian": ITALIAN_NOTARIAL,
"greek": GREEK_BAPTISMAL,
"ottoman": OTTOMAN_SIJILL,
}
# =============================================================================
# API Functions
# =============================================================================
async def call_glm_api(system_prompt: str, user_content: str) -> tuple[dict, float]:
"""Call Z.AI GLM-4.6 API and return parsed JSON response with timing."""
api_token = os.environ.get("ZAI_API_TOKEN")
if not api_token:
raise ValueError("ZAI_API_TOKEN not set in environment")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"model": ZAI_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content},
],
"temperature": 0.1,
"max_tokens": MAX_TOKENS,
}
start_time = datetime.now(timezone.utc)
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
response = await client.post(ZAI_API_URL, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
content = result["choices"][0]["message"]["content"]
end_time = datetime.now(timezone.utc)
duration = (end_time - start_time).total_seconds()
# Parse JSON from response
json_content = content
if "```json" in content:
json_content = content.split("```json")[1].split("```")[0]
elif "```" in content:
parts = content.split("```")
if len(parts) >= 2:
json_content = parts[1]
return json.loads(json_content.strip()), duration
def extract_all_strings_recursive(obj, strings: list[str]) -> None:
"""Recursively extract all string values from nested dicts/lists."""
if isinstance(obj, str):
strings.append(obj.lower())
elif isinstance(obj, dict):
for value in obj.values():
extract_all_strings_recursive(value, strings)
elif isinstance(obj, list):
for item in obj:
extract_all_strings_recursive(item, strings)
def normalize_name_variant(name: str) -> list[str]:
"""Generate common spelling variants for a name.
Handles cross-script romanization differences like:
- mehmed/muhammad/mohammed
- fatma/fatima
- dimitrios/demetrios
- yitzchak/isaac
"""
variants = [name.lower()]
# Arabic/Turkish name variants
variant_map = {
'mehmed': ['muhammad', 'mohammed', 'mehmet'],
'fatma': ['fatima', 'fatmah'],
'ahmed': ['ahmad'],
'ibrahim': ['abraham', 'ibrahim'],
'hasan': ['hassan'],
'hussein': ['husayn', 'huseyin'],
# Greek variants
'dimitrios': ['demetrios', 'dimitris', 'dēmētrios'],
'nikolaos': ['nicholas', 'nikolas'],
'konstantinos': ['constantine', 'constantinos'],
'georgios': ['george', 'geōrgios'],
'eleni': ['helen', 'elena', 'elenē'],
'athanasios': ['athanasius'],
# Hebrew variants
'yitzchak': ['isaac', 'itzhak', 'yitzhak'],
'miriam': ['mirjam', 'myriam'],
'shimon': ['simon', 'shimeon'],
'avraham': ['abraham'],
'moshe': ['moses'],
'david': ['dovid'],
'yaakov': ['jacob', 'jakob'],
# Russian variants
'petr': ['peter', 'pyotr', 'piotr'],
'ivan': ['john', 'ioann'],
'nikolai': ['nicholas', 'nikolay'],
'maria': ['mary', 'mariya'],
}
for key, values in variant_map.items():
if name.lower() == key:
variants.extend(values)
elif name.lower() in values:
variants.append(key)
variants.extend(v for v in values if v != name.lower())
return variants
def validate_extraction(result: dict, test: TestDocument) -> tuple[bool, list[str]]:
"""Validate extraction result against expected values."""
errors = []
warnings = []
# Check structure
if "persons" not in result:
errors.append("Missing 'persons' field")
return False, errors
persons = result.get("persons", [])
# Check person count
if len(persons) < test.expected_persons:
warnings.append(f"Expected at least {test.expected_persons} persons, got {len(persons)}")
# Extract ALL string values from persons recursively for comprehensive name matching
all_name_strings = []
for person in persons:
# Get pnv_name - could be nested structure
pnv = person.get("pnv_name", {})
extract_all_strings_recursive(pnv, all_name_strings)
# Also check context field which often contains the original text
if person.get("context"):
all_name_strings.append(str(person["context"]).lower())
# Check for expected names with variant support
for expected_name in test.validation_names:
variants = normalize_name_variant(expected_name)
found = False
for variant in variants:
if any(variant in name_str for name_str in all_name_strings):
found = True
break
if not found:
warnings.append(f"Expected name '{expected_name}' (variants: {variants[:3]}) not found")
# Check locations
locations = result.get("locations_mentioned", [])
if len(locations) < test.expected_locations:
warnings.append(f"Expected at least {test.expected_locations} locations, got {len(locations)}")
# Combine errors and warnings
is_valid = len(errors) == 0
all_issues = errors + warnings
return is_valid, all_issues
# =============================================================================
# Test Runner
# =============================================================================
async def run_single_test(test: TestDocument) -> dict:
"""Run extraction test for a single document type."""
print(f"\n{'='*70}")
print(f"TEST: {test.name.upper()}")
print(f"Language: {test.language} | Script: {test.script} | Period: {test.date_period}")
print(f"{'='*70}")
# Prepare user prompt
user_prompt = f"""Extract all persons, relationships, dates, and locations from this {test.source_type}:
{test.source_text}
Follow the PiCo ontology pattern for person observations."""
print(f"\n📄 Source: {test.source_type}")
print(f" Text length: {len(test.source_text)} chars")
# Call API
print(f"\n⏳ Calling GLM-4.6 API...")
try:
result, duration = await call_glm_api(test.system_prompt, user_prompt)
print(f"✅ API call completed in {duration:.1f}s")
except httpx.HTTPStatusError as e:
print(f"❌ API Error: {e.response.status_code}")
return {"test": test.name, "status": "error", "error": str(e)}
except json.JSONDecodeError as e:
print(f"❌ JSON Parse Error: {e}")
return {"test": test.name, "status": "error", "error": str(e)}
except Exception as e:
print(f"❌ Error: {type(e).__name__}: {e}")
return {"test": test.name, "status": "error", "error": str(e)}
# Display summary
persons = result.get("persons", [])
locations = result.get("locations_mentioned", [])
temporal = result.get("temporal_references", [])
print(f"\n📊 Extraction Summary:")
print(f" Persons: {len(persons)}")
print(f" Locations: {len(locations)}")
print(f" Temporal refs: {len(temporal)}")
# Show persons
print(f"\n👥 Persons:")
for person in persons[:5]: # Show first 5
idx = person.get("person_index", "?")
name = person.get("pnv_name", {})
if isinstance(name, str):
lit_name = name
else:
lit_name = name.get("literalName_romanized") or name.get("literalName", "?")
# Handle roles - could be list of dicts, list of strings, or string
roles_raw = person.get("roles", [])
if isinstance(roles_raw, str):
role = roles_raw
elif isinstance(roles_raw, list) and len(roles_raw) > 0:
first_role = roles_raw[0]
if isinstance(first_role, dict):
role = first_role.get("role_in_source", "-")
else:
role = str(first_role)
else:
role = "-"
print(f" [{idx}] {str(lit_name)[:50]} ({role})")
if len(persons) > 5:
print(f" ... and {len(persons) - 5} more")
# Validate
is_valid, issues = validate_extraction(result, test)
print(f"\n🔍 Validation: {'✅ PASSED' if is_valid else '⚠️ ISSUES'}")
if issues:
for issue in issues:
print(f" - {issue}")
# Save result
output_dir = project_root / "data/entity_annotation/test_outputs"
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"{test.name}_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n💾 Saved: {output_file.name}")
return {
"test": test.name,
"status": "passed" if is_valid else "warning",
"persons_extracted": len(persons),
"locations_extracted": len(locations),
"duration_seconds": duration,
"issues": issues,
"output_file": str(output_file)
}
async def run_all_tests() -> list[dict]:
"""Run all extraction tests sequentially."""
results = []
for name, test in ALL_TESTS.items():
result = await run_single_test(test)
results.append(result)
return results
def print_summary(results: list[dict]):
"""Print summary of all test results."""
print("\n" + "=" * 70)
print("BATCH TEST SUMMARY")
print("=" * 70)
passed = sum(1 for r in results if r["status"] == "passed")
warnings = sum(1 for r in results if r["status"] == "warning")
errors = sum(1 for r in results if r["status"] == "error")
print(f"\n📊 Results: {passed} passed, {warnings} warnings, {errors} errors")
print(f" Total tests: {len(results)}")
print(f"\n📋 Test Details:")
for r in results:
status_icon = {"passed": "", "warning": "⚠️", "error": ""}.get(r["status"], "?")
print(f" {status_icon} {r['test']}: {r.get('persons_extracted', 0)} persons, {r.get('duration_seconds', 0):.1f}s")
if r.get("issues"):
for issue in r["issues"][:2]:
print(f" - {issue}")
print("\n" + "=" * 70)
if errors == 0:
print("✅ ALL TESTS COMPLETED SUCCESSFULLY")
else:
print(f"⚠️ {errors} TESTS FAILED - Check details above")
print("=" * 70)
# =============================================================================
# Main
# =============================================================================
async def main():
parser = argparse.ArgumentParser(description="Batch test PiCo extraction")
parser.add_argument("--test-name", "-t", choices=list(ALL_TESTS.keys()),
help="Run specific test by name")
parser.add_argument("--all", "-a", action="store_true",
help="Run all tests")
parser.add_argument("--list", "-l", action="store_true",
help="List available tests")
args = parser.parse_args()
# Check API token
if not os.environ.get("ZAI_API_TOKEN"):
print("❌ Error: ZAI_API_TOKEN not set")
print("Set it with: export ZAI_API_TOKEN=<your_token>")
print("Or add to .env file in project root")
return 1
print("\n" + "#" * 70)
print("# PiCo BATCH EXTRACTION TEST")
print(f"# Model: {ZAI_MODEL} (reasoning mode)")
print(f"# Max tokens: {MAX_TOKENS}")
print("#" * 70)
if args.list:
print("\n📋 Available tests:")
for name, test in ALL_TESTS.items():
print(f" {name}: {test.language} {test.source_type} ({test.date_period})")
return 0
if args.test_name:
test = ALL_TESTS[args.test_name]
result = await run_single_test(test)
return 0 if result["status"] != "error" else 1
if args.all:
results = await run_all_tests()
print_summary(results)
errors = sum(1 for r in results if r["status"] == "error")
return 0 if errors == 0 else 1
# Default: show help
parser.print_help()
return 0
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)

View file

@ -665,7 +665,7 @@ def create_hybrid_retriever(
return HybridRetriever(
qdrant_host="bronhouder.nl",
qdrant_port=443,
sparql_endpoint="https://bronhouder.nl/query",
sparql_endpoint="https://bronhouder.nl/sparql",
use_production_qdrant=True,
**kwargs
)

View file

@ -140,21 +140,26 @@ class TypeDBRetriever:
self.database = database
self.k = k
# Lazy-load TypeDB client
# Lazy-load TypeDB client (TypeDB 3.x - no sessions)
self._client = None
self._session = None
logger.info(f"Initialized TypeDBRetriever: {host}:{port}/{database}")
@property
def client(self):
"""Lazy-load TypeDB client."""
"""Lazy-load TypeDB client (TypeDB 3.x API)."""
if self._client is None:
try:
from typedb.driver import TypeDB, SessionType
from typedb.driver import TypeDB, Credentials, DriverOptions
self._client = TypeDB.core_driver(f"{self.host}:{self.port}")
logger.info(f"Connected to TypeDB at {self.host}:{self.port}")
# TypeDB 3.x requires credentials and options
# Default credentials for local development (no auth)
credentials = Credentials("admin", "password")
options = DriverOptions(is_tls_enabled=False) # Disable TLS for local dev
address = f"{self.host}:{self.port}"
self._client = TypeDB.driver(address, credentials, options)
logger.info(f"Connected to TypeDB 3.x at {self.host}:{self.port}")
except ImportError:
raise ImportError(
"typedb-driver package required. Install with: pip install typedb-driver"
@ -164,16 +169,10 @@ class TypeDBRetriever:
raise
return self._client
def _get_session(self):
"""Get a data session."""
from typedb.driver import SessionType
if self._session is None or not self._session.is_open():
self._session = self.client.session(self.database, SessionType.DATA)
return self._session
def _execute_read(self, typeql: str) -> list[dict[str, Any]]:
"""Execute a TypeQL read query.
"""Execute a TypeQL read query (TypeDB 3.x API).
TypeDB 3.x removed sessions - transactions are created directly on driver.
Args:
typeql: TypeQL query string
@ -184,23 +183,33 @@ class TypeDBRetriever:
from typedb.driver import TransactionType
results = []
session = self._get_session()
try:
with session.transaction(TransactionType.READ) as tx:
answer = tx.query.get(typeql)
# TypeDB 3.x: transactions directly on driver, specifying database
with self.client.transaction(self.database, TransactionType.READ) as tx:
# TypeDB 3.x: query() returns a Promise, need to resolve it
answer = tx.query(typeql).resolve()
for concept_map in answer:
row = {}
for var in concept_map.variables():
concept = concept_map.get(var)
if hasattr(concept, 'get_value'):
row[var] = concept.get_value()
elif hasattr(concept, 'get_iid'):
row[var] = concept.get_iid()
# TypeDB 3.x: QueryAnswer may be iterable depending on query type
if hasattr(answer, '__iter__'):
for row in answer:
result_row = {}
# Access columns by index or iterate
if hasattr(row, 'concepts'):
for i, concept in enumerate(row.concepts()):
var_name = f"var_{i}"
if hasattr(concept, 'get_value'):
result_row[var_name] = concept.get_value()
elif hasattr(concept, 'as_entity'):
result_row[var_name] = str(concept)
else:
result_row[var_name] = str(concept)
else:
row[var] = str(concept)
results.append(row)
result_row["result"] = str(row)
results.append(result_row)
else:
# Single result (e.g., count query)
results.append({"result": str(answer)})
except Exception as e:
logger.error(f"TypeQL query failed: {e}")
@ -603,28 +612,52 @@ class TypeDBRetriever:
"relations": {},
}
# Count heritage custodians
# Count custodian entities by type (TypeDB 3.x API)
# Schema types: custodian-observation, custodian-name, custodian-reconstruction
entity_types = [
("custodian-observation", "observations"),
("custodian-name", "names"),
("custodian-reconstruction", "reconstructions"),
]
try:
count_query = """
match
$inst isa heritage-custodian;
get $inst;
count;
"""
session = self._get_session()
from typedb.driver import TransactionType
with session.transaction(TransactionType.READ) as tx:
answer = tx.query.get_aggregate(count_query)
stats["entities"]["heritage_custodian"] = answer.as_value().as_long()
with self.client.transaction(self.database, TransactionType.READ) as tx:
for type_name, stat_key in entity_types:
try:
# TypeDB 3.x count query syntax
count_query = f"""
match
$inst isa {type_name};
reduce $count = count;
"""
answer = tx.query(count_query).resolve()
# Parse count result - TypeDB 3.x returns _Value objects
count = 0
for row in answer:
# row.get("count") returns a _Value object
value_obj = row.get("count")
# Extract integer - try multiple methods
if hasattr(value_obj, 'get_integer'):
count = value_obj.get_integer()
elif hasattr(value_obj, 'try_get_integer'):
count = value_obj.try_get_integer() or 0
else:
# Fallback: string conversion
count = int(str(value_obj))
break
stats["entities"][stat_key] = count
except Exception as e:
stats["entities"][stat_key] = f"error: {e}"
except Exception as e:
stats["entities"]["error"] = str(e)
return stats
def close(self) -> None:
"""Clean up resources."""
if self._session and self._session.is_open():
self._session.close()
"""Clean up resources (TypeDB 3.x - no sessions to close)."""
if self._client:
self._client.close()
self._client = None