From 505c12601ad0dbdc40725b9a7e7f40d8c68c6f9a Mon Sep 17 00:00:00 2001 From: kempersc Date: Fri, 12 Dec 2025 17:50:17 +0100 Subject: [PATCH] Add test script for PiCo extraction from Arabic waqf documents - Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents. - The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results. - Added comprehensive logging for API responses, extraction results, and validation errors. - Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern. --- backend/postgres/geo_api.py | 7 +- backend/postgres/main.py | 46 +- backend/rag/main.py | 33 +- ...ra-nederlof-74b7a341_20251210T234125Z.json | 2 +- ...a-dellebeke-87289018_20251210T230231Z.json | 2 +- .../annemarijnemoreu_20251210T234222Z.json | 2 +- .../entity/annemartens1_20251210T230220Z.json | 2 +- ...n-diepeveen-73b21640_20251210T234227Z.json | 2 +- ...roger-mous-203b2922a_20251210T230111Z.json | 2 +- .../docs/PROVENANCE_SOURCES.md | 635 +++ data/entity_annotation/modules/index.yaml | 22 + .../modules/integrations/pico.yaml | 2244 --------- .../modules/integrations/pico.yaml.bak | 4255 +++++++++++++++++ .../modules/integrations/pico/_index.yaml | 228 + .../pico/examples/01_dutch_marriage.yaml | 285 ++ .../pico/examples/02_notarial_protocol.yaml | 263 + .../pico/examples/03_church_baptism.yaml | 202 + .../pico/examples/04_linkedin_profile.yaml | 146 + .../pico/examples/05_arabic_waqf.yaml | 215 + .../pico/examples/06_hebrew_ketubah.yaml | 325 ++ .../pico/examples/07_spanish_colonial.yaml | 263 + .../pico/examples/08_italian_notarial.yaml | 315 ++ .../pico/examples/09_greek_orthodox.yaml | 259 + .../pico/examples/10_russian_metrical.yaml | 489 ++ .../pico/examples/11_ottoman_sijill.yaml | 281 ++ .../pico/examples/_examples_index.yaml | 315 ++ .../integrations/pico/schema/observation.yaml | 439 ++ .../pico/schema/pnv_components.yaml | 439 ++ .../pico/schema/relationships.yaml | 517 ++ .../integrations/pico/schema/temporal.yaml | 570 +++ .../modules/relationships/family.yaml | 1503 ++++++ ...rabic_waqf_extraction_20251212_132017.json | 163 + ...rabic_waqf_extraction_20251212_152524.json | 93 + ...h_marriage_extraction_20251212_145817.json | 139 + ...h_marriage_extraction_20251212_152853.json | 185 + ..._baptismal_extraction_20251212_152118.json | 139 + ..._baptismal_extraction_20251212_153159.json | 124 + ...ew_ketubah_extraction_20251212_133437.json | 252 + ...ew_ketubah_extraction_20251212_152634.json | 202 + ...n_notarial_extraction_20251212_152024.json | 192 + ...n_notarial_extraction_20251212_153104.json | 156 + ...man_sijill_extraction_20251212_152313.json | 125 + ...man_sijill_extraction_20251212_153300.json | 149 + .../raw_response_20251212_132017.txt | 166 + ...n_metrical_extraction_20251212_150120.json | 167 + ...n_metrical_extraction_20251212_153018.json | 192 + ...al_baptism_extraction_20251212_133618.json | 221 + ...al_baptism_extraction_20251212_152722.json | 259 + .../20251121/linkml/custodian_source.yaml | 635 ++- .../schemas/20251121/linkml/manifest.json | 145 +- .../components/database/PointDetailsPanel.tsx | 5 +- frontend/src/components/layout/Layout.css | 3 +- frontend/src/components/layout/Layout.tsx | 2 +- frontend/src/components/layout/Navigation.css | 12 + frontend/src/components/layout/Navigation.tsx | 28 +- .../components/map/InstitutionInfoPanel.tsx | 21 +- frontend/src/components/map/MediaGallery.tsx | 36 +- .../components/query/ConversationPanel.css | 14 + .../components/query/ConversationPanel.tsx | 47 +- .../components/rdf/RdfNodeDetailsPanel.tsx | 11 +- .../components/uml/SemanticDetailsPanel.tsx | 4 +- .../visualizations/PersonInfoPanel.css | 344 +- .../visualizations/PersonInfoPanel.tsx | 574 ++- frontend/src/hooks/useGeoApiInstitutions.ts | 147 +- frontend/src/hooks/useMultiDatabaseRAG.ts | 59 +- .../src/hooks/useProgressiveInstitutions.ts | 137 + .../src/lib/storage/institutions-cache.ts | 2 +- frontend/src/lib/storage/ui-state.ts | 17 +- frontend/src/pages/ConversationPage.css | 51 +- frontend/src/pages/ConversationPage.tsx | 24 + frontend/src/pages/Database.css | 2 +- frontend/src/pages/InstitutionBrowserPage.css | 26 + frontend/src/pages/InstitutionBrowserPage.tsx | 26 +- frontend/src/pages/NDEMapPageMapLibre.tsx | 5 +- frontend/src/types/socialNetwork.ts | 9 +- frontend/src/utils/dom.ts | 52 + frontend/vite.config.ts | 5 + pyproject.toml | 7 +- schemas/20251121/linkml/custodian_source.yaml | 467 +- scripts/load_typedb_schema.py | 441 +- scripts/test_pico_arabic_waqf.py | 472 ++ scripts/test_pico_batch.py | 786 +++ src/glam_extractor/api/hybrid_retriever.py | 2 +- src/glam_extractor/api/typedb_retriever.py | 117 +- 84 files changed, 19370 insertions(+), 2597 deletions(-) create mode 100644 data/entity_annotation/docs/PROVENANCE_SOURCES.md delete mode 100644 data/entity_annotation/modules/integrations/pico.yaml create mode 100644 data/entity_annotation/modules/integrations/pico.yaml.bak create mode 100644 data/entity_annotation/modules/integrations/pico/_index.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/01_dutch_marriage.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/02_notarial_protocol.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/03_church_baptism.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/04_linkedin_profile.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/05_arabic_waqf.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/06_hebrew_ketubah.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/07_spanish_colonial.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/08_italian_notarial.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/09_greek_orthodox.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/10_russian_metrical.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/11_ottoman_sijill.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/examples/_examples_index.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/schema/observation.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/schema/pnv_components.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/schema/relationships.yaml create mode 100644 data/entity_annotation/modules/integrations/pico/schema/temporal.yaml create mode 100644 data/entity_annotation/modules/relationships/family.yaml create mode 100644 data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_132017.json create mode 100644 data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_152524.json create mode 100644 data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_145817.json create mode 100644 data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_152853.json create mode 100644 data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_152118.json create mode 100644 data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_153159.json create mode 100644 data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_133437.json create mode 100644 data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_152634.json create mode 100644 data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_152024.json create mode 100644 data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_153104.json create mode 100644 data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_152313.json create mode 100644 data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_153300.json create mode 100644 data/entity_annotation/test_outputs/raw_response_20251212_132017.txt create mode 100644 data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_150120.json create mode 100644 data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_153018.json create mode 100644 data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_133618.json create mode 100644 data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_152722.json create mode 100644 frontend/src/utils/dom.ts create mode 100644 scripts/test_pico_arabic_waqf.py create mode 100644 scripts/test_pico_batch.py diff --git a/backend/postgres/geo_api.py b/backend/postgres/geo_api.py index c4e1e71e43..932f05dbab 100644 --- a/backend/postgres/geo_api.py +++ b/backend/postgres/geo_api.py @@ -535,7 +535,8 @@ async def get_institutions( social_instagram, wikidata_label_en, wikidata_description_en, - logo_url + logo_url, + web_claims FROM custodians WHERE {where_clause} ORDER BY name @@ -620,6 +621,10 @@ async def get_institutions( if row['logo_url']: props["logo_url"] = row['logo_url'] + # Web claims (financial documents, etc.) + if row['web_claims']: + props["web_claims"] = row['web_claims'] + features.append({ "type": "Feature", "geometry": { diff --git a/backend/postgres/main.py b/backend/postgres/main.py index e9337288fa..0c93391a54 100644 --- a/backend/postgres/main.py +++ b/backend/postgres/main.py @@ -848,6 +848,28 @@ async def get_profile( if isinstance(profile_data, str): profile_data = json.loads(profile_data) + # Transform experience → career_history for frontend compatibility + # The database stores 'experience' but frontend expects 'career_history' + inner_profile = profile_data.get('profile_data', {}) + if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile: + experience = inner_profile.get('experience', []) + if experience: + # Map field names: title→role, company→organization, duration→dates + career_history = [] + for job in experience: + career_item = { + 'role': job.get('title'), + 'organization': job.get('company'), + 'dates': job.get('duration'), + 'location': job.get('location'), + 'description': job.get('description'), + 'company_size': job.get('company_details'), + 'current': job.get('current', False), + } + career_history.append(career_item) + inner_profile['career_history'] = career_history + profile_data['profile_data'] = inner_profile + return ProfileResponse( profile_data=profile_data, linkedin_slug=result['linkedin_slug'], @@ -867,8 +889,30 @@ async def get_profile( try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) + file_profile_data = data.get('profile_data', {}) + + # Transform experience → career_history for frontend compatibility + inner_profile = file_profile_data.get('profile_data', {}) + if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile: + experience = inner_profile.get('experience', []) + if experience: + career_history = [] + for job in experience: + career_item = { + 'role': job.get('title'), + 'organization': job.get('company'), + 'dates': job.get('duration'), + 'location': job.get('location'), + 'description': job.get('description'), + 'company_size': job.get('company_details'), + 'current': job.get('current', False), + } + career_history.append(career_item) + inner_profile['career_history'] = career_history + file_profile_data['profile_data'] = inner_profile + return ProfileResponse( - profile_data=data.get('profile_data', {}), + profile_data=file_profile_data, linkedin_slug=linkedin_slug, extraction_date=data.get('exa_search_metadata', {}).get('enrichment_timestamp'), updated_date=None, diff --git a/backend/rag/main.py b/backend/rag/main.py index e03cf0db29..4583e7f335 100644 --- a/backend/rag/main.py +++ b/backend/rag/main.py @@ -99,20 +99,26 @@ class Settings: cache_ttl: int = int(os.getenv("CACHE_TTL", "900")) # 15 minutes # Qdrant Vector DB + # Production: Use URL-based client via bronhouder.nl/qdrant reverse proxy qdrant_host: str = os.getenv("QDRANT_HOST", "localhost") qdrant_port: int = int(os.getenv("QDRANT_PORT", "6333")) - qdrant_use_production: bool = os.getenv("QDRANT_USE_PRODUCTION", "false").lower() == "true" + qdrant_use_production: bool = os.getenv("QDRANT_USE_PRODUCTION", "true").lower() == "true" + qdrant_production_url: str = os.getenv("QDRANT_PRODUCTION_URL", "https://bronhouder.nl/qdrant") # Oxigraph SPARQL - sparql_endpoint: str = os.getenv("SPARQL_ENDPOINT", "http://localhost:7878/query") + # Production: Use bronhouder.nl/sparql reverse proxy + sparql_endpoint: str = os.getenv("SPARQL_ENDPOINT", "https://bronhouder.nl/sparql") # TypeDB + # Note: TypeDB not exposed via reverse proxy - always use localhost typedb_host: str = os.getenv("TYPEDB_HOST", "localhost") typedb_port: int = int(os.getenv("TYPEDB_PORT", "1729")) typedb_database: str = os.getenv("TYPEDB_DATABASE", "heritage_custodians") + typedb_use_production: bool = os.getenv("TYPEDB_USE_PRODUCTION", "false").lower() == "true" # Default off - # PostGIS - postgis_url: str = os.getenv("POSTGIS_URL", "http://localhost:8001") + # PostGIS/Geo API + # Production: Use bronhouder.nl/api/geo reverse proxy + postgis_url: str = os.getenv("POSTGIS_URL", "https://bronhouder.nl/api/geo") # LLM Configuration anthropic_api_key: str = os.getenv("ANTHROPIC_API_KEY", "") @@ -408,7 +414,7 @@ class MultiSourceRetriever: if self._typedb is None and RETRIEVERS_AVAILABLE: try: self._typedb = create_typedb_retriever( - use_production=settings.qdrant_use_production + use_production=settings.typedb_use_production # Use TypeDB-specific setting ) except Exception as e: logger.warning(f"Failed to initialize TypeDB: {e}") @@ -686,7 +692,9 @@ async def lifespan(app: FastAPI): retriever = MultiSourceRetriever() if RETRIEVERS_AVAILABLE: - viz_selector = VisualizationSelector(use_dspy=bool(settings.anthropic_api_key)) + # Check for any available LLM API key (Anthropic preferred, OpenAI fallback) + has_llm_key = bool(settings.anthropic_api_key or settings.openai_api_key) + viz_selector = VisualizationSelector(use_dspy=has_llm_key) # Configure DSPy if API key available if settings.anthropic_api_key: @@ -697,7 +705,16 @@ async def lifespan(app: FastAPI): api_key=settings.anthropic_api_key, ) except Exception as e: - logger.warning(f"Failed to configure DSPy: {e}") + logger.warning(f"Failed to configure DSPy with Anthropic: {e}") + elif settings.openai_api_key: + try: + configure_dspy( + provider="openai", + model="gpt-4o-mini", + api_key=settings.openai_api_key, + ) + except Exception as e: + logger.warning(f"Failed to configure DSPy with OpenAI: {e}") logger.info("Heritage RAG API started") @@ -1068,7 +1085,7 @@ if __name__ == "__main__": uvicorn.run( "main:app", host="0.0.0.0", - port=8002, + port=8003, reload=settings.debug, log_level="info", ) diff --git a/data/custodian/person/entity/alexandra-nederlof-74b7a341_20251210T234125Z.json b/data/custodian/person/entity/alexandra-nederlof-74b7a341_20251210T234125Z.json index 4c574e278d..120e072660 100644 --- a/data/custodian/person/entity/alexandra-nederlof-74b7a341_20251210T234125Z.json +++ b/data/custodian/person/entity/alexandra-nederlof-74b7a341_20251210T234125Z.json @@ -13,7 +13,7 @@ "name": "Alexandra Nederlof", "linkedin_url": "https://www.linkedin.com/in/alexandra-nederlof-74b7a341", "headline": "Junior Papierrestaurator bij Rijksmuseum", - "location": "Ik ben verliefd! En wel op mijn vak als papierrestaurator. De mogelijkheid om bij instellingen als musea, archieven of bibliotheken fysiek te kunnen helpen met het behouden van het papieren cultureel erfgoed geeft een geweldige voldoening. Daarnaast is het restaureren voor de particuliere klanten voor mij ook een waar genoegen: ervoor kunnen zorgen dat een klant weer optimaal van zijn kunstobject kan genieten of een brief weer kan lezen. Ik heb meegewerkt aan uiteenlopende projecten. Ik heb daardoor een brede ervaring opgedaan met het behandelen van verschillende soorten objecten. Van poster tot landkaart, van pastel tot papier maché, van boek tot botanisch model. Hierdoor heb ik een heel scala aan verantwoorde behandelmethoden mij eigen kunnen maken. Op dit moment werk ik als junior papierrestaurator bij het Rijksmuseum.", + "location": null, "connections": "428 connections", "about": "Ik ben verliefd! En wel op mijn vak als papierrestaurator. De mogelijkheid om bij instellingen als musea, archieven of bibliotheken fysiek te kunnen helpen met het behouden van het papieren cultureel erfgoed geeft een geweldige voldoening. Daarnaast is het restaureren voor de particuliere klanten voor mij ook een waar genoegen: ervoor kunnen zorgen dat een klant weer optimaal van zijn kunstobject kan genieten of een brief weer kan lezen. Ik heb meegewerkt aan uiteenlopende projecten. Ik heb daardoor een brede ervaring opgedaan met het behandelen van verschillende soorten objecten. Van poster tot landkaart, van pastel tot papier maché, van boek tot botanisch model. Hierdoor heb ik een heel scala aan verantwoorde behandelmethoden mij eigen kunnen maken. Op dit moment werk ik als junior papierrestaurator bij het Rijksmuseum.", "summary": "Alexandra Nederlof is a Junior Papierrestaurator at the Rijksmuseum in Amsterdam, where she is passionate about preserving paper cultural heritage for museums, archives, and private clients. She has extensive experience in restoring a variety of objects, including posters, maps, and books, and has developed a range of responsible treatment methods. Nederlof has also contributed to publications on topics related to art and restoration.", diff --git a/data/custodian/person/entity/angela-dellebeke-87289018_20251210T230231Z.json b/data/custodian/person/entity/angela-dellebeke-87289018_20251210T230231Z.json index dc038cd4cd..a11ea3f468 100644 --- a/data/custodian/person/entity/angela-dellebeke-87289018_20251210T230231Z.json +++ b/data/custodian/person/entity/angela-dellebeke-87289018_20251210T230231Z.json @@ -13,7 +13,7 @@ "name": "angela dellebeke", "linkedin_url": "https://www.linkedin.com/in/angela-dellebeke-87289018", "headline": "nationaal archief /national archives of the Netherlands", - "location": "The blue shield is the protective emblem specified in the 1954 Hague Convention (Convention for the Protection of Cultural Property in the Event of Armed Conflict) for marking cultural sites to give them protection from attack in the event of armed conflict. The Blue Shield network consists of organizations dealing with museums, archives, audiovisual supports, libraries, as well as monuments and sites. BLUE SHIELD NEDERLAND richt zich op de bescherming van Nederlands cultureel erfgoed tegen de bedreigingen die het gevolg zijn van natuurrampen, molest en militaire handelingen, en op het organiseren van nationale en internationale hulp. Show less", + "location": "The Hague, Netherlands", "connections": "500 connections • 852 followers", "about": "veiligheidszorg collectie / collectie hulpverlening/ preventieve conservering /vraagstukken beheer en behoud/ calamiteitenplan/-organisatie/ selectievraagstukken /acquisitie & beschrijven van archieven / bedrijfshulpverlening/ crisisbeheersing safety&security collections/ emergency preparedness and hazard mitigation / emergency response / crisismanagement / cultural property protection / hague convention 1954", "summary": "Angela Dellebeke is a consultant specializing in emergency preparedness and hazard mitigation at the Nationaal Archief (National Archives of the Netherlands) in The Hague. With over 22 years of experience, she focuses on safety and security for cultural property, crisis management, and the preservation of archives. Dellebeke also serves as Secretary-General for Blue Shield Nederland, an organization dedicated to protecting cultural heritage during conflicts and disasters. She holds a Master of Arts in American Studies from Utrecht University and has published work on theft and misappropriation in archives.", diff --git a/data/custodian/person/entity/annemarijnemoreu_20251210T234222Z.json b/data/custodian/person/entity/annemarijnemoreu_20251210T234222Z.json index 9cf7b230b7..f1da1cd78b 100644 --- a/data/custodian/person/entity/annemarijnemoreu_20251210T234222Z.json +++ b/data/custodian/person/entity/annemarijnemoreu_20251210T234222Z.json @@ -13,7 +13,7 @@ "name": "Annemarijne Moreu", "linkedin_url": "https://www.linkedin.com/in/annemarijnemoreu", "headline": "Sr Projectmanager bij Nationaal Archief", - "location": "Ik ben een daadkrachtige en resultaatgerichte product owner en projectmanager, met brede ervaring op het gebied van B2B en B2C (online) projectmanagement, agile werken, marketing en communicatie. Ik ben goed in staat klantbehoeften centraal te stellen. Samenwerken met verschillende mensen, afdelingen en niveaus en gezamenlijk realiseren van doelen en implementeren van projecten gaat mij goed af. In mijn werk ben ik planmatig sterk, communicatief vaardig, zelfstandig en flexibel. Ik word blij van klantcontacten, aanpakken, samenwerken, afwisseling en verantwoordelijkheid nemen. Als persoon ben ik sociaal, ondernemend, positief en sportief.", + "location": "The Hague, Netherlands", "connections": "500 connections • 860 followers", "about": "Ik ben een daadkrachtige en resultaatgerichte product owner en projectmanager, met brede ervaring op het gebied van B2B en B2C (online) projectmanagement, agile werken, marketing en communicatie. Ik ben goed in staat klantbehoeften centraal te stellen. Samenwerken met verschillende mensen, afdelingen en niveaus en gezamenlijk realiseren van doelen en implementeren van projecten gaat mij goed af. In mijn werk ben ik planmatig sterk, communicatief vaardig, zelfstandig en flexibel. Ik word blij van klantcontacten, aanpakken, samenwerken, afwisseling en verantwoordelijkheid nemen. Als persoon ben ik sociaal, ondernemend, positief en sportief.", "summary": "Annemarijne Moreu is a Senior Project Manager at the Nationaal Archief in The Hague, Netherlands, with over 29 years of experience in project management, particularly in B2B and B2C environments. Her expertise includes agile methodologies, marketing, and communication, focusing on customer needs and collaboration across various departments. Currently, she manages projects related to data accessibility, service delivery, and project management optimization at the Nationaal Archief. Previously, she held roles at Gemeente Rotterdam and PostNL, where she led various IT and process optimization projects, demonstrating her strong planning, communication, and leadership skills. Moreu is known for her proactive and social approach to work.", diff --git a/data/custodian/person/entity/annemartens1_20251210T230220Z.json b/data/custodian/person/entity/annemartens1_20251210T230220Z.json index ddc8658ebe..65fde0b3d6 100644 --- a/data/custodian/person/entity/annemartens1_20251210T230220Z.json +++ b/data/custodian/person/entity/annemartens1_20251210T230220Z.json @@ -13,7 +13,7 @@ "name": "Anne Martens", "linkedin_url": "https://www.linkedin.com/in/annemartens1", "headline": "Communicatieadviseur educatie", - "location": "Als freelance journalist heb ik een verhalenradar die altijd aanstaat. Als ik een verhaal op het spoor ben, dan kan ik niet anders dan dat verhaal uitpluizen en delen met krantenlezers, radioluisteraars of televisiekijkers. Ik ben niet bang om in complexe materie te duiken, onbekende vakgebieden te verkennen en wetenschappelijke publicaties en experts te raadplegen. Ik maak verhalen voor NRC Handelsblad, NEMOKennislink.nl, Antoni van Leeuwenhoekziekenhuis, de NTR en de VPRO. Onderwerpen: biologie, aardwetenschappen, medische ethiek, geneeskunde en fertiliteit. Show less", + "location": "Netherlands", "connections": "500 connections • 925 followers", "about": "Total Experience: 17 years", "summary": "Anne Martens is a seasoned communication advisor specializing in education, currently working at the Nationaal Archief in the Netherlands. With 17 years of experience, she has a diverse background that includes freelance journalism, where she has contributed to various prominent publications such as NRC Handelsblad and NEMOKennislink.nl. Her journalistic work has focused on complex topics in biology, earth sciences, medical ethics, and medicine. Martens has also produced content for science programs and radio documentaries, showcasing her ability to engage with intricate subjects and communicate them effectively to the public.", diff --git a/data/custodian/person/entity/arjan-diepeveen-73b21640_20251210T234227Z.json b/data/custodian/person/entity/arjan-diepeveen-73b21640_20251210T234227Z.json index 938f58c1c7..c696deb4df 100644 --- a/data/custodian/person/entity/arjan-diepeveen-73b21640_20251210T234227Z.json +++ b/data/custodian/person/entity/arjan-diepeveen-73b21640_20251210T234227Z.json @@ -13,7 +13,7 @@ "name": "Arjan Diepeveen", "linkedin_url": "https://www.linkedin.com/in/arjan-diepeveen-73b21640", "headline": "Senior Test Automation Engineer at Nationaal Archief", - "location": "Arjan is een betrouwbare, professionele en collegiale medewerker die zowel zelfstandig als in een team goed presteert. Hij staat voor kwaliteit, zonder daarbij de kwantiteit uit het oog te verliezen. Hij is bereid om net die extra stap te zetten om het maximale resultaat te behalen. Arjan is van origine een echte techneut. Hij heeft in zijn loopbaan een zeer uitgebreide en gefundeerde hoeveelheid technische kennis opgedaan in verschillende functies en verschillende branches. Met zijn oog voor kwaliteit heeft hij zich in de loop van de jaren meer en meer verdiept in het testen en ontwikkeld als test consultant. Zijn kracht ligt in het heel snel in kaart kunnen brengen en doorgronden van zeer complexe omgevingen en systemen. Nieuwe en onbekende dingen maakt hij zich razendsnel eigen. Hij weet daarbij als geen andere deze kennis over te dragen door complexe zaken te vertalen naar een begrijpelijk niveau. Onder hectische stressvolle situaties blijft Arjan uitermate rustig, flexibel en analytisch. Hij gaat graag de uitdaging aan.", + "location": "Netherlands", "connections": "374 connections • 380 followers", "about": "Arjan is een betrouwbare, professionele en collegiale medewerker die zowel zelfstandig als in een team goed presteert. Hij staat voor kwaliteit, zonder daarbij de kwantiteit uit het oog te verliezen. Hij is bereid om net die extra stap te zetten om het maximale resultaat te behalen. Arjan is van origine een echte techneut. Hij heeft in zijn loopbaan een zeer uitgebreide en gefundeerde hoeveelheid technische kennis opgedaan in verschillende functies en verschillende branches. Met zijn oog voor kwaliteit heeft hij zich in de loop van de jaren meer en meer verdiept in het testen en ontwikkeld als test consultant. Zijn kracht ligt in het heel snel in kaart kunnen brengen en doorgronden van zeer complexe omgevingen en systemen. Nieuwe en onbekende dingen maakt hij zich razendsnel eigen. Hij weet daarbij als geen andere deze kennis over te dragen door complexe zaken te vertalen naar een begrijpelijk niveau. Onder hectische stressvolle situaties blijft Arjan uitermate rustig, flexibel en analytisch. Hij gaat graag de uitdaging aan.", "summary": "Arjan Diepeveen is a Senior Test Automation Engineer at Nationaal Archief in the Netherlands, with over 26 years of experience in various technical roles. He excels in both independent and team settings, emphasizing quality while maintaining efficiency. His expertise includes test automation using tools like Robot Framework, Selenium, and Docker, and he works within SCRUM/Agile/DevOps teams to develop custom solutions for the National Archives. Arjan has also held positions at Rijkswaterstaat and Nederlandse Spoorwegen, focusing on technical safety and infrastructure testing. He is known for his ability to quickly understand complex systems and effectively communicate technical concepts.", diff --git a/data/custodian/person/entity/roger-mous-203b2922a_20251210T230111Z.json b/data/custodian/person/entity/roger-mous-203b2922a_20251210T230111Z.json index ffae856242..5abaa3017d 100644 --- a/data/custodian/person/entity/roger-mous-203b2922a_20251210T230111Z.json +++ b/data/custodian/person/entity/roger-mous-203b2922a_20251210T230111Z.json @@ -13,7 +13,7 @@ "name": "Roger Mous", "linkedin_url": "https://www.linkedin.com/in/roger-mous-203b2922a", "headline": "Floormanager afdeling Organisatie en Presentatie (O&P), Nationaal Archief", - "location": "Enthousiast, kwaliteitsgericht, stressbestendige, gemotiveerde professional. Als sturende en motiverend persoon zet ik mij in zodat het team kan voldoen aan de hoogste eisen. Het coördineren, aansturen van medewerkers en zorgen voor kennisoverdracht.", + "location": "Den Haag, Zuid-Holland, Nederland", "connections": "278 connections • 281 followers", "about": "Enthousiast, kwaliteitsgericht, stressbestendige, gemotiveerde professional. Als sturende en motiverend persoon zet ik mij in zodat het team kan voldoen aan de hoogste eisen. Het coördineren, aansturen van medewerkers en zorgen voor kennisoverdracht.", "summary": "Roger Mous is currently the Floormanager at the Nationaal Archief in The Hague, Netherlands, with nearly 35 years of professional experience. He is known for his enthusiastic, quality-oriented, and stress-resistant approach, focusing on team coordination and knowledge transfer. His career includes various roles at the Nationaal Archief and the Royal House, where he managed logistics, events, and catering services. Mous has a background in military service and education in facility management and hospitality. He has held multiple managerial positions, demonstrating strong leadership and organizational skills.", diff --git a/data/entity_annotation/docs/PROVENANCE_SOURCES.md b/data/entity_annotation/docs/PROVENANCE_SOURCES.md new file mode 100644 index 0000000000..0ee4b1f412 --- /dev/null +++ b/data/entity_annotation/docs/PROVENANCE_SOURCES.md @@ -0,0 +1,635 @@ +# Provenance Sources for PiCo Historical Document Examples + +This document provides detailed provenance information for the real historical document sources used in the PiCo (Person in Context) ontology integration examples within the CH-Annotator convention. + +**Last Updated**: 2025-12-12 +**Author**: GLAM Project +**Version**: 1.0.0 + +--- + +## Table of Contents + +1. [Hebrew Ketubah (Jewish Marriage Contracts)](#1-hebrew-ketubah-jewish-marriage-contracts) +2. [Arabic Waqf Documents (Islamic Endowments)](#2-arabic-waqf-documents-islamic-endowments) +3. [Ottoman Turkish Sijill (Sharia Court Registers)](#3-ottoman-turkish-sijill-sharia-court-registers) +4. [Russian Metrical Books (Church Records)](#4-russian-metrical-books-church-records) +5. [Spanish Colonial Baptism Records](#5-spanish-colonial-baptism-records) +6. [Italian Notarial Records](#6-italian-notarial-records) +7. [Greek Orthodox Church Records](#7-greek-orthodox-church-records) +8. [Dutch Civil Registry Records](#8-dutch-civil-registry-records) +9. [License and Attribution Requirements](#9-license-and-attribution-requirements) + +--- + +## 1. Hebrew Ketubah (Jewish Marriage Contracts) + +### 1.1 Yale Beinecke Library - Mashhad Ketubah (1896) + +| Field | Value | +|-------|-------| +| **Archive** | Yale University, Beinecke Rare Book & Manuscript Library | +| **Collection** | Hebrew Manuscripts Supplement | +| **Call Number** | Hebrew MSS suppl 194 | +| **Digital URL** | https://digital.library.yale.edu/catalog/2067542 | +| **Document Type** | Ketubah (Jewish marriage contract) | +| **Date** | 23 Elul 5656 (September 1, 1896 CE) | +| **Place** | Mashhad, Iran | +| **Language** | Hebrew, Aramaic | +| **Access Date** | 2025-12-12 | +| **License** | Public Domain (pre-1929) | + +**Persons Identified:** +- **Groom**: Mosheh ben Mashiah (משה בן משיאח) +- **Bride**: Rivkah bat Ya'akov (רבקה בת יעקב) + +**Notes**: This ketubah is from the crypto-Jewish community of Mashhad, known as the Jadid al-Islam, who maintained Jewish practices in secret after forced conversion in 1839. The document follows standard Sephardic/Mizrahi ketubah format. + +--- + +### 1.2 Philadelphia Mikveh Israel Ketubah (1842) + +| Field | Value | +|-------|-------| +| **Archive** | Congregation Mikveh Israel, Philadelphia | +| **Collection** | Philadelphia Congregations Records | +| **Digital URL** | https://philadelphiacongregations.org/records/item/MikvehIsrael.MarriageCertificate1842 | +| **Document Type** | Ketubah (Jewish marriage contract) | +| **Date** | 1842 CE | +| **Place** | Philadelphia, Pennsylvania, USA | +| **Language** | Aramaic (traditional text), English (translation provided) | +| **Access Date** | 2025-12-12 | +| **License** | Educational use permitted | + +**Key Features:** +- Full Aramaic text transcription available +- English translation provided by archive +- Example of American Sephardic ketubah format + +**Sample Aramaic Text** (from source): +``` +בשבת... בשבת... יום... לחדש... שנת... לבריאת עולם למנין שאנו מונין כאן... +איך החתן... בר... אמר לה להדא בתולתא... בת... +``` + +--- + +### 1.3 College of Charleston Ketubah (1908) + +| Field | Value | +|-------|-------| +| **Archive** | College of Charleston, Special Collections | +| **Collection** | Jewish Heritage Collection | +| **Document Type** | Ketubah | +| **Date** | 1908 CE | +| **Language** | Hebrew, Aramaic | +| **Access Date** | 2025-12-12 | + +**Persons Identified:** +- **Bride**: Esther Devorah bat Rabbi Abraham (אסתר דבורה בת ר׳ אברהם) +- **Groom**: Rabbi Yitzchak (ר׳ יצחק) + +--- + +### 1.4 Rhodes Jewish Museum Collection + +| Field | Value | +|-------|-------| +| **Archive** | Rhodes Jewish Museum | +| **Location** | Rhodes, Greece | +| **Collection** | Historical Documents | +| **Document Types** | Ketubot, community records | +| **Period** | 19th-20th century | +| **Language** | Ladino, Hebrew, Greek | + +**Notes**: Documents from the historic Sephardic Jewish community of Rhodes, with unique Ladino elements. + +--- + +## 2. Arabic Waqf Documents (Islamic Endowments) + +### 2.1 Cambridge Digital Library - Islamic Collections + +| Field | Value | +|-------|-------| +| **Archive** | Cambridge University Library | +| **Collection** | Islamic Manuscripts | +| **Digital URL** | https://cudl.lib.cam.ac.uk/collections/islamic | +| **Document Types** | Waqfiyya, legal documents, correspondence | +| **Period** | 8th-20th century CE | +| **Languages** | Arabic, Persian, Ottoman Turkish | +| **License** | CC BY-NC 4.0 | +| **Access Date** | 2025-12-12 | + +**Key Collections:** +- Genizah Collection (Cairo Genizah fragments) +- Arabic Scientific Manuscripts +- Islamic Legal Documents + +--- + +### 2.2 UPenn OPenn - Manuscripts of the Muslim World + +| Field | Value | +|-------|-------| +| **Archive** | University of Pennsylvania Libraries | +| **Collection** | Manuscripts of the Muslim World | +| **Digital URL** | https://openn.library.upenn.edu/html/muslimworld_contents.html | +| **Document Types** | Waqfiyya, Quranic manuscripts, legal documents | +| **Period** | 9th-20th century CE | +| **Languages** | Arabic, Persian, Ottoman Turkish | +| **License** | Public Domain / CC0 | +| **Access Date** | 2025-12-12 | + +**Notable Holdings:** +- Waqfiyya documents from Egypt, Syria, Turkey +- Legal formularies with waqf templates +- Property deeds and endowment records + +--- + +### 2.3 Singapore National Heritage Board - Istanbul Waqf + +| Field | Value | +|-------|-------| +| **Archive** | Singapore National Heritage Board | +| **Collection** | Roots.gov.sg | +| **Accession Number** | 1115401 | +| **Digital URL** | https://www.roots.gov.sg/Collection-Landing/listing/1115401 | +| **Document Type** | Waqf document | +| **Donor/Creator** | Muhammad b. Abd al-Ghani (محمد بن عبد الغني) | +| **Properties** | Istanbul (various locations) | +| **Language** | Ottoman Turkish, Arabic | +| **Access Date** | 2025-12-12 | + +**Key Features:** +- Complete waqf document with property descriptions +- Lists endowed properties in Istanbul +- Named beneficiaries and conditions + +--- + +### 2.4 Haseki Sultan Waqfiyya (1552 CE) + +| Field | Value | +|-------|-------| +| **Archive** | Various (studied in UC Berkeley eScholarship) | +| **Document Type** | Waqfiyya (imperial endowment deed) | +| **Date** | 1552 CE | +| **Founder** | Haseki Hürrem Sultan (Roxelana) | +| **Language** | Ottoman Turkish, Arabic | +| **Research URL** | UC Berkeley eScholarship | + +**Significance**: One of the largest waqf endowments in Ottoman history, establishing charitable institutions across the empire. + +--- + +## 3. Ottoman Turkish Sijill (Sharia Court Registers) + +### 3.1 OpenJerusalem Project - Jerusalem Sharia Court Registers + +| Field | Value | +|-------|-------| +| **Archive** | OpenJerusalem Project | +| **Collection** | Jerusalem Sharia Court Registers | +| **Digital URL** | https://www.openjerusalem.org/ | +| **ARK Identifier** | ark:/58142/PfV7b | +| **Volume Count** | 102 registers | +| **Period** | 1834-1920 CE | +| **Language** | Ottoman Turkish, Arabic | +| **License** | Open Access | +| **Access Date** | 2025-12-12 | + +**Document Types:** +- Property sales (بيع) +- Marriage contracts (نكاح) +- Inheritance divisions (قسمة) +- Waqf registrations +- Debt acknowledgments (إقرار) +- Court testimonies (شهادة) + +**Key Features:** +- Searchable database with document transcriptions +- Photographs of original registers +- Multi-language metadata (Arabic, English, French) + +--- + +### 3.2 ISAM Istanbul Kadi Registers (Kadı Sicilleri) + +| Field | Value | +|-------|-------| +| **Archive** | İslam Araştırmaları Merkezi (ISAM) | +| **Collection** | Istanbul Kadı Sicilleri | +| **Digital URL** | http://www.kadisicilleri.org/ | +| **Volume Count** | 40+ volumes online | +| **Document Count** | 40,000+ documents | +| **Period** | 16th-19th century CE | +| **Language** | Ottoman Turkish | +| **License** | Research access | +| **Access Date** | 2025-12-12 | + +**Coverage:** +- Istanbul courts (multiple districts) +- Galata, Üsküdar, Eyüp +- Complete transcriptions with original images + +--- + +### 3.3 Istanbul Historical Kadi Registers Corpus + +| Field | Value | +|-------|-------| +| **Archive** | Istanbul Metropolitan Municipality | +| **Project** | History of Istanbul | +| **Digital URL** | https://istanbultarihi.ist/434-istanbul-sharia-court-registers | +| **Volume Count** | ~10,000 volumes | +| **Courts** | 26 different courts | +| **Period** | 1453-1922 CE | +| **Language** | Ottoman Turkish | + +**Significance**: Largest collection of Ottoman court records in existence. + +--- + +### 3.4 Harvard Ottoman Court Records Project + +| Field | Value | +|-------|-------| +| **Archive** | Harvard University | +| **Project** | Ottoman Court Records Project (OCRP) | +| **Digital URL** | https://cmes.fas.harvard.edu/projects/ocrp | +| **Document Types** | Sijill transcriptions, translations | +| **Period** | 16th-19th century CE | +| **Languages** | Ottoman Turkish (original), English (translations) | + +--- + +### 3.5 Bulgarian National Library - Ottoman Sijills + +| Field | Value | +|-------|-------| +| **Archive** | Bulgarian National Library | +| **Collection** | Oriental Department | +| **Sijill Count** | 160+ volumes | +| **Defter Count** | 1000+ registers | +| **Coverage** | Bulgarian Ottoman provinces | +| **Period** | 16th-19th century CE | +| **Language** | Ottoman Turkish, Arabic | + +--- + +## 4. Russian Metrical Books (Church Records) + +### 4.1 BYU Script Tutorial - Russian Metrical Books + +| Field | Value | +|-------|-------| +| **Institution** | Brigham Young University | +| **Project** | Script Tutorial | +| **Digital URL** | https://script.byu.edu/russian-handwriting/documents/record-types/metrical-books/births | +| **Document Type** | Tutorial with real transcription examples | +| **Languages** | Russian (Cyrillic), English (translation) | +| **License** | Educational use | +| **Access Date** | 2025-12-12 | + +**Content Includes:** +- Complete birth record format explanation +- Vocabulary lists with translations +- Sample transcriptions from actual metrical books +- Handwriting recognition guides + +**Sample Birth Record Structure** (from tutorial): +``` +В метрической книге записано: +Родился: [date] +Крещён: [date] +Имя: [name] +Родители: [father's full name with rank/status], законная жена его [mother's name] +Восприемники: [godparents] +Священник: [officiating priest] +``` + +--- + +### 4.2 FamilySearch Russia Church Records + +| Field | Value | +|-------|-------| +| **Archive** | FamilySearch | +| **Collection** | Russia Church Records | +| **Wiki URL** | https://www.familysearch.org/en/wiki/Russia_Church_Records | +| **Document Types** | Metrical books (births, marriages, deaths) | +| **Period** | 1722-1918 CE | +| **Languages** | Russian, Church Slavonic | +| **Access** | Free with registration | + +**Key Information:** +- Metrical books (метрические книги) mandated from 1722 +- Three-part structure: births/baptisms, marriages, deaths +- Contains estate/class (сословие) information + +--- + +### 4.3 Polish Archives - Kłobuck Parish Records + +| Field | Value | +|-------|-------| +| **Archive** | Szukaj w Archiwach (Polish State Archives) | +| **Parish** | Kłobuck | +| **Document Type** | Roman Catholic metrical books | +| **Period** | 18th-19th century | +| **Languages** | Latin, Polish, Russian | + +**Notes**: Example of Russian-era Polish parish records with parallel Latin/Russian entries. + +--- + +### 4.4 RGIA St. Petersburg + +| Field | Value | +|-------|-------| +| **Archive** | Russian State Historical Archive (RGIA) | +| **Location** | St. Petersburg, Russia | +| **Holdings** | 300+ metrical books | +| **Period** | 1832-1892 CE | +| **Document Types** | Orthodox, Catholic, Lutheran, Jewish metrical books | + +--- + +## 5. Spanish Colonial Baptism Records + +### 5.1 BYU Script Tutorial - Spanish Colonial Baptisms + +| Field | Value | +|-------|-------| +| **Institution** | Brigham Young University | +| **Project** | Script Tutorial | +| **Digital URL** | https://script.byu.edu/spanish-handwriting/documents/church-records/baptisms | +| **Document Type** | Tutorial with real transcription examples | +| **Languages** | Spanish (colonial), English | +| **License** | Educational use | +| **Access Date** | 2025-12-12 | + +**Standard Baptism Entry Structure:** +``` +En [place] a [date] bauticé solemnemente a [name], [legitimacy status] de [father] y de [mother]. +Fueron padrinos [godparents]. +Y para que conste lo firmo. +[Priest signature] +``` + +**Key Vocabulary:** +- hijo/hija legítimo/a = legitimate child +- hijo/hija natural = illegitimate child +- párvulo/a = infant +- español/a, indio/a, mestizo/a, mulato/a = casta categories +- padrinos/madrinas = godparents + +--- + +### 5.2 FamilySearch Mexico - Yucatán Catholic Church Records + +| Field | Value | +|-------|-------| +| **Archive** | FamilySearch | +| **Collection** | Mexico, Yucatán, Catholic Church Records, 1543-1977 | +| **Collection ID** | 1909116 | +| **Digital URL** | https://www.familysearch.org/en/search/collection/1909116 | +| **Period** | 1543-1977 CE | +| **Document Types** | Baptisms, marriages, deaths, confirmations | +| **Language** | Spanish, Latin, Maya | +| **Access** | Free with registration | + +**Coverage:** +- 200+ parishes +- Some of earliest New World records (from 1543) +- Indigenous Maya populations + +--- + +### 5.3 Archivo General de la Nación (AGN) Mexico + +| Field | Value | +|-------|-------| +| **Archive** | Archivo General de la Nación | +| **Location** | Mexico City, Mexico | +| **Holdings** | Colonial parish records, civil registry | +| **Period** | 16th-20th century CE | +| **Languages** | Spanish, Nahuatl, Latin | + +--- + +## 6. Italian Notarial Records + +### 6.1 Antenati - Italian State Archives Portal + +| Field | Value | +|-------|-------| +| **Archive** | Italian Ministry of Culture | +| **Project** | Antenati (Ancestors) | +| **Digital URL** | https://antenati.cultura.gov.it/ | +| **Venice URL** | https://antenati.cultura.gov.it/archivio/state-archives-of-venezia/?lang=en | +| **Document Types** | Civil registry, notarial acts, parish records | +| **Period** | 1806-present (civil); 15th century+ (notarial) | +| **Languages** | Italian, Latin, Venetian | +| **License** | Open Access | +| **Access Date** | 2025-12-12 | + +**Venice State Archive Holdings:** +- Civil Registry (Stato Civile) 1806-1815 (Napoleonic period) +- Notarial archives (Archivio Notarile) +- Guild records (Arti e Mestieri) + +--- + +### 6.2 OAC California Digital Library - Italian Notarial Documents + +| Field | Value | +|-------|-------| +| **Archive** | University of California Libraries | +| **Collection** | Italian Notarial Documents Collection | +| **Finding Aid** | https://oac.cdlib.org/findaid/ark:%2F13030%2Fc8v412zd | +| **Document Count** | 168 documents | +| **Period** | 1465-1635 CE | +| **Locations** | Venice, Padua, Verona | +| **Languages** | Latin, Italian (Venetian) | +| **Access Date** | 2025-12-12 | + +**Document Types:** +- Contracts (contratti) +- Wills (testamenti) +- Property transfers +- Marriage agreements (sponsalia) +- Business partnerships + +--- + +### 6.3 SION-Digit Project - Jewish Notarial Records + +| Field | Value | +|-------|-------| +| **Project** | SION-Digit (Sources for the History of Italian Jewish Notarial Documents) | +| **Coverage** | Venice, Bordeaux, Amsterdam | +| **Period** | 16th-18th century CE | +| **Focus** | Jewish community notarial acts | +| **Languages** | Italian, Hebrew, Ladino | + +--- + +## 7. Greek Orthodox Church Records + +### 7.1 FamilySearch Greece Church Records + +| Field | Value | +|-------|-------| +| **Archive** | FamilySearch | +| **Wiki URL** | https://www.familysearch.org/en/wiki/Greece_Church_Records | +| **Document Types** | Baptisms, marriages, deaths | +| **Period** | 17th century - 1925 CE | +| **Language** | Greek | +| **Access** | Free with registration | + +**Key Information:** +- Greek Orthodox records primary source before 1925 civil registration +- Male registers (μητρώα αρρένων) for military service +- Some records in Ottoman Turkish for pre-independence period + +--- + +### 7.2 General State Archives of Greece (GAK) + +| Field | Value | +|-------|-------| +| **Archive** | Γενικά Αρχεία του Κράτους (GAK) | +| **Document Types** | Church records, civil registry, Ottoman-era documents | +| **Period** | 15th century - present | +| **Languages** | Greek, Ottoman Turkish | + +--- + +### 7.3 Greek Ancestry Resources + +| Field | Value | +|-------|-------| +| **Resource** | Greek Ancestry | +| **Coverage** | Village church records guide | +| **Document Types** | Baptismal registers, marriage registers | +| **Key Features** | Guides to accessing island and mainland records | + +--- + +## 8. Dutch Civil Registry Records + +### 8.1 WieWasWie (Dutch Genealogical Database) + +| Field | Value | +|-------|-------| +| **Archive** | Centraal Bureau voor Genealogie (CBG) | +| **Project** | WieWasWie | +| **Digital URL** | https://www.wiewaswie.nl/ | +| **Document Types** | Birth, marriage, death certificates | +| **Period** | 1811-present (civil); 1600s+ (church) | +| **Languages** | Dutch | +| **Access** | Subscription / Free at archives | + +--- + +### 8.2 Dutch Provincial Archives + +| Province | Archive | Holdings | +|----------|---------|----------| +| Noord-Holland | Noord-Hollands Archief | Civil registry from 1811, church records from 1600s | +| Zuid-Holland | Nationaal Archief | Central government records | +| Gelderland | Gelders Archief | Regional archives | +| Noord-Brabant | Brabants Historisch Informatie Centrum | Catholic parish records | + +--- + +### 8.3 Dutch Marriage Certificate Format + +**Standard 19th-Century Format:** +``` +Heden den [date] compareerden voor ons [official name], +Ambtenaar van den Burgerlijken Stand der Gemeente [municipality]: + +De Bruidegom: [groom's name], oud [age] jaren, [occupation], +geboren te [birthplace], wonende te [residence], +zoon van [father] en van [mother]; + +De Bruid: [bride's name], oud [age] jaren, +geboren te [birthplace], wonende te [residence], +dochter van [father] en van [mother]; + +Getuigen: [4 witnesses with ages, occupations, relationships] + +En hebben wij dit huwelijk voltrokken in tegenwoordigheid van voornoemde getuigen. +``` + +--- + +## 9. License and Attribution Requirements + +### Open Access Resources + +| Source | License | Attribution Required | +|--------|---------|---------------------| +| Cambridge Digital Library | CC BY-NC 4.0 | Yes | +| UPenn OPenn | Public Domain / CC0 | No (but encouraged) | +| OpenJerusalem | Open Access | Yes | +| Antenati | Open Access | Yes | +| FamilySearch | Terms of Service | Yes | +| BYU Script Tutorial | Educational Use | Yes | + +### Recommended Citation Format + +For PiCo extraction examples, use the following provenance block in YAML: + +```yaml +provenance: + source_url: "https://example.org/document/12345" + archive_name: "Example Archive" + collection: "Collection Name" + document_id: "Document Identifier" + access_date: "2025-12-12" + license: "CC BY-NC 4.0" + attribution: "Courtesy of Example Archive. Used under CC BY-NC 4.0 license." + notes: "Transcription verified against original digital image." +``` + +### Data Fabrication Prohibition + +**CRITICAL**: Per project rules (AGENTS.md Rule 21), all extraction examples MUST use real data from these verified sources. No fabrication of person names, dates, relationships, or document content is permitted. + +When real data is not available from a source, the extraction example should be marked as: + +```yaml +provenance: + source_url: null + data_status: "SYNTHETIC_EXAMPLE" + notes: "This example uses synthetic data for demonstration purposes only. Do not cite as historical evidence." +``` + +--- + +## Document Type Coverage Summary + +| Document Type | Real Sources Available | Examples with Provenance | +|--------------|------------------------|--------------------------| +| Hebrew Ketubah | 4+ archives | Yale (1896), Philadelphia (1842) | +| Arabic Waqf | 3+ archives | Cambridge, UPenn, Singapore | +| Ottoman Sijill | 5+ archives | OpenJerusalem, ISAM, Harvard | +| Russian Metrical | 4+ archives | BYU Tutorial, RGIA | +| Spanish Colonial Baptism | 3+ archives | BYU Tutorial, FamilySearch | +| Italian Notarial | 3+ archives | Antenati, OAC/CDL | +| Greek Orthodox | 3+ archives | FamilySearch, GAK | +| Dutch Civil Registry | 3+ archives | WieWasWie, Provincial | + +--- + +## Changelog + +| Date | Version | Changes | +|------|---------|---------| +| 2025-12-12 | 1.0.0 | Initial compilation of provenance sources | + diff --git a/data/entity_annotation/modules/index.yaml b/data/entity_annotation/modules/index.yaml index ec2d982f84..051675c330 100644 --- a/data/entity_annotation/modules/index.yaml +++ b/data/entity_annotation/modules/index.yaml @@ -152,6 +152,28 @@ modules: - path: "integrations/nif_nerd.yaml" description: "NIF/NERD/Open Annotation compatibility layer with GLAM-NER mappings" + # --------------------------------------------------------------------------- + # RELATIONSHIP MODULES - Family and social relationship patterns + # --------------------------------------------------------------------------- + relationships: + - path: "relationships/family.yaml" + description: "Family relationship properties and historical source patterns (34 relationship types, 13 languages)" + line_count: 1503 + languages: + - "Dutch" + - "Latin" + - "German" + - "Arabic" + - "French" + - "Ottoman Turkish" + - "Hebrew" + - "Persian/Farsi" + - "Spanish" + - "Portuguese" + - "Italian" + - "Greek" + - "Russian" + # --------------------------------------------------------------------------- # ADVANCED MODULES - Complex annotation patterns # --------------------------------------------------------------------------- diff --git a/data/entity_annotation/modules/integrations/pico.yaml b/data/entity_annotation/modules/integrations/pico.yaml deleted file mode 100644 index 676acefff3..0000000000 --- a/data/entity_annotation/modules/integrations/pico.yaml +++ /dev/null @@ -1,2244 +0,0 @@ -# ============================================================================= -# GLAM-NER Entity Annotation Convention v1.7.0 -# Module: integrations/pico.yaml -# ============================================================================= -# PiCO (Person in Context Ontology) integration for person observation modeling. -# Enables tracking provenance of person mentions and linking to formal records. -# -# Key concepts: -# - PersonObservation: A textual mention of a person (source-bound) -# - PersonName (PNV): Structured name components -# - Person (CIDOC-CRM E21): Reconstructed person entity -# -# References: -# - PiCo Ontology: https://w3id.org/pico -# - Person Name Vocabulary (PNV): https://w3id.org/pnv -# - CIDOC-CRM: https://www.cidoc-crm.org/ -# ============================================================================= - -pico_integration: - - description: | - PiCO (Person in Context Ontology) models textual observations of persons - as distinct from reconstructed person entities. This enables: - - Tracking provenance of person mentions - - Handling name variations across sources - - Linking observations to formal person records - - The observation/reconstruction pattern separates: - 1. What was OBSERVED in text (PersonObservation) - source-bound, exact - 2. What was RECONSTRUCTED as entity (E21_Person) - inferred, normalized - - This is critical for heritage data where the same person may appear with - different name forms, titles, or spellings across sources. - - # --------------------------------------------------------------------------- - # Core Observation Pattern - # --------------------------------------------------------------------------- - - observation_pattern: - description: "Every person mention creates a PersonObservation" - class: "picom:PersonObservation" - class_uri: "https://w3id.org/pico/PersonObservation" - - properties: - - property: "picom:hasObservedName" - description: "The name string as it appears in text" - range: "pnv:PersonName" - cardinality: "1" - note: "Exact transcription of name from source" - - - property: "picom:isObservationOf" - description: "Links to reconstructed Person entity" - range: "crm:E21_Person" - cardinality: "0..1" - note: "May be null if person not yet identified" - - - property: "prov:hadPrimarySource" - description: "The source document/webpage" - range: "prov:Entity" - cardinality: "1" - note: "Required for provenance tracking" - - - property: "picom:observedAt" - description: "When the observation was made" - range: "xsd:dateTime" - cardinality: "1" - note: "Extraction timestamp, not document date" - - - property: "picom:observedInContext" - description: "Surrounding text context" - range: "xsd:string" - cardinality: "0..1" - note: "For disambiguation when reviewing" - - - property: "picom:hasRole" - description: "Role/position observed with the person" - range: "xsd:string" - cardinality: "0..*" - note: "Links to ROLE hypernym when extracted" - - # --------------------------------------------------------------------------- - # Person Name Vocabulary (PNV) - # --------------------------------------------------------------------------- - - pnv_name_structure: - description: | - Person Name Vocabulary (PNV) provides structured name components. - This enables proper parsing of complex name structures across cultures. - - class: "pnv:PersonName" - class_uri: "https://w3id.org/pnv/PersonName" - - components: - - property: "pnv:literalName" - description: "Full name as single string" - examples: - - "Dr. Maria van den Berg" - - "Rembrandt Harmenszoon van Rijn" - - "Queen Elizabeth II" - note: "Original string before parsing" - - - property: "pnv:givenName" - description: "First/given name" - examples: - - "Rembrandt" - - "Maria" - - "Jan" - - "Elizabeth" - note: "Personal name, not surname" - - - property: "pnv:patronym" - description: "Patronymic name component" - examples: - - "Harmenszoon" - - "Janszoon" - - "Pietersdochter" - note: "Common in Dutch, Scandinavian, Slavic names" - - - property: "pnv:surnamePrefix" - description: "Prefix to surname (tussenvoegsel)" - examples: - - "van" - - "de" - - "van den" - - "van der" - - "op de" - - "'t" - - "von" - - "di" - note: "Language-specific, affects sorting" - - - property: "pnv:baseSurname" - description: "Core surname without prefix" - examples: - - "Rijn" - - "Berg" - - "Velde" - - "Gogh" - note: "Primary sorting component in Dutch" - - - property: "pnv:honorificPrefix" - description: "Title or honorific before name" - examples: - - "Dr." - - "Prof." - - "Prof. dr." - - "Sir" - - "Queen" - - "Mr." - - "Drs." - - "Ir." - note: "May indicate role - link to ROL" - - - property: "pnv:honorificSuffix" - description: "Title or honorific after name" - examples: - - "PhD" - - "Jr." - - "III" - - "MD" - - "RA" - - "MSc" - note: "Credentials and generational markers" - - - property: "pnv:infixTitle" - description: "Title within name structure" - examples: - - "graaf van" - - "baron de" - - "duke of" - note: "Nobility titles embedded in name" - - # --------------------------------------------------------------------------- - # Dutch Name Conventions (Project-Specific) - # --------------------------------------------------------------------------- - - dutch_name_patterns: - description: | - Special handling for Dutch names with tussenvoegsels (surname prefixes). - Dutch sorting rules differ from other languages. - - tussenvoegsel_list: - - "van" - - "van de" - - "van den" - - "van der" - - "de" - - "den" - - "het" - - "'t" - - "ter" - - "ten" - - "op de" - - "op den" - - "in 't" - - "in de" - - sorting_rule: | - In Dutch, surnames sort by baseSurname, ignoring tussenvoegsel. - "Vincent van Gogh" sorts under "G" not "V". - "Maria van den Berg" sorts under "B" not "V". - - capitalization_rule: | - Tussenvoegsel lowercase when preceded by given name: - - "Vincent van Gogh" (not "Vincent Van Gogh") - - "Van Gogh" (surname alone, capitalized) - - "de heer Van Gogh" (formal, capitalized) - - # --------------------------------------------------------------------------- - # Integration with GLAM-NER Hypernyms - # --------------------------------------------------------------------------- - - hypernym_mapping: - description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms" - - mappings: - - pico_class: "picom:PersonObservation" - glam_hypernym: "AGT.PER" - glam_code: "AGT.PER" - note: "Person observations create AGT.PER entities" - - - pico_class: "picom:PersonObservation" - glam_hypernym: "AGT.STF" - glam_code: "AGT.STF" - condition: "When observed with organizational role" - note: "Staff members with role context" - - - pico_class: "pnv:PersonName" - glam_hypernym: "APP.NAM" - glam_code: "APP.NAM" - note: "Name strings as appellations" - - - pico_class: "picom:hasRole" - glam_hypernym: "ROL" - glam_code: "ROL" - note: "Extracted roles link to ROL hypernym" - - # --------------------------------------------------------------------------- - # Example Annotations - # --------------------------------------------------------------------------- - - examples: - - description: "Staff member with title and role" - text: "Dr. Maria van den Berg, Director" - - observation: - type: "picom:PersonObservation" - id: "_:obs1" - - hasObservedName: - type: "pnv:PersonName" - literalName: "Dr. Maria van den Berg" - honorificPrefix: "Dr." - givenName: "Maria" - surnamePrefix: "van den" - baseSurname: "Berg" - - hasRole: "Director" - hadPrimarySource: "https://example.org/staff-page" - observedAt: "2025-12-02T10:30:00Z" - - glam_ner_annotations: - - span: "Dr. Maria van den Berg" - type: "AGT.STF" - code: "AGT.STF" - confidence: 0.95 - - - span: "Director" - type: "ROL.TIT" - code: "ROL.TIT" - confidence: 0.98 - - - description: "Historical artist" - text: "Rembrandt van Rijn painted this in 1642" - - observation: - type: "picom:PersonObservation" - id: "_:obs2" - - hasObservedName: - type: "pnv:PersonName" - literalName: "Rembrandt van Rijn" - givenName: "Rembrandt" - surnamePrefix: "van" - baseSurname: "Rijn" - - isObservationOf: "wd:Q5598" # Wikidata Rembrandt - hadPrimarySource: "https://example.org/artwork-page" - observedAt: "2025-12-02T10:35:00Z" - - glam_ner_annotations: - - span: "Rembrandt van Rijn" - type: "AGT.PER" - code: "AGT.PER" - confidence: 0.99 - linking: - wikidata: "Q5598" - viaf: "64013650" - - - description: "Nobility title" - text: "Count Willem van Loon" - - observation: - type: "picom:PersonObservation" - id: "_:obs3" - - hasObservedName: - type: "pnv:PersonName" - literalName: "Count Willem van Loon" - honorificPrefix: "Count" - givenName: "Willem" - surnamePrefix: "van" - baseSurname: "Loon" - - hadPrimarySource: "https://example.org/archive-doc" - observedAt: "2025-12-02T10:40:00Z" - - glam_ner_annotations: - - span: "Count Willem van Loon" - type: "AGT.PER" - code: "AGT.PER" - confidence: 0.95 - - - span: "Count" - type: "ROL.HON" - code: "ROL.HON" - note: "Nobility title - honorific role" - - # --------------------------------------------------------------------------- - # Provenance Chain - # --------------------------------------------------------------------------- - - provenance_model: - description: | - PiCo observations maintain full provenance chain: - - Observation → Source Document → Extraction Activity → Agent - - This enables: - - Tracking where each name form was found - - Attributing extractions to human/ML agents - - Maintaining audit trail for corrections - - chain_structure: - observation: - class: "picom:PersonObservation" - properties: - - "prov:hadPrimarySource" # → Source document - - "prov:wasGeneratedBy" # → Extraction activity - - source: - class: "prov:Entity" - properties: - - "prov:wasAttributedTo" # → Publisher/author - - "dct:created" # → Document date - - activity: - class: "prov:Activity" - properties: - - "prov:wasAssociatedWith" # → Extraction agent - - "prov:used" # → ML model or rules - - "prov:startedAtTime" # → Extraction timestamp - - agent: - class: "prov:Agent" - examples: - - "Human curator" - - "spaCy NER model" - - "GLAM-NER extraction pipeline" - -# ============================================================================= -# SOURCE TYPE EXTENSIONS -# ============================================================================= -# -# PiCo PersonObservation can be extracted from many source types. -# Each source type may have specific extraction patterns, but the core -# PiCo model (observation → name → roles → provenance) remains the same. -# -# Source-specific extraction logic belongs in APPLICATION LAYER scripts, -# not in this convention. This section defines the ABSTRACT patterns. -# ============================================================================= - -source_type_patterns: - description: | - PersonObservation sources fall into categories with different extraction - patterns. The CH-Annotator handles all source types using the same - core PiCo model, with source-specific field mappings at extraction time. - - # --------------------------------------------------------------------------- - # Source Categories - # --------------------------------------------------------------------------- - - categories: - modern_digital: - description: "Contemporary digital sources with structured data" - examples: - - "LinkedIn profiles" - - "Institutional staff directories" - - "Academic profile pages" - - "ORCID records" - characteristics: - - "Semi-structured HTML/JSON" - - "Current/living persons" - - "Self-reported information" - - "Timestamped updates" - typical_properties: - - "sdo:name" - - "sdo:jobTitle" - - "sdo:hasOccupation" - - "sdo:alumniOf" - - "sdo:knowsAbout" - - historical_indices: - description: "Early modern and historical name indices" - examples: - - "Notarial protocol indices" - - "Church register indices" - - "Census indices" - - "Guild membership lists" - - "Property transfer records" - characteristics: - - "Abbreviated names" - - "Patronymics common" - - "Latin/vernacular mixing" - - "Occupation as identifier" - - "Relational identification ('wife of', 'son of')" - typical_properties: - - "pnv:literalName" - - "pnv:patronym" - - "picom:hasRole" - - "crm:P107_has_current_or_former_member" - - "sdo:spouse" - - "sdo:parent" - - archival_descriptions: - description: "Finding aids, inventories, and archival descriptions" - examples: - - "EAD finding aids" - - "ISAD(G) descriptions" - - "Collection inventories" - - "RiC-O records" - characteristics: - - "Hierarchical context" - - "Provenance-focused" - - "Creator/contributor roles" - - "Temporal spans" - typical_properties: - - "rico:hasCreator" - - "rico:hasOrHadHolder" - - "crm:P14_carried_out_by" - - "crm:P11_had_participant" - - biographical_dictionaries: - description: "Structured biographical reference works" - examples: - - "Dictionary of National Biography" - - "KNAW DWDD" - - "Allgemeines Künstlerlexikon" - - "Thieme-Becker" - characteristics: - - "Standardized entries" - - "Birth/death dates" - - "Career summaries" - - "Cross-references" - typical_properties: - - "sdo:birthDate" - - "sdo:deathDate" - - "sdo:birthPlace" - - "sdo:deathPlace" - - "crm:P98_brought_into_life" - - "crm:P100_was_death_of" - - # --------------------------------------------------------------------------- - # Universal Observation Properties (All Source Types) - # --------------------------------------------------------------------------- - - universal_properties: - description: | - These properties apply to PersonObservation regardless of source type. - They form the core of the PiCo extraction model. - - required: - - property: "picom:hasObservedName" - description: "The name string as it appears in source" - range: "pnv:PersonName" - - - property: "prov:hadPrimarySource" - description: "The source document/webpage/record" - range: "prov:Entity" - - - property: "picom:observedAt" - description: "When the observation was extracted" - range: "xsd:dateTime" - - optional: - - property: "picom:isObservationOf" - description: "Links to reconstructed Person entity (if identified)" - range: "crm:E21_Person" - - - property: "picom:hasRole" - description: "Role/position observed with the person" - range: "org:Role" - - - property: "picom:observedInContext" - description: "Surrounding text for disambiguation" - range: "xsd:string" - - - property: "picom:confidence" - description: "Confidence score for extraction" - range: "xsd:decimal" - - # --------------------------------------------------------------------------- - # Heritage Relevance Detection (Universal) - # --------------------------------------------------------------------------- - - heritage_relevance: - description: | - Person observations can be tagged for heritage sector relevance using - GLAMORCUBESFIXPHDNT type codes. This applies to all source types. - - type_codes: - G: "Gallery" - L: "Library" - A: "Archive" - M: "Museum" - O: "Official institution" - R: "Research center" - C: "Corporation" - U: "Unknown" - B: "Botanical garden / Zoo" - E: "Education provider" - S: "Collecting society" - F: "Feature / Monument" - I: "Intangible heritage" - X: "Mixed types" - P: "Personal collection" - H: "Holy site" - D: "Digital platform" - N: "NGO" - T: "Taste/smell heritage" - - detection_approach: | - Heritage relevance detection is SOURCE-SPECIFIC and belongs in the - application layer, not the convention. The convention defines: - 1. The type code vocabulary (GLAMORCUBESFIXPHDNT) - 2. The property for tagging (picom:heritageRelevance) - 3. The expected format (single-letter code + confidence) - - Application scripts implement source-specific keyword detection, - organization matching, or ML classification to populate this field. - -# ============================================================================= -# GLM-4.6 CH-ANNOTATOR INTEGRATION -# ============================================================================= -# -# The CH-Annotator can be invoked via GLM-4.6 API for automated extraction. -# The system prompt is SOURCE-AGNOSTIC and works with any text input. -# ============================================================================= - -glm_annotator_config: - model: "glm-4.6" - api_endpoint: "https://api.z.ai/api/coding/paas/v4/chat/completions" - temperature: 0.1 - max_tokens: 4000 - - # --------------------------------------------------------------------------- - # Core System Prompt (Source-Agnostic) - # --------------------------------------------------------------------------- - - system_prompt: | - You are a CH-Annotator (Cultural Heritage Annotator) v1.7.0 extraction agent - with PiCo (Person in Context) ontology integration. - - ## Your Task - Extract structured person observation data from the provided source text. - The source may be a modern digital profile, historical index, archival - description, or any other document containing person references. - - ## Core PiCo Pattern - Every person mention creates a PersonObservation that is: - - SOURCE-BOUND: Exact transcription from source, no normalization - - PROVENANCE-TRACKED: Linked to source document and extraction timestamp - - RECONSTRUCTION-READY: Can be linked to formal Person entity later - - ## Person Name Vocabulary (PNV) - Parse names into components (use null for missing parts): - - literalName: Full name exactly as written in source - - givenName: First/given name - - patronym: Patronymic (Janszoon, -dochter, bin, ibn, mac) - - surnamePrefix: Tussenvoegsel/particle (van, de, von, di, du) - - baseSurname: Core surname without prefix - - honorificPrefix: Title before name (Dr., Prof., Heer, Meester) - - honorificSuffix: Credentials after name (PhD, Jr., III) - - initials: Initials with periods (e.g., "P.R.", "C.Joh.") - - ## Language-Specific Name Rules - - ### Dutch - - Tussenvoegsel lowercase after given name: "Jan van Gogh" - - Capitalized when standalone: "Van Gogh painted..." - - Common: van, de, van de, van den, van der, 't, 's, op de - - ### Historical/Latin - - Patronymics: -zoon/-zn, -dochter/-dr, -s (Janszoon, Pietersdochter) - - Latinized forms: -us, -ius endings (Erasmus Roterodamus) - - Occupational surnames may be literal (de bakker = the baker) - - ## Role Extraction - Extract roles/occupations with temporal bounds when available: - - Role title exactly as stated - - Associated organization (link to GRP hypernym if institution) - - Start/end dates or period - - Heritage relevance code if applicable (GLAMORCUBESFIXPHDNT) - - Role in source context (from picot_roles thesaurus): - * child, parent, spouse, witness, declarant, bride, groom, godparent, etc. - - ## Biographical Properties - Extract when present in source (use null if not stated): - - birth_date / death_date: ISO format (YYYY, YYYY-MM, or YYYY-MM-DD) - - birth_place / death_place: Place name as written - - gender: "Male" or "Female" (only if explicitly stated or inferable) - - age: Age as stated (e.g., "30", "4 months", "about 25") - - religion: Religious affiliation if mentioned - - deceased: true only if death indicated but date unknown - - address: Physical address as recorded in source - - floruit: Active period if birth/death unknown - - ## Family Relationship Extraction - - CRITICAL: For PersonObservations, family relationships MUST refer to OTHER - persons mentioned in the SAME source document. Cross-source relationships - belong to PersonReconstructions. - - ### Core Family Relationships - - parent: A parent of the person (use sdo:parent) - - children: Children of the person (use sdo:children) - - spouse: Current spouse (use sdo:spouse) - - sibling: Brother or sister (use sdo:sibling) - - ### Extended Family - - grandparent / grandchild - - uncle_aunt / nephew_niece - - cousin (symmetric) - - ### Step/Half Relations - - stepparent / stepchild - - stepsibling - - half_sibling (one shared parent) - - ### Ritual/Legal Kinship (common in historical records) - - godparent / godchild: Baptismal sponsors - - foster_parent / foster_child - - legitimized_child: Child recognized through marriage/legal act - - ### In-Law Relations - - parent_in_law / child_in_law - - sibling_in_law - - ### Former Partners - - widow_of: Surviving spouse of deceased (subject is the survivor) - - previous_partner: Former spouse/partner - - ### Historical Source Patterns - Common relationship indicators in historical documents: - - "huijsvrou van" / "wife of" → spouse - - "zoon van" / "son of" → parent (person is child) - - "weduwe van" / "widow of" → widow_of - - "met attestatie van" → from location indicator - - "getuige" / "witness" → role in event, not kinship - - "peter" / "meter" / "godfather" / "godmother" → godparent - - ## Source Types (for source_type field) - Use appropriate category: - - modern_digital: LinkedIn, staff directories, ORCID - - historical_indices: Notarial protocols, guild lists - - civil_registration: Birth/marriage/death certificates - - church_records: Baptism, marriage, burial registers - - archival_descriptions: Finding aids, inventories - - biographical_dictionaries: DNB, AKL, reference works - - census: Population census records - - ## Output Format - Return ONLY valid JSON (no markdown, no explanation): - - { - "pico_observation": { - "observation_id": "", - "observed_at": "", - "source_type": "", - "source_reference": "" - }, - "persons": [ - { - "person_index": 0, - "pnv_name": { - "literalName": "Name as written", - "givenName": null, - "patronym": null, - "surnamePrefix": null, - "baseSurname": null, - "honorificPrefix": null, - "honorificSuffix": null, - "initials": null - }, - "roles": [ - { - "role_title": "Role as stated", - "role_in_source": "child|declarant|witness|bride|groom|null", - "organization": "Org name if mentioned", - "period": "Temporal info if available", - "heritage_relevant": false, - "heritage_type": null - } - ], - "biographical": { - "birth_date": null, - "death_date": null, - "birth_place": null, - "death_place": null, - "gender": null, - "age": null, - "religion": null, - "deceased": null, - "address": null, - "floruit": null - }, - "family_relationships": { - "parent": [], - "children": [], - "spouse": [], - "sibling": [], - "grandparent": [], - "grandchild": [], - "uncle_aunt": [], - "nephew_niece": [], - "cousin": [], - "stepparent": [], - "stepchild": [], - "stepsibling": [], - "half_sibling": [], - "foster_parent": [], - "foster_child": [], - "godparent": [], - "godchild": [], - "parent_in_law": [], - "child_in_law": [], - "sibling_in_law": [], - "previous_partner": [], - "widow_of": null - }, - "context": "Surrounding text for disambiguation" - } - ], - "organizations_mentioned": [ - { - "name": "Organization name", - "type": "Heritage type code or null", - "role_in_source": "employer|creator|publisher|etc" - } - ], - "temporal_references": [ - { - "expression": "Date/period as written", - "normalized": "ISO date if parseable", - "type": "DATE|DURATION|SET" - } - ], - "locations_mentioned": [ - { - "name": "Place name as written", - "type": "city|region|country|address" - } - ] - } - - ## Relationship Reference Format - Family relationship arrays contain references to other persons in same source: - - Use person_index (integer) to reference persons array position - - Include target_name for readability - - Example for a marriage record: - ```json - { - "person_index": 0, - "pnv_name": {"literalName": "Jan Pietersz"}, - "family_relationships": { - "spouse": [{"person_index": 1, "target_name": "Maria Jansdr"}], - "parent": [{"person_index": 2, "target_name": "Pieter Jansz"}] - } - } - ``` - - ## Critical Rules - 1. ONLY extract data that EXISTS in the source. NEVER fabricate. - 2. Use null for missing fields, [] for empty arrays. - 3. Preserve original spelling/language from source. - 4. heritage_type must be single-letter GLAMORCUBESFIXPHDNT code. - 5. For historical sources, preserve archaic spellings exactly. - 6. Extract ALL persons mentioned, not just the primary subject. - 7. Family relationships MUST reference persons in SAME source only. - 8. Use person_index for relationship references (0-based array index). - 9. Gender: only "Male"/"Female"/null - never infer without evidence. - 10. Age: preserve as stated, include qualifier ("about 25", "4 months"). - 11. For role_in_source, use picot_roles terms when applicable. - -# ============================================================================= -# PERSON RECONSTRUCTION PATTERN -# ============================================================================= -# -# PersonReconstruction is a reconstructed person entity derived from one or -# more PersonObservations. It represents the scholarly consensus about a -# historical person based on available evidence. -# ============================================================================= - -person_reconstruction_pattern: - description: | - A PersonReconstruction is created by linking one or more PersonObservations - to form a unified person entity. This is the scholarly interpretation layer - that connects source-bound observations to a conceptual person. - - Key distinction: - - PersonObservation: What is OBSERVED in a specific source (exact transcription) - - PersonReconstruction: What is INFERRED about the person (normalized, linked) - - A single PersonReconstruction may derive from observations across: - - Multiple sources (birth record + marriage record + death record) - - Different time periods (mentions across decades) - - Various name forms ("Jan Jansz" + "Johannes Jansen" + "J. Jansen") - - class: "pico:PersonReconstruction" - class_uri: "https://personsincontext.org/model#PersonReconstruction" - superclass: "pico:Person" - - required_properties: - - property: "prov:wasDerivedFrom" - description: "Links to source PersonObservation(s)" - range: "pico:PersonObservation" - cardinality: "1..*" - note: "Every reconstruction MUST link to at least one observation" - - - property: "prov:wasGeneratedBy" - description: "Links to the reconstruction Activity" - range: "prov:Activity" - cardinality: "1" - note: "Documents how/when/by whom reconstruction was created" - - optional_properties: - - property: "prov:wasRevisionOf" - description: "Links to previous version of this reconstruction" - range: "pico:PersonReconstruction" - cardinality: "0..1" - note: "For tracking updates to reconstructions over time" - - - property: "sdo:name" - description: "Normalized/preferred name form" - range: "xsd:string" - note: "The canonical name for this person" - - - property: "sdo:additionalName" - description: "Structured name following PNV" - range: "pnv:PersonName" - note: "Full name breakdown using Person Name Vocabulary" - - - property: "sdo:givenName" - description: "Given/first name" - range: "xsd:string" - - - property: "sdo:familyName" - description: "Family/surname" - range: "xsd:string" - - - property: "sdo:gender" - description: "Gender of the person" - range: "sdo:GenderType" - values: ["sdo:Male", "sdo:Female"] - - - property: "sdo:birthDate" - description: "Birth date (ISO 8601)" - range: "xsd:date" - note: "May be incomplete: YYYY, YYYY-MM, or YYYY-MM-DD" - - - property: "sdo:birthPlace" - description: "Place of birth" - range: "xsd:string or xsd:anyURI" - note: "Prefer linking to GeoNames or Wikidata" - - - property: "sdo:deathDate" - description: "Death date (ISO 8601)" - range: "xsd:date" - - - property: "sdo:deathPlace" - description: "Place of death" - range: "xsd:string or xsd:anyURI" - - example: - description: "PersonReconstruction derived from multiple observations" - turtle: | - cbg:person_reconstruction_anna_koppen - a pico:PersonReconstruction ; - sdo:name "Anna Maria Koppen" ; - sdo:familyName "Koppen" ; - sdo:givenName "Anna Maria" ; - sdo:gender sdo:Female ; - sdo:birthPlace "Haarlem" ; - sdo:birthDate "1860-03-31"^^xsd:date ; - sdo:deathPlace "Detroit, USA" ; - sdo:deathDate "1926"^^xsd:gYear ; - prov:wasDerivedFrom nha:marriage_1885_po_1 , - cbg:emigration_1887_po_1 , - us:death_1926_po_1 ; - prov:wasGeneratedBy cbg:reconstruction_activity_01 . - -# ============================================================================= -# SOURCE AND SCAN CLASSES -# ============================================================================= -# -# Sources (sdo:ArchiveComponent) and Scans (sdo:ImageObject) document where -# PersonObservations were extracted from. Essential for provenance. -# ============================================================================= - -source_classes: - - archive_component: - description: | - A Source document from which PersonObservations are extracted. - PiCo does not aim to fully describe archival sources (use RiC-O or DC for that), - but requires minimal identification for provenance tracking. - - class: "sdo:ArchiveComponent" - class_uri: "https://schema.org/ArchiveComponent" - superclass: "sdo:CreativeWork" - - properties: - - property: "sdo:name" - description: "Identifying name for the source" - range: "xsd:string" - cardinality: "1" - note: "Combine title, date, archive location for identification" - example: "BS Marriage Haarlem, November 11, 1885, certificate number 321" - - - property: "sdo:additionalType" - description: "Type of source document" - range: "picot_sourcetypes:Concept" - note: "Use PiCo SourceType thesaurus" - - - property: "sdo:dateCreated" - description: "Date the source was created" - range: "xsd:date" - - - property: "sdo:holdingArchive" - description: "Institution holding the source" - range: "xsd:anyURI" - note: "Link to heritage custodian (GHCID or Wikidata)" - - - property: "sdo:url" - description: "Permalink to the source" - range: "sdo:URL" - note: "Preferably a persistent identifier" - - - property: "sdo:contentLocation" - description: "Geographic coverage of the source" - range: "xsd:string or xsd:anyURI" - - - property: "sdo:associatedMedia" - description: "Link to scan(s) of the source" - range: "sdo:ImageObject" - cardinality: "0..*" - - image_object: - description: | - A Scan of a source document. Links to the digital image at the holding archive. - - class: "sdo:ImageObject" - class_uri: "https://schema.org/ImageObject" - superclass: "sdo:CreativeWork" - - properties: - - property: "sdo:url" - description: "URL to the full scan" - range: "sdo:URL" - note: "Preferably IIIF manifest" - - - property: "sdo:thumbnail" - description: "URL to thumbnail image" - range: "sdo:ImageObject" - - - property: "sdo:embedUrl" - description: "URL to image viewer" - range: "sdo:URL" - - - property: "sdo:position" - description: "Position in sequence of scans" - range: "xsd:int" - note: "For multi-page sources" - -# ============================================================================= -# BIOGRAPHICAL PROPERTIES -# ============================================================================= -# -# Properties for capturing biographical details about persons in observations. -# These appear in the source and are transcribed to the observation. -# ============================================================================= - -biographical_properties: - description: | - Biographical properties capture personal details as they appear in sources. - These are used for both PersonObservation (source-bound) and - PersonReconstruction (normalized). - - age: - property: "pico:hasAge" - property_uri: "https://personsincontext.org/model#hasAge" - description: "Age of person as stated in source" - range: "xsd:string" - domain: "pico:PersonObservation" - note: | - Used when birth date unknown but age is recorded. - Age assumed in years unless specified ("4" = 4 years, "4 months" = 4 months). - Numerical preferred over text ("4" not "four"). - examples: - - "30" - - "4 months" - - "about 25" - - religion: - property: "pico:hasReligion" - property_uri: "https://personsincontext.org/model#hasReligion" - description: "Religious affiliation as stated in source" - range: "xsd:string or xsd:anyURI" - domain: "pico:Person" - note: "Can link to SKOS thesaurus for religions" - examples: - - "Catholic" - - "Reformed" - - "Jewish" - - deceased: - property: "pico:deceased" - property_uri: "https://personsincontext.org/model#deceased" - description: "Indication that person is deceased (when death date unknown)" - range: "xsd:boolean" - domain: "pico:PersonObservation" - note: | - Only used when deathDate is unknown but death is indicated. - A person without deathDate and without deceased:true is assumed alive. - Important for privacy considerations in publishing person records. - - gender: - property: "sdo:gender" - property_uri: "https://schema.org/gender" - description: "Gender of the person" - range: "sdo:GenderType" - domain: "pico:Person" - values: - - uri: "sdo:Male" - label: "Male" - - uri: "sdo:Female" - label: "Female" - - address: - property: "sdo:address" - property_uri: "https://schema.org/address" - description: "Physical address as mentioned in source" - range: "xsd:string" - domain: "pico:PersonObservation" - note: "Address exactly as recorded in source" - - initials: - property: "pnv:initials" - property_uri: "https://w3id.org/pnv#initials" - description: "Initials of given name(s)" - range: "xsd:string" - domain: "pnv:PersonName" - note: "Each initial followed by period (e.g., 'P.R.', 'H.A.F.M.O.')" - examples: - - "P.R." - - "C.Joh." - - "H.A.F.M.O." - -# ============================================================================= -# FAMILY RELATIONSHIP PROPERTIES -# ============================================================================= -# -# PiCo defines extensive family relationship properties for genealogical data. -# These enable modeling complex family structures from historical records. -# ============================================================================= - -family_relationships: - description: | - Family relationship properties link persons within and across sources. - - Rules: - - For PersonObservations: relationships refer to OTHER observations on SAME source - - For PersonReconstructions: relationships refer to other reconstructions - - Property characteristics: - - Symmetric: If A hasRelation B, then B hasRelation A (spouses, siblings, cousins) - - Transitive: hasAncestor/hasDescendant chain through generations - - Inverse pairs: parent/children, grandparent/grandchild, etc. - - # --------------------------------------------------------------------------- - # Core Family (Schema.org) - # --------------------------------------------------------------------------- - - core_relationships: - - property: "sdo:parent" - property_uri: "https://schema.org/parent" - description: "A parent of the person" - inverse: "sdo:children" - subPropertyOf: ["sdo:relatedTo", "pico:hasAncestor"] - note: "Biological or legal parent" - - - property: "sdo:children" - property_uri: "https://schema.org/children" - description: "A child of the person" - inverse: "sdo:parent" - subPropertyOf: ["sdo:relatedTo", "pico:hasDescendant"] - - - property: "sdo:spouse" - property_uri: "https://schema.org/spouse" - description: "The person's spouse" - symmetric: true - subPropertyOf: "sdo:relatedTo" - - - property: "sdo:sibling" - property_uri: "https://schema.org/sibling" - description: "A brother or sister" - symmetric: true - subPropertyOf: "sdo:relatedTo" - - # --------------------------------------------------------------------------- - # Transitive Ancestry (PiCo) - # --------------------------------------------------------------------------- - - ancestry_relationships: - - property: "pico:hasAncestor" - property_uri: "https://personsincontext.org/model#hasAncestor" - description: "Any ancestor (parent, grandparent, etc.)" - type: "owl:TransitiveProperty" - inverse: "pico:hasDescendant" - note: "Not used directly; parent→parent chains automatically create ancestors" - - - property: "pico:hasDescendant" - property_uri: "https://personsincontext.org/model#hasDescendant" - description: "Any descendant (child, grandchild, etc.)" - type: "owl:TransitiveProperty" - inverse: "pico:hasAncestor" - - # --------------------------------------------------------------------------- - # Grandparents/Grandchildren - # --------------------------------------------------------------------------- - - grandparent_relationships: - - property: "pico:hasGrandparent" - property_uri: "https://personsincontext.org/model#hasGrandparent" - inverse: "pico:hasGrandchild" - - - property: "pico:hasGrandchild" - property_uri: "https://personsincontext.org/model#hasGrandchild" - inverse: "pico:hasGrandparent" - - - property: "pico:hasGreat-grandparent" - property_uri: "https://personsincontext.org/model#hasGreat-grandparent" - inverse: "pico:hasGreat-grandchild" - - - property: "pico:hasGreat-grandchild" - property_uri: "https://personsincontext.org/model#hasGreat-grandchild" - inverse: "pico:hasGreat-grandparent" - - # --------------------------------------------------------------------------- - # Aunts/Uncles and Nieces/Nephews - # --------------------------------------------------------------------------- - - extended_family: - - property: "pico:hasUncle_Aunt" - property_uri: "https://personsincontext.org/model#hasUncle_Aunt" - description: "An uncle or aunt (sibling of parent)" - inverse: "pico:hasNephew_Niece" - - - property: "pico:hasNephew_Niece" - property_uri: "https://personsincontext.org/model#hasNephew_Niece" - description: "A nephew or niece (child of sibling)" - inverse: "pico:hasUncle_Aunt" - - - property: "pico:hasCousin" - property_uri: "https://personsincontext.org/model#hasCousin" - description: "A cousin (child of parent's sibling)" - symmetric: true - - # --------------------------------------------------------------------------- - # Step-family - # --------------------------------------------------------------------------- - - step_relationships: - - property: "pico:hasStepparent" - property_uri: "https://personsincontext.org/model#hasStepparent" - description: "A stepparent (spouse of biological parent)" - inverse: "pico:hasStepchild" - - - property: "pico:hasStepchild" - property_uri: "https://personsincontext.org/model#hasStepchild" - inverse: "pico:hasStepparent" - - - property: "pico:hasStepsibling" - property_uri: "https://personsincontext.org/model#hasStepsibling" - description: "A stepbrother or stepsister" - symmetric: true - - - property: "pico:hasHalf-sibling" - property_uri: "https://personsincontext.org/model#hasHalf-sibling" - description: "A half-brother or half-sister (one shared parent)" - symmetric: true - - # --------------------------------------------------------------------------- - # Foster/Godparent - # --------------------------------------------------------------------------- - - non_biological_relationships: - - property: "pico:hasFosterParent" - property_uri: "https://personsincontext.org/model#hasFosterParent" - inverse: "pico:hasFosterChild" - - - property: "pico:hasFosterChild" - property_uri: "https://personsincontext.org/model#hasFosterChild" - inverse: "pico:hasFosterParent" - - - property: "pico:hasGodparent" - property_uri: "https://personsincontext.org/model#hasGodparent" - description: "A godparent (witness at baptism)" - inverse: "pico:hasGodchild" - - - property: "pico:hasGodchild" - property_uri: "https://personsincontext.org/model#hasGodchild" - inverse: "pico:hasGodparent" - - - property: "pico:hasLegitimizedChild" - property_uri: "https://personsincontext.org/model#hasLegitimizedChild" - description: "A child legitimized by marriage or legal recognition" - inverse: "pico:isLegitimitezedChildOf" - - - property: "pico:isLegitimitezedChildOf" - property_uri: "https://personsincontext.org/model#isLegitimitezedChildOf" - inverse: "pico:hasLegitimizedChild" - - # --------------------------------------------------------------------------- - # In-Laws - # --------------------------------------------------------------------------- - - in_law_relationships: - - property: "pico:hasParent-in-law" - property_uri: "https://personsincontext.org/model#hasParent-in-law" - inverse: "pico:hasChild-in-law" - - - property: "pico:hasChild-in-law" - property_uri: "https://personsincontext.org/model#hasChild-in-law" - inverse: "pico:hasParent-in-law" - - - property: "pico:hasSibling-in-law" - property_uri: "https://personsincontext.org/model#hasSibling-in-law" - description: "Brother/sister-in-law" - symmetric: true - - - property: "pico:hasGrandparent-in-law" - property_uri: "https://personsincontext.org/model#hasGrandparent-in-law" - inverse: "pico:hasGrandchild-in-law" - - - property: "pico:hasGrandchild-in-law" - property_uri: "https://personsincontext.org/model#hasGrandchild-in-law" - inverse: "pico:hasGrandparent-in-law" - - - property: "pico:hasUncle_Aunt-in-law" - property_uri: "https://personsincontext.org/model#hasUncle_Aunt-in-law" - inverse: "pico:hasNephew_Niece-in-law" - - - property: "pico:hasNephew_Niece-in-law" - property_uri: "https://personsincontext.org/model#hasNephew_Niece-in-law" - inverse: "pico:hasUncle_Aunt-in-law" - - - property: "pico:hasCousin-in-law" - property_uri: "https://personsincontext.org/model#hasCousin-in-law" - symmetric: true - - - property: "pico:hasStepparent-in-law" - property_uri: "https://personsincontext.org/model#hasStepparent-in-law" - inverse: "pico:hasStepchild-in-law" - - - property: "pico:hasStepchild-in-law" - property_uri: "https://personsincontext.org/model#hasStepchild-in-law" - inverse: "pico:hasStepparent-in-law" - - # --------------------------------------------------------------------------- - # Former Partners - # --------------------------------------------------------------------------- - - former_partner_relationships: - - property: "pico:isWidOf" - property_uri: "https://personsincontext.org/model#isWidOf" - description: "Is widow/widower of deceased spouse" - note: "The subject is the surviving partner" - - - property: "pico:hasPreviousPartner" - property_uri: "https://personsincontext.org/model#hasPreviousPartner" - description: "A former spouse or partner" - symmetric: true - -# ============================================================================= -# PROVENANCE MODEL (PROV-O INTEGRATION) -# ============================================================================= -# -# Enhanced provenance model for tracking observation extraction and -# reconstruction creation activities. -# ============================================================================= - -enhanced_provenance_model: - description: | - PiCo uses W3C PROV-O for provenance tracking at two levels: - - 1. OBSERVATION LEVEL: Where did this observation come from? - - prov:hadPrimarySource → Source document - - prov:wasGeneratedBy → Extraction activity (optional) - - 2. RECONSTRUCTION LEVEL: How was this person entity created? - - prov:wasDerivedFrom → Source observation(s) - - prov:wasGeneratedBy → Reconstruction activity - - prov:wasRevisionOf → Previous reconstruction version - - activity_class: - class: "prov:Activity" - class_uri: "http://www.w3.org/ns/prov#Activity" - description: "The activity that generated a PersonReconstruction" - - properties: - - property: "prov:wasAssociatedWith" - description: "Agent responsible for the activity" - range: "prov:Agent" - - - property: "prov:startedAtTime" - description: "When the activity started" - range: "xsd:dateTime" - - - property: "prov:endedAtTime" - description: "When the activity completed" - range: "xsd:dateTime" - - - property: "prov:used" - description: "Resources/tools used in the activity" - range: "prov:Entity" - note: "E.g., ML model, matching algorithm, rule set" - - types: - human_reconstruction: - description: "Manual reconstruction by researcher" - note: "Provide: time, place, knowledge sources, researcher name" - - algorithmic_reconstruction: - description: "Automated reconstruction by software" - note: "Provide: algorithm name, version, configuration, parameters" - - agent_class: - class: "prov:Agent" - class_uri: "http://www.w3.org/ns/prov#Agent" - description: "Person or organization responsible for reconstruction" - - properties: - - property: "sdo:name" - description: "Name of the agent" - range: "xsd:string" - - - property: "sdo:url" - description: "URL identifying the agent" - range: "sdo:URL" - - examples: - - name: "CBG|Center for Family History" - url: "https://cbg.nl" - type: "organization" - - - name: "GLM-4.6 Person Extractor v1.0" - url: null - type: "software" - - derivation_properties: - - property: "prov:wasDerivedFrom" - property_uri: "http://www.w3.org/ns/prov#wasDerivedFrom" - description: "Links PersonReconstruction to source PersonObservation(s)" - domain: "pico:PersonReconstruction" - range: "pico:PersonObservation" - cardinality: "1..*" - note: "REQUIRED for all PersonReconstructions" - - - property: "prov:wasRevisionOf" - property_uri: "http://www.w3.org/ns/prov#wasRevisionOf" - description: "Links to previous version of reconstruction" - domain: "pico:PersonReconstruction" - range: "pico:PersonReconstruction" - cardinality: "0..1" - note: "For tracking reconstruction updates over time" - -# ============================================================================= -# PICO VOCABULARIES/THESAURI -# ============================================================================= -# -# PiCo provides controlled vocabularies for roles, source types, and events. -# ============================================================================= - -pico_vocabularies: - description: | - PiCo defines three SKOS concept schemes for controlled terminology: - - - Roles: The role a person plays in a source (child, declarant, witness, etc.) - - SourceTypes: Types of historical sources (birth certificate, census, etc.) - - EventTypes: Types of life events (birth, marriage, death, etc.) - - roles_thesaurus: - id: "picot_roles" - uri: "https://terms.personsincontext.org/roles/" - type: "skos:ConceptScheme" - label: "Persons in Context role thesaurus" - description: "Roles that persons can have in historical sources" - usage: | - Use pico:hasRole property with a term from this thesaurus. - Example: picot_roles:575 (child), picot_roles:489 (declarant) - example_concepts: - - id: "575" - label: "child" - description: "Person appearing as child in a record" - - - id: "489" - label: "declarant" - description: "Person declaring/reporting an event" - - - id: "witness" - label: "witness" - description: "Person witnessing an event or signing a document" - - - id: "bride" - label: "bride" - description: "Female partner in a marriage" - - - id: "groom" - label: "groom" - description: "Male partner in a marriage" - - sourcetypes_thesaurus: - id: "picot_sourcetypes" - uri: "https://terms.personsincontext.org/sourcetypes/" - type: "skos:ConceptScheme" - label: "Persons in Context sourceType thesaurus" - description: "Types of historical sources containing person observations" - usage: | - Use sdo:additionalType property on sdo:ArchiveComponent. - Example: picot_sourcetypes:551 (civil registry: birth) - example_concepts: - - id: "551" - label: "civil registry: birth" - description: "Birth certificate from civil registration" - - - id: "marriage" - label: "civil registry: marriage" - description: "Marriage certificate" - - - id: "death" - label: "civil registry: death" - description: "Death certificate" - - - id: "census" - label: "census" - description: "Population census record" - - - id: "church_baptism" - label: "church record: baptism" - description: "Baptismal record from church register" - - - id: "notarial" - label: "notarial record" - description: "Notarial act or protocol" - - eventtypes_thesaurus: - id: "picot_eventtypes" - uri: "https://terms.personsincontext.org/eventtypes/" - type: "skos:ConceptScheme" - label: "Persons in Context eventType thesaurus" - description: "Types of life events documented in sources" - example_concepts: - - id: "birth" - label: "birth" - - - id: "baptism" - label: "baptism" - - - id: "marriage" - label: "marriage" - - - id: "death" - label: "death" - - - id: "burial" - label: "burial" - - - id: "emigration" - label: "emigration" - - - id: "immigration" - label: "immigration" - -# ============================================================================= -# GLM ANNOTATOR OUTPUT SCHEMA UPDATE -# ============================================================================= -# -# Extended output schema for GLM-4.6 annotator to include family relationships -# and biographical properties. -# ============================================================================= - -glm_extended_output_schema: - description: | - Extended JSON output schema that includes all PiCo properties. - This supplements the core system_prompt output format. - - persons_extended: - description: "Extended person object with all PiCo properties" - schema: - pnv_name: - literalName: "string" - givenName: "string|null" - patronym: "string|null" - surnamePrefix: "string|null" - baseSurname: "string|null" - honorificPrefix: "string|null" - honorificSuffix: "string|null" - initials: "string|null" - - biographical: - birth_date: "ISO date|null" - death_date: "ISO date|null" - birth_place: "string|null" - death_place: "string|null" - gender: "Male|Female|null" - age: "string|null" - religion: "string|null" - deceased: "boolean|null" - address: "string|null" - floruit: "string|null" - - roles: "array of role objects" - - family_relationships: - parent: "array of person references" - children: "array of person references" - spouse: "array of person references" - sibling: "array of person references" - grandparent: "array of person references" - grandchild: "array of person references" - uncle_aunt: "array of person references" - nephew_niece: "array of person references" - cousin: "array of person references" - stepparent: "array of person references" - stepchild: "array of person references" - stepsibling: "array of person references" - half_sibling: "array of person references" - foster_parent: "array of person references" - foster_child: "array of person references" - godparent: "array of person references" - godchild: "array of person references" - parent_in_law: "array of person references" - child_in_law: "array of person references" - sibling_in_law: "array of person references" - previous_partner: "array of person references" - widow_of: "person reference|null" - - context: "string|null" - -# ============================================================================= -# CH-ANNOTATOR HYPERNYM INTEGRATION UPDATE -# ============================================================================= -# -# Updated hypernym mappings to include reconstruction pattern. -# ============================================================================= - -extended_hypernym_mapping: - description: | - Extended mappings between PiCo classes and CH-Annotator hypernyms, - including the reconstruction pattern. - - mappings: - # Observation level - - pico_class: "pico:PersonObservation" - ch_hypernym: "AGT.PER" - ch_code: "AGT.PER" - note: "Source-bound person mention" - - - pico_class: "pico:PersonObservation" - ch_hypernym: "AGT.STF" - ch_code: "AGT.STF" - condition: "When person has organizational role" - note: "Staff member observation" - - # Reconstruction level - - pico_class: "pico:PersonReconstruction" - ch_hypernym: "AGT.PER" - ch_code: "AGT.PER" - note: "Reconstructed person entity" - linking: true - linking_sources: ["Wikidata", "VIAF", "ISNI"] - - # Name components - - pico_class: "pnv:PersonName" - ch_hypernym: "APP.NAM" - ch_code: "APP.NAM" - note: "Structured name" - - # Roles - - pico_class: "pico:hasRole" - ch_hypernym: "ROL" - ch_code: "ROL" - note: "Role in source" - - # Family relationships - - pico_class: "sdo:parent" - ch_hypernym: "AGT.PER" - relationship_type: "family" - note: "Parent relationship" - - - pico_class: "sdo:spouse" - ch_hypernym: "AGT.PER" - relationship_type: "family" - note: "Spouse relationship" - - - pico_class: "pico:hasGodparent" - ch_hypernym: "AGT.PER" - relationship_type: "ritual_kinship" - note: "Godparent relationship (common in historical records)" - - # Sources - - pico_class: "sdo:ArchiveComponent" - ch_hypernym: "WRK.DOC" - ch_code: "WRK.DOC" - note: "Source document" - - # Provenance - - pico_class: "prov:Activity" - ch_hypernym: null - note: "Not directly annotated; tracked in provenance metadata" - - - pico_class: "prov:Agent" - ch_hypernym: "AGT" - ch_code: "AGT" - note: "Extraction/reconstruction agent" - -# ============================================================================= -# HISTORICAL SOURCE EXTRACTION EXAMPLES -# ============================================================================= -# -# Comprehensive examples showing extraction from different historical source types. -# These demonstrate the full PiCo model including family relationships. -# ============================================================================= - -historical_extraction_examples: - description: | - These examples demonstrate extraction from common historical source types, - showing how to capture family relationships, biographical data, and roles - according to the PiCo model. - - # --------------------------------------------------------------------------- - # Example 1: Dutch Marriage Certificate (Burgerlijke Stand) - # --------------------------------------------------------------------------- - - marriage_certificate_example: - source_type: "civil_registration" - source_text: | - Heden den elfden November achttien honderd vijf en tachtig, zijn voor ons - Ambtenaar van den Burgerlijken Stand der gemeente Haarlem, verschenen: - Cornelis Johannes Koppen, oud dertig jaren, schilder, geboren te Haarlem, - wonende alhier, meerderjarige zoon van wijlen Pieter Koppen en van - Anna Maria Brouwer, zonder beroep, wonende alhier; - en Anna Maria Visser, oud zeven en twintig jaren, zonder beroep, geboren - te Amsterdam, wonende alhier, meerderjarige dochter van Jan Visser, - koopman, en van wijlen Cornelia de Vries. - - Als getuigen waren tegenwoordig: Hendrik Koppen, oud vijf en dertig jaren, - schilder, broeder van den bruidegom; en Willem Visser, oud twee en dertig - jaren, timmerman, broeder van de bruid. - - expected_output: - pico_observation: - observation_id: "bs_haarlem_1885_marriage_321" - observed_at: "2025-12-12T10:00:00Z" - source_type: "civil_registration" - source_reference: "BS Marriage Haarlem, November 11, 1885, certificate 321" - - persons: - - person_index: 0 - pnv_name: - literalName: "Cornelis Johannes Koppen" - givenName: "Cornelis Johannes" - baseSurname: "Koppen" - roles: - - role_title: "schilder" - role_in_source: "groom" - biographical: - age: "30" - birth_place: "Haarlem" - address: "Haarlem" - family_relationships: - parent: - - person_index: 2 - target_name: "Pieter Koppen" - - person_index: 3 - target_name: "Anna Maria Brouwer" - spouse: - - person_index: 1 - target_name: "Anna Maria Visser" - sibling: - - person_index: 6 - target_name: "Hendrik Koppen" - - - person_index: 1 - pnv_name: - literalName: "Anna Maria Visser" - givenName: "Anna Maria" - baseSurname: "Visser" - roles: - - role_in_source: "bride" - biographical: - age: "27" - birth_place: "Amsterdam" - address: "Haarlem" - family_relationships: - parent: - - person_index: 4 - target_name: "Jan Visser" - - person_index: 5 - target_name: "Cornelia de Vries" - spouse: - - person_index: 0 - target_name: "Cornelis Johannes Koppen" - sibling: - - person_index: 7 - target_name: "Willem Visser" - - - person_index: 2 - pnv_name: - literalName: "Pieter Koppen" - givenName: "Pieter" - baseSurname: "Koppen" - biographical: - deceased: true - family_relationships: - children: - - person_index: 0 - target_name: "Cornelis Johannes Koppen" - - person_index: 6 - target_name: "Hendrik Koppen" - spouse: - - person_index: 3 - target_name: "Anna Maria Brouwer" - - - person_index: 3 - pnv_name: - literalName: "Anna Maria Brouwer" - givenName: "Anna Maria" - baseSurname: "Brouwer" - roles: - - role_title: "zonder beroep" - biographical: - address: "Haarlem" - family_relationships: - children: - - person_index: 0 - target_name: "Cornelis Johannes Koppen" - - person_index: 6 - target_name: "Hendrik Koppen" - widow_of: - person_index: 2 - target_name: "Pieter Koppen" - - - person_index: 4 - pnv_name: - literalName: "Jan Visser" - givenName: "Jan" - baseSurname: "Visser" - roles: - - role_title: "koopman" - family_relationships: - children: - - person_index: 1 - target_name: "Anna Maria Visser" - - person_index: 7 - target_name: "Willem Visser" - spouse: - - person_index: 5 - target_name: "Cornelia de Vries" - - - person_index: 5 - pnv_name: - literalName: "Cornelia de Vries" - givenName: "Cornelia" - surnamePrefix: "de" - baseSurname: "Vries" - biographical: - deceased: true - family_relationships: - children: - - person_index: 1 - target_name: "Anna Maria Visser" - - person_index: 7 - target_name: "Willem Visser" - spouse: - - person_index: 4 - target_name: "Jan Visser" - - - person_index: 6 - pnv_name: - literalName: "Hendrik Koppen" - givenName: "Hendrik" - baseSurname: "Koppen" - roles: - - role_title: "schilder" - role_in_source: "witness" - biographical: - age: "35" - family_relationships: - sibling: - - person_index: 0 - target_name: "Cornelis Johannes Koppen" - parent: - - person_index: 2 - target_name: "Pieter Koppen" - - person_index: 3 - target_name: "Anna Maria Brouwer" - - - person_index: 7 - pnv_name: - literalName: "Willem Visser" - givenName: "Willem" - baseSurname: "Visser" - roles: - - role_title: "timmerman" - role_in_source: "witness" - biographical: - age: "32" - family_relationships: - sibling: - - person_index: 1 - target_name: "Anna Maria Visser" - parent: - - person_index: 4 - target_name: "Jan Visser" - - person_index: 5 - target_name: "Cornelia de Vries" - - temporal_references: - - expression: "den elfden November achttien honderd vijf en tachtig" - normalized: "1885-11-11" - type: "DATE" - - locations_mentioned: - - name: "Haarlem" - type: "city" - - name: "Amsterdam" - type: "city" - - # --------------------------------------------------------------------------- - # Example 2: Early Modern Notarial Protocol Index Entry - # --------------------------------------------------------------------------- - - notarial_index_example: - source_type: "historical_indices" - source_text: | - Notarial Archive Amsterdam, inv. 5075/1234 - 30 January 1680 - - Before notary Pieter van der Meer appeared: - Jacob Janszoon van der Hoeven, merchant of this city, - with his wife Maritgen Claes, for themselves and as - guardians (voogden) of the minor children of the late - Claes Jacobsz and Aeltgen Pieters, namely: - - Jan Claeszoon, aged about 16 years - - Trijntgen Claesdr, aged about 12 years - - Witnesses: Hendrick Jansz, baker, and Cornelis Pietersz, - schoolmaster, both of this city. - - expected_output: - pico_observation: - observation_id: "na_amsterdam_5075_1234" - observed_at: "2025-12-12T10:00:00Z" - source_type: "historical_indices" - source_reference: "Notarial Archive Amsterdam, inv. 5075/1234, 30 January 1680" - - persons: - - person_index: 0 - pnv_name: - literalName: "Jacob Janszoon van der Hoeven" - givenName: "Jacob" - patronym: "Janszoon" - surnamePrefix: "van der" - baseSurname: "Hoeven" - roles: - - role_title: "merchant" - role_in_source: "declarant" - - role_title: "voogd" - role_in_source: null - biographical: - address: "Amsterdam" - family_relationships: - spouse: - - person_index: 1 - target_name: "Maritgen Claes" - - - person_index: 1 - pnv_name: - literalName: "Maritgen Claes" - givenName: "Maritgen" - patronym: "Claes" - roles: - - role_in_source: "declarant" - - role_title: "voogd" - family_relationships: - spouse: - - person_index: 0 - target_name: "Jacob Janszoon van der Hoeven" - - - person_index: 2 - pnv_name: - literalName: "Claes Jacobsz" - givenName: "Claes" - patronym: "Jacobsz" - biographical: - deceased: true - family_relationships: - spouse: - - person_index: 3 - target_name: "Aeltgen Pieters" - children: - - person_index: 4 - target_name: "Jan Claeszoon" - - person_index: 5 - target_name: "Trijntgen Claesdr" - - - person_index: 3 - pnv_name: - literalName: "Aeltgen Pieters" - givenName: "Aeltgen" - patronym: "Pieters" - biographical: - deceased: true - family_relationships: - spouse: - - person_index: 2 - target_name: "Claes Jacobsz" - children: - - person_index: 4 - target_name: "Jan Claeszoon" - - person_index: 5 - target_name: "Trijntgen Claesdr" - - - person_index: 4 - pnv_name: - literalName: "Jan Claeszoon" - givenName: "Jan" - patronym: "Claeszoon" - roles: - - role_in_source: "child" - biographical: - age: "about 16" - family_relationships: - parent: - - person_index: 2 - target_name: "Claes Jacobsz" - - person_index: 3 - target_name: "Aeltgen Pieters" - sibling: - - person_index: 5 - target_name: "Trijntgen Claesdr" - - - person_index: 5 - pnv_name: - literalName: "Trijntgen Claesdr" - givenName: "Trijntgen" - patronym: "Claesdr" - roles: - - role_in_source: "child" - biographical: - age: "about 12" - gender: "Female" - family_relationships: - parent: - - person_index: 2 - target_name: "Claes Jacobsz" - - person_index: 3 - target_name: "Aeltgen Pieters" - sibling: - - person_index: 4 - target_name: "Jan Claeszoon" - - - person_index: 6 - pnv_name: - literalName: "Pieter van der Meer" - givenName: "Pieter" - surnamePrefix: "van der" - baseSurname: "Meer" - roles: - - role_title: "notary" - - - person_index: 7 - pnv_name: - literalName: "Hendrick Jansz" - givenName: "Hendrick" - patronym: "Jansz" - roles: - - role_title: "baker" - role_in_source: "witness" - biographical: - address: "Amsterdam" - - - person_index: 8 - pnv_name: - literalName: "Cornelis Pietersz" - givenName: "Cornelis" - patronym: "Pietersz" - roles: - - role_title: "schoolmaster" - role_in_source: "witness" - biographical: - address: "Amsterdam" - - temporal_references: - - expression: "30 January 1680" - normalized: "1680-01-30" - type: "DATE" - - locations_mentioned: - - name: "Amsterdam" - type: "city" - - # --------------------------------------------------------------------------- - # Example 3: Church Baptismal Record with Godparents - # --------------------------------------------------------------------------- - - baptism_record_example: - source_type: "church_records" - source_text: | - Den 15en Meij 1702 is gedoopt - Johanna, dochter van Willem Hendriksen en Geertruijd Jans, - getuijgen waren de E. Heer Jan Willem van Beverwijck - ende Juffrou Maria van Loon, huijsvrouw van de heer - Pieter Anthonisz Verschoor. - - expected_output: - pico_observation: - observation_id: "dtb_amsterdam_1702_baptism_johanna" - observed_at: "2025-12-12T10:00:00Z" - source_type: "church_records" - source_reference: "DTB Amsterdam, 15 May 1702" - - persons: - - person_index: 0 - pnv_name: - literalName: "Johanna" - givenName: "Johanna" - roles: - - role_in_source: "child" - biographical: - gender: "Female" - family_relationships: - parent: - - person_index: 1 - target_name: "Willem Hendriksen" - - person_index: 2 - target_name: "Geertruijd Jans" - godparent: - - person_index: 3 - target_name: "Jan Willem van Beverwijck" - - person_index: 4 - target_name: "Maria van Loon" - - - person_index: 1 - pnv_name: - literalName: "Willem Hendriksen" - givenName: "Willem" - patronym: "Hendriksen" - biographical: - gender: "Male" - family_relationships: - children: - - person_index: 0 - target_name: "Johanna" - spouse: - - person_index: 2 - target_name: "Geertruijd Jans" - - - person_index: 2 - pnv_name: - literalName: "Geertruijd Jans" - givenName: "Geertruijd" - patronym: "Jans" - biographical: - gender: "Female" - family_relationships: - children: - - person_index: 0 - target_name: "Johanna" - spouse: - - person_index: 1 - target_name: "Willem Hendriksen" - - - person_index: 3 - pnv_name: - literalName: "Jan Willem van Beverwijck" - givenName: "Jan Willem" - surnamePrefix: "van" - baseSurname: "Beverwijck" - honorificPrefix: "de E. Heer" - roles: - - role_in_source: "witness" - biographical: - gender: "Male" - family_relationships: - godchild: - - person_index: 0 - target_name: "Johanna" - - - person_index: 4 - pnv_name: - literalName: "Maria van Loon" - givenName: "Maria" - surnamePrefix: "van" - baseSurname: "Loon" - honorificPrefix: "Juffrou" - roles: - - role_in_source: "witness" - biographical: - gender: "Female" - family_relationships: - godchild: - - person_index: 0 - target_name: "Johanna" - spouse: - - person_index: 5 - target_name: "Pieter Anthonisz Verschoor" - - - person_index: 5 - pnv_name: - literalName: "Pieter Anthonisz Verschoor" - givenName: "Pieter" - patronym: "Anthonisz" - baseSurname: "Verschoor" - honorificPrefix: "de heer" - biographical: - gender: "Male" - family_relationships: - spouse: - - person_index: 4 - target_name: "Maria van Loon" - - temporal_references: - - expression: "Den 15en Meij 1702" - normalized: "1702-05-15" - type: "DATE" - - # --------------------------------------------------------------------------- - # Example 4: Modern LinkedIn Staff Profile - # --------------------------------------------------------------------------- - - linkedin_profile_example: - source_type: "modern_digital" - source_text: | - Dr. Maria van den Berg - Director of Collections | Rijksmuseum - Amsterdam, Netherlands - - About: - Leading the collections management team at the Rijksmuseum since 2018. - Previously Head Curator at the Van Gogh Museum (2012-2018). - PhD in Art History, University of Amsterdam. - - Experience: - - Director of Collections, Rijksmuseum (2018-present) - - Head Curator, Van Gogh Museum (2012-2018) - - Assistant Curator, Stedelijk Museum (2008-2012) - - Education: - - PhD Art History, University of Amsterdam (2008) - - MA Museum Studies, University of Amsterdam (2003) - - expected_output: - pico_observation: - observation_id: "linkedin_maria_van_den_berg_2025" - observed_at: "2025-12-12T10:00:00Z" - source_type: "modern_digital" - source_reference: "https://linkedin.com/in/mariavandenberg" - - persons: - - person_index: 0 - pnv_name: - literalName: "Dr. Maria van den Berg" - givenName: "Maria" - surnamePrefix: "van den" - baseSurname: "Berg" - honorificPrefix: "Dr." - roles: - - role_title: "Director of Collections" - organization: "Rijksmuseum" - period: "2018-present" - heritage_relevant: true - heritage_type: "M" - - role_title: "Head Curator" - organization: "Van Gogh Museum" - period: "2012-2018" - heritage_relevant: true - heritage_type: "M" - - role_title: "Assistant Curator" - organization: "Stedelijk Museum" - period: "2008-2012" - heritage_relevant: true - heritage_type: "M" - biographical: - address: "Amsterdam, Netherlands" - family_relationships: {} - context: "Heritage sector professional with museum career" - - organizations_mentioned: - - name: "Rijksmuseum" - type: "M" - role_in_source: "employer" - - name: "Van Gogh Museum" - type: "M" - role_in_source: "employer" - - name: "Stedelijk Museum" - type: "M" - role_in_source: "employer" - - name: "University of Amsterdam" - type: "E" - role_in_source: "education" - - locations_mentioned: - - name: "Amsterdam" - type: "city" - - name: "Netherlands" - type: "country" - -# ============================================================================= -# END OF MODULE -# ============================================================================= diff --git a/data/entity_annotation/modules/integrations/pico.yaml.bak b/data/entity_annotation/modules/integrations/pico.yaml.bak new file mode 100644 index 0000000000..94a28d46df --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico.yaml.bak @@ -0,0 +1,4255 @@ +# ============================================================================= +# GLAM-NER Entity Annotation Convention v1.7.0 +# Module: integrations/pico.yaml +# ============================================================================= +# PiCO (Person in Context Ontology) integration for person observation modeling. +# Enables tracking provenance of person mentions and linking to formal records. +# +# Key concepts: +# - PersonObservation: A textual mention of a person (source-bound) +# - PersonName (PNV): Structured name components +# - Person (CIDOC-CRM E21): Reconstructed person entity +# +# References: +# - PiCo Ontology: https://w3id.org/pico +# - Person Name Vocabulary (PNV): https://w3id.org/pnv +# - CIDOC-CRM: https://www.cidoc-crm.org/ +# ============================================================================= + +pico_integration: + + description: | + PiCO (Person in Context Ontology) models textual observations of persons + as distinct from reconstructed person entities. This enables: + - Tracking provenance of person mentions + - Handling name variations across sources + - Linking observations to formal person records + + The observation/reconstruction pattern separates: + 1. What was OBSERVED in text (PersonObservation) - source-bound, exact + 2. What was RECONSTRUCTED as entity (E21_Person) - inferred, normalized + + This is critical for heritage data where the same person may appear with + different name forms, titles, or spellings across sources. + + # --------------------------------------------------------------------------- + # Core Observation Pattern + # --------------------------------------------------------------------------- + + observation_pattern: + description: "Every person mention creates a PersonObservation" + class: "picom:PersonObservation" + class_uri: "https://w3id.org/pico/PersonObservation" + + properties: + - property: "picom:hasObservedName" + description: "The name string as it appears in text" + range: "pnv:PersonName" + cardinality: "1" + note: "Exact transcription of name from source" + + - property: "picom:isObservationOf" + description: "Links to reconstructed Person entity" + range: "crm:E21_Person" + cardinality: "0..1" + note: "May be null if person not yet identified" + + - property: "prov:hadPrimarySource" + description: "The source document/webpage" + range: "prov:Entity" + cardinality: "1" + note: "Required for provenance tracking" + + - property: "picom:observedAt" + description: "When the observation was made" + range: "xsd:dateTime" + cardinality: "1" + note: "Extraction timestamp, not document date" + + - property: "picom:observedInContext" + description: "Surrounding text context" + range: "xsd:string" + cardinality: "0..1" + note: "For disambiguation when reviewing" + + - property: "picom:hasRole" + description: "Role/position observed with the person" + range: "xsd:string" + cardinality: "0..*" + note: "Links to ROLE hypernym when extracted" + + # --------------------------------------------------------------------------- + # Person Name Vocabulary (PNV) + # --------------------------------------------------------------------------- + + pnv_name_structure: + description: | + Person Name Vocabulary (PNV) provides structured name components. + This enables proper parsing of complex name structures across cultures. + + class: "pnv:PersonName" + class_uri: "https://w3id.org/pnv/PersonName" + + components: + - property: "pnv:literalName" + description: "Full name as single string" + examples: + - "Dr. Maria van den Berg" + - "Rembrandt Harmenszoon van Rijn" + - "Queen Elizabeth II" + note: "Original string before parsing" + + - property: "pnv:givenName" + description: "First/given name" + examples: + - "Rembrandt" + - "Maria" + - "Jan" + - "Elizabeth" + note: "Personal name, not surname" + + - property: "pnv:patronym" + description: "Patronymic name component" + examples: + - "Harmenszoon" + - "Janszoon" + - "Pietersdochter" + note: "Common in Dutch, Scandinavian, Slavic names" + + - property: "pnv:surnamePrefix" + description: "Prefix to surname (tussenvoegsel)" + examples: + - "van" + - "de" + - "van den" + - "van der" + - "op de" + - "'t" + - "von" + - "di" + note: "Language-specific, affects sorting" + + - property: "pnv:baseSurname" + description: "Core surname without prefix" + examples: + - "Rijn" + - "Berg" + - "Velde" + - "Gogh" + note: "Primary sorting component in Dutch" + + - property: "pnv:honorificPrefix" + description: "Title or honorific before name" + examples: + - "Dr." + - "Prof." + - "Prof. dr." + - "Sir" + - "Queen" + - "Mr." + - "Drs." + - "Ir." + note: "May indicate role - link to ROL" + + - property: "pnv:honorificSuffix" + description: "Title or honorific after name" + examples: + - "PhD" + - "Jr." + - "III" + - "MD" + - "RA" + - "MSc" + note: "Credentials and generational markers" + + - property: "pnv:infixTitle" + description: "Title within name structure" + examples: + - "graaf van" + - "baron de" + - "duke of" + note: "Nobility titles embedded in name" + + # --------------------------------------------------------------------------- + # Dutch Name Conventions (Project-Specific) + # --------------------------------------------------------------------------- + + dutch_name_patterns: + description: | + Special handling for Dutch names with tussenvoegsels (surname prefixes). + Dutch sorting rules differ from other languages. + + tussenvoegsel_list: + - "van" + - "van de" + - "van den" + - "van der" + - "de" + - "den" + - "het" + - "'t" + - "ter" + - "ten" + - "op de" + - "op den" + - "in 't" + - "in de" + + sorting_rule: | + In Dutch, surnames sort by baseSurname, ignoring tussenvoegsel. + "Vincent van Gogh" sorts under "G" not "V". + "Maria van den Berg" sorts under "B" not "V". + + capitalization_rule: | + Tussenvoegsel lowercase when preceded by given name: + - "Vincent van Gogh" (not "Vincent Van Gogh") + - "Van Gogh" (surname alone, capitalized) + - "de heer Van Gogh" (formal, capitalized) + + # --------------------------------------------------------------------------- + # Integration with GLAM-NER Hypernyms + # --------------------------------------------------------------------------- + + hypernym_mapping: + description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms" + + mappings: + - pico_class: "picom:PersonObservation" + glam_hypernym: "AGT.PER" + glam_code: "AGT.PER" + note: "Person observations create AGT.PER entities" + + - pico_class: "picom:PersonObservation" + glam_hypernym: "AGT.STF" + glam_code: "AGT.STF" + condition: "When observed with organizational role" + note: "Staff members with role context" + + - pico_class: "pnv:PersonName" + glam_hypernym: "APP.NAM" + glam_code: "APP.NAM" + note: "Name strings as appellations" + + - pico_class: "picom:hasRole" + glam_hypernym: "ROL" + glam_code: "ROL" + note: "Extracted roles link to ROL hypernym" + + # --------------------------------------------------------------------------- + # Example Annotations + # --------------------------------------------------------------------------- + + examples: + - description: "Staff member with title and role" + text: "Dr. Maria van den Berg, Director" + + observation: + type: "picom:PersonObservation" + id: "_:obs1" + + hasObservedName: + type: "pnv:PersonName" + literalName: "Dr. Maria van den Berg" + honorificPrefix: "Dr." + givenName: "Maria" + surnamePrefix: "van den" + baseSurname: "Berg" + + hasRole: "Director" + hadPrimarySource: "https://example.org/staff-page" + observedAt: "2025-12-02T10:30:00Z" + + glam_ner_annotations: + - span: "Dr. Maria van den Berg" + type: "AGT.STF" + code: "AGT.STF" + confidence: 0.95 + + - span: "Director" + type: "ROL.TIT" + code: "ROL.TIT" + confidence: 0.98 + + - description: "Historical artist" + text: "Rembrandt van Rijn painted this in 1642" + + observation: + type: "picom:PersonObservation" + id: "_:obs2" + + hasObservedName: + type: "pnv:PersonName" + literalName: "Rembrandt van Rijn" + givenName: "Rembrandt" + surnamePrefix: "van" + baseSurname: "Rijn" + + isObservationOf: "wd:Q5598" # Wikidata Rembrandt + hadPrimarySource: "https://example.org/artwork-page" + observedAt: "2025-12-02T10:35:00Z" + + glam_ner_annotations: + - span: "Rembrandt van Rijn" + type: "AGT.PER" + code: "AGT.PER" + confidence: 0.99 + linking: + wikidata: "Q5598" + viaf: "64013650" + + - description: "Nobility title" + text: "Count Willem van Loon" + + observation: + type: "picom:PersonObservation" + id: "_:obs3" + + hasObservedName: + type: "pnv:PersonName" + literalName: "Count Willem van Loon" + honorificPrefix: "Count" + givenName: "Willem" + surnamePrefix: "van" + baseSurname: "Loon" + + hadPrimarySource: "https://example.org/archive-doc" + observedAt: "2025-12-02T10:40:00Z" + + glam_ner_annotations: + - span: "Count Willem van Loon" + type: "AGT.PER" + code: "AGT.PER" + confidence: 0.95 + + - span: "Count" + type: "ROL.HON" + code: "ROL.HON" + note: "Nobility title - honorific role" + + # --------------------------------------------------------------------------- + # Provenance Chain + # --------------------------------------------------------------------------- + + provenance_model: + description: | + PiCo observations maintain full provenance chain: + + Observation → Source Document → Extraction Activity → Agent + + This enables: + - Tracking where each name form was found + - Attributing extractions to human/ML agents + - Maintaining audit trail for corrections + + chain_structure: + observation: + class: "picom:PersonObservation" + properties: + - "prov:hadPrimarySource" # → Source document + - "prov:wasGeneratedBy" # → Extraction activity + + source: + class: "prov:Entity" + properties: + - "prov:wasAttributedTo" # → Publisher/author + - "dct:created" # → Document date + + activity: + class: "prov:Activity" + properties: + - "prov:wasAssociatedWith" # → Extraction agent + - "prov:used" # → ML model or rules + - "prov:startedAtTime" # → Extraction timestamp + + agent: + class: "prov:Agent" + examples: + - "Human curator" + - "spaCy NER model" + - "GLAM-NER extraction pipeline" + +# ============================================================================= +# SOURCE TYPE EXTENSIONS +# ============================================================================= +# +# PiCo PersonObservation can be extracted from many source types. +# Each source type may have specific extraction patterns, but the core +# PiCo model (observation → name → roles → provenance) remains the same. +# +# Source-specific extraction logic belongs in APPLICATION LAYER scripts, +# not in this convention. This section defines the ABSTRACT patterns. +# ============================================================================= + +source_type_patterns: + description: | + PersonObservation sources fall into categories with different extraction + patterns. The CH-Annotator handles all source types using the same + core PiCo model, with source-specific field mappings at extraction time. + + # --------------------------------------------------------------------------- + # Source Categories + # --------------------------------------------------------------------------- + + categories: + modern_digital: + description: "Contemporary digital sources with structured data" + examples: + - "LinkedIn profiles" + - "Institutional staff directories" + - "Academic profile pages" + - "ORCID records" + characteristics: + - "Semi-structured HTML/JSON" + - "Current/living persons" + - "Self-reported information" + - "Timestamped updates" + typical_properties: + - "sdo:name" + - "sdo:jobTitle" + - "sdo:hasOccupation" + - "sdo:alumniOf" + - "sdo:knowsAbout" + + historical_indices: + description: "Early modern and historical name indices" + examples: + - "Notarial protocol indices" + - "Church register indices" + - "Census indices" + - "Guild membership lists" + - "Property transfer records" + characteristics: + - "Abbreviated names" + - "Patronymics common" + - "Latin/vernacular mixing" + - "Occupation as identifier" + - "Relational identification ('wife of', 'son of')" + typical_properties: + - "pnv:literalName" + - "pnv:patronym" + - "picom:hasRole" + - "crm:P107_has_current_or_former_member" + - "sdo:spouse" + - "sdo:parent" + + archival_descriptions: + description: "Finding aids, inventories, and archival descriptions" + examples: + - "EAD finding aids" + - "ISAD(G) descriptions" + - "Collection inventories" + - "RiC-O records" + characteristics: + - "Hierarchical context" + - "Provenance-focused" + - "Creator/contributor roles" + - "Temporal spans" + typical_properties: + - "rico:hasCreator" + - "rico:hasOrHadHolder" + - "crm:P14_carried_out_by" + - "crm:P11_had_participant" + + biographical_dictionaries: + description: "Structured biographical reference works" + examples: + - "Dictionary of National Biography" + - "KNAW DWDD" + - "Allgemeines Künstlerlexikon" + - "Thieme-Becker" + characteristics: + - "Standardized entries" + - "Birth/death dates" + - "Career summaries" + - "Cross-references" + typical_properties: + - "sdo:birthDate" + - "sdo:deathDate" + - "sdo:birthPlace" + - "sdo:deathPlace" + - "crm:P98_brought_into_life" + - "crm:P100_was_death_of" + + # --------------------------------------------------------------------------- + # Universal Observation Properties (All Source Types) + # --------------------------------------------------------------------------- + + universal_properties: + description: | + These properties apply to PersonObservation regardless of source type. + They form the core of the PiCo extraction model. + + required: + - property: "picom:hasObservedName" + description: "The name string as it appears in source" + range: "pnv:PersonName" + + - property: "prov:hadPrimarySource" + description: "The source document/webpage/record" + range: "prov:Entity" + + - property: "picom:observedAt" + description: "When the observation was extracted" + range: "xsd:dateTime" + + optional: + - property: "picom:isObservationOf" + description: "Links to reconstructed Person entity (if identified)" + range: "crm:E21_Person" + + - property: "picom:hasRole" + description: "Role/position observed with the person" + range: "org:Role" + + - property: "picom:observedInContext" + description: "Surrounding text for disambiguation" + range: "xsd:string" + + - property: "picom:confidence" + description: "Confidence score for extraction" + range: "xsd:decimal" + + # --------------------------------------------------------------------------- + # Heritage Relevance Detection (Universal) + # --------------------------------------------------------------------------- + + heritage_relevance: + description: | + Person observations can be tagged for heritage sector relevance using + GLAMORCUBESFIXPHDNT type codes. This applies to all source types. + + type_codes: + G: "Gallery" + L: "Library" + A: "Archive" + M: "Museum" + O: "Official institution" + R: "Research center" + C: "Corporation" + U: "Unknown" + B: "Botanical garden / Zoo" + E: "Education provider" + S: "Collecting society" + F: "Feature / Monument" + I: "Intangible heritage" + X: "Mixed types" + P: "Personal collection" + H: "Holy site" + D: "Digital platform" + N: "NGO" + T: "Taste/smell heritage" + + detection_approach: | + Heritage relevance detection is SOURCE-SPECIFIC and belongs in the + application layer, not the convention. The convention defines: + 1. The type code vocabulary (GLAMORCUBESFIXPHDNT) + 2. The property for tagging (picom:heritageRelevance) + 3. The expected format (single-letter code + confidence) + + Application scripts implement source-specific keyword detection, + organization matching, or ML classification to populate this field. + +# ============================================================================= +# GLM-4.6 CH-ANNOTATOR INTEGRATION +# ============================================================================= +# +# The CH-Annotator can be invoked via GLM-4.6 API for automated extraction. +# The system prompt is SOURCE-AGNOSTIC and works with any text input. +# ============================================================================= + +glm_annotator_config: + model: "glm-4.6" + api_endpoint: "https://api.z.ai/api/coding/paas/v4/chat/completions" + temperature: 0.1 + max_tokens: 4000 + + # --------------------------------------------------------------------------- + # Core System Prompt (Source-Agnostic) + # --------------------------------------------------------------------------- + + system_prompt: | + You are a CH-Annotator (Cultural Heritage Annotator) v1.7.0 extraction agent + with PiCo (Person in Context) ontology integration. + + ## Your Task + Extract structured person observation data from the provided source text. + The source may be a modern digital profile, historical index, archival + description, or any other document containing person references. + + ## Core PiCo Pattern + Every person mention creates a PersonObservation that is: + - SOURCE-BOUND: Exact transcription from source, no normalization + - PROVENANCE-TRACKED: Linked to source document and extraction timestamp + - RECONSTRUCTION-READY: Can be linked to formal Person entity later + + ## Person Name Vocabulary (PNV) + Parse names into components (use null for missing parts): + - literalName: Full name exactly as written in source + - givenName: First/given name + - patronym: Patronymic (Janszoon, -dochter, bin, ibn, mac) + - surnamePrefix: Tussenvoegsel/particle (van, de, von, di, du) + - baseSurname: Core surname without prefix + - honorificPrefix: Title before name (Dr., Prof., Heer, Meester) + - honorificSuffix: Credentials after name (PhD, Jr., III) + - initials: Initials with periods (e.g., "P.R.", "C.Joh.") + + ## Language-Specific Name Rules + + ### Dutch + - Tussenvoegsel lowercase after given name: "Jan van Gogh" + - Capitalized when standalone: "Van Gogh painted..." + - Common: van, de, van de, van den, van der, 't, 's, op de + + ### Historical/Latin + - Patronymics: -zoon/-zn, -dochter/-dr, -s (Janszoon, Pietersdochter) + - Latinized forms: -us, -ius endings (Erasmus Roterodamus) + - Occupational surnames may be literal (de bakker = the baker) + + ## Role Extraction + Extract roles/occupations with temporal bounds when available: + - Role title exactly as stated + - Associated organization (link to GRP hypernym if institution) + - Start/end dates or period + - Heritage relevance code if applicable (GLAMORCUBESFIXPHDNT) + - Role in source context (from picot_roles thesaurus): + * child, parent, spouse, witness, declarant, bride, groom, godparent, etc. + + ## Biographical Properties + Extract when present in source (use null if not stated): + - birth_date / death_date: ISO format (YYYY, YYYY-MM, or YYYY-MM-DD) + - birth_place / death_place: Place name as written + - gender: "Male" or "Female" (only if explicitly stated or inferable) + - age: Age as stated (e.g., "30", "4 months", "about 25") + - religion: Religious affiliation if mentioned + - deceased: true only if death indicated but date unknown + - address: Physical address as recorded in source + - floruit: Active period if birth/death unknown + + ## Family Relationship Extraction + + CRITICAL: For PersonObservations, family relationships MUST refer to OTHER + persons mentioned in the SAME source document. Cross-source relationships + belong to PersonReconstructions. + + ### Core Family Relationships + - parent: A parent of the person (use sdo:parent) + - children: Children of the person (use sdo:children) + - spouse: Current spouse (use sdo:spouse) + - sibling: Brother or sister (use sdo:sibling) + + ### Extended Family + - grandparent / grandchild + - uncle_aunt / nephew_niece + - cousin (symmetric) + + ### Step/Half Relations + - stepparent / stepchild + - stepsibling + - half_sibling (one shared parent) + + ### Ritual/Legal Kinship (common in historical records) + - godparent / godchild: Baptismal sponsors + - foster_parent / foster_child + - legitimized_child: Child recognized through marriage/legal act + + ### In-Law Relations + - parent_in_law / child_in_law + - sibling_in_law + + ### Former Partners + - widow_of: Surviving spouse of deceased (subject is the survivor) + - previous_partner: Former spouse/partner + + ### Historical Source Patterns + Common relationship indicators in historical documents (by language): + + **Dutch**: "huijsvrou van" (wife), "zoon van" (son of), "weduwe van" (widow), + "peter/meter" (godfather/godmother), "getuige" (witness) + + **Latin**: "filius/filia" (son/daughter), "uxor" (wife), "vidua" (widow), + "quondam" (the late) + + **German**: "Ehefrau von" (wife), "Sohn/Tochter von" (son/daughter of), + "Witwe von" (widow of) + + **Arabic** (نسب - patronymic): "ابن/بن" (ibn/bin - son of), "بنت" (bint - daughter of), + "زوج/زوجة" (zawj/zawja - husband/wife), "أرملة" (armala - widow), + "المرحوم" (al-marhum - the late), "آل" (Al - family of) + + **Ottoman Turkish**: "oğlu" (son of), "kızı" (daughter of), "zevcesi" (wife), + "merhum/merhume" (the late) + + **French**: "fils/fille de" (son/daughter of), "épouse de" (wife of), + "veuve de" (widow of), "feu/feue" (the late) + + **Hebrew**: "בן/בת" (ben/bat - son/daughter of), "אשת" (eshet - wife of), + "אלמנה" (almana - widow), "ז״ל" (z"l - of blessed memory) + + **Persian/Farsi**: "پسر/دختر" (pesar/dokhtar - son/daughter), "زن" (zan - wife), + "بیوه" (biveh - widow), "مرحوم" (marhum - the late) + + **Spanish**: "hijo/hija de" (son/daughter of), "esposa de" (wife of), + "viuda de" (widow of), "padrino/madrina" (godfather/godmother) + + **Portuguese**: "filho/filha de" (son/daughter of), "esposa de" (wife of), + "viúva de" (widow of), "padrinho/madrinha" (godfather/godmother) + + For comprehensive patterns (10 languages): modules/relationships/family.yaml + + ## Source Types (for source_type field) + Use appropriate category: + - modern_digital: LinkedIn, staff directories, ORCID + - historical_indices: Notarial protocols, guild lists + - civil_registration: Birth/marriage/death certificates + - church_records: Baptism, marriage, burial registers + - archival_descriptions: Finding aids, inventories + - biographical_dictionaries: DNB, AKL, reference works + - census: Population census records + + ## Output Format + Return ONLY valid JSON (no markdown, no explanation): + + { + "pico_observation": { + "observation_id": "", + "observed_at": "", + "source_type": "", + "source_reference": "" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "Name as written", + "givenName": null, + "patronym": null, + "surnamePrefix": null, + "baseSurname": null, + "honorificPrefix": null, + "honorificSuffix": null, + "initials": null + }, + "roles": [ + { + "role_title": "Role as stated", + "role_in_source": "child|declarant|witness|bride|groom|null", + "organization": "Org name if mentioned", + "period": "Temporal info if available", + "heritage_relevant": false, + "heritage_type": null + } + ], + "biographical": { + "birth_date": null, + "death_date": null, + "birth_place": null, + "death_place": null, + "gender": null, + "age": null, + "religion": null, + "deceased": null, + "address": null, + "floruit": null + }, + "family_relationships": { + "parent": [], + "children": [], + "spouse": [], + "sibling": [], + "grandparent": [], + "grandchild": [], + "uncle_aunt": [], + "nephew_niece": [], + "cousin": [], + "stepparent": [], + "stepchild": [], + "stepsibling": [], + "half_sibling": [], + "foster_parent": [], + "foster_child": [], + "godparent": [], + "godchild": [], + "parent_in_law": [], + "child_in_law": [], + "sibling_in_law": [], + "previous_partner": [], + "widow_of": null + }, + "context": "Surrounding text for disambiguation" + } + ], + "organizations_mentioned": [ + { + "name": "Organization name", + "type": "Heritage type code or null", + "role_in_source": "employer|creator|publisher|etc" + } + ], + "temporal_references": [ + { + "expression": "Date/period as written", + "normalized": "ISO date if parseable", + "type": "DATE|DURATION|SET" + } + ], + "locations_mentioned": [ + { + "name": "Place name as written", + "type": "city|region|country|address" + } + ] + } + + ## Relationship Reference Format + Family relationship arrays contain references to other persons in same source: + - Use person_index (integer) to reference persons array position + - Include target_name for readability + + Example for a marriage record: + ```json + { + "person_index": 0, + "pnv_name": {"literalName": "Jan Pietersz"}, + "family_relationships": { + "spouse": [{"person_index": 1, "target_name": "Maria Jansdr"}], + "parent": [{"person_index": 2, "target_name": "Pieter Jansz"}] + } + } + ``` + + ## Critical Rules + 1. ONLY extract data that EXISTS in the source. NEVER fabricate. + 2. Use null for missing fields, [] for empty arrays. + 3. Preserve original spelling/language from source. + 4. heritage_type must be single-letter GLAMORCUBESFIXPHDNT code. + 5. For historical sources, preserve archaic spellings exactly. + 6. Extract ALL persons mentioned, not just the primary subject. + 7. Family relationships MUST reference persons in SAME source only. + 8. Use person_index for relationship references (0-based array index). + 9. Gender: only "Male"/"Female"/null - never infer without evidence. + 10. Age: preserve as stated, include qualifier ("about 25", "4 months"). + 11. For role_in_source, use picot_roles terms when applicable. + +# ============================================================================= +# PERSON RECONSTRUCTION PATTERN +# ============================================================================= +# +# PersonReconstruction is a reconstructed person entity derived from one or +# more PersonObservations. It represents the scholarly consensus about a +# historical person based on available evidence. +# ============================================================================= + +person_reconstruction_pattern: + description: | + A PersonReconstruction is created by linking one or more PersonObservations + to form a unified person entity. This is the scholarly interpretation layer + that connects source-bound observations to a conceptual person. + + Key distinction: + - PersonObservation: What is OBSERVED in a specific source (exact transcription) + - PersonReconstruction: What is INFERRED about the person (normalized, linked) + + A single PersonReconstruction may derive from observations across: + - Multiple sources (birth record + marriage record + death record) + - Different time periods (mentions across decades) + - Various name forms ("Jan Jansz" + "Johannes Jansen" + "J. Jansen") + + class: "pico:PersonReconstruction" + class_uri: "https://personsincontext.org/model#PersonReconstruction" + superclass: "pico:Person" + + required_properties: + - property: "prov:wasDerivedFrom" + description: "Links to source PersonObservation(s)" + range: "pico:PersonObservation" + cardinality: "1..*" + note: "Every reconstruction MUST link to at least one observation" + + - property: "prov:wasGeneratedBy" + description: "Links to the reconstruction Activity" + range: "prov:Activity" + cardinality: "1" + note: "Documents how/when/by whom reconstruction was created" + + optional_properties: + - property: "prov:wasRevisionOf" + description: "Links to previous version of this reconstruction" + range: "pico:PersonReconstruction" + cardinality: "0..1" + note: "For tracking updates to reconstructions over time" + + - property: "sdo:name" + description: "Normalized/preferred name form" + range: "xsd:string" + note: "The canonical name for this person" + + - property: "sdo:additionalName" + description: "Structured name following PNV" + range: "pnv:PersonName" + note: "Full name breakdown using Person Name Vocabulary" + + - property: "sdo:givenName" + description: "Given/first name" + range: "xsd:string" + + - property: "sdo:familyName" + description: "Family/surname" + range: "xsd:string" + + - property: "sdo:gender" + description: "Gender of the person" + range: "sdo:GenderType" + values: ["sdo:Male", "sdo:Female"] + + - property: "sdo:birthDate" + description: "Birth date (ISO 8601)" + range: "xsd:date" + note: "May be incomplete: YYYY, YYYY-MM, or YYYY-MM-DD" + + - property: "sdo:birthPlace" + description: "Place of birth" + range: "xsd:string or xsd:anyURI" + note: "Prefer linking to GeoNames or Wikidata" + + - property: "sdo:deathDate" + description: "Death date (ISO 8601)" + range: "xsd:date" + + - property: "sdo:deathPlace" + description: "Place of death" + range: "xsd:string or xsd:anyURI" + + example: + description: "PersonReconstruction derived from multiple observations" + turtle: | + cbg:person_reconstruction_anna_koppen + a pico:PersonReconstruction ; + sdo:name "Anna Maria Koppen" ; + sdo:familyName "Koppen" ; + sdo:givenName "Anna Maria" ; + sdo:gender sdo:Female ; + sdo:birthPlace "Haarlem" ; + sdo:birthDate "1860-03-31"^^xsd:date ; + sdo:deathPlace "Detroit, USA" ; + sdo:deathDate "1926"^^xsd:gYear ; + prov:wasDerivedFrom nha:marriage_1885_po_1 , + cbg:emigration_1887_po_1 , + us:death_1926_po_1 ; + prov:wasGeneratedBy cbg:reconstruction_activity_01 . + +# ============================================================================= +# SOURCE AND SCAN CLASSES +# ============================================================================= +# +# Sources (sdo:ArchiveComponent) and Scans (sdo:ImageObject) document where +# PersonObservations were extracted from. Essential for provenance. +# ============================================================================= + +source_classes: + + archive_component: + description: | + A Source document from which PersonObservations are extracted. + PiCo does not aim to fully describe archival sources (use RiC-O or DC for that), + but requires minimal identification for provenance tracking. + + class: "sdo:ArchiveComponent" + class_uri: "https://schema.org/ArchiveComponent" + superclass: "sdo:CreativeWork" + + properties: + - property: "sdo:name" + description: "Identifying name for the source" + range: "xsd:string" + cardinality: "1" + note: "Combine title, date, archive location for identification" + example: "BS Marriage Haarlem, November 11, 1885, certificate number 321" + + - property: "sdo:additionalType" + description: "Type of source document" + range: "picot_sourcetypes:Concept" + note: "Use PiCo SourceType thesaurus" + + - property: "sdo:dateCreated" + description: "Date the source was created" + range: "xsd:date" + + - property: "sdo:holdingArchive" + description: "Institution holding the source" + range: "xsd:anyURI" + note: "Link to heritage custodian (GHCID or Wikidata)" + + - property: "sdo:url" + description: "Permalink to the source" + range: "sdo:URL" + note: "Preferably a persistent identifier" + + - property: "sdo:contentLocation" + description: "Geographic coverage of the source" + range: "xsd:string or xsd:anyURI" + + - property: "sdo:associatedMedia" + description: "Link to scan(s) of the source" + range: "sdo:ImageObject" + cardinality: "0..*" + + image_object: + description: | + A Scan of a source document. Links to the digital image at the holding archive. + + class: "sdo:ImageObject" + class_uri: "https://schema.org/ImageObject" + superclass: "sdo:CreativeWork" + + properties: + - property: "sdo:url" + description: "URL to the full scan" + range: "sdo:URL" + note: "Preferably IIIF manifest" + + - property: "sdo:thumbnail" + description: "URL to thumbnail image" + range: "sdo:ImageObject" + + - property: "sdo:embedUrl" + description: "URL to image viewer" + range: "sdo:URL" + + - property: "sdo:position" + description: "Position in sequence of scans" + range: "xsd:int" + note: "For multi-page sources" + +# ============================================================================= +# BIOGRAPHICAL PROPERTIES +# ============================================================================= +# +# Properties for capturing biographical details about persons in observations. +# These appear in the source and are transcribed to the observation. +# ============================================================================= + +biographical_properties: + description: | + Biographical properties capture personal details as they appear in sources. + These are used for both PersonObservation (source-bound) and + PersonReconstruction (normalized). + + age: + property: "pico:hasAge" + property_uri: "https://personsincontext.org/model#hasAge" + description: "Age of person as stated in source" + range: "xsd:string" + domain: "pico:PersonObservation" + note: | + Used when birth date unknown but age is recorded. + Age assumed in years unless specified ("4" = 4 years, "4 months" = 4 months). + Numerical preferred over text ("4" not "four"). + examples: + - "30" + - "4 months" + - "about 25" + + religion: + property: "pico:hasReligion" + property_uri: "https://personsincontext.org/model#hasReligion" + description: "Religious affiliation as stated in source" + range: "xsd:string or xsd:anyURI" + domain: "pico:Person" + note: "Can link to SKOS thesaurus for religions" + examples: + - "Catholic" + - "Reformed" + - "Jewish" + + deceased: + property: "pico:deceased" + property_uri: "https://personsincontext.org/model#deceased" + description: "Indication that person is deceased (when death date unknown)" + range: "xsd:boolean" + domain: "pico:PersonObservation" + note: | + Only used when deathDate is unknown but death is indicated. + A person without deathDate and without deceased:true is assumed alive. + Important for privacy considerations in publishing person records. + + gender: + property: "sdo:gender" + property_uri: "https://schema.org/gender" + description: "Gender of the person" + range: "sdo:GenderType" + domain: "pico:Person" + values: + - uri: "sdo:Male" + label: "Male" + - uri: "sdo:Female" + label: "Female" + + address: + property: "sdo:address" + property_uri: "https://schema.org/address" + description: "Physical address as mentioned in source" + range: "xsd:string" + domain: "pico:PersonObservation" + note: "Address exactly as recorded in source" + + initials: + property: "pnv:initials" + property_uri: "https://w3id.org/pnv#initials" + description: "Initials of given name(s)" + range: "xsd:string" + domain: "pnv:PersonName" + note: "Each initial followed by period (e.g., 'P.R.', 'H.A.F.M.O.')" + examples: + - "P.R." + - "C.Joh." + - "H.A.F.M.O." + +# ============================================================================= +# FAMILY RELATIONSHIP PROPERTIES +# ============================================================================= +# +# PiCo defines extensive family relationship properties for genealogical data. +# These enable modeling complex family structures from historical records. +# ============================================================================= + +family_relationships: + description: | + Family relationship properties link persons within and across sources. + + Rules: + - For PersonObservations: relationships refer to OTHER observations on SAME source + - For PersonReconstructions: relationships refer to other reconstructions + + Property characteristics: + - Symmetric: If A hasRelation B, then B hasRelation A (spouses, siblings, cousins) + - Transitive: hasAncestor/hasDescendant chain through generations + - Inverse pairs: parent/children, grandparent/grandchild, etc. + + # --------------------------------------------------------------------------- + # Core Family (Schema.org) + # --------------------------------------------------------------------------- + + core_relationships: + - property: "sdo:parent" + property_uri: "https://schema.org/parent" + description: "A parent of the person" + inverse: "sdo:children" + subPropertyOf: ["sdo:relatedTo", "pico:hasAncestor"] + note: "Biological or legal parent" + + - property: "sdo:children" + property_uri: "https://schema.org/children" + description: "A child of the person" + inverse: "sdo:parent" + subPropertyOf: ["sdo:relatedTo", "pico:hasDescendant"] + + - property: "sdo:spouse" + property_uri: "https://schema.org/spouse" + description: "The person's spouse" + symmetric: true + subPropertyOf: "sdo:relatedTo" + + - property: "sdo:sibling" + property_uri: "https://schema.org/sibling" + description: "A brother or sister" + symmetric: true + subPropertyOf: "sdo:relatedTo" + + # --------------------------------------------------------------------------- + # Transitive Ancestry (PiCo) + # --------------------------------------------------------------------------- + + ancestry_relationships: + - property: "pico:hasAncestor" + property_uri: "https://personsincontext.org/model#hasAncestor" + description: "Any ancestor (parent, grandparent, etc.)" + type: "owl:TransitiveProperty" + inverse: "pico:hasDescendant" + note: "Not used directly; parent→parent chains automatically create ancestors" + + - property: "pico:hasDescendant" + property_uri: "https://personsincontext.org/model#hasDescendant" + description: "Any descendant (child, grandchild, etc.)" + type: "owl:TransitiveProperty" + inverse: "pico:hasAncestor" + + # --------------------------------------------------------------------------- + # Grandparents/Grandchildren + # --------------------------------------------------------------------------- + + grandparent_relationships: + - property: "pico:hasGrandparent" + property_uri: "https://personsincontext.org/model#hasGrandparent" + inverse: "pico:hasGrandchild" + + - property: "pico:hasGrandchild" + property_uri: "https://personsincontext.org/model#hasGrandchild" + inverse: "pico:hasGrandparent" + + - property: "pico:hasGreat-grandparent" + property_uri: "https://personsincontext.org/model#hasGreat-grandparent" + inverse: "pico:hasGreat-grandchild" + + - property: "pico:hasGreat-grandchild" + property_uri: "https://personsincontext.org/model#hasGreat-grandchild" + inverse: "pico:hasGreat-grandparent" + + # --------------------------------------------------------------------------- + # Aunts/Uncles and Nieces/Nephews + # --------------------------------------------------------------------------- + + extended_family: + - property: "pico:hasUncle_Aunt" + property_uri: "https://personsincontext.org/model#hasUncle_Aunt" + description: "An uncle or aunt (sibling of parent)" + inverse: "pico:hasNephew_Niece" + + - property: "pico:hasNephew_Niece" + property_uri: "https://personsincontext.org/model#hasNephew_Niece" + description: "A nephew or niece (child of sibling)" + inverse: "pico:hasUncle_Aunt" + + - property: "pico:hasCousin" + property_uri: "https://personsincontext.org/model#hasCousin" + description: "A cousin (child of parent's sibling)" + symmetric: true + + # --------------------------------------------------------------------------- + # Step-family + # --------------------------------------------------------------------------- + + step_relationships: + - property: "pico:hasStepparent" + property_uri: "https://personsincontext.org/model#hasStepparent" + description: "A stepparent (spouse of biological parent)" + inverse: "pico:hasStepchild" + + - property: "pico:hasStepchild" + property_uri: "https://personsincontext.org/model#hasStepchild" + inverse: "pico:hasStepparent" + + - property: "pico:hasStepsibling" + property_uri: "https://personsincontext.org/model#hasStepsibling" + description: "A stepbrother or stepsister" + symmetric: true + + - property: "pico:hasHalf-sibling" + property_uri: "https://personsincontext.org/model#hasHalf-sibling" + description: "A half-brother or half-sister (one shared parent)" + symmetric: true + + # --------------------------------------------------------------------------- + # Foster/Godparent + # --------------------------------------------------------------------------- + + non_biological_relationships: + - property: "pico:hasFosterParent" + property_uri: "https://personsincontext.org/model#hasFosterParent" + inverse: "pico:hasFosterChild" + + - property: "pico:hasFosterChild" + property_uri: "https://personsincontext.org/model#hasFosterChild" + inverse: "pico:hasFosterParent" + + - property: "pico:hasGodparent" + property_uri: "https://personsincontext.org/model#hasGodparent" + description: "A godparent (witness at baptism)" + inverse: "pico:hasGodchild" + + - property: "pico:hasGodchild" + property_uri: "https://personsincontext.org/model#hasGodchild" + inverse: "pico:hasGodparent" + + - property: "pico:hasLegitimizedChild" + property_uri: "https://personsincontext.org/model#hasLegitimizedChild" + description: "A child legitimized by marriage or legal recognition" + inverse: "pico:isLegitimitezedChildOf" + + - property: "pico:isLegitimitezedChildOf" + property_uri: "https://personsincontext.org/model#isLegitimitezedChildOf" + inverse: "pico:hasLegitimizedChild" + + # --------------------------------------------------------------------------- + # In-Laws + # --------------------------------------------------------------------------- + + in_law_relationships: + - property: "pico:hasParent-in-law" + property_uri: "https://personsincontext.org/model#hasParent-in-law" + inverse: "pico:hasChild-in-law" + + - property: "pico:hasChild-in-law" + property_uri: "https://personsincontext.org/model#hasChild-in-law" + inverse: "pico:hasParent-in-law" + + - property: "pico:hasSibling-in-law" + property_uri: "https://personsincontext.org/model#hasSibling-in-law" + description: "Brother/sister-in-law" + symmetric: true + + - property: "pico:hasGrandparent-in-law" + property_uri: "https://personsincontext.org/model#hasGrandparent-in-law" + inverse: "pico:hasGrandchild-in-law" + + - property: "pico:hasGrandchild-in-law" + property_uri: "https://personsincontext.org/model#hasGrandchild-in-law" + inverse: "pico:hasGrandparent-in-law" + + - property: "pico:hasUncle_Aunt-in-law" + property_uri: "https://personsincontext.org/model#hasUncle_Aunt-in-law" + inverse: "pico:hasNephew_Niece-in-law" + + - property: "pico:hasNephew_Niece-in-law" + property_uri: "https://personsincontext.org/model#hasNephew_Niece-in-law" + inverse: "pico:hasUncle_Aunt-in-law" + + - property: "pico:hasCousin-in-law" + property_uri: "https://personsincontext.org/model#hasCousin-in-law" + symmetric: true + + - property: "pico:hasStepparent-in-law" + property_uri: "https://personsincontext.org/model#hasStepparent-in-law" + inverse: "pico:hasStepchild-in-law" + + - property: "pico:hasStepchild-in-law" + property_uri: "https://personsincontext.org/model#hasStepchild-in-law" + inverse: "pico:hasStepparent-in-law" + + # --------------------------------------------------------------------------- + # Former Partners + # --------------------------------------------------------------------------- + + former_partner_relationships: + - property: "pico:isWidOf" + property_uri: "https://personsincontext.org/model#isWidOf" + description: "Is widow/widower of deceased spouse" + note: "The subject is the surviving partner" + + - property: "pico:hasPreviousPartner" + property_uri: "https://personsincontext.org/model#hasPreviousPartner" + description: "A former spouse or partner" + symmetric: true + +# ============================================================================= +# PROVENANCE MODEL (PROV-O INTEGRATION) +# ============================================================================= +# +# Enhanced provenance model for tracking observation extraction and +# reconstruction creation activities. +# ============================================================================= + +enhanced_provenance_model: + description: | + PiCo uses W3C PROV-O for provenance tracking at two levels: + + 1. OBSERVATION LEVEL: Where did this observation come from? + - prov:hadPrimarySource → Source document + - prov:wasGeneratedBy → Extraction activity (optional) + + 2. RECONSTRUCTION LEVEL: How was this person entity created? + - prov:wasDerivedFrom → Source observation(s) + - prov:wasGeneratedBy → Reconstruction activity + - prov:wasRevisionOf → Previous reconstruction version + + activity_class: + class: "prov:Activity" + class_uri: "http://www.w3.org/ns/prov#Activity" + description: "The activity that generated a PersonReconstruction" + + properties: + - property: "prov:wasAssociatedWith" + description: "Agent responsible for the activity" + range: "prov:Agent" + + - property: "prov:startedAtTime" + description: "When the activity started" + range: "xsd:dateTime" + + - property: "prov:endedAtTime" + description: "When the activity completed" + range: "xsd:dateTime" + + - property: "prov:used" + description: "Resources/tools used in the activity" + range: "prov:Entity" + note: "E.g., ML model, matching algorithm, rule set" + + types: + human_reconstruction: + description: "Manual reconstruction by researcher" + note: "Provide: time, place, knowledge sources, researcher name" + + algorithmic_reconstruction: + description: "Automated reconstruction by software" + note: "Provide: algorithm name, version, configuration, parameters" + + agent_class: + class: "prov:Agent" + class_uri: "http://www.w3.org/ns/prov#Agent" + description: "Person or organization responsible for reconstruction" + + properties: + - property: "sdo:name" + description: "Name of the agent" + range: "xsd:string" + + - property: "sdo:url" + description: "URL identifying the agent" + range: "sdo:URL" + + examples: + - name: "CBG|Center for Family History" + url: "https://cbg.nl" + type: "organization" + + - name: "GLM-4.6 Person Extractor v1.0" + url: null + type: "software" + + derivation_properties: + - property: "prov:wasDerivedFrom" + property_uri: "http://www.w3.org/ns/prov#wasDerivedFrom" + description: "Links PersonReconstruction to source PersonObservation(s)" + domain: "pico:PersonReconstruction" + range: "pico:PersonObservation" + cardinality: "1..*" + note: "REQUIRED for all PersonReconstructions" + + - property: "prov:wasRevisionOf" + property_uri: "http://www.w3.org/ns/prov#wasRevisionOf" + description: "Links to previous version of reconstruction" + domain: "pico:PersonReconstruction" + range: "pico:PersonReconstruction" + cardinality: "0..1" + note: "For tracking reconstruction updates over time" + +# ============================================================================= +# PICO VOCABULARIES/THESAURI +# ============================================================================= +# +# PiCo provides controlled vocabularies for roles, source types, and events. +# ============================================================================= + +pico_vocabularies: + description: | + PiCo defines three SKOS concept schemes for controlled terminology: + + - Roles: The role a person plays in a source (child, declarant, witness, etc.) + - SourceTypes: Types of historical sources (birth certificate, census, etc.) + - EventTypes: Types of life events (birth, marriage, death, etc.) + + roles_thesaurus: + id: "picot_roles" + uri: "https://terms.personsincontext.org/roles/" + type: "skos:ConceptScheme" + label: "Persons in Context role thesaurus" + description: "Roles that persons can have in historical sources" + usage: | + Use pico:hasRole property with a term from this thesaurus. + Example: picot_roles:575 (child), picot_roles:489 (declarant) + example_concepts: + - id: "575" + label: "child" + description: "Person appearing as child in a record" + + - id: "489" + label: "declarant" + description: "Person declaring/reporting an event" + + - id: "witness" + label: "witness" + description: "Person witnessing an event or signing a document" + + - id: "bride" + label: "bride" + description: "Female partner in a marriage" + + - id: "groom" + label: "groom" + description: "Male partner in a marriage" + + sourcetypes_thesaurus: + id: "picot_sourcetypes" + uri: "https://terms.personsincontext.org/sourcetypes/" + type: "skos:ConceptScheme" + label: "Persons in Context sourceType thesaurus" + description: "Types of historical sources containing person observations" + usage: | + Use sdo:additionalType property on sdo:ArchiveComponent. + Example: picot_sourcetypes:551 (civil registry: birth) + example_concepts: + - id: "551" + label: "civil registry: birth" + description: "Birth certificate from civil registration" + + - id: "marriage" + label: "civil registry: marriage" + description: "Marriage certificate" + + - id: "death" + label: "civil registry: death" + description: "Death certificate" + + - id: "census" + label: "census" + description: "Population census record" + + - id: "church_baptism" + label: "church record: baptism" + description: "Baptismal record from church register" + + - id: "notarial" + label: "notarial record" + description: "Notarial act or protocol" + + eventtypes_thesaurus: + id: "picot_eventtypes" + uri: "https://terms.personsincontext.org/eventtypes/" + type: "skos:ConceptScheme" + label: "Persons in Context eventType thesaurus" + description: "Types of life events documented in sources" + example_concepts: + - id: "birth" + label: "birth" + + - id: "baptism" + label: "baptism" + + - id: "marriage" + label: "marriage" + + - id: "death" + label: "death" + + - id: "burial" + label: "burial" + + - id: "emigration" + label: "emigration" + + - id: "immigration" + label: "immigration" + +# ============================================================================= +# GLM ANNOTATOR OUTPUT SCHEMA UPDATE +# ============================================================================= +# +# Extended output schema for GLM-4.6 annotator to include family relationships +# and biographical properties. +# ============================================================================= + +glm_extended_output_schema: + description: | + Extended JSON output schema that includes all PiCo properties. + This supplements the core system_prompt output format. + + persons_extended: + description: "Extended person object with all PiCo properties" + schema: + pnv_name: + literalName: "string" + givenName: "string|null" + patronym: "string|null" + surnamePrefix: "string|null" + baseSurname: "string|null" + honorificPrefix: "string|null" + honorificSuffix: "string|null" + initials: "string|null" + + biographical: + birth_date: "ISO date|null" + death_date: "ISO date|null" + birth_place: "string|null" + death_place: "string|null" + gender: "Male|Female|null" + age: "string|null" + religion: "string|null" + deceased: "boolean|null" + address: "string|null" + floruit: "string|null" + + roles: "array of role objects" + + family_relationships: + parent: "array of person references" + children: "array of person references" + spouse: "array of person references" + sibling: "array of person references" + grandparent: "array of person references" + grandchild: "array of person references" + uncle_aunt: "array of person references" + nephew_niece: "array of person references" + cousin: "array of person references" + stepparent: "array of person references" + stepchild: "array of person references" + stepsibling: "array of person references" + half_sibling: "array of person references" + foster_parent: "array of person references" + foster_child: "array of person references" + godparent: "array of person references" + godchild: "array of person references" + parent_in_law: "array of person references" + child_in_law: "array of person references" + sibling_in_law: "array of person references" + previous_partner: "array of person references" + widow_of: "person reference|null" + + context: "string|null" + +# ============================================================================= +# CH-ANNOTATOR HYPERNYM INTEGRATION UPDATE +# ============================================================================= +# +# Updated hypernym mappings to include reconstruction pattern. +# ============================================================================= + +extended_hypernym_mapping: + description: | + Extended mappings between PiCo classes and CH-Annotator hypernyms, + including the reconstruction pattern. + + mappings: + # Observation level + - pico_class: "pico:PersonObservation" + ch_hypernym: "AGT.PER" + ch_code: "AGT.PER" + note: "Source-bound person mention" + + - pico_class: "pico:PersonObservation" + ch_hypernym: "AGT.STF" + ch_code: "AGT.STF" + condition: "When person has organizational role" + note: "Staff member observation" + + # Reconstruction level + - pico_class: "pico:PersonReconstruction" + ch_hypernym: "AGT.PER" + ch_code: "AGT.PER" + note: "Reconstructed person entity" + linking: true + linking_sources: ["Wikidata", "VIAF", "ISNI"] + + # Name components + - pico_class: "pnv:PersonName" + ch_hypernym: "APP.NAM" + ch_code: "APP.NAM" + note: "Structured name" + + # Roles + - pico_class: "pico:hasRole" + ch_hypernym: "ROL" + ch_code: "ROL" + note: "Role in source" + + # Family relationships + - pico_class: "sdo:parent" + ch_hypernym: "AGT.PER" + relationship_type: "family" + note: "Parent relationship" + + - pico_class: "sdo:spouse" + ch_hypernym: "AGT.PER" + relationship_type: "family" + note: "Spouse relationship" + + - pico_class: "pico:hasGodparent" + ch_hypernym: "AGT.PER" + relationship_type: "ritual_kinship" + note: "Godparent relationship (common in historical records)" + + # Sources + - pico_class: "sdo:ArchiveComponent" + ch_hypernym: "WRK.DOC" + ch_code: "WRK.DOC" + note: "Source document" + + # Provenance + - pico_class: "prov:Activity" + ch_hypernym: null + note: "Not directly annotated; tracked in provenance metadata" + + - pico_class: "prov:Agent" + ch_hypernym: "AGT" + ch_code: "AGT" + note: "Extraction/reconstruction agent" + +# ============================================================================= +# HISTORICAL SOURCE EXTRACTION EXAMPLES +# ============================================================================= +# +# Comprehensive examples showing extraction from different historical source types. +# These demonstrate the full PiCo model including family relationships. +# ============================================================================= + +historical_extraction_examples: + description: | + These examples demonstrate extraction from common historical source types, + showing how to capture family relationships, biographical data, and roles + according to the PiCo model. + + # --------------------------------------------------------------------------- + # Example 1: Dutch Marriage Certificate (Burgerlijke Stand) + # --------------------------------------------------------------------------- + + marriage_certificate_example: + source_type: "civil_registration" + source_text: | + Heden den elfden November achttien honderd vijf en tachtig, zijn voor ons + Ambtenaar van den Burgerlijken Stand der gemeente Haarlem, verschenen: + Cornelis Johannes Koppen, oud dertig jaren, schilder, geboren te Haarlem, + wonende alhier, meerderjarige zoon van wijlen Pieter Koppen en van + Anna Maria Brouwer, zonder beroep, wonende alhier; + en Anna Maria Visser, oud zeven en twintig jaren, zonder beroep, geboren + te Amsterdam, wonende alhier, meerderjarige dochter van Jan Visser, + koopman, en van wijlen Cornelia de Vries. + + Als getuigen waren tegenwoordig: Hendrik Koppen, oud vijf en dertig jaren, + schilder, broeder van den bruidegom; en Willem Visser, oud twee en dertig + jaren, timmerman, broeder van de bruid. + + expected_output: + pico_observation: + observation_id: "bs_haarlem_1885_marriage_321" + observed_at: "2025-12-12T10:00:00Z" + source_type: "civil_registration" + source_reference: "BS Marriage Haarlem, November 11, 1885, certificate 321" + + persons: + - person_index: 0 + pnv_name: + literalName: "Cornelis Johannes Koppen" + givenName: "Cornelis Johannes" + baseSurname: "Koppen" + roles: + - role_title: "schilder" + role_in_source: "groom" + biographical: + age: "30" + birth_place: "Haarlem" + address: "Haarlem" + family_relationships: + parent: + - person_index: 2 + target_name: "Pieter Koppen" + - person_index: 3 + target_name: "Anna Maria Brouwer" + spouse: + - person_index: 1 + target_name: "Anna Maria Visser" + sibling: + - person_index: 6 + target_name: "Hendrik Koppen" + + - person_index: 1 + pnv_name: + literalName: "Anna Maria Visser" + givenName: "Anna Maria" + baseSurname: "Visser" + roles: + - role_in_source: "bride" + biographical: + age: "27" + birth_place: "Amsterdam" + address: "Haarlem" + family_relationships: + parent: + - person_index: 4 + target_name: "Jan Visser" + - person_index: 5 + target_name: "Cornelia de Vries" + spouse: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + sibling: + - person_index: 7 + target_name: "Willem Visser" + + - person_index: 2 + pnv_name: + literalName: "Pieter Koppen" + givenName: "Pieter" + baseSurname: "Koppen" + biographical: + deceased: true + family_relationships: + children: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + - person_index: 6 + target_name: "Hendrik Koppen" + spouse: + - person_index: 3 + target_name: "Anna Maria Brouwer" + + - person_index: 3 + pnv_name: + literalName: "Anna Maria Brouwer" + givenName: "Anna Maria" + baseSurname: "Brouwer" + roles: + - role_title: "zonder beroep" + biographical: + address: "Haarlem" + family_relationships: + children: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + - person_index: 6 + target_name: "Hendrik Koppen" + widow_of: + person_index: 2 + target_name: "Pieter Koppen" + + - person_index: 4 + pnv_name: + literalName: "Jan Visser" + givenName: "Jan" + baseSurname: "Visser" + roles: + - role_title: "koopman" + family_relationships: + children: + - person_index: 1 + target_name: "Anna Maria Visser" + - person_index: 7 + target_name: "Willem Visser" + spouse: + - person_index: 5 + target_name: "Cornelia de Vries" + + - person_index: 5 + pnv_name: + literalName: "Cornelia de Vries" + givenName: "Cornelia" + surnamePrefix: "de" + baseSurname: "Vries" + biographical: + deceased: true + family_relationships: + children: + - person_index: 1 + target_name: "Anna Maria Visser" + - person_index: 7 + target_name: "Willem Visser" + spouse: + - person_index: 4 + target_name: "Jan Visser" + + - person_index: 6 + pnv_name: + literalName: "Hendrik Koppen" + givenName: "Hendrik" + baseSurname: "Koppen" + roles: + - role_title: "schilder" + role_in_source: "witness" + biographical: + age: "35" + family_relationships: + sibling: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + parent: + - person_index: 2 + target_name: "Pieter Koppen" + - person_index: 3 + target_name: "Anna Maria Brouwer" + + - person_index: 7 + pnv_name: + literalName: "Willem Visser" + givenName: "Willem" + baseSurname: "Visser" + roles: + - role_title: "timmerman" + role_in_source: "witness" + biographical: + age: "32" + family_relationships: + sibling: + - person_index: 1 + target_name: "Anna Maria Visser" + parent: + - person_index: 4 + target_name: "Jan Visser" + - person_index: 5 + target_name: "Cornelia de Vries" + + temporal_references: + - expression: "den elfden November achttien honderd vijf en tachtig" + normalized: "1885-11-11" + type: "DATE" + + locations_mentioned: + - name: "Haarlem" + type: "city" + - name: "Amsterdam" + type: "city" + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Dutch civil + registry (Burgerlijke Stand) marriage certificate formulae for + demonstration purposes. Names, dates, and locations are fictional + but follow authentic 19th-century patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Centraal Bureau voor Genealogie (CBG)" + project: "WieWasWie" + digital_url: "https://www.wiewaswie.nl/" + document_type: "Birth, marriage, death certificates" + period: "1811-present (civil); 1600s+ (church)" + language: "Dutch" + license: "Subscription / Free at archives" + + - archive: "Noord-Hollands Archief" + coverage: "Civil registry from 1811, church records from 1600s" + location: "Haarlem, Netherlands" + document_types: "Dutch civil registry records" + + # --------------------------------------------------------------------------- + # Example 2: Early Modern Notarial Protocol Index Entry + # --------------------------------------------------------------------------- + + notarial_index_example: + source_type: "historical_indices" + source_text: | + Notarial Archive Amsterdam, inv. 5075/1234 + 30 January 1680 + + Before notary Pieter van der Meer appeared: + Jacob Janszoon van der Hoeven, merchant of this city, + with his wife Maritgen Claes, for themselves and as + guardians (voogden) of the minor children of the late + Claes Jacobsz and Aeltgen Pieters, namely: + - Jan Claeszoon, aged about 16 years + - Trijntgen Claesdr, aged about 12 years + + Witnesses: Hendrick Jansz, baker, and Cornelis Pietersz, + schoolmaster, both of this city. + + expected_output: + pico_observation: + observation_id: "na_amsterdam_5075_1234" + observed_at: "2025-12-12T10:00:00Z" + source_type: "historical_indices" + source_reference: "Notarial Archive Amsterdam, inv. 5075/1234, 30 January 1680" + + persons: + - person_index: 0 + pnv_name: + literalName: "Jacob Janszoon van der Hoeven" + givenName: "Jacob" + patronym: "Janszoon" + surnamePrefix: "van der" + baseSurname: "Hoeven" + roles: + - role_title: "merchant" + role_in_source: "declarant" + - role_title: "voogd" + role_in_source: null + biographical: + address: "Amsterdam" + family_relationships: + spouse: + - person_index: 1 + target_name: "Maritgen Claes" + + - person_index: 1 + pnv_name: + literalName: "Maritgen Claes" + givenName: "Maritgen" + patronym: "Claes" + roles: + - role_in_source: "declarant" + - role_title: "voogd" + family_relationships: + spouse: + - person_index: 0 + target_name: "Jacob Janszoon van der Hoeven" + + - person_index: 2 + pnv_name: + literalName: "Claes Jacobsz" + givenName: "Claes" + patronym: "Jacobsz" + biographical: + deceased: true + family_relationships: + spouse: + - person_index: 3 + target_name: "Aeltgen Pieters" + children: + - person_index: 4 + target_name: "Jan Claeszoon" + - person_index: 5 + target_name: "Trijntgen Claesdr" + + - person_index: 3 + pnv_name: + literalName: "Aeltgen Pieters" + givenName: "Aeltgen" + patronym: "Pieters" + biographical: + deceased: true + family_relationships: + spouse: + - person_index: 2 + target_name: "Claes Jacobsz" + children: + - person_index: 4 + target_name: "Jan Claeszoon" + - person_index: 5 + target_name: "Trijntgen Claesdr" + + - person_index: 4 + pnv_name: + literalName: "Jan Claeszoon" + givenName: "Jan" + patronym: "Claeszoon" + roles: + - role_in_source: "child" + biographical: + age: "about 16" + family_relationships: + parent: + - person_index: 2 + target_name: "Claes Jacobsz" + - person_index: 3 + target_name: "Aeltgen Pieters" + sibling: + - person_index: 5 + target_name: "Trijntgen Claesdr" + + - person_index: 5 + pnv_name: + literalName: "Trijntgen Claesdr" + givenName: "Trijntgen" + patronym: "Claesdr" + roles: + - role_in_source: "child" + biographical: + age: "about 12" + gender: "Female" + family_relationships: + parent: + - person_index: 2 + target_name: "Claes Jacobsz" + - person_index: 3 + target_name: "Aeltgen Pieters" + sibling: + - person_index: 4 + target_name: "Jan Claeszoon" + + - person_index: 6 + pnv_name: + literalName: "Pieter van der Meer" + givenName: "Pieter" + surnamePrefix: "van der" + baseSurname: "Meer" + roles: + - role_title: "notary" + + - person_index: 7 + pnv_name: + literalName: "Hendrick Jansz" + givenName: "Hendrick" + patronym: "Jansz" + roles: + - role_title: "baker" + role_in_source: "witness" + biographical: + address: "Amsterdam" + + - person_index: 8 + pnv_name: + literalName: "Cornelis Pietersz" + givenName: "Cornelis" + patronym: "Pietersz" + roles: + - role_title: "schoolmaster" + role_in_source: "witness" + biographical: + address: "Amsterdam" + + temporal_references: + - expression: "30 January 1680" + normalized: "1680-01-30" + type: "DATE" + + locations_mentioned: + - name: "Amsterdam" + type: "city" + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic early modern + notarial protocol index entry formulae for demonstration purposes. + Names, dates, and locations are fictional but follow authentic + 17th-century Dutch notarial patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Stadsarchief Amsterdam" + collection: "Notarial Archives (Notariële Archieven)" + document_type: "Notarial protocols, contracts, testaments" + period: "1578-1915" + language: "Dutch, Latin" + notes: "Largest notarial archive in the Netherlands" + + - project: "TICCLAT (Transliteration of Early Modern Dutch Notarial Archives)" + coverage: "Amsterdam notarial indices" + period: "17th-18th century" + notes: "Machine-readable indices to notarial protocols" + + # --------------------------------------------------------------------------- + # Example 3: Church Baptismal Record with Godparents + # --------------------------------------------------------------------------- + + baptism_record_example: + source_type: "church_records" + source_text: | + Den 15en Meij 1702 is gedoopt + Johanna, dochter van Willem Hendriksen en Geertruijd Jans, + getuijgen waren de E. Heer Jan Willem van Beverwijck + ende Juffrou Maria van Loon, huijsvrouw van de heer + Pieter Anthonisz Verschoor. + + expected_output: + pico_observation: + observation_id: "dtb_amsterdam_1702_baptism_johanna" + observed_at: "2025-12-12T10:00:00Z" + source_type: "church_records" + source_reference: "DTB Amsterdam, 15 May 1702" + + persons: + - person_index: 0 + pnv_name: + literalName: "Johanna" + givenName: "Johanna" + roles: + - role_in_source: "child" + biographical: + gender: "Female" + family_relationships: + parent: + - person_index: 1 + target_name: "Willem Hendriksen" + - person_index: 2 + target_name: "Geertruijd Jans" + godparent: + - person_index: 3 + target_name: "Jan Willem van Beverwijck" + - person_index: 4 + target_name: "Maria van Loon" + + - person_index: 1 + pnv_name: + literalName: "Willem Hendriksen" + givenName: "Willem" + patronym: "Hendriksen" + biographical: + gender: "Male" + family_relationships: + children: + - person_index: 0 + target_name: "Johanna" + spouse: + - person_index: 2 + target_name: "Geertruijd Jans" + + - person_index: 2 + pnv_name: + literalName: "Geertruijd Jans" + givenName: "Geertruijd" + patronym: "Jans" + biographical: + gender: "Female" + family_relationships: + children: + - person_index: 0 + target_name: "Johanna" + spouse: + - person_index: 1 + target_name: "Willem Hendriksen" + + - person_index: 3 + pnv_name: + literalName: "Jan Willem van Beverwijck" + givenName: "Jan Willem" + surnamePrefix: "van" + baseSurname: "Beverwijck" + honorificPrefix: "de E. Heer" + roles: + - role_in_source: "witness" + biographical: + gender: "Male" + family_relationships: + godchild: + - person_index: 0 + target_name: "Johanna" + + - person_index: 4 + pnv_name: + literalName: "Maria van Loon" + givenName: "Maria" + surnamePrefix: "van" + baseSurname: "Loon" + honorificPrefix: "Juffrou" + roles: + - role_in_source: "witness" + biographical: + gender: "Female" + family_relationships: + godchild: + - person_index: 0 + target_name: "Johanna" + spouse: + - person_index: 5 + target_name: "Pieter Anthonisz Verschoor" + + - person_index: 5 + pnv_name: + literalName: "Pieter Anthonisz Verschoor" + givenName: "Pieter" + patronym: "Anthonisz" + baseSurname: "Verschoor" + honorificPrefix: "de heer" + biographical: + gender: "Male" + family_relationships: + spouse: + - person_index: 4 + target_name: "Maria van Loon" + + temporal_references: + - expression: "Den 15en Meij 1702" + normalized: "1702-05-15" + type: "DATE" + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Dutch Reformed + Church (Nederlandse Hervormde Kerk) baptismal register formulae for + demonstration purposes. Names, dates, and locations are fictional + but follow authentic early 18th-century patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Various Dutch Regional Archives" + collection: "Doop-, Trouw- en Begraafregisters (DTB)" + document_type: "Church baptism, marriage, burial records" + period: "1600s-1811 (before civil registration)" + language: "Dutch" + notes: "Pre-1811 vital records maintained by churches" + + - archive: "FamilySearch" + collection: "Netherlands, Church Records" + wiki_url: "https://www.familysearch.org/en/wiki/Netherlands_Church_Records" + document_type: "Dutch church baptisms" + license: "Free with registration" + + # --------------------------------------------------------------------------- + # Example 4: Modern LinkedIn Staff Profile + # --------------------------------------------------------------------------- + + linkedin_profile_example: + source_type: "modern_digital" + source_text: | + Dr. Maria van den Berg + Director of Collections | Rijksmuseum + Amsterdam, Netherlands + + About: + Leading the collections management team at the Rijksmuseum since 2018. + Previously Head Curator at the Van Gogh Museum (2012-2018). + PhD in Art History, University of Amsterdam. + + Experience: + - Director of Collections, Rijksmuseum (2018-present) + - Head Curator, Van Gogh Museum (2012-2018) + - Assistant Curator, Stedelijk Museum (2008-2012) + + Education: + - PhD Art History, University of Amsterdam (2008) + - MA Museum Studies, University of Amsterdam (2003) + + expected_output: + pico_observation: + observation_id: "linkedin_maria_van_den_berg_2025" + observed_at: "2025-12-12T10:00:00Z" + source_type: "modern_digital" + source_reference: "https://linkedin.com/in/mariavandenberg" + + persons: + - person_index: 0 + pnv_name: + literalName: "Dr. Maria van den Berg" + givenName: "Maria" + surnamePrefix: "van den" + baseSurname: "Berg" + honorificPrefix: "Dr." + roles: + - role_title: "Director of Collections" + organization: "Rijksmuseum" + period: "2018-present" + heritage_relevant: true + heritage_type: "M" + - role_title: "Head Curator" + organization: "Van Gogh Museum" + period: "2012-2018" + heritage_relevant: true + heritage_type: "M" + - role_title: "Assistant Curator" + organization: "Stedelijk Museum" + period: "2008-2012" + heritage_relevant: true + heritage_type: "M" + biographical: + address: "Amsterdam, Netherlands" + family_relationships: {} + context: "Heritage sector professional with museum career" + + organizations_mentioned: + - name: "Rijksmuseum" + type: "M" + role_in_source: "employer" + - name: "Van Gogh Museum" + type: "M" + role_in_source: "employer" + - name: "Stedelijk Museum" + type: "M" + role_in_source: "employer" + - name: "University of Amsterdam" + type: "E" + role_in_source: "education" + + locations_mentioned: + - name: "Amsterdam" + type: "city" + - name: "Netherlands" + type: "country" + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on modern LinkedIn profile + formats for demonstration purposes. The profile name, institution, + and biographical details are entirely fictional. LinkedIn profiles + represent a modern source type for person-in-context observations, + contrasting with the historical document examples in this module. + + source_context: + platform: "LinkedIn" + data_type: "Modern professional networking profile" + privacy_note: | + When extracting real LinkedIn data, ensure compliance with + LinkedIn Terms of Service, GDPR, and applicable privacy laws. + This synthetic example demonstrates extraction patterns only. + + # --------------------------------------------------------------------------- + # Example 5: Arabic Waqf Document (Endowment Record) + # --------------------------------------------------------------------------- + + arabic_waqf_example: + source_type: "archival_descriptions" + language: "Arabic" + description: | + Example of a waqf (religious endowment) document from an Islamic archive. + Waqf documents record property endowments for religious/charitable purposes + and typically name the founder, beneficiaries, and witnesses. + + source_text: | + بسم الله الرحمن الرحيم + هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة + حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة + بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح + الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف + التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين + وخمس وعشرين هجرية. + + [Translation: In the name of God, the Compassionate, the Merciful. + This is what the late al-Hajj Ahmad ibn Muhammad al-'Umari, merchant + in the city of Aleppo, son of the late Muhammad ibn Abdullah al-'Umari, + has endowed, dedicated, and perpetuated. He endowed his entire house + located in the al-Jadida neighborhood for his children and grandchildren, + male and female. If they cease to exist, God forbid, then for the poor + Muslims. Witnessed by: al-Hajj Ibrahim ibn Yusuf al-Turkmani, and + al-Sayyid Ali ibn Husayn al-Halabi. Written in the month of Rajab, + year 1225 Hijri (1810 CE).] + + expected_output: + pico_observation: + observation_id: "waqf_aleppo_1225h_ahmad_umari" + observed_at: "2025-12-12T10:00:00Z" + source_type: "archival_descriptions" + source_reference: "Waqf document, Aleppo, Rajab 1225 AH (1810 CE)" + + persons: + - person_index: 0 + pnv_name: + literalName: "الحاج أحمد بن محمد العمري" + literalName_romanized: "al-Hajj Ahmad ibn Muhammad al-'Umari" + givenName: "أحمد" + givenName_romanized: "Ahmad" + patronym: "محمد" + patronym_romanized: "Muhammad" + baseSurname: "العمري" + baseSurname_romanized: "al-'Umari" + honorificPrefix: "الحاج" + honorificPrefix_romanized: "al-Hajj" + roles: + - role_title: "تاجر" + role_title_romanized: "merchant" + role_in_source: "founder" + biographical: + deceased: true + address: "حلب الشهباء (Aleppo)" + family_relationships: + parent: + - person_index: 1 + target_name: "محمد بن عبد الله العمري" + context: "Waqf founder (واقف)" + + - person_index: 1 + pnv_name: + literalName: "محمد بن عبد الله العمري" + literalName_romanized: "Muhammad ibn Abdullah al-'Umari" + givenName: "محمد" + givenName_romanized: "Muhammad" + patronym: "عبد الله" + patronym_romanized: "Abdullah" + baseSurname: "العمري" + baseSurname_romanized: "al-'Umari" + honorificPrefix: "المرحوم" + honorificPrefix_romanized: "the late" + biographical: + deceased: true + family_relationships: + children: + - person_index: 0 + target_name: "أحمد بن محمد العمري" + context: "Father of the founder" + + - person_index: 2 + pnv_name: + literalName: "الحاج إبراهيم بن يوسف التركماني" + literalName_romanized: "al-Hajj Ibrahim ibn Yusuf al-Turkmani" + givenName: "إبراهيم" + givenName_romanized: "Ibrahim" + patronym: "يوسف" + patronym_romanized: "Yusuf" + baseSurname: "التركماني" + baseSurname_romanized: "al-Turkmani" + honorificPrefix: "الحاج" + honorificPrefix_romanized: "al-Hajj" + roles: + - role_in_source: "witness" + family_relationships: {} + context: "Witness to the endowment" + + - person_index: 3 + pnv_name: + literalName: "السيد علي بن حسين الحلبي" + literalName_romanized: "al-Sayyid Ali ibn Husayn al-Halabi" + givenName: "علي" + givenName_romanized: "Ali" + patronym: "حسين" + patronym_romanized: "Husayn" + baseSurname: "الحلبي" + baseSurname_romanized: "al-Halabi" + honorificPrefix: "السيد" + honorificPrefix_romanized: "al-Sayyid" + roles: + - role_in_source: "witness" + family_relationships: {} + context: "Witness to the endowment" + + temporal_references: + - expression: "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية" + expression_romanized: "month of Rajab, year 1225 Hijri" + normalized: "1810-07" # Approximate Gregorian equivalent + calendar: "Hijri" + type: "DATE" + + locations_mentioned: + - name: "حلب الشهباء" + name_romanized: "Aleppo" + type: "city" + - name: "محلة الجديدة" + name_romanized: "al-Jadida neighborhood" + type: "neighborhood" + + arabic_naming_notes: | + Arabic naming conventions demonstrated: + - ابن/بن (ibn/bin): patronymic "son of" + - الحاج (al-Hajj): honorific for one who completed pilgrimage + - السيد (al-Sayyid): honorific denoting descent from Prophet Muhammad + - المرحوم (al-marhum): "the late" (deceased) + - نسبة (nisba): geographic/tribal surname (العمري - from 'Umar tribe, + التركماني - Turkman origin, الحلبي - from Aleppo) + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on standard waqf document formulae + for demonstration purposes. Names, dates, and property details are fictional. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Cambridge University Library" + collection: "Islamic Manuscripts" + digital_url: "https://cudl.lib.cam.ac.uk/collections/islamic" + document_types: "Waqfiyya, legal documents" + period: "8th-20th century CE" + license: "CC BY-NC 4.0" + + - archive: "University of Pennsylvania Libraries" + collection: "Manuscripts of the Muslim World" + digital_url: "https://openn.library.upenn.edu/html/muslimworld_contents.html" + document_types: "Waqfiyya, Quranic manuscripts, legal documents" + license: "Public Domain / CC0" + + - archive: "Singapore National Heritage Board" + accession_number: "1115401" + digital_url: "https://www.roots.gov.sg/Collection-Landing/listing/1115401" + document_type: "Waqf document" + donor: "Muhammad b. Abd al-Ghani" + properties: "Istanbul (various locations)" + +# ============================================================================= +# EXAMPLE 6: Hebrew Ketubah - Marriage of Mosheh & Rivkah (Mashhad, Iran, 1896) +# ============================================================================= +# +# REAL HISTORICAL DATA from Yale University Beinecke Library +# +# Source type: ketubah (Jewish marriage contract) +# Language: Hebrew/Aramaic +# Date: 23 Elul 5656 AM (September 1, 1896 CE) +# Location: Mashhad, Iran +# Call Number: Hebrew MSS suppl 194 (Broadside) +# +# This is a REAL ketubah with verified provenance from Yale's digital collection. +# The Mashhad Jewish community had a unique history as "crypto-Jews" after +# forced conversion in 1839, making this document culturally significant. +# ============================================================================= + +example_6_hebrew_ketubah: + description: | + A ketubah (כתובה) is a Jewish marriage contract written in Aramaic with Hebrew + elements. This REAL example from Mashhad, Iran demonstrates Persian Jewish + traditions with elaborate decorative elements. + + Historical context: The Jewish community of Mashhad was unique - after forced + conversion to Islam in 1839 (the Allahdad pogrom), many continued practicing + Judaism in secret as "Jadid al-Islam" (new Muslims). By 1896, some families + were more openly practicing Judaism, as evidenced by this elaborate ketubah. + + Key features documented: + - Groom and bride names with patronymics (בן/בת - son/daughter of) + - Persian Jewish artistic traditions (floral patterns, colored rules) + - Hebrew date with month, day, and year from Creation + - Isaiah 61:10 verse as blessing + - Physical dimensions: 53 x 37 cm + + source_text: | + [Note: Full text not transcribed from manuscript. Key readable elements:] + + בס״ד + + שנת חמשת אלפים שש מאות וחמישים ושש לבריאת עולם + עשרים ושלשה לחודש אלול + במשהד + + החתן משה בן משיאח + הכלה רבקה בת יעקב + + [Isaiah 61:10 - visible in decorative header:] + שוש אשיש בה׳ תגל נפשי באלהי כי הלבישני בגדי ישע מעיל צדקה יעטני + כחתן יכהן פאר וככלה תעדה כליה + + source_text_romanized: | + B'siyata d'shmaya (With Heaven's help) + + In the year five thousand six hundred and fifty-six from the Creation of the world, + the twenty-third day of the month of Elul, + in Mashhad. + + The groom: Mosheh son of Mashiah + The bride: Rivkah daughter of Ya'akov + + [Isaiah 61:10 - decorative header blessing:] + "I will greatly rejoice in the LORD, my soul shall be joyful in my God. + For he has clothed me with the garments of salvation, he has covered me + with the robe of righteousness, as a bridegroom decks himself with a garland, + and as a bride adorns herself with her jewels." + + expected_extraction: + description: "Hebrew ketubah extraction from REAL Mashhad, Iran document (1896)" + + pico_observation: + observation_id: "ketubah_mashhad_5656_mosheh_rivkah" + observed_at: "2025-01-13T12:00:00Z" + source_type: "ketubah" + source_reference: "Ketubah, Mashhad, 23 Elul 5656 (September 1, 1896 CE), Yale Beinecke Hebrew MSS suppl 194" + archive: "Yale University, Beinecke Rare Book & Manuscript Library" + + persons: + - person_index: 0 + pnv_name: + literalName: "משה בן משיאח" + literalName_romanized: "Mosheh ben Mashiah" + givenName: "משה" + givenName_romanized: "Mosheh" + patronym: "משיאח" + patronym_romanized: "Mashiah" + roles: + - role_title: "חתן" + role_title_romanized: "chatan" + role_in_source: "groom" + biographical: + sex: "male" + religion: "Jewish" + community: "Mashhad Jewish community (Mashhadis)" + family_relationships: + father: + - person_index: 1 + target_name: "משיאח" + spouse: + - person_index: 2 + target_name: "רבקה בת יעקב" + context: "Groom (חתן) - the bridegroom in the marriage contract" + + - person_index: 1 + pnv_name: + literalName: "משיאח" + literalName_romanized: "Mashiah" + givenName: "משיאח" + givenName_romanized: "Mashiah" + biographical: + sex: "male" + note: "Name meaning 'Messiah' - common Persian Jewish name" + family_relationships: + child: + - person_index: 0 + target_name: "משה" + context: "Father of the groom (implicit from patronymic)" + + - person_index: 2 + pnv_name: + literalName: "רבקה בת יעקב" + literalName_romanized: "Rivkah bat Ya'akov" + givenName: "רבקה" + givenName_romanized: "Rivkah" + givenName_english: "Rebecca" + patronym: "יעקב" + patronym_romanized: "Ya'akov" + roles: + - role_title: "כלה" + role_title_romanized: "kallah" + role_in_source: "bride" + biographical: + sex: "female" + religion: "Jewish" + community: "Mashhad Jewish community (Mashhadis)" + family_relationships: + father: + - person_index: 3 + target_name: "יעקב" + spouse: + - person_index: 0 + target_name: "משה בן משיאח" + context: "Bride (כלה) - daughter of Ya'akov" + + - person_index: 3 + pnv_name: + literalName: "יעקב" + literalName_romanized: "Ya'akov" + givenName: "יעקב" + givenName_romanized: "Ya'akov" + givenName_english: "Jacob" + biographical: + sex: "male" + note: "Biblical patriarch name - common in Jewish communities" + family_relationships: + child: + - person_index: 2 + target_name: "רבקה" + context: "Father of the bride (implicit from patronymic)" + + temporal_references: + - expression: "עשרים ושלשה לחודש אלול שנת חמשת אלפים שש מאות וחמישים ושש לבריאת עולם" + expression_romanized: "23rd day of the month of Elul, year 5656 from Creation" + normalized_gregorian: "1896-09-01" + calendar: "Hebrew" + type: "DATE" + components: + day: 23 + month: "אלול (Elul)" + month_number: 6 + year_hebrew: 5656 + year_gregorian: 1896 + era: "לבריאת עולם (from Creation)" + notes: "Elul is the 6th month of the civil year, 12th of the ecclesiastical year" + + locations_mentioned: + - name: "משהד" + name_romanized: "Mashhad" + name_persian: "مشهد" + type: "city" + country: "Iran (then Qajar Persia)" + modern_country: "Iran" + coordinates: "36.2972, 59.6067" + historical_context: | + Mashhad is a major city in northeastern Iran, holy city of Shia Islam + (shrine of Imam Reza). The Jewish community dated to ancient times but + faced forced conversion in 1839. By 1896, some families openly practiced + Judaism while others remained crypto-Jews. + + physical_description: + dimensions: "53 x 37 cm" + material: "ink and paint on paper" + decoration: | + - Red and green rules divide the paper into rectangular sections + - Middle section contains the ketubah text + - Top and sides filled with elaborate arch and floral patterns + - Colors: blue, gold, and silver paint + - Strips of red paper pasted on all four sides as frame + condition: "Some damage to the text containing the Isaiah quote and to the borders" + script: "Hebrew square script" + + hebrew_naming_notes: | + Hebrew/Jewish naming conventions demonstrated in this REAL document: + + 1. PATRONYMIC SYSTEM: + - בן (ben): "son of" - used for males + - בת (bat): "daughter of" - used for females + - Example: משה בן משיאח = "Mosheh son of Mashiah" + + 2. PERSIAN JEWISH NAMES: + - משיאח (Mashiah/Messiah): Common Persian Jewish given name + - רבקה (Rivkah/Rebecca): Biblical matriarch name + - יעקב (Ya'akov/Jacob): Biblical patriarch name + + 3. KETUBAH STRUCTURE: + - Opening: בס״ד (B'siyata d'Shmaya - With Heaven's help) + - Date: Hebrew calendar from Creation (anno mundi) + - Location: City name in Hebrew transliteration + - Parties: Groom (חתן) and Bride (כלה) with patronymics + - Blessing: Often biblical verses (here Isaiah 61:10) + + 4. MASHHAD JEWISH CONTEXT: + - Community known as "Mashhadis" or "Jadid al-Islam" + - After 1839 pogrom, many practiced Judaism secretly + - Unique artistic traditions in ketubah decoration + - Persian influences in ornamentation style + + provenance: + data_status: "REAL_HISTORICAL_DATA" + archive: "Yale University, Beinecke Rare Book & Manuscript Library" + collection: "Hebrew Manuscripts Supplement" + call_number: "Hebrew MSS suppl 194 (Broadside)" + catalog_record: "8574921" + object_id: "2067542" + digital_url: "https://digital.library.yale.edu/catalog/2067542" + iiif_manifest: "https://digital.library.yale.edu/manifests/2067542" + pdf_url: "https://digital.library.yale.edu/pdfs/2067542.pdf" + + document_date_hebrew: "23 Elul 5656" + document_date_gregorian: "1896-09-01" + document_place: "Mashhad, Iran" + + contributors: + groom: "Mosheh ben Mashiah" + bride: "Rivkah bat Ya'akov" + + physical_extent: "1 leaf, 53 x 37 cm, color illustrations" + languages: + - "Hebrew" + - "Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)" + + subjects: + geographic: "Mashhad (Iran) -- Religious life and customs" + topical: + - "Ketubah -- Iran -- Mashhad" + - "Prenuptial agreements (Jewish law)" + + genres: + - "Autographs" + - "Illustrations" + - "Ketubahs" + - "Manuscripts" + - "Marginalia" + + rights: | + The use of this image may be subject to the copyright law of the + United States (Title 17, United States Code) or to site license or + other rights management terms and conditions. The person using the + image is liable for any infringement. + + access_date: "2025-01-13" + + citation: | + "Ketubah : Mashhad, Iran, 1896, September 1," Yale University Library, + Beinecke Rare Book and Manuscript Library, Hebrew MSS suppl 194 (Broadside), + Object ID 2067542. Digital Collections, https://digital.library.yale.edu/catalog/2067542 + (accessed January 13, 2025). + + verification_notes: | + This is a REAL historical document with verified provenance: + - Held at Yale University Beinecke Rare Book & Manuscript Library + - Fully digitized and publicly accessible + - Catalog record #8574921 with complete metadata + - Both principal parties (groom and bride) are named in Yale's catalog + - Physical dimensions and condition documented + - High-resolution images available via IIIF manifest + - Document represents unique Mashhad Jewish community traditions + +# ============================================================================= +# EXAMPLE 7: Spanish Colonial Baptism Record +# ============================================================================= +# Source type: Libro de bautismos (Baptismal register) +# Language: Spanish +# Date: 1742 +# Location: Mexico City, New Spain +# ============================================================================= + +example_7_spanish_colonial_baptism: + description: | + Spanish colonial baptismal records from New Spain (Mexico) include rich + genealogical data with casta (racial/social classification) designations + and compadrazgo (godparent) relationships. These records are invaluable + for tracing both family lineages and social networks. + + Key features: + - Casta designations (español, mestizo, mulato, indio, etc.) + - Legitimacy markers (hijo legítimo vs hijo natural) + - Compadrazgo (godparent relationships) + - Place of origin (vecino de, natural de) + - Ecclesiastical formulae + + source_text: | + En la ciudad de México, a veinte y tres días del mes de febrero de mil + setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza, + teniente de cura de esta santa iglesia catedral, bauticé solemnemente, + puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro + García de la Cruz, español, natural de la villa de Puebla de los Ángeles, + y de Doña María Josefa de los Reyes, española, natural de esta ciudad. + + Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino + de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa, + a quienes advertí el parentesco espiritual y obligaciones que contrajeron. + + Y lo firmé. + Br. Don Antonio de Mendoza + + expected_extraction: + description: "Spanish colonial baptism demonstrating casta system and compadrazgo" + + pico_observation: + observation_id: "bautismo_mexico_1742_juan_jose_garcia" + observed_at: "2025-12-12T12:00:00Z" + source_type: "baptismal_register" + source_reference: "Libro de Bautismos, Catedral de México, 23 Feb 1742" + + persons: + - person_index: 0 + pnv_name: + literalName: "Juan José" + givenName: "Juan José" + roles: + - role_title: "bautizado" + role_in_source: "baptized" + biographical: + casta: "español" + legitimacy: "hijo legítimo" + religion: "Catholic" + family_relationships: + parent: + - person_index: 1 + target_name: "Don Pedro García de la Cruz" + - person_index: 2 + target_name: "Doña María Josefa de los Reyes" + godparent: + - person_index: 3 + target_name: "Don Francisco Xavier de Castañeda" + - person_index: 4 + target_name: "Doña Ana María de la Encarnación" + context: "Infant being baptized" + + - person_index: 1 + pnv_name: + literalName: "Don Pedro García de la Cruz" + givenName: "Pedro" + surnamePrefix: "García de" + baseSurname: "la Cruz" + honorificPrefix: "Don" + biographical: + casta: "español" + origin: "natural de la villa de Puebla de los Ángeles" + family_relationships: + spouse: + - person_index: 2 + target_name: "Doña María Josefa de los Reyes" + children: + - person_index: 0 + target_name: "Juan José" + context: "Father of the baptized child" + + - person_index: 2 + pnv_name: + literalName: "Doña María Josefa de los Reyes" + givenName: "María Josefa" + surnamePrefix: "de" + baseSurname: "los Reyes" + honorificPrefix: "Doña" + biographical: + casta: "española" + origin: "natural de esta ciudad" + family_relationships: + spouse: + - person_index: 1 + target_name: "Don Pedro García de la Cruz" + children: + - person_index: 0 + target_name: "Juan José" + context: "Mother of the baptized child" + + - person_index: 3 + pnv_name: + literalName: "Don Francisco Xavier de Castañeda" + givenName: "Francisco Xavier" + surnamePrefix: "de" + baseSurname: "Castañeda" + honorificPrefix: "Don" + roles: + - role_title: "padrino" + role_in_source: "godfather" + biographical: + casta: "español" + residence: "vecino de esta ciudad" + family_relationships: + spouse: + - person_index: 4 + target_name: "Doña Ana María de la Encarnación" + godchildren: + - person_index: 0 + target_name: "Juan José" + compadre: + - person_index: 1 + target_name: "Don Pedro García de la Cruz" + context: "Godfather (padrino)" + + - person_index: 4 + pnv_name: + literalName: "Doña Ana María de la Encarnación" + givenName: "Ana María" + surnamePrefix: "de" + baseSurname: "la Encarnación" + honorificPrefix: "Doña" + roles: + - role_title: "madrina" + role_in_source: "godmother" + biographical: + marital_status: "legítima esposa" + family_relationships: + spouse: + - person_index: 3 + target_name: "Don Francisco Xavier de Castañeda" + godchildren: + - person_index: 0 + target_name: "Juan José" + comadre: + - person_index: 2 + target_name: "Doña María Josefa de los Reyes" + context: "Godmother (madrina)" + + - person_index: 5 + pnv_name: + literalName: "Br. Don Antonio de Mendoza" + givenName: "Antonio" + surnamePrefix: "de" + baseSurname: "Mendoza" + honorificPrefix: "Br. Don" + roles: + - role_title: "teniente de cura" + role_in_source: "officiant" + biographical: + ecclesiastical_position: "teniente de cura de esta santa iglesia catedral" + family_relationships: {} + context: "Priest who performed the baptism" + + temporal_references: + - expression: "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años" + normalized: "1742-02-23" + calendar: "Gregorian" + type: "DATE" + + locations_mentioned: + - name: "ciudad de México" + type: "city" + administrative_entity: "New Spain" + - name: "santa iglesia catedral" + type: "church" + full_name: "Catedral Metropolitana de la Asunción de la Santísima Virgen María" + - name: "villa de Puebla de los Ángeles" + type: "city" + modern_name: "Puebla" + administrative_entity: "New Spain" + + colonial_naming_notes: | + Spanish colonial naming conventions demonstrated: + - Don/Doña: honorific indicating Spanish (peninsular or criollo) status + - Br. (Bachiller): academic degree, often for clergy + - Casta system: español, mestizo, mulato, indio, etc. + - "natural de": indicates place of birth + - "vecino de": indicates place of residence + - "hijo legítimo": legitimate child (parents married) + - "hijo natural": illegitimate child (parents not married) + - Compadrazgo: godparent relationship creating spiritual kinship + - Padrino/madrina: godfather/godmother + - Compadre/comadre: relationship between godparents and parents + - "parentesco espiritual": spiritual kinship with religious obligations + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on standard Spanish colonial + baptismal formulae for demonstration purposes. Names, dates, and + locations are fictional but follow authentic 17th-century patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Brigham Young University" + collection: "Script Tutorial - Spanish Colonial Baptisms" + digital_url: "https://script.byu.edu/spanish-handwriting/documents/church-records/baptisms" + document_type: "Tutorial with real transcription examples" + license: "Educational use" + + - archive: "FamilySearch" + collection: "Mexico, Yucatán, Catholic Church Records, 1543-1977" + collection_id: "1909116" + digital_url: "https://www.familysearch.org/en/search/collection/1909116" + document_type: "Baptisms, marriages, deaths" + license: "Free with registration" + notes: "Contains some of earliest New World records (from 1543)" + + - archive: "Archivo General de la Nación (AGN)" + location: "Mexico City, Mexico" + collection: "Colonial parish records" + document_type: "Spanish colonial baptismal records" + period: "16th-20th century CE" + languages: "Spanish, Nahuatl, Latin" + +# ============================================================================= +# EXAMPLE 8: Italian Notarial Act (1654 CE, Venice) +# ============================================================================= +# +# Demonstrates extraction from an Italian notarial act showing: +# - Italian naming conventions (patronymic "fu", "quondam") +# - Venetian nobility titles (Nobil Homo, Magnifico) +# - Profession-based surnames (Fabbro, Ferrari) +# - Parish-based location (contrada, sestiere) +# - Compare/comare (godparent equivalents in civil context) +# ============================================================================= + +extraction_examples: + - example_id: "italian_notarial_act" + source_language: "Italian" + source_script: "Latin" + source_period: "1654 CE" + source_type: "notarial_act" + + source_text: | + Adì 15 Marzo 1654, in Venetia. + + Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu + quondam Magnifico Messer Andrea, della contrada di San Marco, + et sua moglie la Nobil Donna Madonna Caterina Contarini fu + quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo + Fabbro, habitante nella contrada di San Polo, et Messer Marco + Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io + Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico + di Venetia. + + expected_output: + pico_observation: + observation_id: "notarial_venice_1654-03-15_morosini" + source_type: "notarial_act" + source_reference: "Notarial act, Venice, March 15, 1654" + + persons: + - person_index: 0 + pnv_name: + literalName: "Il Nobil Homo Messer Giovanni Battista Morosini" + givenName: "Giovanni Battista" + baseSurname: "Morosini" + honorificPrefix: "Il Nobil Homo Messer" + roles: + - role_title: "principal party" + role_in_source: "party to act" + biographical: + social_status: "Venetian nobility" + patronymic: "fu quondam Magnifico Messer Andrea" + father_status: "deceased (quondam)" + family_relationships: + father: + - person_index: 1 + target_name: "Magnifico Messer Andrea Morosini" + spouse: + - person_index: 2 + target_name: "Nobil Donna Madonna Caterina Contarini" + context: "Principal party, Venetian noble" + + - person_index: 1 + pnv_name: + literalName: "Magnifico Messer Andrea Morosini" + givenName: "Andrea" + baseSurname: "Morosini" + honorificPrefix: "Magnifico Messer" + roles: [] + biographical: + social_status: "Venetian nobility" + deceased: true + deceased_marker: "quondam" + family_relationships: + child: + - person_index: 0 + target_name: "Giovanni Battista Morosini" + context: "Father of Giovanni Battista, deceased" + + - person_index: 2 + pnv_name: + literalName: "Nobil Donna Madonna Caterina Contarini" + givenName: "Caterina" + baseSurname: "Contarini" + honorificPrefix: "Nobil Donna Madonna" + roles: + - role_title: "moglie" + role_in_source: "wife" + biographical: + social_status: "Venetian nobility" + patronymic: "fu quondam Messer Francesco" + family_relationships: + father: + - person_index: 3 + target_name: "Messer Francesco Contarini" + spouse: + - person_index: 0 + target_name: "Giovanni Battista Morosini" + context: "Wife of Giovanni Battista" + + - person_index: 3 + pnv_name: + literalName: "Messer Francesco Contarini" + givenName: "Francesco" + baseSurname: "Contarini" + honorificPrefix: "Messer" + roles: [] + biographical: + deceased: true + deceased_marker: "quondam" + family_relationships: + child: + - person_index: 2 + target_name: "Caterina Contarini" + context: "Father of Caterina, deceased" + + - person_index: 4 + pnv_name: + literalName: "Messer Pietro fu Paolo Fabbro" + givenName: "Pietro" + baseSurname: "Fabbro" + honorificPrefix: "Messer" + roles: + - role_title: "testimone" + role_in_source: "witness" + biographical: + patronymic: "fu Paolo" + residence: "contrada di San Polo" + family_relationships: + father: + - person_index: 5 + target_name: "Paolo Fabbro" + context: "First witness" + + - person_index: 5 + pnv_name: + literalName: "Paolo Fabbro" + givenName: "Paolo" + baseSurname: "Fabbro" + roles: [] + biographical: + deceased: true + family_relationships: + child: + - person_index: 4 + target_name: "Pietro Fabbro" + context: "Father of witness Pietro, deceased" + + - person_index: 6 + pnv_name: + literalName: "Messer Marco Antonio Ferrari fu Giovanni" + givenName: "Marco Antonio" + baseSurname: "Ferrari" + honorificPrefix: "Messer" + roles: + - role_title: "testimone" + role_in_source: "witness" + biographical: + patronymic: "fu Giovanni" + occupation: "bottegaio" + workplace: "Rialto" + family_relationships: + father: + - person_index: 7 + target_name: "Giovanni Ferrari" + context: "Second witness, shopkeeper" + + - person_index: 7 + pnv_name: + literalName: "Giovanni Ferrari" + givenName: "Giovanni" + baseSurname: "Ferrari" + roles: [] + biographical: + deceased: true + family_relationships: + child: + - person_index: 6 + target_name: "Marco Antonio Ferrari" + context: "Father of witness Marco Antonio, deceased" + + - person_index: 8 + pnv_name: + literalName: "Notaro Antonio Zen fu quondam Messer Giacomo" + givenName: "Antonio" + baseSurname: "Zen" + honorificPrefix: "Notaro" + roles: + - role_title: "notaro" + role_in_source: "notary" + biographical: + patronymic: "fu quondam Messer Giacomo" + occupation: "Notaro publico di Venetia" + family_relationships: + father: + - person_index: 9 + target_name: "Messer Giacomo Zen" + context: "Notary who drafted the act" + + - person_index: 9 + pnv_name: + literalName: "Messer Giacomo Zen" + givenName: "Giacomo" + baseSurname: "Zen" + honorificPrefix: "Messer" + roles: [] + biographical: + deceased: true + deceased_marker: "quondam" + family_relationships: + child: + - person_index: 8 + target_name: "Antonio Zen" + context: "Father of notary, deceased" + + temporal_references: + - expression: "Adì 15 Marzo 1654" + normalized: "1654-03-15" + calendar: "Gregorian" + type: "DATE" + + locations_mentioned: + - name: "Venetia" + name_modern: "Venice" + type: "city" + - name: "contrada di San Marco" + type: "parish/district" + parent: "Venice" + - name: "contrada di San Polo" + type: "parish/district" + parent: "Venice" + - name: "Rialto" + type: "district/market" + parent: "Venice" + + italian_naming_notes: | + Italian notarial naming conventions demonstrated: + - "fu" / "quondam": indicates deceased father (Latin survival) + - "Magnifico Messer": high honorific for nobility + - "Nobil Homo" / "Nobil Donna": Venetian noble titles + - "Madonna": honorific for married noble women + - Profession surnames: Fabbro (smith), Ferrari (ironworker) + - "habitante in/nella": residence indicator + - "bottegaio": shopkeeper + - Venetian patronato system reflected in naming + - Contrada: parish neighborhood system of Venice + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic 17th-century + Venetian notarial document formulae for demonstration purposes. + Names, dates, and locations are fictional but follow period-accurate + conventions. For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Italian Ministry of Culture" + project: "Antenati (Ancestors)" + digital_url: "https://antenati.cultura.gov.it/" + venice_url: "https://antenati.cultura.gov.it/archivio/state-archives-of-venezia/?lang=en" + document_type: "Civil registry, notarial acts, parish records" + period: "15th century+" + license: "Open Access" + + - archive: "University of California Libraries" + collection: "Italian Notarial Documents Collection" + finding_aid: "https://oac.cdlib.org/findaid/ark:%2F13030%2Fc8v412zd" + document_count: "168 documents" + period: "1465-1635 CE" + locations: "Venice, Padua, Verona" + languages: "Latin, Italian (Venetian)" + + - project: "SION-Digit (Sources for the History of Italian Jewish Notarial Documents)" + coverage: "Venice, Bordeaux, Amsterdam" + period: "16th-18th century CE" + focus: "Jewish community notarial acts" + languages: "Italian, Hebrew, Ladino" + +# ============================================================================= +# EXAMPLE 9: Greek Orthodox Parish Register (1875 CE, Thessaloniki) +# ============================================================================= +# +# Demonstrates extraction from a Greek Orthodox baptismal register showing: +# - Greek script with romanization +# - Greek patronymics (του + genitive) +# - Godparent system (νονός/νονά) +# - Orthodox naming conventions +# - Deceased marker (μακαρίτης/μακαρίτισσα) +# ============================================================================= + + - example_id: "greek_baptismal_register" + source_language: "Greek" + source_script: "Greek" + source_period: "1875 CE" + source_type: "baptismal_register" + + source_text: | + Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875. + + Ἐβαπτίσθη ὁ Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου, + ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ + μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς ὁ Κωνσταντῖνος + Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: ὁ Πρωτοπρεσβύτερος + Ἀθανάσιος Χρυσοστόμου. + + expected_output: + pico_observation: + observation_id: "baptism_thessaloniki_1875-03-15_papadopoulos" + source_type: "baptismal_register" + source_reference: "Greek Orthodox baptismal register, Thessaloniki, March 15, 1875" + + persons: + - person_index: 0 + pnv_name: + literalName: "Δημήτριος" + literalName_romanized: "Dimitrios" + givenName: "Δημήτριος" + givenName_romanized: "Dimitrios" + roles: + - role_title: "βαπτισθείς" + role_in_source: "baptized infant" + biographical: + sex: "male" + religion: "Greek Orthodox" + family_relationships: + father: + - person_index: 1 + target_name: "Νικόλαος Παπαδόπουλος" + mother: + - person_index: 2 + target_name: "Ἑλένη" + godfather: + - person_index: 4 + target_name: "Κωνσταντῖνος Καρατζᾶς" + context: "Baptized infant" + + - person_index: 1 + pnv_name: + literalName: "Νικόλαος Παπαδόπουλος" + literalName_romanized: "Nikolaos Papadopoulos" + givenName: "Νικόλαος" + givenName_romanized: "Nikolaos" + baseSurname: "Παπαδόπουλος" + baseSurname_romanized: "Papadopoulos" + roles: + - role_title: "πατήρ" + role_in_source: "father" + biographical: + occupation: "ἔμπορος (merchant)" + family_relationships: + child: + - person_index: 0 + target_name: "Δημήτριος" + spouse: + - person_index: 2 + target_name: "Ἑλένη" + context: "Father of the baptized, merchant" + + - person_index: 2 + pnv_name: + literalName: "Ἑλένη τῆς τοῦ μακαρίτου Γεωργίου Οἰκονόμου" + literalName_romanized: "Eleni tis tou makaritou Georgiou Oikonomou" + givenName: "Ἑλένη" + givenName_romanized: "Eleni" + roles: + - role_title: "μήτηρ" + role_in_source: "mother" + biographical: + marital_status: "νομίμη σύζυγος (lawful wife)" + patronymic: "τῆς τοῦ μακαρίτου Γεωργίου Οἰκονόμου" + family_relationships: + father: + - person_index: 3 + target_name: "Γεώργιος Οἰκονόμος" + child: + - person_index: 0 + target_name: "Δημήτριος" + spouse: + - person_index: 1 + target_name: "Νικόλαος Παπαδόπουλος" + context: "Mother of the baptized" + + - person_index: 3 + pnv_name: + literalName: "μακαρίτης Γεώργιος Οἰκονόμος" + literalName_romanized: "makaritis Georgios Oikonomos" + givenName: "Γεώργιος" + givenName_romanized: "Georgios" + baseSurname: "Οἰκονόμος" + baseSurname_romanized: "Oikonomos" + roles: [] + biographical: + deceased: true + deceased_marker: "μακαρίτης" + family_relationships: + child: + - person_index: 2 + target_name: "Ἑλένη" + context: "Maternal grandfather, deceased" + + - person_index: 4 + pnv_name: + literalName: "Κωνσταντῖνος Καρατζᾶς τοῦ Ἰωάννου" + literalName_romanized: "Konstantinos Karatzas tou Ioannou" + givenName: "Κωνσταντῖνος" + givenName_romanized: "Konstantinos" + baseSurname: "Καρατζᾶς" + baseSurname_romanized: "Karatzas" + roles: + - role_title: "νονός" + role_in_source: "godfather" + biographical: + occupation: "ἰατρός (physician)" + patronymic: "τοῦ Ἰωάννου" + family_relationships: + father: + - person_index: 5 + target_name: "Ἰωάννης Καρατζᾶς" + godchild: + - person_index: 0 + target_name: "Δημήτριος" + context: "Godfather, physician" + + - person_index: 5 + pnv_name: + literalName: "Ἰωάννης Καρατζᾶς" + literalName_romanized: "Ioannis Karatzas" + givenName: "Ἰωάννης" + givenName_romanized: "Ioannis" + baseSurname: "Καρατζᾶς" + baseSurname_romanized: "Karatzas" + roles: [] + biographical: {} + family_relationships: + child: + - person_index: 4 + target_name: "Κωνσταντῖνος Καρατζᾶς" + context: "Father of godfather" + + - person_index: 6 + pnv_name: + literalName: "Πρωτοπρεσβύτερος Ἀθανάσιος Χρυσοστόμου" + literalName_romanized: "Protopresbyteros Athanasios Chrysostomou" + givenName: "Ἀθανάσιος" + givenName_romanized: "Athanasios" + patronymic: "Χρυσοστόμου" + patronymic_romanized: "Chrysostomou" + honorificPrefix: "Πρωτοπρεσβύτερος" + roles: + - role_title: "ἱερεύς" + role_in_source: "priest" + biographical: + ecclesiastical_rank: "Πρωτοπρεσβύτερος (Protopresbyter/Archpriest)" + family_relationships: {} + context: "Officiating priest" + + temporal_references: + - expression: "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875" + expression_romanized: "ti dekati pempti Martiou tou etous 1875" + normalized: "1875-03-15" + calendar: "Julian" + type: "DATE" + note: "Greek Orthodox used Julian calendar; Gregorian equivalent: March 27, 1875" + + locations_mentioned: + - name: "Θεσσαλονίκη" + name_romanized: "Thessaloniki" + type: "city" + modern_country: "Greece" + historical_context: "Ottoman Empire (Selanik vilayet)" + + greek_naming_notes: | + Greek Orthodox naming conventions demonstrated: + - "τοῦ" + genitive: patronymic marker ("son/daughter of") + - "μακαρίτης/μακαρίτισσα": deceased marker ("the late") + - "νομίμη σύζυγος": lawful wife + - "νονός/νονά": godfather/godmother + - Surnames from occupations: Παπαδόπουλος (priest's son), Οἰκονόμος (steward) + - Ecclesiastical titles: Πρωτοπρεσβύτερος (Archpriest) + - Polytonic Greek orthography common in 19th century + - Julian calendar used by Greek Orthodox Church + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Greek Orthodox + baptismal register formulae for demonstration purposes. Names, dates, + and locations are fictional but follow 19th-century conventions. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "FamilySearch" + wiki_url: "https://www.familysearch.org/en/wiki/Greece_Church_Records" + document_type: "Baptisms, marriages, deaths" + period: "17th century - 1925 CE" + language: "Greek" + license: "Free with registration" + notes: "Greek Orthodox records are primary source before 1925 civil registration" + + - archive: "Γενικά Αρχεία του Κράτους (General State Archives of Greece)" + abbreviation: "GAK" + document_type: "Church records, civil registry, Ottoman-era documents" + period: "15th century - present" + languages: "Greek, Ottoman Turkish" + notes: "National archive with records from all Greek regions" + + - resource: "Greek Ancestry" + coverage: "Village church records guide" + document_type: "Baptismal registers, marriage registers" + notes: "Guides to accessing island and mainland records" + +# ============================================================================= +# EXAMPLE 10: Russian Imperial Metrical Book - Birth of Stefan Nowicki (1894) +# ============================================================================= +# +# REAL HISTORICAL DATA from Archiwum Państwowe w Poznaniu +# +# Demonstrates extraction from a Russian Imperial metrical book showing: +# - Cyrillic script with romanization +# - Polish names recorded in Russian (Congress Poland context) +# - Pre-revolutionary orthography (ъ, ѣ) +# - Julian/Gregorian calendar dual dating +# - Восприемники (godparents/sponsors) +# - Village-level vital records +# +# Source: BYU Script Tutorial - fully transcribed with verification +# ============================================================================= + + - example_id: "russian_metrical_book_osiek_wielki_1894" + source_language: "Russian" + source_script: "Cyrillic (pre-1918 orthography)" + source_period: "1894 CE (Gregorian) / 1893 CE (Julian)" + source_type: "metrical_book" + document_subtype: "birth_record" + + source_text: | + Любины + Состаялосъ въ деревнѣ осѣкъ велькій двадцать седьмаго Декабря + /:восьмаго Января:/ тысяча восемьсоть девяносто третяго (четвертаго) года + въ одинадцать часовъ утра Явился Янъ Новицкій /:Jan Nowicki:/ + сорока лѣтъ отъ роду земледѣлецъ изъ Любинъ, въ присутствіи + Францишка Новицкаго сорока лѣтъ, и Михаила Влодарчика + шестидесяти лѣтъ отъ роду, обоихъ земледѣльцевъ изъ Любинъ + и предьявилъ намъ младенца мужскаго пола, объявляя + что онъ родился въ Любинахъ двадцать пятаго Декабря + /:шестаго Января:/ текущаго года, въ четыре часа вечеромъ + отъ законной его жены Маріанны изъ Адамковъ /:Mary- + anny z Adamkow:/ тридцати лѣтъ отъ роду, младенцу + этому при святомъ крещеніи совершенномъ сего + числа дано имя Стефанъ /:Stefan:/ а воспріемниками + его были Войцех Гаудынъ, и Катаржина Гембка. + Актъ сей объявляющему и свидѣтелямъ негра- + мотнымъ прочитанъ нами только подписанъ + Ксндзъ Павелъ Выборскій + + source_text_romanized: | + Lyubiny + Sostoyalos' v derevne Osek Vel'kiy dvadtsat' sed'mago Dekabrya + /:vos'mago Yanvarya:/ tysyacha vosem'sot' devyanosto tret'yago (chetvertago) goda + v odinnadtsat' chasov utra Yavilsya Yan Novitskiy /:Jan Nowicki:/ + soroka let ot rodu zemledelets iz Lyubin, v prisutstvii + Frantsishka Novitskago soroka let, i Mikhaila Vlodarchika + shestidesyati let ot rodu, oboikh zemledeltsev iz Lyubin + i pred'yavil nam mladentsa muzhskago pola, ob'yavlyaya + chto on rodilsya v Lyubinakh dvadtsat' pyatago Dekabrya + /:shestago Yanvarya:/ tekushchago goda, v chetyre chasa vecherom + ot zakonnoy ego zheny Marianny iz Adamkov /:Mary- + anny z Adamkow:/ tridtsati let ot rodu, mladentsu + etomu pri svyatom kreshchenii sovershennom sego + chisla dano imya Stefan /:Stefan:/ a vospriyemnikami + ego byli Voytsekh Gaudyn, i Katarzhina Gembka. + Akt sey ob'yavlyayushchemu i svidetel'yam negra- + motnym prochitan nami tol'ko podpisan + Ksndz Pavel Vyborskiy + + source_text_english: | + Lubin + It happened in the village of Osiek Wielki on the twenty-seventh of December + /:eighth of January:/ in the year one thousand eight hundred ninety-three (four) + at eleven o'clock in the morning. Appeared Jan Nowicki /:Jan Nowicki:/ + forty years of age, farmer from Lubin, in the presence of + Franciszek Nowicki, forty years old, and Michał Włodarczyk + sixty years of age, both farmers from Lubin + and presented to us an infant of the male sex, declaring + that he was born in Lubin on the twenty-fifth of December + /:sixth of January:/ of the current year, at four o'clock in the evening + of his lawful wife Marianna née Adamkow /:Mary- + anna z Adamkow:/ thirty years of age. To this infant, + at the holy baptism performed on this + date, was given the name Stefan /:Stefan:/ and his godparents + were Wojciech Gaudyn and Katarzyna Gembka. + This act, to the declarant and to the illiterate witnesses, + was read by us and only signed. + Priest Paweł Wyborski + + expected_output: + pico_observation: + observation_id: "birth_osiek_wielki_1894_stefan_nowicki" + source_type: "metrical_book" + source_reference: "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki, Reference Code 54/792/0/6.1/140, scan 4/76" + archive: "Archiwum Państwowe w Poznaniu Oddział w Koninie" + + persons: + - person_index: 0 + pnv_name: + literalName: "Стефанъ Новицкій" + literalName_romanized: "Stefan Novitskiy" + literalName_polish: "Stefan Nowicki" + givenName: "Стефанъ" + givenName_romanized: "Stefan" + baseSurname: "Новицкій" + baseSurname_romanized: "Novitskiy" + baseSurname_polish: "Nowicki" + roles: + - role_title: "младенецъ" + role_in_source: "infant" + biographical: + sex: "male" + religion: "Roman Catholic" + birth_date_julian: "1893-12-25" + birth_date_gregorian: "1894-01-06" + baptism_date_julian: "1893-12-27" + baptism_date_gregorian: "1894-01-08" + birth_place: "Любины (Lubin)" + birth_time: "4 o'clock in the evening" + family_relationships: + father: + - person_index: 1 + target_name: "Янъ Новицкій" + mother: + - person_index: 2 + target_name: "Маріанна изъ Адамковъ" + godfather: + - person_index: 5 + target_name: "Войцех Гаудынъ" + godmother: + - person_index: 6 + target_name: "Катаржина Гембка" + context: "Newborn infant, subject of the birth registration" + + - person_index: 1 + pnv_name: + literalName: "Янъ Новицкій" + literalName_romanized: "Yan Novitskiy" + literalName_polish: "Jan Nowicki" + givenName: "Янъ" + givenName_romanized: "Yan" + givenName_polish: "Jan" + baseSurname: "Новицкій" + baseSurname_romanized: "Novitskiy" + baseSurname_polish: "Nowicki" + roles: + - role_title: "отецъ" + role_in_source: "father" + - role_title: "объявляющій" + role_in_source: "declarant" + biographical: + sex: "male" + age: 40 + age_expression: "сорока лѣтъ отъ роду" + occupation: "земледѣлецъ (farmer)" + residence: "Любины (Lubin)" + literacy: "illiterate (implied - act read to him)" + family_relationships: + child: + - person_index: 0 + target_name: "Стефанъ Новицкій" + spouse: + - person_index: 2 + target_name: "Маріанна изъ Адамковъ" + possible_relative: + - person_index: 3 + target_name: "Францишекъ Новицкій" + relationship_type: "same surname - possibly brother or cousin" + context: "Father of the infant, farmer from Lubin, appeared to register the birth" + + - person_index: 2 + pnv_name: + literalName: "Маріанна изъ Адамковъ" + literalName_romanized: "Marianna iz Adamkov" + literalName_polish: "Maryanna z Adamkow" + givenName: "Маріанна" + givenName_romanized: "Marianna" + givenName_polish: "Maryanna" + maidenName: "Адамковъ" + maidenName_romanized: "Adamkov" + maidenName_polish: "Adamkow" + roles: + - role_title: "мать" + role_in_source: "mother" + biographical: + sex: "female" + age: 30 + age_expression: "тридцати лѣтъ отъ роду" + marital_status: "законная жена (lawful wife)" + maiden_name_marker: "изъ (née/z)" + family_relationships: + child: + - person_index: 0 + target_name: "Стефанъ Новицкій" + spouse: + - person_index: 1 + target_name: "Янъ Новицкій" + context: "Mother of the infant, lawful wife of Jan Nowicki" + + - person_index: 3 + pnv_name: + literalName: "Францишекъ Новицкій" + literalName_romanized: "Frantsishek Novitskiy" + literalName_polish: "Franciszek Nowicki" + givenName: "Францишекъ" + givenName_romanized: "Frantsishek" + givenName_polish: "Franciszek" + baseSurname: "Новицкій" + baseSurname_romanized: "Novitskiy" + baseSurname_polish: "Nowicki" + roles: + - role_title: "свидѣтель" + role_in_source: "witness" + biographical: + sex: "male" + age: 40 + age_expression: "сорока лѣтъ" + occupation: "земледѣлецъ (farmer)" + residence: "Любины (Lubin)" + literacy: "illiterate (неграмотный)" + family_relationships: + possible_relative: + - person_index: 1 + target_name: "Янъ Новицкій" + relationship_type: "same surname, same age, same village - possibly brother" + context: "First witness, farmer from Lubin, same surname as father" + + - person_index: 4 + pnv_name: + literalName: "Михаилъ Влодарчикъ" + literalName_romanized: "Mikhail Vlodarchik" + literalName_polish: "Michał Włodarczyk" + givenName: "Михаилъ" + givenName_romanized: "Mikhail" + givenName_polish: "Michał" + baseSurname: "Влодарчикъ" + baseSurname_romanized: "Vlodarchik" + baseSurname_polish: "Włodarczyk" + roles: + - role_title: "свидѣтель" + role_in_source: "witness" + biographical: + sex: "male" + age: 60 + age_expression: "шестидесяти лѣтъ отъ роду" + occupation: "земледѣлецъ (farmer)" + residence: "Любины (Lubin)" + literacy: "illiterate (неграмотный)" + family_relationships: {} + context: "Second witness, farmer from Lubin, age 60" + + - person_index: 5 + pnv_name: + literalName: "Войцех Гаудынъ" + literalName_romanized: "Voytsekh Gaudyn" + literalName_polish: "Wojciech Gaudyn" + givenName: "Войцех" + givenName_romanized: "Voytsekh" + givenName_polish: "Wojciech" + baseSurname: "Гаудынъ" + baseSurname_romanized: "Gaudyn" + baseSurname_polish: "Gaudyn" + roles: + - role_title: "воспріемникъ" + role_in_source: "godfather" + biographical: + sex: "male" + family_relationships: + godchild: + - person_index: 0 + target_name: "Стефанъ Новицкій" + context: "Godfather (baptismal sponsor)" + + - person_index: 6 + pnv_name: + literalName: "Катаржина Гембка" + literalName_romanized: "Katarzhina Gembka" + literalName_polish: "Katarzyna Gembka" + givenName: "Катаржина" + givenName_romanized: "Katarzhina" + givenName_polish: "Katarzyna" + baseSurname: "Гембка" + baseSurname_romanized: "Gembka" + baseSurname_polish: "Gembka" + roles: + - role_title: "воспріемница" + role_in_source: "godmother" + biographical: + sex: "female" + family_relationships: + godchild: + - person_index: 0 + target_name: "Стефанъ Новицкій" + context: "Godmother (baptismal sponsor)" + + - person_index: 7 + pnv_name: + literalName: "Ксндзъ Павелъ Выборскій" + literalName_romanized: "Ksndz Pavel Vyborskiy" + literalName_polish: "Ksiądz Paweł Wyborski" + givenName: "Павелъ" + givenName_romanized: "Pavel" + givenName_polish: "Paweł" + baseSurname: "Выборскій" + baseSurname_romanized: "Vyborskiy" + baseSurname_polish: "Wyborski" + honorificPrefix: "Ксндзъ (Priest)" + roles: + - role_title: "ксндзъ" + role_in_source: "priest" + - role_title: "registrar" + role_in_source: "signed the act" + biographical: + sex: "male" + ecclesiastical_status: "Roman Catholic priest" + literacy: "literate (only signer)" + family_relationships: {} + context: "Officiating priest who performed baptism and signed the registration" + + temporal_references: + - expression: "тысяча восемьсоть девяносто третяго (четвертаго) года" + expression_romanized: "tysyacha vosem'sot' devyanosto tret'yago (chetvertago) goda" + normalized_julian: "1893" + normalized_gregorian: "1894" + calendar: "Dual (Julian/Gregorian)" + type: "YEAR" + note: "Document shows both Julian (1893) and Gregorian (1894) years" + - expression: "двадцать седьмаго Декабря /:восьмаго Января:/" + expression_romanized: "dvadtsat' sed'mago Dekabrya /:vos'mago Yanvarya:/" + normalized_julian: "1893-12-27" + normalized_gregorian: "1894-01-08" + calendar: "Dual (Julian/Gregorian)" + type: "DATE" + event: "registration and baptism" + - expression: "двадцать пятаго Декабря /:шестаго Января:/" + expression_romanized: "dvadtsat' pyatago Dekabrya /:shestago Yanvarya:/" + normalized_julian: "1893-12-25" + normalized_gregorian: "1894-01-06" + calendar: "Dual (Julian/Gregorian)" + type: "DATE" + event: "birth" + note: "Born on Christmas Day (Julian calendar)" + - expression: "въ четыре часа вечеромъ" + expression_romanized: "v chetyre chasa vecherom" + normalized: "16:00" + type: "TIME" + event: "birth" + - expression: "въ одинадцать часовъ утра" + expression_romanized: "v odinnadtsat' chasov utra" + normalized: "11:00" + type: "TIME" + event: "registration" + + locations_mentioned: + - name: "Осѣкъ Велькій" + name_romanized: "Osek Vel'kiy" + name_polish: "Osiek Wielki" + type: "village (derevnya)" + modern_location: "Greater Poland Voivodeship, Poland" + coordinates: "52.2461, 18.6207" + geonames_url: "https://www.google.com/maps/place/Osiek+Wielki,+Poland" + - name: "Любины" + name_romanized: "Lyubiny" + name_polish: "Lubin" + type: "village" + note: "Village where the family resided and child was born" + - name: "Parafia Rzymskokatolicka Osiek Wielki" + type: "parish" + note: "Roman Catholic Parish of Osiek Wielki - registration authority" + + russian_naming_notes: | + Congress Poland naming conventions demonstrated in this REAL document: + + 1. DUAL SCRIPT NOTATION: + - Polish names recorded in both Russian Cyrillic AND Latin script + - Example: "Янъ Новицкій /:Jan Nowicki:/" + - Slashes and colons mark the Latin/Polish original + + 2. PRE-REVOLUTIONARY ORTHOGRAPHY: + - Hard sign (ъ) at end of words: Новицкій, Стефанъ + - Yat (ѣ) instead of е: лѣтъ, деревнѣ, свидѣтелямъ + - -аго/-яго genitive endings (later simplified to -ого/-его) + + 3. POLISH MAIDEN NAME CONVENTION: + - "изъ Адамковъ" = "z Adamkow" = née Adamkow + - "изъ" (from) marks maiden/birth name + + 4. WITNESSES (свидѣтели): + - Two male witnesses required for registration + - Both noted as illiterate (неграмотнымъ) + - Father (declarant) also illiterate - act "read" to them + + 5. CALENDAR SYSTEM: + - Russian Empire used Julian calendar + - Congress Poland (under Russian rule) noted both dates + - 12-day difference in 1893-1894 + - Format: Julian date /:Gregorian date:/ + + 6. GODPARENTS (воспріемники): + - Male: воспріемникъ (godfather) + - Female: воспріемница (godmother) + - Not necessarily from same family as parents + + 7. SOCIAL/OCCUPATIONAL TERMS: + - земледѣлецъ = farmer/agriculturalist + - ксндзъ = ksiądz (Polish priest title, from German "Knez") + + provenance: + data_status: "REAL_HISTORICAL_DATA" + archive: "Archiwum Państwowe w Poznaniu Oddział w Koninie" + archive_english: "State Archive in Poznań, Konin Branch" + collection: "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki (pow. kolski)" + collection_english: "Civil Registration Records of the Roman Catholic Parish of Osiek Wielki (Koło district)" + reference_code: "54/792/0/6.1/140" + scan_number: "4 of 76" + document_date_julian: "1893-12-27" + document_date_gregorian: "1894-01-08" + digital_url: "https://szukajwarchiwach.gov.pl" + tutorial_url: "https://script.byu.edu/russian-handwriting/transcription/birth/osiek-wielki-poland/1894" + license: "Public domain (historical document over 100 years old)" + + citation: | + "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki (pow. kolski)," + Archiwum Państwowe w Poznaniu Oddział w Koninie, Szukaj w Archiwach + (szukajwarchiwach.gov.pl: accessed 25 January 2023), entry for Stefan Novitsky, + Catholic birth record, 6 January 1894 (Gregorian date), Osiek Wielki, Czołowo, + Koło, Kaliska, Russian Empire, Reference Code 54/792/0/6.1/140, scan no. 4 of 76. + + transcription_source: + institution: "Brigham Young University" + project: "Script Tutorial" + url: "https://script.byu.edu/russian-handwriting/transcription/birth/osiek-wielki-poland/1894" + access_date: "2025-01-13" + notes: "Complete line-by-line transcription with Russian original, romanization, and English translation" + + verification_notes: | + This is a REAL historical document with verified transcription: + - Original held at Polish State Archives (Archiwum Państwowe) + - Transcribed and verified by BYU Script Tutorial paleographers + - All 8 persons are real historical individuals + - Names provided in both Russian Cyrillic and Polish Latin script in original + - Stefan Nowicki born 6 January 1894 (Gregorian) in Lubin village + - Family: farmers (земледѣльцы) in Greater Poland region + - Document context: Congress Poland under Russian Imperial rule + + # --------------------------------------------------------------------------- + # Example 11: Ottoman Turkish Sijill (Court Record) + # --------------------------------------------------------------------------- + # Period: 1258 AH (1842 CE) + # Source: Şer'iyye Sicili (Sharia Court Register), Demirciköy + # Language: Ottoman Turkish (Arabic script) + # Key features: + # - Honorific titles: Ağa, Efendi, Çelebi, Hatun + # - Patronymics: bin (son of), bint (daughter of) + # - Deceased markers: merhum/merhume (المرحوم/المرحومة) + # - Hijri calendar + # - Mixed Arabic-Turkish vocabulary + # - Court terminology: mahkeme, şahid, mübayi', ba'i + # --------------------------------------------------------------------------- + + - example_id: "ottoman_sijill" + source_language: "Ottoman Turkish" + source_script: "Arabic" + source_period: "1258 AH (1842 CE)" + source_type: "sijill" + document_subtype: "property_sale" + archive_context: "Şer'iyye Sicilleri (Islamic Court Registers)" + + source_text: | + بسم الله الرحمن الرحيم + + مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجی‌کوی + ساکنلرندن محمد بن احمد افندی و زوجه‌سی فاطمه خاتون بنت علی‌اوغلو + حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی + + شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی + + فی اوائل شهر رجب سنة ١٢٥٨ + + source_text_romanized: | + Bismillahirrahmanirrahim + + Meclis-i şer'-i şerifde Mehmed Ağa bin Abdullah merhum kasaba Demirciköy + sakinlerinden Mehmed bin Ahmed Efendi ve zevcesi Fatma Hatun bint Ali-oğlu + hazır olub Mehmed Ağa tarafından yirmi beş guruş bedel-i ma'lum ile satıldı + + Şuhud al-hal: Hasan Efendi bin Ömer, İbrahim Çelebi bin Mustafa + + Fi evail-i şehr-i Receb sene 1258 + + source_text_english: | + In the name of God, the Merciful, the Compassionate + + In the noble Sharia court, Mehmed Ağa son of the late Abdullah, [sold to] + residents of the town of Demirciköy, Mehmed son of Ahmed Efendi and his + wife Fatma Hatun daughter of Ali-oğlu, who were present, for the known + price of twenty-five guruş, [the property] was sold by Mehmed Ağa. + + Witnesses present: Hasan Efendi son of Ömer, İbrahim Çelebi son of Mustafa + + In early Receb of the year 1258 [Hijri] + + expected_output: + pico_observation: + observation_id: "sijill_demircikoy_1258ah_sale" + source_type: "sijill" + source_reference: "Şer'iyye Sicili, Demirciköy, Receb 1258 AH" + + persons: + - person_index: 0 + pnv_name: + literalName: "محمد آغا بن عبد الله" + literalName_romanized: "Mehmed Ağa bin Abdullah" + givenName: "محمد" + givenName_romanized: "Mehmed" + title: "آغا (Ağa)" + patronymic: "بن عبد الله" + patronymic_romanized: "bin Abdullah" + roles: + - role_title: "با‌ئع (ba'i)" + role_in_source: "seller" + biographical: + sex: "male" + status: "deceased" + deceased_marker: "مرحوم (merhum)" + social_rank: "Ağa (military/landowning class)" + family_relationships: + father: + - name: "عبد الله (Abdullah)" + status: "deceased" + context: "Seller (deceased), Ağa = military/landowning" + + - person_index: 1 + pnv_name: + literalName: "محمد بن احمد افندی" + literalName_romanized: "Mehmed bin Ahmed Efendi" + givenName: "محمد" + givenName_romanized: "Mehmed" + title: "افندی (Efendi)" + patronymic: "بن احمد" + patronymic_romanized: "bin Ahmed" + roles: + - role_title: "مشتری (müşteri)" + role_in_source: "buyer" + biographical: + sex: "male" + residence: "Demirciköy" + social_rank: "Efendi (educated class)" + family_relationships: + father: + - name: "احمد (Ahmed)" + spouse: + - person_index: 2 + target_name: "Fatma Hatun" + context: "Buyer, Efendi = literate/administrative" + + - person_index: 2 + pnv_name: + literalName: "فاطمه خاتون بنت علی‌اوغلو" + literalName_romanized: "Fatma Hatun bint Ali-oğlu" + givenName: "فاطمه" + givenName_romanized: "Fatma" + title: "خاتون (Hatun)" + patronymic: "بنت علی‌اوغلو" + patronymic_romanized: "bint Ali-oğlu" + roles: + - role_title: "مشتری (müşteri)" + role_in_source: "buyer" + - role_title: "زوجه (zevce)" + role_in_source: "wife" + biographical: + sex: "female" + marital_status: "married" + social_rank: "Hatun (respectable woman)" + family_relationships: + father: + - name: "علی‌اوغلو (Ali-oğlu)" + spouse: + - person_index: 1 + target_name: "Mehmed Efendi" + context: "Wife of buyer, co-purchaser" + + - person_index: 3 + pnv_name: + literalName: "حسن افندی بن عمر" + literalName_romanized: "Hasan Efendi bin Ömer" + givenName: "حسن" + givenName_romanized: "Hasan" + title: "افندی (Efendi)" + patronymic: "بن عمر" + patronymic_romanized: "bin Ömer" + roles: + - role_title: "شاهد (şahid)" + role_in_source: "witness" + biographical: + sex: "male" + social_rank: "Efendi" + family_relationships: + father: + - name: "عمر (Ömer)" + context: "First witness" + + - person_index: 4 + pnv_name: + literalName: "ابراهیم چلبی بن مصطفی" + literalName_romanized: "İbrahim Çelebi bin Mustafa" + givenName: "ابراهیم" + givenName_romanized: "İbrahim" + title: "چلبی (Çelebi)" + patronymic: "بن مصطفی" + patronymic_romanized: "bin Mustafa" + roles: + - role_title: "شاهد (şahid)" + role_in_source: "witness" + biographical: + sex: "male" + social_rank: "Çelebi (gentleman/merchant)" + family_relationships: + father: + - name: "مصطفی (Mustafa)" + context: "Second witness" + + temporal_references: + - expression: "فی اوائل شهر رجب سنة ١٢٥٨" + expression_romanized: "fi evail-i şehr-i Receb sene 1258" + normalized: "1842-07" + calendar: "Hijri" + type: "DATE" + conversion_note: "Receb 1258 AH ≈ July-August 1842 CE" + + locations_mentioned: + - name: "قصبه دميرجی‌کوی" + name_romanized: "kasaba Demirciköy" + type: "town (kasaba)" + - name: "مجلس شرع شريف" + name_romanized: "meclis-i şer'-i şerif" + type: "court" + + ottoman_naming_notes: | + Ottoman Turkish naming conventions: + + HONORIFIC TITLES: + - آغا (Ağa): Military commander, landowner + - افندی (Efendi): Educated person, official + - چلبی (Çelebi): Gentleman, merchant + - خاتون (Hatun): Respectable woman + + PATRONYMIC PATTERNS: + - بن (bin): Son of (Arabic) + - بنت (bint): Daughter of (Arabic) + - اوغلو (-oğlu): Son of (Turkish) + + DECEASED MARKERS: + - مرحوم (merhum): The late (man) + - مرحومه (merhume): The late (woman) + + CALENDAR: Hijri lunar (354/355 days) + Receb 1258 AH ≈ July-August 1842 CE + + provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Ottoman Turkish + sijill (court register) formulae for demonstration purposes. Names, + dates, and locations are fictional but follow authentic 19th-century + patterns. For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "OpenJerusalem Project" + collection: "Jerusalem Sharia Court Registers" + digital_url: "https://www.openjerusalem.org/" + ark_identifier: "ark:/58142/PfV7b" + volume_count: "102 registers" + period: "1834-1920 CE" + languages: "Ottoman Turkish, Arabic" + license: "Open Access" + document_types: "Property sales, marriage contracts, inheritance, waqf" + + - archive: "İslam Araştırmaları Merkezi (ISAM)" + collection: "Istanbul Kadı Sicilleri" + digital_url: "http://www.kadisicilleri.org/" + volume_count: "40+ volumes online" + document_count: "40,000+ documents" + period: "16th-19th century CE" + language: "Ottoman Turkish" + license: "Research access" + + - archive: "Istanbul Metropolitan Municipality" + project: "History of Istanbul" + digital_url: "https://istanbultarihi.ist/434-istanbul-sharia-court-registers" + volume_count: "~10,000 volumes" + courts: "26 different courts" + period: "1453-1922 CE" + notes: "Largest collection of Ottoman court records in existence" + + - archive: "Harvard University" + project: "Ottoman Court Records Project (OCRP)" + digital_url: "https://cmes.fas.harvard.edu/projects/ocrp" + document_types: "Sijill transcriptions, translations" + period: "16th-19th century CE" + +# ============================================================================= +# END OF MODULE +# ============================================================================= diff --git a/data/entity_annotation/modules/integrations/pico/_index.yaml b/data/entity_annotation/modules/integrations/pico/_index.yaml new file mode 100644 index 0000000000..d8485d2978 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/_index.yaml @@ -0,0 +1,228 @@ +# ============================================================================= +# PiCo Integration Module - Index +# ============================================================================= +# Part of: GLAM-NER Entity Annotation Convention v1.7.0 +# Module: integrations/pico/ +# +# Description: +# PiCO (Person in Context Ontology) integration for person observation modeling. +# Enables tracking provenance of person mentions and linking to formal records. +# +# Key concepts: +# - PersonObservation: A textual mention of a person (source-bound) +# - PersonName (PNV): Structured name components +# - Person (CIDOC-CRM E21): Reconstructed person entity +# +# References: +# - PiCo Ontology: https://w3id.org/pico +# - Person Name Vocabulary (PNV): https://w3id.org/pnv +# - CIDOC-CRM: https://www.cidoc-crm.org/ +# +# Module Structure: +# pico/ +# ├── _index.yaml # This file - module manifest +# ├── schema/ +# │ ├── observation.yaml # Core PiCo observation pattern +# │ ├── pnv_components.yaml # Person Name Vocabulary +# │ ├── relationships.yaml # Family and social relationships +# │ ├── temporal.yaml # Date and calendar systems +# │ └── locations.yaml # Location type definitions +# ├── examples/ +# │ ├── _examples_index.yaml # Examples overview +# │ ├── 01_dutch_marriage.yaml # Example 1: Dutch civil registration +# │ ├── 02_notarial_protocol.yaml +# │ ├── 03_church_baptismal.yaml +# │ ├── 04_linkedin_profile.yaml +# │ ├── 05_arabic_waqf.yaml +# │ ├── 06_hebrew_ketubah.yaml # REAL DATA: Yale Mashhad 1896 +# │ ├── 07_spanish_colonial.yaml +# │ ├── 08_italian_notarial.yaml +# │ ├── 09_greek_orthodox.yaml +# │ ├── 10_russian_metrical.yaml # REAL DATA: BYU Osiek 1894 +# │ └── 11_ottoman_sijill.yaml +# └── naming_conventions/ +# ├── dutch.yaml # Dutch naming rules +# ├── arabic.yaml # Arabic naming rules +# ├── hebrew.yaml # Hebrew naming rules +# └── ... # Other language conventions +# +# Last Updated: 2025-01-13 +# Version: 1.7.0 +# ============================================================================= + +module: + id: "pico_integration" + name: "PiCo Integration Module" + version: "1.7.0" + parent: "ch_annotator-v1_7_0" + description: | + PiCO (Person in Context Ontology) models textual observations of persons + as distinct from reconstructed person entities. This enables: + - Tracking provenance of person mentions + - Handling name variations across sources + - Linking observations to formal person records + + The observation/reconstruction pattern separates: + 1. What was OBSERVED in text (PersonObservation) - source-bound, exact + 2. What was RECONSTRUCTED as entity (E21_Person) - inferred, normalized + + This is critical for heritage data where the same person may appear with + different name forms, titles, or spellings across sources. + +# ----------------------------------------------------------------------------- +# Module Components +# ----------------------------------------------------------------------------- + +components: + schema: + description: "Core schema definitions for PiCo model" + files: + - path: "schema/observation.yaml" + description: "PersonObservation class and properties" + classes: + - "picom:PersonObservation" + + - path: "schema/pnv_components.yaml" + description: "Person Name Vocabulary (PNV) components" + classes: + - "pnv:PersonName" + + - path: "schema/relationships.yaml" + description: "Family and social relationship types" + properties: + - "sdo:parent" + - "sdo:children" + - "sdo:spouse" + - "sdo:sibling" + - "godparent" + - "witness" + + - path: "schema/temporal.yaml" + description: "Date formats, calendar systems, temporal modeling" + + - path: "schema/locations.yaml" + description: "Location types for biographical data" + + examples: + description: "Complete extraction examples demonstrating PiCo patterns" + index_file: "examples/_examples_index.yaml" + real_data_examples: + - id: "06_hebrew_ketubah" + data_status: "REAL_HISTORICAL_DATA" + source: "Yale University Beinecke Library" + call_number: "Hebrew MSS suppl 194" + + - id: "10_russian_metrical" + data_status: "REAL_HISTORICAL_DATA" + source: "Archiwum Panstwowe w Poznaniu Oddzial w Koninie" + reference: "54/792/0/6.1/140" + + synthetic_examples: + - "01_dutch_marriage" + - "02_notarial_protocol" + - "03_church_baptismal" + - "04_linkedin_profile" + - "05_arabic_waqf" + - "07_spanish_colonial" + - "08_italian_notarial" + - "09_greek_orthodox" + - "11_ottoman_sijill" + + naming_conventions: + description: "Language-specific naming rules and patterns" + files: + - path: "naming_conventions/dutch.yaml" + language: "nl" + covers: ["tussenvoegsels", "patronymics", "sorting rules"] + + - path: "naming_conventions/arabic.yaml" + language: "ar" + covers: ["nasab", "nisba", "kunya", "laqab"] + + - path: "naming_conventions/hebrew.yaml" + language: "he" + covers: ["ben/bat patronymics", "ketubah conventions"] + + - path: "naming_conventions/spanish.yaml" + language: "es" + covers: ["double surnames", "colonial titles"] + + - path: "naming_conventions/italian.yaml" + language: "it" + covers: ["notarial conventions", "nobility particles"] + + - path: "naming_conventions/greek.yaml" + language: "el" + covers: ["Orthodox naming", "genitive forms"] + + - path: "naming_conventions/russian.yaml" + language: "ru" + covers: ["patronymics", "metrical book conventions"] + + - path: "naming_conventions/ottoman.yaml" + language: "ota" + covers: ["Ottoman Turkish", "Arabic-Ottoman blend"] + +# ----------------------------------------------------------------------------- +# GLM-4.6 Annotator Configuration +# ----------------------------------------------------------------------------- + +glm_annotator_config: + model: "glm-4.6" + api_endpoint: "https://api.z.ai/api/coding/paas/v4/chat/completions" + temperature: 0.1 + max_tokens: 4000 + system_prompt_file: "schema/observation.yaml" # Contains extraction instructions + +# ----------------------------------------------------------------------------- +# Hypernym Mapping (GLAM-NER v1.7.0) +# ----------------------------------------------------------------------------- + +hypernym_mapping: + description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms" + + mappings: + - pico_class: "picom:PersonObservation" + glam_hypernym: "AGT.PER" + note: "Person observations create AGT.PER entities" + + - pico_class: "picom:PersonObservation" + glam_hypernym: "AGT.STF" + condition: "When observed with organizational role" + note: "Staff members with role context" + + - pico_class: "pnv:PersonName" + glam_hypernym: "APP.NAM" + note: "Name strings as appellations" + + - pico_class: "picom:hasRole" + glam_hypernym: "ROL" + note: "Extracted roles link to ROL hypernym" + +# ----------------------------------------------------------------------------- +# Usage Notes +# ----------------------------------------------------------------------------- + +usage: + loading: | + Since YAML does not have native imports, applications should load + module files individually or use a custom loader. Example: + + ```python + import yaml + from pathlib import Path + + def load_pico_module(base_path: Path) -> dict: + module = {} + module['index'] = yaml.safe_load((base_path / '_index.yaml').read_text()) + module['observation'] = yaml.safe_load((base_path / 'schema/observation.yaml').read_text()) + module['pnv'] = yaml.safe_load((base_path / 'schema/pnv_components.yaml').read_text()) + # ... load other components as needed + return module + ``` + + validation: | + Each YAML file is valid standalone. Validate with: + ```bash + python3 -c "import yaml; yaml.safe_load(open('path/to/file.yaml'))" + ``` diff --git a/data/entity_annotation/modules/integrations/pico/examples/01_dutch_marriage.yaml b/data/entity_annotation/modules/integrations/pico/examples/01_dutch_marriage.yaml new file mode 100644 index 0000000000..1998afb518 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/01_dutch_marriage.yaml @@ -0,0 +1,285 @@ +# ============================================================================= +# PiCo Example 1: Dutch Marriage Certificate (Burgerlijke Stand) +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Demonstrates extraction from a Dutch civil registry (Burgerlijke Stand) +# marriage certificate showing: +# - Full family network extraction (8 persons) +# - Dutch naming conventions (tussenvoegsel: "de") +# - Occupation and residence data +# - Witness relationships (siblings of bride/groom) +# - Deceased parent markers ("wijlen") +# +# Language: Dutch +# Period: 19th century (1885 CE) +# Source Type: Civil Registration (Burgerlijke Stand) +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_01_dutch_marriage" +example_title: "Dutch Marriage Certificate - Burgerlijke Stand (1885)" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Dutch" +source_type: "civil_registration" + +description: | + This example demonstrates extraction from a Dutch civil registry (Burgerlijke + Stand) marriage certificate from 1885. The document contains rich genealogical + data including the bride and groom, their parents (living and deceased), and + witnesses who are siblings of the couple. + + Key extraction features: + - 8 persons with full family relationship mapping + - Occupation data (schilder, koopman, timmerman) + - Place of birth and residence + - Deceased parent markers ("wijlen") + - Age at marriage + - Witness-to-party relationships (brothers of bride/groom) + +source_text: | + Heden den elfden November achttien honderd vijf en tachtig, zijn voor ons + Ambtenaar van den Burgerlijken Stand der gemeente Haarlem, verschenen: + Cornelis Johannes Koppen, oud dertig jaren, schilder, geboren te Haarlem, + wonende alhier, meerderjarige zoon van wijlen Pieter Koppen en van + Anna Maria Brouwer, zonder beroep, wonende alhier; + en Anna Maria Visser, oud zeven en twintig jaren, zonder beroep, geboren + te Amsterdam, wonende alhier, meerderjarige dochter van Jan Visser, + koopman, en van wijlen Cornelia de Vries. + + Als getuigen waren tegenwoordig: Hendrik Koppen, oud vijf en dertig jaren, + schilder, broeder van den bruidegom; en Willem Visser, oud twee en dertig + jaren, timmerman, broeder van de bruid. + +expected_extraction: + pico_observation: + observation_id: "bs_haarlem_1885_marriage_321" + observed_at: "2025-12-12T10:00:00Z" + source_type: "civil_registration" + source_reference: "BS Marriage Haarlem, November 11, 1885, certificate 321" + + persons: + - person_index: 0 + pnv_name: + literalName: "Cornelis Johannes Koppen" + givenName: "Cornelis Johannes" + baseSurname: "Koppen" + roles: + - role_title: "schilder" + role_in_source: "groom" + biographical: + age: "30" + birth_place: "Haarlem" + address: "Haarlem" + family_relationships: + parent: + - person_index: 2 + target_name: "Pieter Koppen" + - person_index: 3 + target_name: "Anna Maria Brouwer" + spouse: + - person_index: 1 + target_name: "Anna Maria Visser" + sibling: + - person_index: 6 + target_name: "Hendrik Koppen" + + - person_index: 1 + pnv_name: + literalName: "Anna Maria Visser" + givenName: "Anna Maria" + baseSurname: "Visser" + roles: + - role_in_source: "bride" + biographical: + age: "27" + birth_place: "Amsterdam" + address: "Haarlem" + family_relationships: + parent: + - person_index: 4 + target_name: "Jan Visser" + - person_index: 5 + target_name: "Cornelia de Vries" + spouse: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + sibling: + - person_index: 7 + target_name: "Willem Visser" + + - person_index: 2 + pnv_name: + literalName: "Pieter Koppen" + givenName: "Pieter" + baseSurname: "Koppen" + biographical: + deceased: true + family_relationships: + children: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + - person_index: 6 + target_name: "Hendrik Koppen" + spouse: + - person_index: 3 + target_name: "Anna Maria Brouwer" + + - person_index: 3 + pnv_name: + literalName: "Anna Maria Brouwer" + givenName: "Anna Maria" + baseSurname: "Brouwer" + roles: + - role_title: "zonder beroep" + biographical: + address: "Haarlem" + family_relationships: + children: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + - person_index: 6 + target_name: "Hendrik Koppen" + widow_of: + person_index: 2 + target_name: "Pieter Koppen" + + - person_index: 4 + pnv_name: + literalName: "Jan Visser" + givenName: "Jan" + baseSurname: "Visser" + roles: + - role_title: "koopman" + family_relationships: + children: + - person_index: 1 + target_name: "Anna Maria Visser" + - person_index: 7 + target_name: "Willem Visser" + spouse: + - person_index: 5 + target_name: "Cornelia de Vries" + + - person_index: 5 + pnv_name: + literalName: "Cornelia de Vries" + givenName: "Cornelia" + surnamePrefix: "de" + baseSurname: "Vries" + biographical: + deceased: true + family_relationships: + children: + - person_index: 1 + target_name: "Anna Maria Visser" + - person_index: 7 + target_name: "Willem Visser" + spouse: + - person_index: 4 + target_name: "Jan Visser" + + - person_index: 6 + pnv_name: + literalName: "Hendrik Koppen" + givenName: "Hendrik" + baseSurname: "Koppen" + roles: + - role_title: "schilder" + role_in_source: "witness" + biographical: + age: "35" + family_relationships: + sibling: + - person_index: 0 + target_name: "Cornelis Johannes Koppen" + parent: + - person_index: 2 + target_name: "Pieter Koppen" + - person_index: 3 + target_name: "Anna Maria Brouwer" + + - person_index: 7 + pnv_name: + literalName: "Willem Visser" + givenName: "Willem" + baseSurname: "Visser" + roles: + - role_title: "timmerman" + role_in_source: "witness" + biographical: + age: "32" + family_relationships: + sibling: + - person_index: 1 + target_name: "Anna Maria Visser" + parent: + - person_index: 4 + target_name: "Jan Visser" + - person_index: 5 + target_name: "Cornelia de Vries" + + temporal_references: + - expression: "den elfden November achttien honderd vijf en tachtig" + normalized: "1885-11-11" + type: "DATE" + + locations_mentioned: + - name: "Haarlem" + type: "city" + - name: "Amsterdam" + type: "city" + +naming_conventions_notes: | + Dutch civil registration naming conventions demonstrated: + + 1. TUSSENVOEGSEL (surname prefix): + - "de Vries" - "de" is the tussenvoegsel + - Lowercase in running text, may be capitalized at start of sentence + - Inherited through family line + + 2. DECEASED MARKER: + - "wijlen" = the late/deceased + - Placed before the full name + + 3. OCCUPATION TERMS: + - "schilder" = painter + - "koopman" = merchant + - "timmerman" = carpenter + - "zonder beroep" = without profession/occupation + + 4. RESIDENCE MARKERS: + - "wonende alhier" = residing here (in the registration municipality) + - "geboren te" = born in + + 5. RELATIONSHIP TERMS: + - "meerderjarige zoon van" = adult son of + - "meerderjarige dochter van" = adult daughter of + - "broeder van" = brother of + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Dutch civil + registry (Burgerlijke Stand) marriage certificate formulae for + demonstration purposes. Names, dates, and locations are fictional + but follow authentic 19th-century patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Centraal Bureau voor Genealogie (CBG)" + project: "WieWasWie" + digital_url: "https://www.wiewaswie.nl/" + document_type: "Birth, marriage, death certificates" + period: "1811-present (civil); 1600s+ (church)" + language: "Dutch" + license: "Subscription / Free at archives" + + - archive: "Noord-Hollands Archief" + coverage: "Civil registry from 1811, church records from 1600s" + location: "Haarlem, Netherlands" + document_types: "Dutch civil registry records" diff --git a/data/entity_annotation/modules/integrations/pico/examples/02_notarial_protocol.yaml b/data/entity_annotation/modules/integrations/pico/examples/02_notarial_protocol.yaml new file mode 100644 index 0000000000..074c61dd0d --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/02_notarial_protocol.yaml @@ -0,0 +1,263 @@ +# ============================================================================= +# PiCo Example 2: Early Modern Notarial Protocol Index Entry +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Demonstrates extraction from a 17th-century Dutch notarial protocol showing: +# - Early modern Dutch naming conventions (patronymics: Janszoon, Claesdr) +# - Guardianship (voogd) relationships +# - Orphan identification +# - Notarial act structure +# - Tussenvoegsel patterns (van der) +# +# Language: Early Modern Dutch +# Period: 17th century (1680 CE) +# Source Type: Notarial Archives +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_02_notarial_protocol" +example_title: "Early Modern Notarial Protocol Index Entry (1680)" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Early Modern Dutch" +source_type: "historical_indices" + +description: | + This example demonstrates extraction from an early modern Dutch notarial + protocol index entry from 1680. Notarial protocols are rich sources for + genealogical research, containing contracts, testaments, and guardianship + appointments. + + Key extraction features: + - 9 persons with complex relationships + - Patronymic naming system (Janszoon, Claesdr) + - Guardianship (voogd) relationships + - Orphan children identification + - Deceased parent markers + - Notary and witness identification + - Early modern Dutch occupation terms + +source_text: | + Notarial Archive Amsterdam, inv. 5075/1234 + 30 January 1680 + + Before notary Pieter van der Meer appeared: + Jacob Janszoon van der Hoeven, merchant of this city, + with his wife Maritgen Claes, for themselves and as + guardians (voogden) of the minor children of the late + Claes Jacobsz and Aeltgen Pieters, namely: + - Jan Claeszoon, aged about 16 years + - Trijntgen Claesdr, aged about 12 years + + Witnesses: Hendrick Jansz, baker, and Cornelis Pietersz, + schoolmaster, both of this city. + +expected_extraction: + pico_observation: + observation_id: "na_amsterdam_5075_1234" + observed_at: "2025-12-12T10:00:00Z" + source_type: "historical_indices" + source_reference: "Notarial Archive Amsterdam, inv. 5075/1234, 30 January 1680" + + persons: + - person_index: 0 + pnv_name: + literalName: "Jacob Janszoon van der Hoeven" + givenName: "Jacob" + patronym: "Janszoon" + surnamePrefix: "van der" + baseSurname: "Hoeven" + roles: + - role_title: "merchant" + role_in_source: "declarant" + - role_title: "voogd" + role_in_source: null + biographical: + address: "Amsterdam" + family_relationships: + spouse: + - person_index: 1 + target_name: "Maritgen Claes" + + - person_index: 1 + pnv_name: + literalName: "Maritgen Claes" + givenName: "Maritgen" + patronym: "Claes" + roles: + - role_in_source: "declarant" + - role_title: "voogd" + family_relationships: + spouse: + - person_index: 0 + target_name: "Jacob Janszoon van der Hoeven" + + - person_index: 2 + pnv_name: + literalName: "Claes Jacobsz" + givenName: "Claes" + patronym: "Jacobsz" + biographical: + deceased: true + family_relationships: + spouse: + - person_index: 3 + target_name: "Aeltgen Pieters" + children: + - person_index: 4 + target_name: "Jan Claeszoon" + - person_index: 5 + target_name: "Trijntgen Claesdr" + + - person_index: 3 + pnv_name: + literalName: "Aeltgen Pieters" + givenName: "Aeltgen" + patronym: "Pieters" + biographical: + deceased: true + family_relationships: + spouse: + - person_index: 2 + target_name: "Claes Jacobsz" + children: + - person_index: 4 + target_name: "Jan Claeszoon" + - person_index: 5 + target_name: "Trijntgen Claesdr" + + - person_index: 4 + pnv_name: + literalName: "Jan Claeszoon" + givenName: "Jan" + patronym: "Claeszoon" + roles: + - role_in_source: "child" + biographical: + age: "about 16" + family_relationships: + parent: + - person_index: 2 + target_name: "Claes Jacobsz" + - person_index: 3 + target_name: "Aeltgen Pieters" + sibling: + - person_index: 5 + target_name: "Trijntgen Claesdr" + + - person_index: 5 + pnv_name: + literalName: "Trijntgen Claesdr" + givenName: "Trijntgen" + patronym: "Claesdr" + roles: + - role_in_source: "child" + biographical: + age: "about 12" + gender: "Female" + family_relationships: + parent: + - person_index: 2 + target_name: "Claes Jacobsz" + - person_index: 3 + target_name: "Aeltgen Pieters" + sibling: + - person_index: 4 + target_name: "Jan Claeszoon" + + - person_index: 6 + pnv_name: + literalName: "Pieter van der Meer" + givenName: "Pieter" + surnamePrefix: "van der" + baseSurname: "Meer" + roles: + - role_title: "notary" + + - person_index: 7 + pnv_name: + literalName: "Hendrick Jansz" + givenName: "Hendrick" + patronym: "Jansz" + roles: + - role_title: "baker" + role_in_source: "witness" + biographical: + address: "Amsterdam" + + - person_index: 8 + pnv_name: + literalName: "Cornelis Pietersz" + givenName: "Cornelis" + patronym: "Pietersz" + roles: + - role_title: "schoolmaster" + role_in_source: "witness" + biographical: + address: "Amsterdam" + + temporal_references: + - expression: "30 January 1680" + normalized: "1680-01-30" + type: "DATE" + + locations_mentioned: + - name: "Amsterdam" + type: "city" + +naming_conventions_notes: | + Early modern Dutch naming conventions demonstrated: + + 1. PATRONYMIC SYSTEM: + - Male: -zoon, -szoon, -sz, -z (son of) + Examples: Janszoon, Jacobsz, Jansz, Pietersz + - Female: -dr, -dochter (daughter of) + Examples: Claesdr (= Claesdochter) + - Patronyms derived from father's given name + + 2. TRANSITION TO SURNAMES: + - Some families adopted fixed surnames (van der Hoeven, van der Meer) + - Others still used pure patronymics (Hendrick Jansz) + - Mixed patterns common in this period + + 3. TUSSENVOEGSEL: + - "van der" = from the (+ definite article) + - Often indicates geographic origin + - Hoeven = farmstead/court + - Meer = lake + + 4. GENDERED DIMINUTIVES: + - Female names often end in -gen, -tgen, -tje + - Maritgen, Trijntgen, Aeltgen + - Male names typically unmodified + + 5. LEGAL TERMINOLOGY: + - "voogd" (plural: voogden) = guardian + - Used for orphaned minors + - Appointed by family or court + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic early modern + notarial protocol index entry formulae for demonstration purposes. + Names, dates, and locations are fictional but follow authentic + 17th-century Dutch notarial patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Stadsarchief Amsterdam" + collection: "Notarial Archives (Notariële Archieven)" + document_type: "Notarial protocols, contracts, testaments" + period: "1578-1915" + language: "Dutch, Latin" + notes: "Largest notarial archive in the Netherlands" + + - project: "TICCLAT (Transliteration of Early Modern Dutch Notarial Archives)" + coverage: "Amsterdam notarial indices" + period: "17th-18th century" + notes: "Machine-readable indices to notarial protocols" diff --git a/data/entity_annotation/modules/integrations/pico/examples/03_church_baptism.yaml b/data/entity_annotation/modules/integrations/pico/examples/03_church_baptism.yaml new file mode 100644 index 0000000000..b7a802d312 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/03_church_baptism.yaml @@ -0,0 +1,202 @@ +# ============================================================================= +# PiCo Example 3: Dutch Church Baptismal Record with Godparents +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Dutch Reformed Church (Nederlandse Hervormde Kerk) baptismal register entry. +# Demonstrates godparent relationships, Dutch patronymic naming, and +# pre-civil registration church records (DTB - Doop-, Trouw- en Begraafregisters). +# +# Language: Dutch (Early Modern) +# Period: 1702 CE +# Source Type: Church baptismal register (DTB) +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_03_church_baptism" +example_title: "Dutch Church Baptismal Record with Godparents (1702)" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Dutch" +source_type: "church_records" + +description: | + Example of a Dutch Reformed Church (Nederlandse Hervormde Kerk) baptismal + register entry demonstrating: + - Godparent (getuigen) relationships creating spiritual kinship + - Dutch patronymic naming conventions (Hendriksen, Jans, Anthonisz) + - Aristocratic naming (surnamePrefix: van) + - Honorific titles (E. Heer, Juffrou) + - Pre-civil registration church records (before 1811) + +source_text: | + Den 15en Meij 1702 is gedoopt + Johanna, dochter van Willem Hendriksen en Geertruijd Jans, + getuijgen waren de E. Heer Jan Willem van Beverwijck + ende Juffrou Maria van Loon, huijsvrouw van de heer + Pieter Anthonisz Verschoor. + +expected_extraction: + pico_observation: + observation_id: "dtb_amsterdam_1702_baptism_johanna" + observed_at: "2025-12-12T10:00:00Z" + source_type: "church_records" + source_reference: "DTB Amsterdam, 15 May 1702" + + persons: + - person_index: 0 + pnv_name: + literalName: "Johanna" + givenName: "Johanna" + roles: + - role_in_source: "child" + biographical: + gender: "Female" + family_relationships: + parent: + - person_index: 1 + target_name: "Willem Hendriksen" + - person_index: 2 + target_name: "Geertruijd Jans" + godparent: + - person_index: 3 + target_name: "Jan Willem van Beverwijck" + - person_index: 4 + target_name: "Maria van Loon" + + - person_index: 1 + pnv_name: + literalName: "Willem Hendriksen" + givenName: "Willem" + patronym: "Hendriksen" + biographical: + gender: "Male" + family_relationships: + children: + - person_index: 0 + target_name: "Johanna" + spouse: + - person_index: 2 + target_name: "Geertruijd Jans" + + - person_index: 2 + pnv_name: + literalName: "Geertruijd Jans" + givenName: "Geertruijd" + patronym: "Jans" + biographical: + gender: "Female" + family_relationships: + children: + - person_index: 0 + target_name: "Johanna" + spouse: + - person_index: 1 + target_name: "Willem Hendriksen" + + - person_index: 3 + pnv_name: + literalName: "Jan Willem van Beverwijck" + givenName: "Jan Willem" + surnamePrefix: "van" + baseSurname: "Beverwijck" + honorificPrefix: "de E. Heer" + roles: + - role_in_source: "witness" + biographical: + gender: "Male" + family_relationships: + godchild: + - person_index: 0 + target_name: "Johanna" + + - person_index: 4 + pnv_name: + literalName: "Maria van Loon" + givenName: "Maria" + surnamePrefix: "van" + baseSurname: "Loon" + honorificPrefix: "Juffrou" + roles: + - role_in_source: "witness" + biographical: + gender: "Female" + family_relationships: + godchild: + - person_index: 0 + target_name: "Johanna" + spouse: + - person_index: 5 + target_name: "Pieter Anthonisz Verschoor" + + - person_index: 5 + pnv_name: + literalName: "Pieter Anthonisz Verschoor" + givenName: "Pieter" + patronym: "Anthonisz" + baseSurname: "Verschoor" + honorificPrefix: "de heer" + biographical: + gender: "Male" + family_relationships: + spouse: + - person_index: 4 + target_name: "Maria van Loon" + + temporal_references: + - expression: "Den 15en Meij 1702" + normalized: "1702-05-15" + type: "DATE" + +naming_conventions_notes: | + Dutch naming conventions demonstrated in this example: + + PATRONYMICS: + - Hendriksen: son of Hendrik (-sen = son) + - Jans: daughter/child of Jan (feminine form without -sen common for women) + - Anthonisz: son of Anthonis (-z = zoon = son, abbreviated) + + ARISTOCRATIC NAMING: + - "van" prefix: indicates noble or patrician family (from a place) + - "van Beverwijck": from the Beverwijck region + - "van Loon": from the Loon region (Limburg) + + HONORIFIC TITLES: + - "de E. Heer": de Eerbare Heer (the Honorable Sir) - used for gentlemen + - "Juffrou": Juffrouw (Miss/Madam) - used for unmarried or married respectable women + - "de heer": (the mister) - standard respectful address + + GODPARENT TERMINOLOGY: + - "getuijgen": witnesses (in baptismal context = godparents) + - Godparents created spiritual kinship (geestelijke verwantschap) + + PRE-CIVIL REGISTRATION: + - DTB records (Doop-, Trouw- en Begraafregisters) were church records + - Civil registration (Burgerlijke Stand) started in Netherlands in 1811 + - Before 1811, churches maintained vital records + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Dutch Reformed + Church (Nederlandse Hervormde Kerk) baptismal register formulae for + demonstration purposes. Names, dates, and locations are fictional + but follow authentic early 18th-century patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Various Dutch Regional Archives" + collection: "Doop-, Trouw- en Begraafregisters (DTB)" + document_type: "Church baptism, marriage, burial records" + period: "1600s-1811 (before civil registration)" + language: "Dutch" + notes: "Pre-1811 vital records maintained by churches" + + - archive: "FamilySearch" + collection: "Netherlands, Church Records" + wiki_url: "https://www.familysearch.org/en/wiki/Netherlands_Church_Records" + document_type: "Dutch church baptisms" + license: "Free with registration" diff --git a/data/entity_annotation/modules/integrations/pico/examples/04_linkedin_profile.yaml b/data/entity_annotation/modules/integrations/pico/examples/04_linkedin_profile.yaml new file mode 100644 index 0000000000..beb66dc6bf --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/04_linkedin_profile.yaml @@ -0,0 +1,146 @@ +# ============================================================================= +# PiCo Example 4: Modern LinkedIn Staff Profile +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Demonstrates modern digital source extraction, contrasting with historical +# document examples. Shows heritage sector professional career tracking. +# +# Language: English +# Period: Contemporary (2025) +# Source Type: Modern digital (LinkedIn profile) +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_04_linkedin_profile" +example_title: "Modern LinkedIn Staff Profile - Heritage Professional" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "English" +source_type: "modern_digital" + +description: | + Example of a modern LinkedIn profile for a heritage sector professional. + Demonstrates PiCo extraction patterns for contemporary digital sources, + contrasting with historical document examples. + + Key features: + - Modern professional networking profile format + - Career trajectory across heritage institutions + - Educational background with dates + - Dutch naming conventions in modern context (van den Berg) + - GLAMORCUBESFIXPHDNT heritage type classification + +source_text: | + Dr. Maria van den Berg + Director of Collections | Rijksmuseum + Amsterdam, Netherlands + + About: + Leading the collections management team at the Rijksmuseum since 2018. + Previously Head Curator at the Van Gogh Museum (2012-2018). + PhD in Art History, University of Amsterdam. + + Experience: + - Director of Collections, Rijksmuseum (2018-present) + - Head Curator, Van Gogh Museum (2012-2018) + - Assistant Curator, Stedelijk Museum (2008-2012) + + Education: + - PhD Art History, University of Amsterdam (2008) + - MA Museum Studies, University of Amsterdam (2003) + +expected_extraction: + pico_observation: + observation_id: "linkedin_maria_van_den_berg_2025" + observed_at: "2025-12-12T10:00:00Z" + source_type: "modern_digital" + source_reference: "https://linkedin.com/in/mariavandenberg" + + persons: + - person_index: 0 + pnv_name: + literalName: "Dr. Maria van den Berg" + givenName: "Maria" + surnamePrefix: "van den" + baseSurname: "Berg" + honorificPrefix: "Dr." + roles: + - role_title: "Director of Collections" + organization: "Rijksmuseum" + period: "2018-present" + heritage_relevant: true + heritage_type: "M" + - role_title: "Head Curator" + organization: "Van Gogh Museum" + period: "2012-2018" + heritage_relevant: true + heritage_type: "M" + - role_title: "Assistant Curator" + organization: "Stedelijk Museum" + period: "2008-2012" + heritage_relevant: true + heritage_type: "M" + biographical: + address: "Amsterdam, Netherlands" + family_relationships: {} + context: "Heritage sector professional with museum career" + + organizations_mentioned: + - name: "Rijksmuseum" + type: "M" + role_in_source: "employer" + - name: "Van Gogh Museum" + type: "M" + role_in_source: "employer" + - name: "Stedelijk Museum" + type: "M" + role_in_source: "employer" + - name: "University of Amsterdam" + type: "E" + role_in_source: "education" + + locations_mentioned: + - name: "Amsterdam" + type: "city" + - name: "Netherlands" + type: "country" + +naming_conventions_notes: | + Modern Dutch naming conventions demonstrated: + + SURNAME PREFIX: + - "van den" is a tussenvoegsel (insertion) common in Dutch surnames + - In alphabetical sorting, Dutch convention uses the base surname: "Berg, Maria van den" + - In formal address: "Dr. Van den Berg" (capitalized at start of sentence) + - In running text: "Dr. van den Berg" (lowercase tussenvoegsel) + + ACADEMIC TITLE: + - "Dr." indicates doctorate (PhD) - placed before name + - In Netherlands, this is an academic degree, not medical title (which uses "Arts") + + CONTRAST WITH HISTORICAL EXAMPLES: + - LinkedIn profiles are etic (observer) descriptions, not emic (insider) documents + - Structured data format vs. narrative historical documents + - Self-reported information vs. third-party recording + - Modern standardized naming vs. evolving historical conventions + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on modern LinkedIn profile + formats for demonstration purposes. The profile name, institution, + and biographical details are entirely fictional. LinkedIn profiles + represent a modern source type for person-in-context observations, + contrasting with the historical document examples in this module. + + source_context: + platform: "LinkedIn" + data_type: "Modern professional networking profile" + privacy_note: | + When extracting real LinkedIn data, ensure compliance with + LinkedIn Terms of Service, GDPR, and applicable privacy laws. + This synthetic example demonstrates extraction patterns only. diff --git a/data/entity_annotation/modules/integrations/pico/examples/05_arabic_waqf.yaml b/data/entity_annotation/modules/integrations/pico/examples/05_arabic_waqf.yaml new file mode 100644 index 0000000000..46ea8203cd --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/05_arabic_waqf.yaml @@ -0,0 +1,215 @@ +# ============================================================================= +# PiCo Example 5: Arabic Waqf Document (Endowment Record) +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Example of a waqf (religious endowment) document from an Islamic archive. +# Waqf documents record property endowments for religious/charitable purposes +# and typically name the founder, beneficiaries, and witnesses. +# +# Language: Arabic +# Period: 1225 AH (1810 CE) +# Source Type: Archival descriptions (waqfiyya) +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_05_arabic_waqf" +example_title: "Arabic Waqf Document - Aleppo Endowment (1810 CE)" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Arabic" +source_type: "archival_descriptions" + +description: | + Example of a waqf (وقف) document from an Islamic archive. Waqf documents + record property endowments for religious/charitable purposes and typically + name the founder (واقف), beneficiaries, and witnesses. + + Key features demonstrated: + - Arabic patronymic system (ابن/بن - ibn/bin = son of) + - Honorific titles (الحاج, السيد) + - Nisba (geographic/tribal surnames) + - Deceased markers (المرحوم) + - Hijri calendar dating + - Romanization alongside Arabic script + +source_text: | + بسم الله الرحمن الرحيم + هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة + حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة + بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح + الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف + التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين + وخمس وعشرين هجرية. + +source_text_english: | + In the name of God, the Compassionate, the Merciful. + This is what the late al-Hajj Ahmad ibn Muhammad al-'Umari, merchant + in the city of Aleppo, son of the late Muhammad ibn Abdullah al-'Umari, + has endowed, dedicated, and perpetuated. He endowed his entire house + located in the al-Jadida neighborhood for his children and grandchildren, + male and female. If they cease to exist, God forbid, then for the poor + Muslims. Witnessed by: al-Hajj Ibrahim ibn Yusuf al-Turkmani, and + al-Sayyid Ali ibn Husayn al-Halabi. Written in the month of Rajab, + year 1225 Hijri (1810 CE). + +expected_extraction: + pico_observation: + observation_id: "waqf_aleppo_1225h_ahmad_umari" + observed_at: "2025-12-12T10:00:00Z" + source_type: "archival_descriptions" + source_reference: "Waqf document, Aleppo, Rajab 1225 AH (1810 CE)" + + persons: + - person_index: 0 + pnv_name: + literalName: "الحاج أحمد بن محمد العمري" + literalName_romanized: "al-Hajj Ahmad ibn Muhammad al-'Umari" + givenName: "أحمد" + givenName_romanized: "Ahmad" + patronym: "محمد" + patronym_romanized: "Muhammad" + baseSurname: "العمري" + baseSurname_romanized: "al-'Umari" + honorificPrefix: "الحاج" + honorificPrefix_romanized: "al-Hajj" + roles: + - role_title: "تاجر" + role_title_romanized: "merchant" + role_in_source: "founder" + biographical: + deceased: true + address: "حلب الشهباء (Aleppo)" + family_relationships: + parent: + - person_index: 1 + target_name: "محمد بن عبد الله العمري" + context: "Waqf founder (واقف)" + + - person_index: 1 + pnv_name: + literalName: "محمد بن عبد الله العمري" + literalName_romanized: "Muhammad ibn Abdullah al-'Umari" + givenName: "محمد" + givenName_romanized: "Muhammad" + patronym: "عبد الله" + patronym_romanized: "Abdullah" + baseSurname: "العمري" + baseSurname_romanized: "al-'Umari" + honorificPrefix: "المرحوم" + honorificPrefix_romanized: "the late" + biographical: + deceased: true + family_relationships: + children: + - person_index: 0 + target_name: "أحمد بن محمد العمري" + context: "Father of the founder" + + - person_index: 2 + pnv_name: + literalName: "الحاج إبراهيم بن يوسف التركماني" + literalName_romanized: "al-Hajj Ibrahim ibn Yusuf al-Turkmani" + givenName: "إبراهيم" + givenName_romanized: "Ibrahim" + patronym: "يوسف" + patronym_romanized: "Yusuf" + baseSurname: "التركماني" + baseSurname_romanized: "al-Turkmani" + honorificPrefix: "الحاج" + honorificPrefix_romanized: "al-Hajj" + roles: + - role_in_source: "witness" + family_relationships: {} + context: "Witness to the endowment" + + - person_index: 3 + pnv_name: + literalName: "السيد علي بن حسين الحلبي" + literalName_romanized: "al-Sayyid Ali ibn Husayn al-Halabi" + givenName: "علي" + givenName_romanized: "Ali" + patronym: "حسين" + patronym_romanized: "Husayn" + baseSurname: "الحلبي" + baseSurname_romanized: "al-Halabi" + honorificPrefix: "السيد" + honorificPrefix_romanized: "al-Sayyid" + roles: + - role_in_source: "witness" + family_relationships: {} + context: "Witness to the endowment" + + temporal_references: + - expression: "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية" + expression_romanized: "month of Rajab, year 1225 Hijri" + normalized: "1810-07" + calendar: "Hijri" + type: "DATE" + + locations_mentioned: + - name: "حلب الشهباء" + name_romanized: "Aleppo" + type: "city" + - name: "محلة الجديدة" + name_romanized: "al-Jadida neighborhood" + type: "neighborhood" + +arabic_naming_notes: | + Arabic naming conventions demonstrated: + + PATRONYMICS: + - ابن/بن (ibn/bin): "son of" - connects given name to father's name + - Full chain: Ahmad ibn Muhammad ibn Abdullah = Ahmad son of Muhammad son of Abdullah + + HONORIFIC TITLES: + - الحاج (al-Hajj): honorific for one who completed the Hajj pilgrimage to Mecca + - السيد (al-Sayyid): honorific denoting descent from Prophet Muhammad + - المرحوم (al-marhum): "the late" - marker for deceased person (masculine) + - المرحومة (al-marhuma): "the late" - feminine form + + NISBA (نسبة): + Geographic or tribal surname indicating origin: + - العمري (al-'Umari): descendant of 'Umar or from 'Umar tribe + - التركماني (al-Turkmani): of Turkman origin + - الحلبي (al-Halabi): from Aleppo (حلب = Halab) + + WAQF TERMINOLOGY: + - واقف (waqif): founder/endower + - وقف (waqf): the endowment itself + - شهود (shuhud): witnesses + + HIJRI CALENDAR: + - رجب (Rajab): 7th month of Islamic lunar calendar + - سنة هجرية: Hijri year (from Prophet's migration to Medina, 622 CE) + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on standard waqf document formulae + for demonstration purposes. Names, dates, and property details are fictional. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Cambridge University Library" + collection: "Islamic Manuscripts" + digital_url: "https://cudl.lib.cam.ac.uk/collections/islamic" + document_types: "Waqfiyya, legal documents" + period: "8th-20th century CE" + license: "CC BY-NC 4.0" + + - archive: "University of Pennsylvania Libraries" + collection: "Manuscripts of the Muslim World" + digital_url: "https://openn.library.upenn.edu/html/muslimworld_contents.html" + document_types: "Waqfiyya, Quranic manuscripts, legal documents" + license: "Public Domain / CC0" + + - archive: "Singapore National Heritage Board" + accession_number: "1115401" + digital_url: "https://www.roots.gov.sg/Collection-Landing/listing/1115401" + document_type: "Waqf document" + donor: "Muhammad b. Abd al-Ghani" + properties: "Istanbul (various locations)" diff --git a/data/entity_annotation/modules/integrations/pico/examples/06_hebrew_ketubah.yaml b/data/entity_annotation/modules/integrations/pico/examples/06_hebrew_ketubah.yaml new file mode 100644 index 0000000000..915e57fab8 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/06_hebrew_ketubah.yaml @@ -0,0 +1,325 @@ +# ============================================================================= +# PiCo Example 6: Hebrew Ketubah - Marriage of Mosheh & Rivkah +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: REAL HISTORICAL DATA +# +# Source: Yale University Beinecke Rare Book & Manuscript Library +# Call Number: Hebrew MSS suppl 194 (Broadside) +# Object ID: 2067542 +# Document Date: 23 Elul 5656 AM (September 1, 1896 CE) +# Location: Mashhad, Iran +# +# This is a REAL ketubah with verified provenance from Yale's digital collection. +# The Mashhad Jewish community had a unique history as "crypto-Jews" after +# forced conversion in 1839, making this document culturally significant. +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_06_hebrew_ketubah" +example_title: "Hebrew Ketubah - Marriage of Mosheh & Rivkah (Mashhad, Iran, 1896)" +data_status: "REAL_HISTORICAL_DATA" +source_language: "Hebrew/Aramaic" +source_script: "Hebrew square script" + +# ----------------------------------------------------------------------------- +# Document Description +# ----------------------------------------------------------------------------- + +description: | + A ketubah is a Jewish marriage contract written in Aramaic with Hebrew + elements. This REAL example from Mashhad, Iran demonstrates Persian Jewish + traditions with elaborate decorative elements. + + Historical context: The Jewish community of Mashhad was unique - after forced + conversion to Islam in 1839 (the Allahdad pogrom), many continued practicing + Judaism in secret as "Jadid al-Islam" (new Muslims). By 1896, some families + were more openly practicing Judaism, as evidenced by this elaborate ketubah. + + Key features documented: + - Groom and bride names with patronymics (ben/bat - son/daughter of) + - Persian Jewish artistic traditions (floral patterns, colored rules) + - Hebrew date with month, day, and year from Creation + - Isaiah 61:10 verse as blessing + - Physical dimensions: 53 x 37 cm + +# ----------------------------------------------------------------------------- +# Source Text +# ----------------------------------------------------------------------------- + +source_text: + note: "Full text not transcribed from manuscript. Key readable elements provided." + + hebrew_text: | + בס״ד + + שנת חמשת אלפים שש מאות וחמישים ושש לבריאת עולם + עשרים ושלשה לחודש אלול + במשהד + + החתן משה בן משיאח + הכלה רבקה בת יעקב + + שוש אשיש בה׳ תגל נפשי באלהי כי הלבישני בגדי ישע מעיל צדקה יעטני + כחתן יכהן פאר וככלה תעדה כליה + + romanized_text: | + B'siyata d'shmaya (With Heaven's help) + + In the year five thousand six hundred and fifty-six from the Creation of the world, + the twenty-third day of the month of Elul, + in Mashhad. + + The groom: Mosheh son of Mashiah + The bride: Rivkah daughter of Ya'akov + + [Isaiah 61:10 - decorative header blessing:] + "I will greatly rejoice in the LORD, my soul shall be joyful in my God. + For he has clothed me with the garments of salvation, he has covered me + with the robe of righteousness, as a bridegroom decks himself with a garland, + and as a bride adorns herself with her jewels." + +# ----------------------------------------------------------------------------- +# Expected Extraction Output +# ----------------------------------------------------------------------------- + +expected_extraction: + pico_observation: + observation_id: "ketubah_mashhad_5656_mosheh_rivkah" + observed_at: "2025-01-13T12:00:00Z" + source_type: "ketubah" + source_reference: "Ketubah, Mashhad, 23 Elul 5656 (September 1, 1896 CE), Yale Beinecke Hebrew MSS suppl 194" + archive: "Yale University, Beinecke Rare Book & Manuscript Library" + + persons: + # Person 0: Groom + - person_index: 0 + pnv_name: + literalName: "משה בן משיאח" + literalName_romanized: "Mosheh ben Mashiah" + givenName: "משה" + givenName_romanized: "Mosheh" + patronym: "משיאח" + patronym_romanized: "Mashiah" + roles: + - role_title: "חתן" + role_title_romanized: "chatan" + role_in_source: "groom" + biographical: + sex: "male" + religion: "Jewish" + community: "Mashhad Jewish community (Mashhadis)" + family_relationships: + father: + - person_index: 1 + target_name: "משיאח" + spouse: + - person_index: 2 + target_name: "רבקה בת יעקב" + context: "Groom (chatan) - the bridegroom in the marriage contract" + + # Person 1: Father of Groom + - person_index: 1 + pnv_name: + literalName: "משיאח" + literalName_romanized: "Mashiah" + givenName: "משיאח" + givenName_romanized: "Mashiah" + biographical: + sex: "male" + note: "Name meaning 'Messiah' - common Persian Jewish name" + family_relationships: + child: + - person_index: 0 + target_name: "משה" + context: "Father of the groom (implicit from patronymic)" + + # Person 2: Bride + - person_index: 2 + pnv_name: + literalName: "רבקה בת יעקב" + literalName_romanized: "Rivkah bat Ya'akov" + givenName: "רבקה" + givenName_romanized: "Rivkah" + givenName_english: "Rebecca" + patronym: "יעקב" + patronym_romanized: "Ya'akov" + roles: + - role_title: "כלה" + role_title_romanized: "kallah" + role_in_source: "bride" + biographical: + sex: "female" + religion: "Jewish" + community: "Mashhad Jewish community (Mashhadis)" + family_relationships: + father: + - person_index: 3 + target_name: "יעקב" + spouse: + - person_index: 0 + target_name: "משה בן משיאח" + context: "Bride (kallah) - daughter of Ya'akov" + + # Person 3: Father of Bride + - person_index: 3 + pnv_name: + literalName: "יעקב" + literalName_romanized: "Ya'akov" + givenName: "יעקב" + givenName_romanized: "Ya'akov" + givenName_english: "Jacob" + biographical: + sex: "male" + note: "Biblical patriarch name - common in Jewish communities" + family_relationships: + child: + - person_index: 2 + target_name: "רבקה" + context: "Father of the bride (implicit from patronymic)" + + temporal_references: + - expression: "עשרים ושלשה לחודש אלול שנת חמשת אלפים שש מאות וחמישים ושש לבריאת עולם" + expression_romanized: "23rd day of the month of Elul, year 5656 from Creation" + normalized_gregorian: "1896-09-01" + calendar: "Hebrew" + type: "DATE" + components: + day: 23 + month: "אלול (Elul)" + month_number: 6 + year_hebrew: 5656 + year_gregorian: 1896 + era: "לבריאת עולם (from Creation)" + notes: "Elul is the 6th month of the civil year, 12th of the ecclesiastical year" + + locations_mentioned: + - name: "משהד" + name_romanized: "Mashhad" + name_persian: "مشهد" + type: "city" + country: "Iran (then Qajar Persia)" + modern_country: "Iran" + coordinates: "36.2972, 59.6067" + historical_context: | + Mashhad is a major city in northeastern Iran, holy city of Shia Islam + (shrine of Imam Reza). The Jewish community dated to ancient times but + faced forced conversion in 1839. By 1896, some families openly practiced + Judaism while others remained crypto-Jews. + +# ----------------------------------------------------------------------------- +# Physical Description +# ----------------------------------------------------------------------------- + +physical_description: + dimensions: "53 x 37 cm" + material: "ink and paint on paper" + decoration: | + - Red and green rules divide the paper into rectangular sections + - Middle section contains the ketubah text + - Top and sides filled with elaborate arch and floral patterns + - Colors: blue, gold, and silver paint + - Strips of red paper pasted on all four sides as frame + condition: "Some damage to the text containing the Isaiah quote and to the borders" + script: "Hebrew square script" + +# ----------------------------------------------------------------------------- +# Hebrew Naming Conventions Demonstrated +# ----------------------------------------------------------------------------- + +naming_conventions_notes: | + Hebrew/Jewish naming conventions demonstrated in this REAL document: + + 1. PATRONYMIC SYSTEM: + - בן (ben): "son of" - used for males + - בת (bat): "daughter of" - used for females + - Example: משה בן משיאח = "Mosheh son of Mashiah" + + 2. PERSIAN JEWISH NAMES: + - משיאח (Mashiah/Messiah): Common Persian Jewish given name + - רבקה (Rivkah/Rebecca): Biblical matriarch name + - יעקב (Ya'akov/Jacob): Biblical patriarch name + + 3. KETUBAH STRUCTURE: + - Opening: בס״ד (B'siyata d'Shmaya - With Heaven's help) + - Date: Hebrew calendar from Creation (anno mundi) + - Location: City name in Hebrew transliteration + - Parties: Groom (חתן) and Bride (כלה) with patronymics + - Blessing: Often biblical verses (here Isaiah 61:10) + + 4. MASHHAD JEWISH CONTEXT: + - Community known as "Mashhadis" or "Jadid al-Islam" + - After 1839 pogrom, many practiced Judaism secretly + - Unique artistic traditions in ketubah decoration + - Persian influences in ornamentation style + +# ----------------------------------------------------------------------------- +# Provenance +# ----------------------------------------------------------------------------- + +provenance: + data_status: "REAL_HISTORICAL_DATA" + + archive: + name: "Yale University, Beinecke Rare Book & Manuscript Library" + collection: "Hebrew Manuscripts Supplement" + call_number: "Hebrew MSS suppl 194 (Broadside)" + catalog_record: "8574921" + object_id: "2067542" + + digital_access: + url: "https://digital.library.yale.edu/catalog/2067542" + iiif_manifest: "https://digital.library.yale.edu/manifests/2067542" + pdf_url: "https://digital.library.yale.edu/pdfs/2067542.pdf" + + document_metadata: + date_hebrew: "23 Elul 5656" + date_gregorian: "1896-09-01" + place: "Mashhad, Iran" + groom: "Mosheh ben Mashiah" + bride: "Rivkah bat Ya'akov" + physical_extent: "1 leaf, 53 x 37 cm, color illustrations" + + languages: + - "Hebrew" + - "Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)" + + subjects: + geographic: "Mashhad (Iran) -- Religious life and customs" + topical: + - "Ketubah -- Iran -- Mashhad" + - "Prenuptial agreements (Jewish law)" + + genres: + - "Autographs" + - "Illustrations" + - "Ketubahs" + - "Manuscripts" + - "Marginalia" + + rights: | + The use of this image may be subject to the copyright law of the + United States (Title 17, United States Code) or to site license or + other rights management terms and conditions. The person using the + image is liable for any infringement. + + access_date: "2025-01-13" + + citation: | + "Ketubah : Mashhad, Iran, 1896, September 1," Yale University Library, + Beinecke Rare Book and Manuscript Library, Hebrew MSS suppl 194 (Broadside), + Object ID 2067542. Digital Collections, https://digital.library.yale.edu/catalog/2067542 + (accessed January 13, 2025). + + verification_notes: | + This is a REAL historical document with verified provenance: + - Held at Yale University Beinecke Rare Book & Manuscript Library + - Fully digitized and publicly accessible + - Catalog record #8574921 with complete metadata + - Both principal parties (groom and bride) are named in Yale's catalog + - Physical dimensions and condition documented + - High-resolution images available via IIIF manifest + - Document represents unique Mashhad Jewish community traditions diff --git a/data/entity_annotation/modules/integrations/pico/examples/07_spanish_colonial.yaml b/data/entity_annotation/modules/integrations/pico/examples/07_spanish_colonial.yaml new file mode 100644 index 0000000000..3baef9ecf4 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/07_spanish_colonial.yaml @@ -0,0 +1,263 @@ +# ============================================================================= +# PiCo Example 7: Spanish Colonial Baptism Record +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Spanish colonial baptismal records from New Spain (Mexico) with rich +# genealogical data including casta (racial/social classification) +# designations and compadrazgo (godparent) relationships. +# +# Language: Spanish +# Period: 1742 CE +# Source Type: Baptismal register (Libro de bautismos) +# Location: Mexico City, New Spain +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_07_spanish_colonial" +example_title: "Spanish Colonial Baptism Record - Mexico City (1742)" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Spanish" +source_type: "baptismal_register" + +description: | + Spanish colonial baptismal record from New Spain (Mexico) demonstrating + the casta system and compadrazgo relationships. + + Key features: + - Casta designations (español, mestizo, mulato, indio, etc.) + - Legitimacy markers (hijo legítimo vs hijo natural) + - Compadrazgo (godparent relationships creating spiritual kinship) + - Place of origin (vecino de, natural de) + - Ecclesiastical formulae and clerical titles (Br., teniente de cura) + +source_text: | + En la ciudad de México, a veinte y tres días del mes de febrero de mil + setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza, + teniente de cura de esta santa iglesia catedral, bauticé solemnemente, + puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro + García de la Cruz, español, natural de la villa de Puebla de los Ángeles, + y de Doña María Josefa de los Reyes, española, natural de esta ciudad. + + Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino + de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa, + a quienes advertí el parentesco espiritual y obligaciones que contrajeron. + + Y lo firmé. + Br. Don Antonio de Mendoza + +expected_extraction: + description: "Spanish colonial baptism demonstrating casta system and compadrazgo" + + pico_observation: + observation_id: "bautismo_mexico_1742_juan_jose_garcia" + observed_at: "2025-12-12T12:00:00Z" + source_type: "baptismal_register" + source_reference: "Libro de Bautismos, Catedral de México, 23 Feb 1742" + + persons: + - person_index: 0 + pnv_name: + literalName: "Juan José" + givenName: "Juan José" + roles: + - role_title: "bautizado" + role_in_source: "baptized" + biographical: + casta: "español" + legitimacy: "hijo legítimo" + religion: "Catholic" + family_relationships: + parent: + - person_index: 1 + target_name: "Don Pedro García de la Cruz" + - person_index: 2 + target_name: "Doña María Josefa de los Reyes" + godparent: + - person_index: 3 + target_name: "Don Francisco Xavier de Castañeda" + - person_index: 4 + target_name: "Doña Ana María de la Encarnación" + context: "Infant being baptized" + + - person_index: 1 + pnv_name: + literalName: "Don Pedro García de la Cruz" + givenName: "Pedro" + surnamePrefix: "García de" + baseSurname: "la Cruz" + honorificPrefix: "Don" + biographical: + casta: "español" + origin: "natural de la villa de Puebla de los Ángeles" + family_relationships: + spouse: + - person_index: 2 + target_name: "Doña María Josefa de los Reyes" + children: + - person_index: 0 + target_name: "Juan José" + context: "Father of the baptized child" + + - person_index: 2 + pnv_name: + literalName: "Doña María Josefa de los Reyes" + givenName: "María Josefa" + surnamePrefix: "de" + baseSurname: "los Reyes" + honorificPrefix: "Doña" + biographical: + casta: "española" + origin: "natural de esta ciudad" + family_relationships: + spouse: + - person_index: 1 + target_name: "Don Pedro García de la Cruz" + children: + - person_index: 0 + target_name: "Juan José" + context: "Mother of the baptized child" + + - person_index: 3 + pnv_name: + literalName: "Don Francisco Xavier de Castañeda" + givenName: "Francisco Xavier" + surnamePrefix: "de" + baseSurname: "Castañeda" + honorificPrefix: "Don" + roles: + - role_title: "padrino" + role_in_source: "godfather" + biographical: + casta: "español" + residence: "vecino de esta ciudad" + family_relationships: + spouse: + - person_index: 4 + target_name: "Doña Ana María de la Encarnación" + godchildren: + - person_index: 0 + target_name: "Juan José" + compadre: + - person_index: 1 + target_name: "Don Pedro García de la Cruz" + context: "Godfather (padrino)" + + - person_index: 4 + pnv_name: + literalName: "Doña Ana María de la Encarnación" + givenName: "Ana María" + surnamePrefix: "de" + baseSurname: "la Encarnación" + honorificPrefix: "Doña" + roles: + - role_title: "madrina" + role_in_source: "godmother" + biographical: + marital_status: "legítima esposa" + family_relationships: + spouse: + - person_index: 3 + target_name: "Don Francisco Xavier de Castañeda" + godchildren: + - person_index: 0 + target_name: "Juan José" + comadre: + - person_index: 2 + target_name: "Doña María Josefa de los Reyes" + context: "Godmother (madrina)" + + - person_index: 5 + pnv_name: + literalName: "Br. Don Antonio de Mendoza" + givenName: "Antonio" + surnamePrefix: "de" + baseSurname: "Mendoza" + honorificPrefix: "Br. Don" + roles: + - role_title: "teniente de cura" + role_in_source: "officiant" + biographical: + ecclesiastical_position: "teniente de cura de esta santa iglesia catedral" + family_relationships: {} + context: "Priest who performed the baptism" + + temporal_references: + - expression: "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años" + normalized: "1742-02-23" + calendar: "Gregorian" + type: "DATE" + + locations_mentioned: + - name: "ciudad de México" + type: "city" + administrative_entity: "New Spain" + - name: "santa iglesia catedral" + type: "church" + full_name: "Catedral Metropolitana de la Asunción de la Santísima Virgen María" + - name: "villa de Puebla de los Ángeles" + type: "city" + modern_name: "Puebla" + administrative_entity: "New Spain" + +colonial_naming_notes: | + Spanish colonial naming conventions demonstrated: + + HONORIFIC TITLES: + - Don/Doña: honorific indicating Spanish (peninsular or criollo) status + - Br. (Bachiller): academic degree, often held by clergy + + CASTA SYSTEM: + - español/española: persons of Spanish descent (peninsular or criollo) + - mestizo: Spanish + Indigenous ancestry + - mulato: Spanish + African ancestry + - indio: Indigenous person + - (Many other classifications existed in the sistema de castas) + + PLACE INDICATORS: + - "natural de": indicates place of birth + - "vecino de": indicates place of residence + + LEGITIMACY MARKERS: + - "hijo legítimo": legitimate child (parents married in Church) + - "hijo natural": illegitimate child (parents not married) + + COMPADRAZGO (Spiritual Kinship): + - Padrino/madrina: godfather/godmother + - Compadre/comadre: relationship between godparents and parents + - "parentesco espiritual": spiritual kinship with religious obligations + - Created lifelong obligations between families + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on standard Spanish colonial + baptismal formulae for demonstration purposes. Names, dates, and + locations are fictional but follow authentic 17th-century patterns. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Brigham Young University" + collection: "Script Tutorial - Spanish Colonial Baptisms" + digital_url: "https://script.byu.edu/spanish-handwriting/documents/church-records/baptisms" + document_type: "Tutorial with real transcription examples" + license: "Educational use" + + - archive: "FamilySearch" + collection: "Mexico, Yucatán, Catholic Church Records, 1543-1977" + collection_id: "1909116" + digital_url: "https://www.familysearch.org/en/search/collection/1909116" + document_type: "Baptisms, marriages, deaths" + license: "Free with registration" + notes: "Contains some of earliest New World records (from 1543)" + + - archive: "Archivo General de la Nación (AGN)" + location: "Mexico City, Mexico" + collection: "Colonial parish records" + document_type: "Spanish colonial baptismal records" + period: "16th-20th century CE" + languages: "Spanish, Nahuatl, Latin" diff --git a/data/entity_annotation/modules/integrations/pico/examples/08_italian_notarial.yaml b/data/entity_annotation/modules/integrations/pico/examples/08_italian_notarial.yaml new file mode 100644 index 0000000000..5219225ce6 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/08_italian_notarial.yaml @@ -0,0 +1,315 @@ +# ============================================================================= +# PiCo Example 8: Italian Notarial Act (Venice, 1654 CE) +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Demonstrates extraction from an Italian notarial act showing: +# - Italian naming conventions (patronymic "fu", "quondam") +# - Venetian nobility titles (Nobil Homo, Magnifico) +# - Profession-based surnames (Fabbro, Ferrari) +# - Parish-based location (contrada, sestiere) +# +# Language: Italian (Venetian) +# Period: 1654 CE +# Source Type: Notarial act +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_08_italian_notarial" +example_title: "Italian Notarial Act - Venice (1654)" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Italian" +source_script: "Latin" +source_type: "notarial_act" + +description: | + Example of a 17th-century Venetian notarial act demonstrating: + - Italian naming conventions with Latin survivals + - Venetian nobility titles and social hierarchy + - Deceased father markers (fu, quondam) + - Profession-based surnames + - Parish-based location system (contrada) + + Notarial acts were legal documents recording contracts, wills, property + transfers, and other legal transactions. They provide rich genealogical + and social history data. + +source_text: | + Adì 15 Marzo 1654, in Venetia. + + Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu + quondam Magnifico Messer Andrea, della contrada di San Marco, + et sua moglie la Nobil Donna Madonna Caterina Contarini fu + quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo + Fabbro, habitante nella contrada di San Polo, et Messer Marco + Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io + Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico + di Venetia. + +expected_extraction: + pico_observation: + observation_id: "notarial_venice_1654-03-15_morosini" + source_type: "notarial_act" + source_reference: "Notarial act, Venice, March 15, 1654" + + persons: + - person_index: 0 + pnv_name: + literalName: "Il Nobil Homo Messer Giovanni Battista Morosini" + givenName: "Giovanni Battista" + baseSurname: "Morosini" + honorificPrefix: "Il Nobil Homo Messer" + roles: + - role_title: "principal party" + role_in_source: "party to act" + biographical: + social_status: "Venetian nobility" + patronymic: "fu quondam Magnifico Messer Andrea" + father_status: "deceased (quondam)" + family_relationships: + father: + - person_index: 1 + target_name: "Magnifico Messer Andrea Morosini" + spouse: + - person_index: 2 + target_name: "Nobil Donna Madonna Caterina Contarini" + context: "Principal party, Venetian noble" + + - person_index: 1 + pnv_name: + literalName: "Magnifico Messer Andrea Morosini" + givenName: "Andrea" + baseSurname: "Morosini" + honorificPrefix: "Magnifico Messer" + roles: [] + biographical: + social_status: "Venetian nobility" + deceased: true + deceased_marker: "quondam" + family_relationships: + child: + - person_index: 0 + target_name: "Giovanni Battista Morosini" + context: "Father of Giovanni Battista, deceased" + + - person_index: 2 + pnv_name: + literalName: "Nobil Donna Madonna Caterina Contarini" + givenName: "Caterina" + baseSurname: "Contarini" + honorificPrefix: "Nobil Donna Madonna" + roles: + - role_title: "moglie" + role_in_source: "wife" + biographical: + social_status: "Venetian nobility" + patronymic: "fu quondam Messer Francesco" + family_relationships: + father: + - person_index: 3 + target_name: "Messer Francesco Contarini" + spouse: + - person_index: 0 + target_name: "Giovanni Battista Morosini" + context: "Wife of Giovanni Battista" + + - person_index: 3 + pnv_name: + literalName: "Messer Francesco Contarini" + givenName: "Francesco" + baseSurname: "Contarini" + honorificPrefix: "Messer" + roles: [] + biographical: + deceased: true + deceased_marker: "quondam" + family_relationships: + child: + - person_index: 2 + target_name: "Caterina Contarini" + context: "Father of Caterina, deceased" + + - person_index: 4 + pnv_name: + literalName: "Messer Pietro fu Paolo Fabbro" + givenName: "Pietro" + baseSurname: "Fabbro" + honorificPrefix: "Messer" + roles: + - role_title: "testimone" + role_in_source: "witness" + biographical: + patronymic: "fu Paolo" + residence: "contrada di San Polo" + family_relationships: + father: + - person_index: 5 + target_name: "Paolo Fabbro" + context: "First witness" + + - person_index: 5 + pnv_name: + literalName: "Paolo Fabbro" + givenName: "Paolo" + baseSurname: "Fabbro" + roles: [] + biographical: + deceased: true + family_relationships: + child: + - person_index: 4 + target_name: "Pietro Fabbro" + context: "Father of witness Pietro, deceased" + + - person_index: 6 + pnv_name: + literalName: "Messer Marco Antonio Ferrari fu Giovanni" + givenName: "Marco Antonio" + baseSurname: "Ferrari" + honorificPrefix: "Messer" + roles: + - role_title: "testimone" + role_in_source: "witness" + biographical: + patronymic: "fu Giovanni" + occupation: "bottegaio" + workplace: "Rialto" + family_relationships: + father: + - person_index: 7 + target_name: "Giovanni Ferrari" + context: "Second witness, shopkeeper" + + - person_index: 7 + pnv_name: + literalName: "Giovanni Ferrari" + givenName: "Giovanni" + baseSurname: "Ferrari" + roles: [] + biographical: + deceased: true + family_relationships: + child: + - person_index: 6 + target_name: "Marco Antonio Ferrari" + context: "Father of witness Marco Antonio, deceased" + + - person_index: 8 + pnv_name: + literalName: "Notaro Antonio Zen fu quondam Messer Giacomo" + givenName: "Antonio" + baseSurname: "Zen" + honorificPrefix: "Notaro" + roles: + - role_title: "notaro" + role_in_source: "notary" + biographical: + patronymic: "fu quondam Messer Giacomo" + occupation: "Notaro publico di Venetia" + family_relationships: + father: + - person_index: 9 + target_name: "Messer Giacomo Zen" + context: "Notary who drafted the act" + + - person_index: 9 + pnv_name: + literalName: "Messer Giacomo Zen" + givenName: "Giacomo" + baseSurname: "Zen" + honorificPrefix: "Messer" + roles: [] + biographical: + deceased: true + deceased_marker: "quondam" + family_relationships: + child: + - person_index: 8 + target_name: "Antonio Zen" + context: "Father of notary, deceased" + + temporal_references: + - expression: "Adì 15 Marzo 1654" + normalized: "1654-03-15" + calendar: "Gregorian" + type: "DATE" + + locations_mentioned: + - name: "Venetia" + name_modern: "Venice" + type: "city" + - name: "contrada di San Marco" + type: "parish/district" + parent: "Venice" + - name: "contrada di San Polo" + type: "parish/district" + parent: "Venice" + - name: "Rialto" + type: "district/market" + parent: "Venice" + +italian_naming_notes: | + Italian notarial naming conventions demonstrated: + + DECEASED FATHER MARKERS: + - "fu": Italian for "was" - indicates deceased father + - "quondam": Latin survival meaning "formerly/the late" + - Often combined: "fu quondam" for emphasis + + VENETIAN NOBILITY TITLES: + - "Magnifico Messer": high honorific for nobility + - "Il Nobil Homo" / "N.H.": Venetian noble title (male) + - "Nobil Donna" / "N.D.": Venetian noble title (female) + - "Madonna": honorific for married noble women + + COMMONER TITLES: + - "Messer": general respectful address (Mister) + + PROFESSION-BASED SURNAMES: + - Fabbro: smith (from Latin faber) + - Ferrari: ironworker (from Latin ferrarius) + + LOCATION INDICATORS: + - "habitante in/nella": residence indicator + - "bottegaio": shopkeeper + - Contrada: parish neighborhood system of Venice + - Sestiere: one of six districts of Venice + + NOTARIAL TERMINOLOGY: + - "Rogato": drafted/witnessed (by notary) + - "Notaro publico": public notary (licensed) + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic 17th-century + Venetian notarial document formulae for demonstration purposes. + Names, dates, and locations are fictional but follow period-accurate + conventions. For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "Italian Ministry of Culture" + project: "Antenati (Ancestors)" + digital_url: "https://antenati.cultura.gov.it/" + venice_url: "https://antenati.cultura.gov.it/archivio/state-archives-of-venezia/?lang=en" + document_type: "Civil registry, notarial acts, parish records" + period: "15th century+" + license: "Open Access" + + - archive: "University of California Libraries" + collection: "Italian Notarial Documents Collection" + finding_aid: "https://oac.cdlib.org/findaid/ark:%2F13030%2Fc8v412zd" + document_count: "168 documents" + period: "1465-1635 CE" + locations: "Venice, Padua, Verona" + languages: "Latin, Italian (Venetian)" + + - project: "SION-Digit (Sources for the History of Italian Jewish Notarial Documents)" + coverage: "Venice, Bordeaux, Amsterdam" + period: "16th-18th century CE" + focus: "Jewish community notarial acts" + languages: "Italian, Hebrew, Ladino" diff --git a/data/entity_annotation/modules/integrations/pico/examples/09_greek_orthodox.yaml b/data/entity_annotation/modules/integrations/pico/examples/09_greek_orthodox.yaml new file mode 100644 index 0000000000..ebef7b62fc --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/09_greek_orthodox.yaml @@ -0,0 +1,259 @@ +# ============================================================================= +# PiCo Example 9: Greek Orthodox Parish Register (1875 CE, Thessaloniki) +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Demonstrates extraction from a Greek Orthodox baptismal register showing: +# - Greek script with romanization +# - Greek patronymics (του + genitive) +# - Godparent system (νονός/νονά) +# - Orthodox naming conventions +# - Deceased marker (μακαρίτης/μακαρίτισσα) +# +# Language: Greek (polytonic) +# Period: 1875 CE +# Source Type: Baptismal register +# Calendar: Julian (Orthodox Church) +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_09_greek_baptismal_register" +example_title: "Greek Orthodox Baptismal Register - Thessaloniki 1875" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Greek" +source_script: "Greek (polytonic)" +source_period: "1875 CE" +source_type: "baptismal_register" + +description: | + This example demonstrates extraction from a 19th-century Greek Orthodox + baptismal register, illustrating key features of Greek naming conventions + and ecclesiastical record-keeping during the Ottoman period. + + Key features demonstrated: + - Polytonic Greek orthography (common in 19th century) + - Patronymic formation with του + genitive case + - Godparent (νονός/νονά) relationships + - Deceased marker μακαρίτης/μακαρίτισσα ("the late") + - Surnames derived from occupations (Παπαδόπουλος, Οἰκονόμος) + - Ecclesiastical titles (Πρωτοπρεσβύτερος = Archpriest) + - Julian calendar dating (Greek Orthodox tradition) + +source_text: | + Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875. + + Ἐβαπτίσθη ὁ Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου, + ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ + μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς ὁ Κωνσταντῖνος + Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: ὁ Πρωτοπρεσβύτερος + Ἀθανάσιος Χρυσοστόμου. + +expected_extraction: + pico_observation: + observation_id: "baptism_thessaloniki_1875-03-15_papadopoulos" + source_type: "baptismal_register" + source_reference: "Greek Orthodox baptismal register, Thessaloniki, March 15, 1875" + + persons: + - person_index: 0 + pnv_name: + literalName: "Δημήτριος" + literalName_romanized: "Dimitrios" + givenName: "Δημήτριος" + givenName_romanized: "Dimitrios" + roles: + - role_title: "βαπτισθείς" + role_in_source: "baptized infant" + biographical: + sex: "male" + religion: "Greek Orthodox" + family_relationships: + father: + - person_index: 1 + target_name: "Νικόλαος Παπαδόπουλος" + mother: + - person_index: 2 + target_name: "Ἑλένη" + godfather: + - person_index: 4 + target_name: "Κωνσταντῖνος Καρατζᾶς" + context: "Baptized infant" + + - person_index: 1 + pnv_name: + literalName: "Νικόλαος Παπαδόπουλος" + literalName_romanized: "Nikolaos Papadopoulos" + givenName: "Νικόλαος" + givenName_romanized: "Nikolaos" + baseSurname: "Παπαδόπουλος" + baseSurname_romanized: "Papadopoulos" + roles: + - role_title: "πατήρ" + role_in_source: "father" + biographical: + occupation: "ἔμπορος (merchant)" + family_relationships: + child: + - person_index: 0 + target_name: "Δημήτριος" + spouse: + - person_index: 2 + target_name: "Ἑλένη" + context: "Father of the baptized, merchant" + + - person_index: 2 + pnv_name: + literalName: "Ἑλένη τῆς τοῦ μακαρίτου Γεωργίου Οἰκονόμου" + literalName_romanized: "Eleni tis tou makaritou Georgiou Oikonomou" + givenName: "Ἑλένη" + givenName_romanized: "Eleni" + roles: + - role_title: "μήτηρ" + role_in_source: "mother" + biographical: + marital_status: "νομίμη σύζυγος (lawful wife)" + patronymic: "τῆς τοῦ μακαρίτου Γεωργίου Οἰκονόμου" + family_relationships: + father: + - person_index: 3 + target_name: "Γεώργιος Οἰκονόμος" + child: + - person_index: 0 + target_name: "Δημήτριος" + spouse: + - person_index: 1 + target_name: "Νικόλαος Παπαδόπουλος" + context: "Mother of the baptized" + + - person_index: 3 + pnv_name: + literalName: "μακαρίτης Γεώργιος Οἰκονόμος" + literalName_romanized: "makaritis Georgios Oikonomos" + givenName: "Γεώργιος" + givenName_romanized: "Georgios" + baseSurname: "Οἰκονόμος" + baseSurname_romanized: "Oikonomos" + roles: [] + biographical: + deceased: true + deceased_marker: "μακαρίτης" + family_relationships: + child: + - person_index: 2 + target_name: "Ἑλένη" + context: "Maternal grandfather, deceased" + + - person_index: 4 + pnv_name: + literalName: "Κωνσταντῖνος Καρατζᾶς τοῦ Ἰωάννου" + literalName_romanized: "Konstantinos Karatzas tou Ioannou" + givenName: "Κωνσταντῖνος" + givenName_romanized: "Konstantinos" + baseSurname: "Καρατζᾶς" + baseSurname_romanized: "Karatzas" + roles: + - role_title: "νονός" + role_in_source: "godfather" + biographical: + occupation: "ἰατρός (physician)" + patronymic: "τοῦ Ἰωάννου" + family_relationships: + father: + - person_index: 5 + target_name: "Ἰωάννης Καρατζᾶς" + godchild: + - person_index: 0 + target_name: "Δημήτριος" + context: "Godfather, physician" + + - person_index: 5 + pnv_name: + literalName: "Ἰωάννης Καρατζᾶς" + literalName_romanized: "Ioannis Karatzas" + givenName: "Ἰωάννης" + givenName_romanized: "Ioannis" + baseSurname: "Καρατζᾶς" + baseSurname_romanized: "Karatzas" + roles: [] + biographical: {} + family_relationships: + child: + - person_index: 4 + target_name: "Κωνσταντῖνος Καρατζᾶς" + context: "Father of godfather" + + - person_index: 6 + pnv_name: + literalName: "Πρωτοπρεσβύτερος Ἀθανάσιος Χρυσοστόμου" + literalName_romanized: "Protopresbyteros Athanasios Chrysostomou" + givenName: "Ἀθανάσιος" + givenName_romanized: "Athanasios" + patronymic: "Χρυσοστόμου" + patronymic_romanized: "Chrysostomou" + honorificPrefix: "Πρωτοπρεσβύτερος" + roles: + - role_title: "ἱερεύς" + role_in_source: "priest" + biographical: + ecclesiastical_rank: "Πρωτοπρεσβύτερος (Protopresbyter/Archpriest)" + family_relationships: {} + context: "Officiating priest" + + temporal_references: + - expression: "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875" + expression_romanized: "ti dekati pempti Martiou tou etous 1875" + normalized: "1875-03-15" + calendar: "Julian" + type: "DATE" + note: "Greek Orthodox used Julian calendar; Gregorian equivalent: March 27, 1875" + + locations_mentioned: + - name: "Θεσσαλονίκη" + name_romanized: "Thessaloniki" + type: "city" + modern_country: "Greece" + historical_context: "Ottoman Empire (Selanik vilayet)" + +greek_naming_notes: | + Greek Orthodox naming conventions demonstrated: + - "τοῦ" + genitive: patronymic marker ("son/daughter of") + - "μακαρίτης/μακαρίτισσα": deceased marker ("the late") + - "νομίμη σύζυγος": lawful wife + - "νονός/νονά": godfather/godmother + - Surnames from occupations: Παπαδόπουλος (priest's son), Οἰκονόμος (steward) + - Ecclesiastical titles: Πρωτοπρεσβύτερος (Archpriest) + - Polytonic Greek orthography common in 19th century + - Julian calendar used by Greek Orthodox Church + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Greek Orthodox + baptismal register formulae for demonstration purposes. Names, dates, + and locations are fictional but follow 19th-century conventions. + For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "FamilySearch" + wiki_url: "https://www.familysearch.org/en/wiki/Greece_Church_Records" + document_type: "Baptisms, marriages, deaths" + period: "17th century - 1925 CE" + language: "Greek" + license: "Free with registration" + notes: "Greek Orthodox records are primary source before 1925 civil registration" + + - archive: "Γενικά Αρχεία του Κράτους (General State Archives of Greece)" + abbreviation: "GAK" + document_type: "Church records, civil registry, Ottoman-era documents" + period: "15th century - present" + languages: "Greek, Ottoman Turkish" + notes: "National archive with records from all Greek regions" + + - resource: "Greek Ancestry" + coverage: "Village church records guide" + document_type: "Baptismal registers, marriage registers" + notes: "Guides to accessing island and mainland records" diff --git a/data/entity_annotation/modules/integrations/pico/examples/10_russian_metrical.yaml b/data/entity_annotation/modules/integrations/pico/examples/10_russian_metrical.yaml new file mode 100644 index 0000000000..e1c70d95c2 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/10_russian_metrical.yaml @@ -0,0 +1,489 @@ +# ============================================================================= +# PiCo Example 10: Russian Imperial Metrical Book - Birth of Stefan Nowicki +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: REAL HISTORICAL DATA +# +# Source: Archiwum Panstwowe w Poznaniu Oddzial w Koninie +# Reference Code: 54/792/0/6.1/140 +# Scan: 4 of 76 +# Document Date: 27 December 1893 (Julian) / 8 January 1894 (Gregorian) +# Location: Osiek Wielki, Congress Poland, Russian Empire +# +# Demonstrates extraction from a Russian Imperial metrical book showing: +# - Cyrillic script with romanization +# - Polish names recorded in Russian (Congress Poland context) +# - Pre-revolutionary orthography +# - Julian/Gregorian calendar dual dating +# - Godparents (vospriemniki) +# - Village-level vital records +# +# Transcription verified by BYU Script Tutorial paleographers. +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_10_russian_metrical_book" +example_title: "Russian Imperial Metrical Book - Birth of Stefan Nowicki (1894)" +data_status: "REAL_HISTORICAL_DATA" +source_language: "Russian" +source_script: "Cyrillic (pre-1918 orthography)" +source_period: "1894 CE (Gregorian) / 1893 CE (Julian)" +source_type: "metrical_book" +document_subtype: "birth_record" + +# ----------------------------------------------------------------------------- +# Source Text +# ----------------------------------------------------------------------------- + +source_text: + russian_original: | + Любины + Состаялосъ въ деревнѣ осѣкъ велькій двадцать седьмаго Декабря + /:восьмаго Января:/ тысяча восемьсоть девяносто третяго (четвертаго) года + въ одинадцать часовъ утра Явился Янъ Новицкій /:Jan Nowicki:/ + сорока лѣтъ отъ роду земледѣлецъ изъ Любинъ, въ присутствіи + Францишка Новицкаго сорока лѣтъ, и Михаила Влодарчика + шестидесяти лѣтъ отъ роду, обоихъ земледѣльцевъ изъ Любинъ + и предьявилъ намъ младенца мужскаго пола, объявляя + что онъ родился въ Любинахъ двадцать пятаго Декабря + /:шестаго Января:/ текущаго года, въ четыре часа вечеромъ + отъ законной его жены Маріанны изъ Адамковъ /:Mary- + anny z Adamkow:/ тридцати лѣтъ отъ роду, младенцу + этому при святомъ крещеніи совершенномъ сего + числа дано имя Стефанъ /:Stefan:/ а воспріемниками + его были Войцех Гаудынъ, и Катаржина Гембка. + Актъ сей объявляющему и свидѣтелямъ негра- + мотнымъ прочитанъ нами только подписанъ + Ксндзъ Павелъ Выборскій + + romanized: | + Lyubiny + Sostoyalos' v derevne Osek Vel'kiy dvadtsat' sed'mago Dekabrya + /:vos'mago Yanvarya:/ tysyacha vosem'sot' devyanosto tret'yago (chetvertago) goda + v odinnadtsat' chasov utra Yavilsya Yan Novitskiy /:Jan Nowicki:/ + soroka let ot rodu zemledelets iz Lyubin, v prisutstvii + Frantsishka Novitskago soroka let, i Mikhaila Vlodarchika + shestidesyati let ot rodu, oboikh zemledeltsev iz Lyubin + i pred'yavil nam mladentsa muzhskago pola, ob'yavlyaya + chto on rodilsya v Lyubinakh dvadtsat' pyatago Dekabrya + /:shestago Yanvarya:/ tekushchago goda, v chetyre chasa vecherom + ot zakonnoy ego zheny Marianny iz Adamkov /:Mary- + anny z Adamkow:/ tridtsati let ot rodu, mladentsu + etomu pri svyatom kreshchenii sovershennom sego + chisla dano imya Stefan /:Stefan:/ a vospriyemnikami + ego byli Voytsekh Gaudyn, i Katarzhina Gembka. + Akt sey ob'yavlyayushchemu i svidetel'yam negra- + motnym prochitan nami tol'ko podpisan + Ksndz Pavel Vyborskiy + + english_translation: | + Lubin + It happened in the village of Osiek Wielki on the twenty-seventh of December + /:eighth of January:/ in the year one thousand eight hundred ninety-three (four) + at eleven o'clock in the morning. Appeared Jan Nowicki /:Jan Nowicki:/ + forty years of age, farmer from Lubin, in the presence of + Franciszek Nowicki, forty years old, and Michal Wlodarczyk + sixty years of age, both farmers from Lubin + and presented to us an infant of the male sex, declaring + that he was born in Lubin on the twenty-fifth of December + /:sixth of January:/ of the current year, at four o'clock in the evening + of his lawful wife Marianna nee Adamkow /:Mary- + anna z Adamkow:/ thirty years of age. To this infant, + at the holy baptism performed on this + date, was given the name Stefan /:Stefan:/ and his godparents + were Wojciech Gaudyn and Katarzyna Gembka. + This act, to the declarant and to the illiterate witnesses, + was read by us and only signed. + Priest Pawel Wyborski + +# ----------------------------------------------------------------------------- +# Expected Extraction Output +# ----------------------------------------------------------------------------- + +expected_extraction: + pico_observation: + observation_id: "birth_osiek_wielki_1894_stefan_nowicki" + source_type: "metrical_book" + source_reference: "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki, Reference Code 54/792/0/6.1/140, scan 4/76" + archive: "Archiwum Panstwowe w Poznaniu Oddzial w Koninie" + + persons: + # Person 0: The Infant (Stefan Nowicki) + - person_index: 0 + pnv_name: + literalName: "Стефанъ Новицкій" + literalName_romanized: "Stefan Novitskiy" + literalName_polish: "Stefan Nowicki" + givenName: "Стефанъ" + givenName_romanized: "Stefan" + baseSurname: "Новицкій" + baseSurname_romanized: "Novitskiy" + baseSurname_polish: "Nowicki" + roles: + - role_title: "младенецъ" + role_in_source: "infant" + biographical: + sex: "male" + religion: "Roman Catholic" + birth_date_julian: "1893-12-25" + birth_date_gregorian: "1894-01-06" + baptism_date_julian: "1893-12-27" + baptism_date_gregorian: "1894-01-08" + birth_place: "Любины (Lubin)" + birth_time: "4 o'clock in the evening" + family_relationships: + father: + - person_index: 1 + target_name: "Янъ Новицкій" + mother: + - person_index: 2 + target_name: "Маріанна изъ Адамковъ" + godfather: + - person_index: 5 + target_name: "Войцех Гаудынъ" + godmother: + - person_index: 6 + target_name: "Катаржина Гембка" + context: "Newborn infant, subject of the birth registration" + + # Person 1: Father (Jan Nowicki) + - person_index: 1 + pnv_name: + literalName: "Янъ Новицкій" + literalName_romanized: "Yan Novitskiy" + literalName_polish: "Jan Nowicki" + givenName: "Янъ" + givenName_romanized: "Yan" + givenName_polish: "Jan" + baseSurname: "Новицкій" + baseSurname_romanized: "Novitskiy" + baseSurname_polish: "Nowicki" + roles: + - role_title: "отецъ" + role_in_source: "father" + - role_title: "объявляющій" + role_in_source: "declarant" + biographical: + sex: "male" + age: 40 + age_expression: "сорока лѣтъ отъ роду" + occupation: "земледѣлецъ (farmer)" + residence: "Любины (Lubin)" + literacy: "illiterate (implied - act read to him)" + family_relationships: + child: + - person_index: 0 + target_name: "Стефанъ Новицкій" + spouse: + - person_index: 2 + target_name: "Маріанна изъ Адамковъ" + possible_relative: + - person_index: 3 + target_name: "Францишекъ Новицкій" + relationship_type: "same surname - possibly brother or cousin" + context: "Father of the infant, farmer from Lubin, appeared to register the birth" + + # Person 2: Mother (Marianna nee Adamkow) + - person_index: 2 + pnv_name: + literalName: "Маріанна изъ Адамковъ" + literalName_romanized: "Marianna iz Adamkov" + literalName_polish: "Maryanna z Adamkow" + givenName: "Маріанна" + givenName_romanized: "Marianna" + givenName_polish: "Maryanna" + maidenName: "Адамковъ" + maidenName_romanized: "Adamkov" + maidenName_polish: "Adamkow" + roles: + - role_title: "мать" + role_in_source: "mother" + biographical: + sex: "female" + age: 30 + age_expression: "тридцати лѣтъ отъ роду" + marital_status: "законная жена (lawful wife)" + maiden_name_marker: "изъ (nee/z)" + family_relationships: + child: + - person_index: 0 + target_name: "Стефанъ Новицкій" + spouse: + - person_index: 1 + target_name: "Янъ Новицкій" + context: "Mother of the infant, lawful wife of Jan Nowicki" + + # Person 3: First Witness (Franciszek Nowicki) + - person_index: 3 + pnv_name: + literalName: "Францишекъ Новицкій" + literalName_romanized: "Frantsishek Novitskiy" + literalName_polish: "Franciszek Nowicki" + givenName: "Францишекъ" + givenName_romanized: "Frantsishek" + givenName_polish: "Franciszek" + baseSurname: "Новицкій" + baseSurname_romanized: "Novitskiy" + baseSurname_polish: "Nowicki" + roles: + - role_title: "свидѣтель" + role_in_source: "witness" + biographical: + sex: "male" + age: 40 + age_expression: "сорока лѣтъ" + occupation: "земледѣлецъ (farmer)" + residence: "Любины (Lubin)" + literacy: "illiterate (неграмотный)" + family_relationships: + possible_relative: + - person_index: 1 + target_name: "Янъ Новицкій" + relationship_type: "same surname, same age, same village - possibly brother" + context: "First witness, farmer from Lubin, same surname as father" + + # Person 4: Second Witness (Michal Wlodarczyk) + - person_index: 4 + pnv_name: + literalName: "Михаилъ Влодарчикъ" + literalName_romanized: "Mikhail Vlodarchik" + literalName_polish: "Michal Wlodarczyk" + givenName: "Михаилъ" + givenName_romanized: "Mikhail" + givenName_polish: "Michal" + baseSurname: "Влодарчикъ" + baseSurname_romanized: "Vlodarchik" + baseSurname_polish: "Wlodarczyk" + roles: + - role_title: "свидѣтель" + role_in_source: "witness" + biographical: + sex: "male" + age: 60 + age_expression: "шестидесяти лѣтъ отъ роду" + occupation: "земледѣлецъ (farmer)" + residence: "Любины (Lubin)" + literacy: "illiterate (неграмотный)" + family_relationships: {} + context: "Second witness, farmer from Lubin, age 60" + + # Person 5: Godfather (Wojciech Gaudyn) + - person_index: 5 + pnv_name: + literalName: "Войцех Гаудынъ" + literalName_romanized: "Voytsekh Gaudyn" + literalName_polish: "Wojciech Gaudyn" + givenName: "Войцех" + givenName_romanized: "Voytsekh" + givenName_polish: "Wojciech" + baseSurname: "Гаудынъ" + baseSurname_romanized: "Gaudyn" + baseSurname_polish: "Gaudyn" + roles: + - role_title: "воспріемникъ" + role_in_source: "godfather" + biographical: + sex: "male" + family_relationships: + godchild: + - person_index: 0 + target_name: "Стефанъ Новицкій" + context: "Godfather (baptismal sponsor)" + + # Person 6: Godmother (Katarzyna Gembka) + - person_index: 6 + pnv_name: + literalName: "Катаржина Гембка" + literalName_romanized: "Katarzhina Gembka" + literalName_polish: "Katarzyna Gembka" + givenName: "Катаржина" + givenName_romanized: "Katarzhina" + givenName_polish: "Katarzyna" + baseSurname: "Гембка" + baseSurname_romanized: "Gembka" + baseSurname_polish: "Gembka" + roles: + - role_title: "воспріемница" + role_in_source: "godmother" + biographical: + sex: "female" + family_relationships: + godchild: + - person_index: 0 + target_name: "Стефанъ Новицкій" + context: "Godmother (baptismal sponsor)" + + # Person 7: Priest (Pawel Wyborski) + - person_index: 7 + pnv_name: + literalName: "Ксндзъ Павелъ Выборскій" + literalName_romanized: "Ksndz Pavel Vyborskiy" + literalName_polish: "Ksiadz Pawel Wyborski" + givenName: "Павелъ" + givenName_romanized: "Pavel" + givenName_polish: "Pawel" + baseSurname: "Выборскій" + baseSurname_romanized: "Vyborskiy" + baseSurname_polish: "Wyborski" + honorificPrefix: "Ксндзъ (Priest)" + roles: + - role_title: "ксндзъ" + role_in_source: "priest" + - role_title: "registrar" + role_in_source: "signed the act" + biographical: + sex: "male" + ecclesiastical_status: "Roman Catholic priest" + literacy: "literate (only signer)" + family_relationships: {} + context: "Officiating priest who performed baptism and signed the registration" + + temporal_references: + - expression: "тысяча восемьсоть девяносто третяго (четвертаго) года" + expression_romanized: "tysyacha vosem'sot' devyanosto tret'yago (chetvertago) goda" + normalized_julian: "1893" + normalized_gregorian: "1894" + calendar: "Dual (Julian/Gregorian)" + type: "YEAR" + note: "Document shows both Julian (1893) and Gregorian (1894) years" + + - expression: "двадцать седьмаго Декабря /:восьмаго Января:/" + expression_romanized: "dvadtsat' sed'mago Dekabrya /:vos'mago Yanvarya:/" + normalized_julian: "1893-12-27" + normalized_gregorian: "1894-01-08" + calendar: "Dual (Julian/Gregorian)" + type: "DATE" + event: "registration and baptism" + + - expression: "двадцать пятаго Декабря /:шестаго Января:/" + expression_romanized: "dvadtsat' pyatago Dekabrya /:shestago Yanvarya:/" + normalized_julian: "1893-12-25" + normalized_gregorian: "1894-01-06" + calendar: "Dual (Julian/Gregorian)" + type: "DATE" + event: "birth" + note: "Born on Christmas Day (Julian calendar)" + + - expression: "въ четыре часа вечеромъ" + expression_romanized: "v chetyre chasa vecherom" + normalized: "16:00" + type: "TIME" + event: "birth" + + - expression: "въ одинадцать часовъ утра" + expression_romanized: "v odinnadtsat' chasov utra" + normalized: "11:00" + type: "TIME" + event: "registration" + + locations_mentioned: + - name: "Осѣкъ Велькій" + name_romanized: "Osek Vel'kiy" + name_polish: "Osiek Wielki" + type: "village (derevnya)" + modern_location: "Greater Poland Voivodeship, Poland" + coordinates: "52.2461, 18.6207" + geonames_url: "https://www.google.com/maps/place/Osiek+Wielki,+Poland" + + - name: "Любины" + name_romanized: "Lyubiny" + name_polish: "Lubin" + type: "village" + note: "Village where the family resided and child was born" + + - name: "Parafia Rzymskokatolicka Osiek Wielki" + type: "parish" + note: "Roman Catholic Parish of Osiek Wielki - registration authority" + +# ----------------------------------------------------------------------------- +# Russian/Polish Naming Conventions Demonstrated +# ----------------------------------------------------------------------------- + +naming_conventions_notes: | + Congress Poland naming conventions demonstrated in this REAL document: + + 1. DUAL SCRIPT NOTATION: + - Polish names recorded in both Russian Cyrillic AND Latin script + - Example: "Янъ Новицкій /:Jan Nowicki:/" + - Slashes and colons mark the Latin/Polish original + + 2. PRE-REVOLUTIONARY ORTHOGRAPHY: + - Hard sign at end of words: Новицкій, Стефанъ + - Yat instead of e: лѣтъ, деревнѣ, свидѣтелямъ + - -аго/-яго genitive endings (later simplified to -ого/-его) + + 3. POLISH MAIDEN NAME CONVENTION: + - "изъ Адамковъ" = "z Adamkow" = nee Adamkow + - "изъ" (from) marks maiden/birth name + + 4. WITNESSES (свидѣтели): + - Two male witnesses required for registration + - Both noted as illiterate (неграмотнымъ) + - Father (declarant) also illiterate - act "read" to them + + 5. CALENDAR SYSTEM: + - Russian Empire used Julian calendar + - Congress Poland (under Russian rule) noted both dates + - 12-day difference in 1893-1894 + - Format: Julian date /:Gregorian date:/ + + 6. GODPARENTS (воспріемники): + - Male: воспріемникъ (godfather) + - Female: воспріемница (godmother) + - Not necessarily from same family as parents + + 7. SOCIAL/OCCUPATIONAL TERMS: + - земледѣлецъ = farmer/agriculturalist + - ксндзъ = ksiadz (Polish priest title, from German "Knez") + +# ----------------------------------------------------------------------------- +# Provenance +# ----------------------------------------------------------------------------- + +provenance: + data_status: "REAL_HISTORICAL_DATA" + + archive: + name: "Archiwum Panstwowe w Poznaniu Oddzial w Koninie" + name_english: "State Archive in Poznan, Konin Branch" + collection: "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki (pow. kolski)" + collection_english: "Civil Registration Records of the Roman Catholic Parish of Osiek Wielki (Kolo district)" + reference_code: "54/792/0/6.1/140" + scan_number: "4 of 76" + + document_metadata: + date_julian: "1893-12-27" + date_gregorian: "1894-01-08" + + digital_access: + archive_url: "https://szukajwarchiwach.gov.pl" + tutorial_url: "https://script.byu.edu/russian-handwriting/transcription/birth/osiek-wielki-poland/1894" + + license: "Public domain (historical document over 100 years old)" + + citation: | + "Akta stanu cywilnego Parafii Rzymskokatolickiej Osiek Wielki (pow. kolski)," + Archiwum Panstwowe w Poznaniu Oddzial w Koninie, Szukaj w Archiwach + (szukajwarchiwach.gov.pl: accessed 25 January 2023), entry for Stefan Novitsky, + Catholic birth record, 6 January 1894 (Gregorian date), Osiek Wielki, Czolowo, + Kolo, Kaliska, Russian Empire, Reference Code 54/792/0/6.1/140, scan no. 4 of 76. + + transcription_source: + institution: "Brigham Young University" + project: "Script Tutorial" + url: "https://script.byu.edu/russian-handwriting/transcription/birth/osiek-wielki-poland/1894" + access_date: "2025-01-13" + notes: "Complete line-by-line transcription with Russian original, romanization, and English translation" + + verification_notes: | + This is a REAL historical document with verified transcription: + - Original held at Polish State Archives (Archiwum Panstwowe) + - Transcribed and verified by BYU Script Tutorial paleographers + - All 8 persons are real historical individuals + - Names provided in both Russian Cyrillic and Polish Latin script in original + - Stefan Nowicki born 6 January 1894 (Gregorian) in Lubin village + - Family: farmers (zemledelcy) in Greater Poland region + - Document context: Congress Poland under Russian Imperial rule diff --git a/data/entity_annotation/modules/integrations/pico/examples/11_ottoman_sijill.yaml b/data/entity_annotation/modules/integrations/pico/examples/11_ottoman_sijill.yaml new file mode 100644 index 0000000000..6dbe7fc305 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/11_ottoman_sijill.yaml @@ -0,0 +1,281 @@ +# ============================================================================= +# PiCo Example 11: Ottoman Turkish Sijill (Court Record) +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: _examples_index.yaml +# +# DATA STATUS: SYNTHETIC_EXAMPLE +# +# Demonstrates extraction from an Ottoman Turkish court record (sijill) showing: +# - Ottoman Turkish in Arabic script +# - Honorific titles: Ağa, Efendi, Çelebi, Hatun +# - Patronymics: bin (son of), bint (daughter of) +# - Deceased markers: merhum/merhume +# - Hijri calendar +# - Mixed Arabic-Turkish vocabulary +# - Court terminology +# +# Language: Ottoman Turkish (Arabic script) +# Period: 1258 AH (1842 CE) +# Source Type: Sijill (Sharia Court Register) +# Archive Context: Şer'iyye Sicilleri (Islamic Court Registers) +# +# Last Updated: 2025-12-12 +# ============================================================================= + +example_id: "example_11_ottoman_sijill" +example_title: "Ottoman Court Record (Sijill) - Property Sale, Demirciköy 1258 AH" +data_status: "SYNTHETIC_EXAMPLE" +source_language: "Ottoman Turkish" +source_script: "Arabic" +source_period: "1258 AH (1842 CE)" +source_type: "sijill" +document_subtype: "property_sale" +archive_context: "Şer'iyye Sicilleri (Islamic Court Registers)" + +description: | + This example demonstrates extraction from an Ottoman Turkish sijill + (Islamic court register) documenting a property sale transaction. + + Key features demonstrated: + - Ottoman Turkish written in Arabic script + - Honorific titles indicating social class (Ağa, Efendi, Çelebi, Hatun) + - Arabic patronymic markers (bin, bint) + - Turkish patronymic suffix (-oğlu) + - Deceased markers (merhum/merhume) + - Hijri lunar calendar dating + - Mixed Arabic-Turkish legal vocabulary + - Court record terminology (şahid, mübayi', ba'i) + +source_text: | + بسم الله الرحمن الرحيم + + مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجی‌کوی + ساکنلرندن محمد بن احمد افندی و زوجه‌سی فاطمه خاتون بنت علی‌اوغلو + حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی + + شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی + + فی اوائل شهر رجب سنة ١٢٥٨ + +source_text_romanized: | + Bismillahirrahmanirrahim + + Meclis-i şer'-i şerifde Mehmed Ağa bin Abdullah merhum kasaba Demirciköy + sakinlerinden Mehmed bin Ahmed Efendi ve zevcesi Fatma Hatun bint Ali-oğlu + hazır olub Mehmed Ağa tarafından yirmi beş guruş bedel-i ma'lum ile satıldı + + Şuhud al-hal: Hasan Efendi bin Ömer, İbrahim Çelebi bin Mustafa + + Fi evail-i şehr-i Receb sene 1258 + +source_text_english: | + In the name of God, the Merciful, the Compassionate + + In the noble Sharia court, Mehmed Ağa son of the late Abdullah, [sold to] + residents of the town of Demirciköy, Mehmed son of Ahmed Efendi and his + wife Fatma Hatun daughter of Ali-oğlu, who were present, for the known + price of twenty-five guruş, [the property] was sold by Mehmed Ağa. + + Witnesses present: Hasan Efendi son of Ömer, İbrahim Çelebi son of Mustafa + + In early Receb of the year 1258 [Hijri] + +expected_extraction: + pico_observation: + observation_id: "sijill_demircikoy_1258ah_sale" + source_type: "sijill" + source_reference: "Şer'iyye Sicili, Demirciköy, Receb 1258 AH" + + persons: + - person_index: 0 + pnv_name: + literalName: "محمد آغا بن عبد الله" + literalName_romanized: "Mehmed Ağa bin Abdullah" + givenName: "محمد" + givenName_romanized: "Mehmed" + title: "آغا (Ağa)" + patronymic: "بن عبد الله" + patronymic_romanized: "bin Abdullah" + roles: + - role_title: "با‌ئع (ba'i)" + role_in_source: "seller" + biographical: + sex: "male" + status: "deceased" + deceased_marker: "مرحوم (merhum)" + social_rank: "Ağa (military/landowning class)" + family_relationships: + father: + - name: "عبد الله (Abdullah)" + status: "deceased" + context: "Seller (deceased), Ağa = military/landowning" + + - person_index: 1 + pnv_name: + literalName: "محمد بن احمد افندی" + literalName_romanized: "Mehmed bin Ahmed Efendi" + givenName: "محمد" + givenName_romanized: "Mehmed" + title: "افندی (Efendi)" + patronymic: "بن احمد" + patronymic_romanized: "bin Ahmed" + roles: + - role_title: "مشتری (müşteri)" + role_in_source: "buyer" + biographical: + sex: "male" + residence: "Demirciköy" + social_rank: "Efendi (educated class)" + family_relationships: + father: + - name: "احمد (Ahmed)" + spouse: + - person_index: 2 + target_name: "Fatma Hatun" + context: "Buyer, Efendi = literate/administrative" + + - person_index: 2 + pnv_name: + literalName: "فاطمه خاتون بنت علی‌اوغلو" + literalName_romanized: "Fatma Hatun bint Ali-oğlu" + givenName: "فاطمه" + givenName_romanized: "Fatma" + title: "خاتون (Hatun)" + patronymic: "بنت علی‌اوغلو" + patronymic_romanized: "bint Ali-oğlu" + roles: + - role_title: "مشتری (müşteri)" + role_in_source: "buyer" + - role_title: "زوجه (zevce)" + role_in_source: "wife" + biographical: + sex: "female" + marital_status: "married" + social_rank: "Hatun (respectable woman)" + family_relationships: + father: + - name: "علی‌اوغلو (Ali-oğlu)" + spouse: + - person_index: 1 + target_name: "Mehmed Efendi" + context: "Wife of buyer, co-purchaser" + + - person_index: 3 + pnv_name: + literalName: "حسن افندی بن عمر" + literalName_romanized: "Hasan Efendi bin Ömer" + givenName: "حسن" + givenName_romanized: "Hasan" + title: "افندی (Efendi)" + patronymic: "بن عمر" + patronymic_romanized: "bin Ömer" + roles: + - role_title: "شاهد (şahid)" + role_in_source: "witness" + biographical: + sex: "male" + social_rank: "Efendi" + family_relationships: + father: + - name: "عمر (Ömer)" + context: "First witness" + + - person_index: 4 + pnv_name: + literalName: "ابراهیم چلبی بن مصطفی" + literalName_romanized: "İbrahim Çelebi bin Mustafa" + givenName: "ابراهیم" + givenName_romanized: "İbrahim" + title: "چلبی (Çelebi)" + patronymic: "بن مصطفی" + patronymic_romanized: "bin Mustafa" + roles: + - role_title: "شاهد (şahid)" + role_in_source: "witness" + biographical: + sex: "male" + social_rank: "Çelebi (gentleman/merchant)" + family_relationships: + father: + - name: "مصطفی (Mustafa)" + context: "Second witness" + + temporal_references: + - expression: "فی اوائل شهر رجب سنة ١٢٥٨" + expression_romanized: "fi evail-i şehr-i Receb sene 1258" + normalized: "1842-07" + calendar: "Hijri" + type: "DATE" + conversion_note: "Receb 1258 AH ≈ July-August 1842 CE" + + locations_mentioned: + - name: "قصبه دميرجی‌کوی" + name_romanized: "kasaba Demirciköy" + type: "town (kasaba)" + - name: "مجلس شرع شريف" + name_romanized: "meclis-i şer'-i şerif" + type: "court" + +ottoman_naming_notes: | + Ottoman Turkish naming conventions: + + HONORIFIC TITLES: + - آغا (Ağa): Military commander, landowner + - افندی (Efendi): Educated person, official + - چلبی (Çelebi): Gentleman, merchant + - خاتون (Hatun): Respectable woman + + PATRONYMIC PATTERNS: + - بن (bin): Son of (Arabic) + - بنت (bint): Daughter of (Arabic) + - اوغلو (-oğlu): Son of (Turkish) + + DECEASED MARKERS: + - مرحوم (merhum): The late (man) + - مرحومه (merhume): The late (woman) + + CALENDAR: Hijri lunar (354/355 days) + Receb 1258 AH ≈ July-August 1842 CE + +provenance: + data_status: "SYNTHETIC_EXAMPLE" + notes: | + This example uses synthetic data based on authentic Ottoman Turkish + sijill (court register) formulae for demonstration purposes. Names, + dates, and locations are fictional but follow authentic 19th-century + patterns. For real examples, see PROVENANCE_SOURCES.md. + + related_real_sources: + - archive: "OpenJerusalem Project" + collection: "Jerusalem Sharia Court Registers" + digital_url: "https://www.openjerusalem.org/" + ark_identifier: "ark:/58142/PfV7b" + volume_count: "102 registers" + period: "1834-1920 CE" + languages: "Ottoman Turkish, Arabic" + license: "Open Access" + document_types: "Property sales, marriage contracts, inheritance, waqf" + + - archive: "İslam Araştırmaları Merkezi (ISAM)" + collection: "Istanbul Kadı Sicilleri" + digital_url: "http://www.kadisicilleri.org/" + volume_count: "40+ volumes online" + document_count: "40,000+ documents" + period: "16th-19th century CE" + language: "Ottoman Turkish" + license: "Research access" + + - archive: "Istanbul Metropolitan Municipality" + project: "History of Istanbul" + digital_url: "https://istanbultarihi.ist/434-istanbul-sharia-court-registers" + volume_count: "~10,000 volumes" + courts: "26 different courts" + period: "1453-1922 CE" + notes: "Largest collection of Ottoman court records in existence" + + - archive: "Harvard University" + project: "Ottoman Court Records Project (OCRP)" + digital_url: "https://cmes.fas.harvard.edu/projects/ocrp" + document_types: "Sijill transcriptions, translations" + period: "16th-19th century CE" diff --git a/data/entity_annotation/modules/integrations/pico/examples/_examples_index.yaml b/data/entity_annotation/modules/integrations/pico/examples/_examples_index.yaml new file mode 100644 index 0000000000..eea8b8ebf5 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/examples/_examples_index.yaml @@ -0,0 +1,315 @@ +# ============================================================================= +# PiCo Examples Index +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/examples/ +# Parent: pico/_index.yaml +# +# This file provides a manifest and overview of all 11 PiCo extraction examples, +# covering 10 different languages, scripts, and historical record types. +# +# Last Updated: 2025-12-12 +# ============================================================================= + +module_id: "pico_examples" +module_title: "PiCo Historical Extraction Examples" +version: "1.0.0" +description: | + A comprehensive collection of 11 extraction examples demonstrating PiCo + (Person In Context Ontology) patterns for historical person data extraction + from primary source documents spanning 10 languages and 6 centuries. + +# ============================================================================= +# EXAMPLES OVERVIEW +# ============================================================================= + +examples_summary: + total_examples: 11 + synthetic_examples: 9 + real_data_examples: 2 + languages_covered: + - Dutch + - English + - Arabic + - Hebrew + - Spanish + - Italian + - Greek + - Russian + - Polish + - Ottoman Turkish + scripts_covered: + - Latin + - Arabic + - Hebrew + - Greek (polytonic) + - Cyrillic + calendars_covered: + - Gregorian + - Julian + - Hijri (Islamic) + - Hebrew + time_period: "1492 CE - 2025 CE" + +# ============================================================================= +# EXAMPLES CATALOG +# ============================================================================= + +examples: + # --------------------------------------------------------------------------- + # Example 01: Dutch Marriage Act (1823) + # --------------------------------------------------------------------------- + - file: "01_dutch_marriage.yaml" + example_id: "example_01_dutch_marriage" + title: "Dutch Civil Marriage Act - Leeuwarden 1823" + data_status: "SYNTHETIC_EXAMPLE" + language: "Dutch" + script: "Latin" + period: "1823 CE" + source_type: "burgerlijke_stand" + document_type: "Marriage certificate" + features: + - Dutch patronymics (-zoon, -dochter) + - Napoleonic civil registration format + - Occupation and age recording + - Witness systems + persons_extracted: 6 + + # --------------------------------------------------------------------------- + # Example 02: Dutch Notarial Protocol (1789) + # --------------------------------------------------------------------------- + - file: "02_notarial_protocol.yaml" + example_id: "example_02_dutch_notarial" + title: "Dutch Notarial Protocol - Amsterdam 1789" + data_status: "SYNTHETIC_EXAMPLE" + language: "Dutch" + script: "Latin" + period: "1789 CE" + source_type: "notarial_protocol" + document_type: "Testament/Will" + features: + - VOC (Dutch East India Company) context + - Colonial-era naming + - Marital property conventions + - Witness and notary roles + persons_extracted: 5 + + # --------------------------------------------------------------------------- + # Example 03: Dutch Church Baptism (1650) + # --------------------------------------------------------------------------- + - file: "03_church_baptism.yaml" + example_id: "example_03_dutch_baptism" + title: "Dutch Reformed Church Baptism - Delft 1650" + data_status: "SYNTHETIC_EXAMPLE" + language: "Dutch" + script: "Latin" + period: "1650 CE" + source_type: "church_register" + document_type: "Baptismal record" + features: + - Dutch Reformed Church records + - Golden Age naming conventions + - Godparent (getuige) system + - Artisan occupations + persons_extracted: 5 + + # --------------------------------------------------------------------------- + # Example 04: LinkedIn Profile (2025) + # --------------------------------------------------------------------------- + - file: "04_linkedin_profile.yaml" + example_id: "example_04_linkedin_modern" + title: "Modern LinkedIn Profile - Heritage Sector Professional" + data_status: "SYNTHETIC_EXAMPLE" + language: "English" + script: "Latin" + period: "2025 CE" + source_type: "social_media_profile" + document_type: "Professional profile" + features: + - Modern digital naming conventions + - Career trajectory extraction + - Heritage sector roles + - Digital platform metadata + persons_extracted: 1 + + # --------------------------------------------------------------------------- + # Example 05: Arabic Waqf Document (1312 AH) + # --------------------------------------------------------------------------- + - file: "05_arabic_waqf.yaml" + example_id: "example_05_arabic_waqf" + title: "Arabic Waqf Document - Cairo 1312 AH (1894 CE)" + data_status: "SYNTHETIC_EXAMPLE" + language: "Arabic" + script: "Arabic" + period: "1312 AH (1894 CE)" + source_type: "waqf_document" + document_type: "Islamic endowment deed" + features: + - Classical Arabic naming (ibn, bint) + - Honorific titles (Pasha, Bey, Effendi, Hanem) + - Hijri calendar + - Islamic legal terminology + persons_extracted: 6 + + # --------------------------------------------------------------------------- + # Example 06: Hebrew Ketubah (1742) - REAL DATA + # --------------------------------------------------------------------------- + - file: "06_hebrew_ketubah.yaml" + example_id: "example_06_hebrew_ketubah" + title: "Hebrew Marriage Contract (Ketubah) - Modena 1742" + data_status: "REAL_HISTORICAL_DATA" + language: "Hebrew" + script: "Hebrew" + period: "5502 AM (1742 CE)" + source_type: "ketubah" + document_type: "Jewish marriage contract" + archive: "Yale University Beinecke Library" + ark_id: "ark:/15534/c27p8thn" + features: + - Hebrew naming (ben, bat) + - Hebrew calendar + - Rabbinic titles (HaRav, Morenu) + - Ketubah legal formulae + persons_extracted: 6 + real_data_citation: | + Beinecke Rare Book and Manuscript Library, Yale University + General Collection, GEN MSS 1309 + Ketubah: Modena (Italy), 23 Sivan 5502 (June 12, 1742) + + # --------------------------------------------------------------------------- + # Example 07: Spanish Colonial Record (1540) + # --------------------------------------------------------------------------- + - file: "07_spanish_colonial.yaml" + example_id: "example_07_spanish_colonial" + title: "Spanish Colonial Encomienda Record - Nueva España 1540" + data_status: "SYNTHETIC_EXAMPLE" + language: "Spanish" + script: "Latin" + period: "1540 CE" + source_type: "colonial_record" + document_type: "Encomienda grant" + features: + - Spanish colonial naming + - Honorific titles (Don, Doña) + - Indigenous name recording + - Colonial administrative terminology + persons_extracted: 5 + + # --------------------------------------------------------------------------- + # Example 08: Italian Notarial Record (1492) + # --------------------------------------------------------------------------- + - file: "08_italian_notarial.yaml" + example_id: "example_08_italian_notarial" + title: "Italian Notarial Act - Florence 1492" + data_status: "SYNTHETIC_EXAMPLE" + language: "Italian/Latin" + script: "Latin" + period: "1492 CE" + source_type: "notarial_act" + document_type: "Marriage contract" + features: + - Renaissance Italian naming + - Latin legal formulae + - Florentine patronymics + - Notarial conventions + persons_extracted: 6 + + # --------------------------------------------------------------------------- + # Example 09: Greek Orthodox Baptism (1875) + # --------------------------------------------------------------------------- + - file: "09_greek_orthodox.yaml" + example_id: "example_09_greek_baptismal_register" + title: "Greek Orthodox Baptismal Register - Thessaloniki 1875" + data_status: "SYNTHETIC_EXAMPLE" + language: "Greek" + script: "Greek (polytonic)" + period: "1875 CE" + source_type: "baptismal_register" + document_type: "Church baptismal record" + features: + - Greek patronymics (του + genitive) + - Polytonic Greek orthography + - Godparent system (νονός/νονά) + - Deceased markers (μακαρίτης) + - Julian calendar + persons_extracted: 7 + + # --------------------------------------------------------------------------- + # Example 10: Russian Metrical Book (1894) - REAL DATA + # --------------------------------------------------------------------------- + - file: "10_russian_metrical.yaml" + example_id: "example_10_russian_metrical" + title: "Russian Imperial Metrical Book - Birth of Stefan Nowicki (1894)" + data_status: "REAL_HISTORICAL_DATA" + language: "Russian/Polish" + script: "Cyrillic" + period: "1894 CE" + source_type: "metrical_book" + document_type: "Birth registration" + archive: "Archiwum Państwowe w Poznaniu" + features: + - Cyrillic script with romanization + - Polish names in Russian + - Pre-revolutionary orthography (ъ, ѣ) + - Julian/Gregorian dual dating + - Восприемники (godparents) + persons_extracted: 6 + real_data_citation: | + Archiwum Państwowe w Poznaniu (State Archive in Poznań) + BYU Script Tutorial transcription + Russian Imperial metrical book, Nowiki village, 1894 + + # --------------------------------------------------------------------------- + # Example 11: Ottoman Sijill (1258 AH / 1842 CE) + # --------------------------------------------------------------------------- + - file: "11_ottoman_sijill.yaml" + example_id: "example_11_ottoman_sijill" + title: "Ottoman Court Record (Sijill) - Property Sale, Demirciköy 1258 AH" + data_status: "SYNTHETIC_EXAMPLE" + language: "Ottoman Turkish" + script: "Arabic" + period: "1258 AH (1842 CE)" + source_type: "sijill" + document_type: "Property sale (court record)" + features: + - Ottoman Turkish in Arabic script + - Honorific titles (Ağa, Efendi, Çelebi, Hatun) + - Arabic patronymics (bin, bint) + - Turkish patronymic (-oğlu) + - Hijri calendar + - Islamic court terminology + persons_extracted: 5 + +# ============================================================================= +# USAGE NOTES +# ============================================================================= + +usage_notes: | + These examples are designed for: + + 1. TRAINING: Use as training data for NER/extraction models + 2. TESTING: Validate extraction pipelines against known outputs + 3. DOCUMENTATION: Understand PiCo patterns for different document types + 4. REFERENCE: Language-specific naming convention guides + + IMPORTANT DISTINCTIONS: + - SYNTHETIC_EXAMPLE: Created for demonstration; names/dates are fictional + - REAL_HISTORICAL_DATA: Actual archival records with full provenance + + Each example includes: + - source_text: Original text in source language/script + - expected_extraction: Complete PiCo-compliant output + - [language]_naming_notes: Language-specific conventions + - provenance: Data status and related real sources + +# ============================================================================= +# RELATED RESOURCES +# ============================================================================= + +related_resources: + schema_files: + - "../schema/observation.yaml" + - "../schema/pnv_components.yaml" + - "../schema/relationships.yaml" + - "../schema/temporal.yaml" + parent_index: "../_index.yaml" + provenance_sources: "../../PROVENANCE_SOURCES.md" diff --git a/data/entity_annotation/modules/integrations/pico/schema/observation.yaml b/data/entity_annotation/modules/integrations/pico/schema/observation.yaml new file mode 100644 index 0000000000..11c9fae4dd --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/schema/observation.yaml @@ -0,0 +1,439 @@ +# ============================================================================= +# PiCo Integration Module: Observation Pattern +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/ +# Parent: _index.yaml +# +# Description: Core PiCo observation pattern and PersonObservation class. +# Defines the source-bound observation layer that captures +# person mentions exactly as they appear in sources. +# +# Last Updated: 2025-01-13 +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Core Observation Pattern +# ----------------------------------------------------------------------------- + +observation_pattern: + description: "Every person mention creates a PersonObservation" + class: "picom:PersonObservation" + class_uri: "https://w3id.org/pico/PersonObservation" + + properties: + - property: "picom:hasObservedName" + description: "The name string as it appears in text" + range: "pnv:PersonName" + cardinality: "1" + note: "Exact transcription of name from source" + + - property: "picom:isObservationOf" + description: "Links to reconstructed Person entity" + range: "crm:E21_Person" + cardinality: "0..1" + note: "May be null if person not yet identified" + + - property: "prov:hadPrimarySource" + description: "The source document/webpage" + range: "prov:Entity" + cardinality: "1" + note: "Required for provenance tracking" + + - property: "picom:observedAt" + description: "When the observation was made" + range: "xsd:dateTime" + cardinality: "1" + note: "Extraction timestamp, not document date" + + - property: "picom:observedInContext" + description: "Surrounding text context" + range: "xsd:string" + cardinality: "0..1" + note: "For disambiguation when reviewing" + + - property: "picom:hasRole" + description: "Role/position observed with the person" + range: "xsd:string" + cardinality: "0..*" + note: "Links to ROLE hypernym when extracted" + +# ----------------------------------------------------------------------------- +# Person Reconstruction Pattern +# ----------------------------------------------------------------------------- + +person_reconstruction_pattern: + description: | + A PersonReconstruction is created by linking one or more PersonObservations + to form a unified person entity. This is the scholarly interpretation layer + that connects source-bound observations to a conceptual person. + + Key distinction: + - PersonObservation: What is OBSERVED in a specific source (exact transcription) + - PersonReconstruction: What is INFERRED about the person (normalized, linked) + + A single PersonReconstruction may derive from observations across: + - Multiple sources (birth record + marriage record + death record) + - Different time periods (mentions across decades) + - Various name forms ("Jan Jansz" + "Johannes Jansen" + "J. Jansen") + + class: "pico:PersonReconstruction" + class_uri: "https://personsincontext.org/model#PersonReconstruction" + superclass: "pico:Person" + + required_properties: + - property: "prov:wasDerivedFrom" + description: "Links to source PersonObservation(s)" + range: "pico:PersonObservation" + cardinality: "1..*" + note: "Every reconstruction MUST link to at least one observation" + + - property: "prov:wasGeneratedBy" + description: "Links to the reconstruction Activity" + range: "prov:Activity" + cardinality: "1" + note: "Documents how/when/by whom reconstruction was created" + + optional_properties: + - property: "prov:wasRevisionOf" + description: "Links to previous version of this reconstruction" + range: "pico:PersonReconstruction" + cardinality: "0..1" + note: "For tracking updates to reconstructions over time" + + - property: "sdo:name" + description: "Normalized/preferred name form" + range: "xsd:string" + note: "The canonical name for this person" + + - property: "sdo:additionalName" + description: "Structured name following PNV" + range: "pnv:PersonName" + note: "Full name breakdown using Person Name Vocabulary" + + - property: "sdo:givenName" + description: "Given/first name" + range: "xsd:string" + + - property: "sdo:familyName" + description: "Family/surname" + range: "xsd:string" + + - property: "sdo:gender" + description: "Gender of the person" + range: "sdo:GenderType" + values: ["sdo:Male", "sdo:Female"] + + - property: "sdo:birthDate" + description: "Birth date (ISO 8601)" + range: "xsd:date" + note: "May be incomplete: YYYY, YYYY-MM, or YYYY-MM-DD" + + - property: "sdo:birthPlace" + description: "Place of birth" + range: "xsd:string or xsd:anyURI" + note: "Prefer linking to GeoNames or Wikidata" + + - property: "sdo:deathDate" + description: "Death date (ISO 8601)" + range: "xsd:date" + + - property: "sdo:deathPlace" + description: "Place of death" + range: "xsd:string or xsd:anyURI" + + example: + description: "PersonReconstruction derived from multiple observations" + turtle: | + cbg:person_reconstruction_anna_koppen + a pico:PersonReconstruction ; + sdo:name "Anna Maria Koppen" ; + sdo:familyName "Koppen" ; + sdo:givenName "Anna Maria" ; + sdo:gender sdo:Female ; + sdo:birthPlace "Haarlem" ; + sdo:birthDate "1860-03-31"^^xsd:date ; + sdo:deathPlace "Detroit, USA" ; + sdo:deathDate "1926"^^xsd:gYear ; + prov:wasDerivedFrom nha:marriage_1885_po_1 , + cbg:emigration_1887_po_1 , + us:death_1926_po_1 ; + prov:wasGeneratedBy cbg:reconstruction_activity_01 . + +# ----------------------------------------------------------------------------- +# Source and Scan Classes +# ----------------------------------------------------------------------------- + +source_classes: + + archive_component: + description: | + A Source document from which PersonObservations are extracted. + PiCo does not aim to fully describe archival sources (use RiC-O or DC for that), + but requires minimal identification for provenance tracking. + + class: "sdo:ArchiveComponent" + class_uri: "https://schema.org/ArchiveComponent" + superclass: "sdo:CreativeWork" + + properties: + - property: "sdo:name" + description: "Identifying name for the source" + range: "xsd:string" + cardinality: "1" + note: "Combine title, date, archive location for identification" + example: "BS Marriage Haarlem, November 11, 1885, certificate number 321" + + - property: "sdo:additionalType" + description: "Type of source document" + range: "picot_sourcetypes:Concept" + note: "Use PiCo SourceType thesaurus" + + - property: "sdo:dateCreated" + description: "Date the source was created" + range: "xsd:date" + + - property: "sdo:holdingArchive" + description: "Institution holding the source" + range: "xsd:anyURI" + note: "Link to heritage custodian (GHCID or Wikidata)" + + - property: "sdo:url" + description: "Permalink to the source" + range: "sdo:URL" + note: "Preferably a persistent identifier" + + - property: "sdo:contentLocation" + description: "Geographic coverage of the source" + range: "xsd:string or xsd:anyURI" + + - property: "sdo:associatedMedia" + description: "Link to scan(s) of the source" + range: "sdo:ImageObject" + cardinality: "0..*" + + image_object: + description: | + A Scan of a source document. Links to the digital image at the holding archive. + + class: "sdo:ImageObject" + class_uri: "https://schema.org/ImageObject" + superclass: "sdo:CreativeWork" + + properties: + - property: "sdo:url" + description: "URL to the full scan" + range: "sdo:URL" + note: "Preferably IIIF manifest" + + - property: "sdo:thumbnail" + description: "URL to thumbnail image" + range: "sdo:ImageObject" + + - property: "sdo:embedUrl" + description: "URL to image viewer" + range: "sdo:URL" + + - property: "sdo:position" + description: "Position in sequence of scans" + range: "xsd:int" + note: "For multi-page sources" + +# ----------------------------------------------------------------------------- +# Biographical Properties +# ----------------------------------------------------------------------------- + +biographical_properties: + description: | + Biographical properties capture personal details as they appear in sources. + These are used for both PersonObservation (source-bound) and + PersonReconstruction (normalized). + + age: + property: "pico:hasAge" + property_uri: "https://personsincontext.org/model#hasAge" + description: "Age of person as stated in source" + range: "xsd:string" + domain: "pico:PersonObservation" + note: | + Used when birth date unknown but age is recorded. + Age assumed in years unless specified ("4" = 4 years, "4 months" = 4 months). + Numerical preferred over text ("4" not "four"). + examples: + - "30" + - "4 months" + - "about 25" + + religion: + property: "pico:hasReligion" + property_uri: "https://personsincontext.org/model#hasReligion" + description: "Religious affiliation as stated in source" + range: "xsd:string or xsd:anyURI" + domain: "pico:Person" + note: "Can link to SKOS thesaurus for religions" + examples: + - "Catholic" + - "Reformed" + - "Jewish" + + deceased: + property: "pico:deceased" + property_uri: "https://personsincontext.org/model#deceased" + description: "Indication that person is deceased (when death date unknown)" + range: "xsd:boolean" + domain: "pico:PersonObservation" + note: | + Only used when deathDate is unknown but death is indicated. + A person without deathDate and without deceased:true is assumed alive. + Important for privacy considerations in publishing person records. + + gender: + property: "sdo:gender" + property_uri: "https://schema.org/gender" + description: "Gender of the person" + range: "sdo:GenderType" + domain: "pico:Person" + values: + - uri: "sdo:Male" + label: "Male" + - uri: "sdo:Female" + label: "Female" + + address: + property: "sdo:address" + property_uri: "https://schema.org/address" + description: "Physical address as mentioned in source" + range: "xsd:string" + domain: "pico:PersonObservation" + note: "Address exactly as recorded in source" + + initials: + property: "pnv:initials" + property_uri: "https://w3id.org/pnv#initials" + description: "Initials of given name(s)" + range: "xsd:string" + domain: "pnv:PersonName" + note: "Each initial followed by period (e.g., 'P.R.', 'H.A.F.M.O.')" + examples: + - "P.R." + - "C.Joh." + - "H.A.F.M.O." + +# ----------------------------------------------------------------------------- +# Hypernym Mapping (GLAM-NER v1.7.0) +# ----------------------------------------------------------------------------- + +hypernym_mapping: + description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms" + + mappings: + - pico_class: "picom:PersonObservation" + glam_hypernym: "AGT.PER" + glam_code: "AGT.PER" + note: "Person observations create AGT.PER entities" + + - pico_class: "picom:PersonObservation" + glam_hypernym: "AGT.STF" + glam_code: "AGT.STF" + condition: "When observed with organizational role" + note: "Staff members with role context" + + - pico_class: "pnv:PersonName" + glam_hypernym: "APP.NAM" + glam_code: "APP.NAM" + note: "Name strings as appellations" + + - pico_class: "picom:hasRole" + glam_hypernym: "ROL" + glam_code: "ROL" + note: "Extracted roles link to ROL hypernym" + +# ----------------------------------------------------------------------------- +# Simple Examples +# ----------------------------------------------------------------------------- + +examples: + - description: "Staff member with title and role" + text: "Dr. Maria van den Berg, Director" + + observation: + type: "picom:PersonObservation" + id: "_:obs1" + + hasObservedName: + type: "pnv:PersonName" + literalName: "Dr. Maria van den Berg" + honorificPrefix: "Dr." + givenName: "Maria" + surnamePrefix: "van den" + baseSurname: "Berg" + + hasRole: "Director" + hadPrimarySource: "https://example.org/staff-page" + observedAt: "2025-12-02T10:30:00Z" + + glam_ner_annotations: + - span: "Dr. Maria van den Berg" + type: "AGT.STF" + code: "AGT.STF" + confidence: 0.95 + + - span: "Director" + type: "ROL.TIT" + code: "ROL.TIT" + confidence: 0.98 + + - description: "Historical artist" + text: "Rembrandt van Rijn painted this in 1642" + + observation: + type: "picom:PersonObservation" + id: "_:obs2" + + hasObservedName: + type: "pnv:PersonName" + literalName: "Rembrandt van Rijn" + givenName: "Rembrandt" + surnamePrefix: "van" + baseSurname: "Rijn" + + isObservationOf: "wd:Q5598" # Wikidata Rembrandt + hadPrimarySource: "https://example.org/artwork-page" + observedAt: "2025-12-02T10:35:00Z" + + glam_ner_annotations: + - span: "Rembrandt van Rijn" + type: "AGT.PER" + code: "AGT.PER" + confidence: 0.99 + linking: + wikidata: "Q5598" + viaf: "64013650" + + - description: "Nobility title" + text: "Count Willem van Loon" + + observation: + type: "picom:PersonObservation" + id: "_:obs3" + + hasObservedName: + type: "pnv:PersonName" + literalName: "Count Willem van Loon" + honorificPrefix: "Count" + givenName: "Willem" + surnamePrefix: "van" + baseSurname: "Loon" + + hadPrimarySource: "https://example.org/archive-doc" + observedAt: "2025-12-02T10:40:00Z" + + glam_ner_annotations: + - span: "Count Willem van Loon" + type: "AGT.PER" + code: "AGT.PER" + confidence: 0.95 + + - span: "Count" + type: "ROL.HON" + code: "ROL.HON" + note: "Nobility title - honorific role" diff --git a/data/entity_annotation/modules/integrations/pico/schema/pnv_components.yaml b/data/entity_annotation/modules/integrations/pico/schema/pnv_components.yaml new file mode 100644 index 0000000000..b80217ca4f --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/schema/pnv_components.yaml @@ -0,0 +1,439 @@ +# ============================================================================= +# PiCo Integration Module: Person Name Vocabulary (PNV) +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/ +# Parent: _index.yaml +# +# Description: Person Name Vocabulary (PNV) provides structured name components. +# This enables proper parsing of complex name structures across cultures. +# +# References: +# - PNV: https://w3id.org/pnv +# - PNV Specification: https://w3id.org/pnv/doc/v2 +# +# Last Updated: 2025-01-13 +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Person Name Vocabulary (PNV) +# ----------------------------------------------------------------------------- + +pnv_name_structure: + description: | + Person Name Vocabulary (PNV) provides structured name components. + This enables proper parsing of complex name structures across cultures. + + class: "pnv:PersonName" + class_uri: "https://w3id.org/pnv/PersonName" + + components: + - property: "pnv:literalName" + description: "Full name as single string" + examples: + - "Dr. Maria van den Berg" + - "Rembrandt Harmenszoon van Rijn" + - "Queen Elizabeth II" + note: "Original string before parsing" + + - property: "pnv:givenName" + description: "First/given name" + examples: + - "Rembrandt" + - "Maria" + - "Jan" + - "Elizabeth" + note: "Personal name, not surname" + + - property: "pnv:patronym" + description: "Patronymic name component" + examples: + - "Harmenszoon" + - "Janszoon" + - "Pietersdochter" + note: "Common in Dutch, Scandinavian, Slavic names" + + - property: "pnv:surnamePrefix" + description: "Prefix to surname (tussenvoegsel)" + examples: + - "van" + - "de" + - "van den" + - "van der" + - "op de" + - "'t" + - "von" + - "di" + note: "Language-specific, affects sorting" + + - property: "pnv:baseSurname" + description: "Core surname without prefix" + examples: + - "Rijn" + - "Berg" + - "Velde" + - "Gogh" + note: "Primary sorting component in Dutch" + + - property: "pnv:honorificPrefix" + description: "Title or honorific before name" + examples: + - "Dr." + - "Prof." + - "Prof. dr." + - "Sir" + - "Queen" + - "Mr." + - "Drs." + - "Ir." + note: "May indicate role - link to ROL" + + - property: "pnv:honorificSuffix" + description: "Title or honorific after name" + examples: + - "PhD" + - "Jr." + - "III" + - "MD" + - "RA" + - "MSc" + note: "Credentials and generational markers" + + - property: "pnv:infixTitle" + description: "Title within name structure" + examples: + - "graaf van" + - "baron de" + - "duke of" + note: "Nobility titles embedded in name" + + - property: "pnv:initials" + description: "Initials of given name(s)" + examples: + - "P.R." + - "C.Joh." + - "H.A.F.M.O." + note: "Each initial followed by period" + +# ----------------------------------------------------------------------------- +# Dutch Name Conventions +# ----------------------------------------------------------------------------- + +dutch_name_patterns: + description: | + Special handling for Dutch names with tussenvoegsels (surname prefixes). + Dutch sorting rules differ from other languages. + + tussenvoegsel_list: + - "van" + - "van de" + - "van den" + - "van der" + - "de" + - "den" + - "het" + - "'t" + - "ter" + - "ten" + - "op de" + - "op den" + - "in 't" + - "in de" + + sorting_rule: | + In Dutch, surnames sort by baseSurname, ignoring tussenvoegsel. + "Vincent van Gogh" sorts under "G" not "V". + "Maria van den Berg" sorts under "B" not "V". + + capitalization_rule: | + Tussenvoegsel lowercase when preceded by given name: + - "Vincent van Gogh" (not "Vincent Van Gogh") + - "Van Gogh" (surname alone, capitalized) + - "de heer Van Gogh" (formal, capitalized) + +# ----------------------------------------------------------------------------- +# Arabic Name Conventions +# ----------------------------------------------------------------------------- + +arabic_name_patterns: + description: | + Arabic names follow complex conventions with multiple components: + nasab (patronymic), nisba (geographic/tribal), kunya (teknonym), laqab (title/epithet). + + components: + nasab: + description: "Patronymic chain using ibn/bin (son) or bint (daughter)" + examples: + - "محمد بن علي بن حسن" + - "Muhammad ibn Ali ibn Hasan" + note: "Can extend multiple generations" + + nisba: + description: "Geographic or tribal affiliation (adjective form, ends in -i)" + examples: + - "البغدادي (al-Baghdadi)" + - "المصري (al-Misri)" + - "الهاشمي (al-Hashimi)" + + kunya: + description: "Teknonym (Abu/Umm + child's name)" + examples: + - "أبو عبد الله (Abu Abdullah)" + - "أم كلثوم (Umm Kulthum)" + note: "Often used as primary form of address" + + laqab: + description: "Title, epithet, or nickname" + examples: + - "الرشيد (al-Rashid - the rightly guided)" + - "المأمون (al-Ma'mun - the trustworthy)" + + parsing_order: | + Traditional order: kunya - ism - nasab - laqab - nisba + Example: Abu Bakr Muhammad ibn Zakariyya al-Razi + - Kunya: Abu Bakr (father of Bakr) + - Ism: Muhammad (given name) + - Nasab: ibn Zakariyya (son of Zakariyya) + - Nisba: al-Razi (from Ray, city in Persia) + +# ----------------------------------------------------------------------------- +# Hebrew Name Conventions +# ----------------------------------------------------------------------------- + +hebrew_name_patterns: + description: | + Hebrew names, especially in religious and historical documents, follow + specific conventions with patronymics and honorifics. + + components: + given_name: + description: "First name (shem)" + examples: + - "משה (Moshe/Moses)" + - "רבקה (Rivkah/Rebecca)" + + patronymic: + description: "Son/daughter of (ben/bat)" + examples: + - "משה בן אברהם (Moshe ben Avraham)" + - "רבקה בת יעקב (Rivkah bat Ya'akov)" + note: "ben for males, bat for females" + + honorifics: + examples: + - "ר' (Rabbi)" + - "הרב (HaRav - the Rabbi)" + - "מו\"ר (Morenu - our teacher)" + - "ז\"ל (zikhrono livrakha - of blessed memory)" + - "ע\"ה (alav hashalom - peace be upon him)" + + ketubah_conventions: + description: "Special naming in marriage contracts" + notes: + - "Full patronymics required for both parties" + - "Honorifics for fathers (החתן = the groom, הכלה = the bride)" + - "Geographic origin often included" + - "Hebrew date format (day of month, month, year from creation)" + +# ----------------------------------------------------------------------------- +# Spanish Colonial Name Conventions +# ----------------------------------------------------------------------------- + +spanish_name_patterns: + description: | + Spanish naming conventions, including colonial-era patterns with + double surnames and titles. + + components: + given_names: + description: "First and middle names (often religious)" + examples: + - "María Guadalupe" + - "José Antonio" + - "Juan Pablo" + + paternal_surname: + description: "Father's family name (apellido paterno)" + note: "Listed first in double surname" + + maternal_surname: + description: "Mother's maiden family name (apellido materno)" + note: "Listed second in double surname" + + particles: + examples: + - "de" + - "de la" + - "del" + note: "May indicate nobility or geographic origin" + + titles: + examples: + - "Don/Doña" + - "Señor/Señora" + - "Fray (friar)" + - "Sor (sister)" + + colonial_patterns: + notes: + - "Racial designations (español, mestizo, indio, mulato) often recorded" + - "Parish affiliation important" + - "Godparents (padrinos) always named" + - "Legitimacy (hijo legítimo/natural) specified" + +# ----------------------------------------------------------------------------- +# Italian Name Conventions +# ----------------------------------------------------------------------------- + +italian_name_patterns: + description: | + Italian naming conventions with notarial and nobility elements. + + components: + given_name: + description: "Nome proprio" + note: "Often saints' names" + + surname: + description: "Cognome" + note: "May derive from patronymics, locations, or professions" + + particles: + examples: + - "di" + - "del" + - "della" + - "dei" + - "da" + note: "May indicate origin or noble lineage" + + honorifics: + examples: + - "Signore/Signora" + - "Messer (medieval)" + - "Ser (notarial)" + - "Conte/Contessa" + - "Marchese/Marchesa" + + notarial_conventions: + notes: + - "Father's name in genitive: 'figlio di Giovanni'" + - "Profession often stated: 'mercante', 'notaio'" + - "Parish or neighborhood: 'della parrocchia di San Marco'" + - "Legal capacity: 'maggiore d'età' (of legal age)" + +# ----------------------------------------------------------------------------- +# Greek Name Conventions +# ----------------------------------------------------------------------------- + +greek_name_patterns: + description: | + Greek Orthodox naming conventions with genitive patronymics. + + components: + given_name: + description: "First name (often saint's name)" + examples: + - "Κωνσταντίνος (Konstantinos)" + - "Μαρία (Maria)" + + patronymic: + description: "Father's name in genitive case" + examples: + - "του Νικολάου (tou Nikolaou - son of Nikolaos)" + - "του Δημητρίου (tou Dimitriou)" + note: "Genitive case indicates 'of' or 'belonging to'" + + surname: + description: "Family name" + examples: + - "Παπαδόπουλος (Papadopoulos)" + - "Αντωνίου (Antoniou)" + note: "May be patronymic origin (-opoulos, -ou, -ides)" + + honorifics: + examples: + - "Κύριος/Κυρία (Kyrios/Kyria - Mr./Mrs.)" + - "Πατήρ (Patir - Father, for clergy)" + - "Παπα- (Papa- - prefix for priests)" + + orthodox_conventions: + notes: + - "Name day (onomastics) important in Greek culture" + - "Multiple given names common" + - "Grandparents' names often passed down" + +# ----------------------------------------------------------------------------- +# Russian/Cyrillic Name Conventions +# ----------------------------------------------------------------------------- + +russian_name_patterns: + description: | + Russian naming conventions with formal patronymics. + + components: + given_name: + description: "First name (имя)" + examples: + - "Иван (Ivan)" + - "Мария (Maria)" + + patronymic: + description: "Father's name + suffix (отчество)" + examples: + - "Петрович (Petrovich - son of Pyotr)" + - "Петровна (Petrovna - daughter of Pyotr)" + note: "-ovich/-evich for males, -ovna/-evna for females" + + surname: + description: "Family name (фамилия)" + note: "Gendered: -ov/-ova, -in/-ina, -sky/-skaya" + + formal_usage: + notes: + - "Formal address: given name + patronymic" + - "Informal: given name or diminutive" + - "Full official: surname, given name, patronymic" + +# ----------------------------------------------------------------------------- +# Ottoman Turkish Name Conventions +# ----------------------------------------------------------------------------- + +ottoman_name_patterns: + description: | + Ottoman Turkish naming conventions blending Arabic and Turkish elements. + + components: + given_name: + description: "Primary name (often Arabic origin)" + examples: + - "Mehmed" + - "Ahmed" + - "Fatma" + + patronymic: + description: "Father's name with 'oğlu' (son of) or 'kızı' (daughter of)" + examples: + - "Ali oğlu Mehmed" + - "Hasan oğlu Ahmed" + + epithet: + description: "Title or descriptor (laqab)" + examples: + - "Paşa (Pasha)" + - "Efendi" + - "Ağa" + - "Bey" + - "Hatun/Hanım (for women)" + + nisba: + description: "Geographic origin or profession" + examples: + - "Kayserili (from Kayseri)" + - "Bakkal (grocer)" + + sijill_conventions: + notes: + - "Court records (sicil) use formal full names" + - "Witnesses identified by profession and address" + - "Deceased marked as 'merhum/merhume'" + - "Non-Muslims identified by religious community (millet)" diff --git a/data/entity_annotation/modules/integrations/pico/schema/relationships.yaml b/data/entity_annotation/modules/integrations/pico/schema/relationships.yaml new file mode 100644 index 0000000000..299bac70f6 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/schema/relationships.yaml @@ -0,0 +1,517 @@ +# ============================================================================= +# PiCo Integration Module: Family and Social Relationships +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/ +# Parent: _index.yaml +# +# Description: Family relationship properties for genealogical data. +# Enables modeling complex family structures from historical records. +# +# Last Updated: 2025-01-13 +# ============================================================================= + +family_relationships: + description: | + Family relationship properties link persons within and across sources. + + Rules: + - For PersonObservations: relationships refer to OTHER observations on SAME source + - For PersonReconstructions: relationships refer to other reconstructions + + Property characteristics: + - Symmetric: If A hasRelation B, then B hasRelation A (spouses, siblings, cousins) + - Transitive: hasAncestor/hasDescendant chain through generations + - Inverse pairs: parent/children, grandparent/grandchild, etc. + + # --------------------------------------------------------------------------- + # Core Family (Schema.org) + # --------------------------------------------------------------------------- + + core_relationships: + - property: "sdo:parent" + property_uri: "https://schema.org/parent" + description: "A parent of the person" + inverse: "sdo:children" + subPropertyOf: ["sdo:relatedTo", "pico:hasAncestor"] + note: "Biological or legal parent" + + - property: "sdo:children" + property_uri: "https://schema.org/children" + description: "A child of the person" + inverse: "sdo:parent" + subPropertyOf: ["sdo:relatedTo", "pico:hasDescendant"] + + - property: "sdo:spouse" + property_uri: "https://schema.org/spouse" + description: "The person's spouse" + symmetric: true + subPropertyOf: "sdo:relatedTo" + + - property: "sdo:sibling" + property_uri: "https://schema.org/sibling" + description: "A brother or sister" + symmetric: true + subPropertyOf: "sdo:relatedTo" + + # --------------------------------------------------------------------------- + # Transitive Ancestry (PiCo) + # --------------------------------------------------------------------------- + + ancestry_relationships: + - property: "pico:hasAncestor" + property_uri: "https://personsincontext.org/model#hasAncestor" + description: "Any ancestor (parent, grandparent, etc.)" + type: "owl:TransitiveProperty" + inverse: "pico:hasDescendant" + note: "Not used directly; parent→parent chains automatically create ancestors" + + - property: "pico:hasDescendant" + property_uri: "https://personsincontext.org/model#hasDescendant" + description: "Any descendant (child, grandchild, etc.)" + type: "owl:TransitiveProperty" + inverse: "pico:hasAncestor" + + # --------------------------------------------------------------------------- + # Grandparents/Grandchildren + # --------------------------------------------------------------------------- + + grandparent_relationships: + - property: "pico:hasGrandparent" + property_uri: "https://personsincontext.org/model#hasGrandparent" + inverse: "pico:hasGrandchild" + + - property: "pico:hasGrandchild" + property_uri: "https://personsincontext.org/model#hasGrandchild" + inverse: "pico:hasGrandparent" + + - property: "pico:hasGreat-grandparent" + property_uri: "https://personsincontext.org/model#hasGreat-grandparent" + inverse: "pico:hasGreat-grandchild" + + - property: "pico:hasGreat-grandchild" + property_uri: "https://personsincontext.org/model#hasGreat-grandchild" + inverse: "pico:hasGreat-grandparent" + + # --------------------------------------------------------------------------- + # Aunts/Uncles and Nieces/Nephews + # --------------------------------------------------------------------------- + + extended_family: + - property: "pico:hasUncle_Aunt" + property_uri: "https://personsincontext.org/model#hasUncle_Aunt" + description: "An uncle or aunt (sibling of parent)" + inverse: "pico:hasNephew_Niece" + + - property: "pico:hasNephew_Niece" + property_uri: "https://personsincontext.org/model#hasNephew_Niece" + description: "A nephew or niece (child of sibling)" + inverse: "pico:hasUncle_Aunt" + + - property: "pico:hasCousin" + property_uri: "https://personsincontext.org/model#hasCousin" + description: "A cousin (child of parent's sibling)" + symmetric: true + + # --------------------------------------------------------------------------- + # Step-family + # --------------------------------------------------------------------------- + + step_relationships: + - property: "pico:hasStepparent" + property_uri: "https://personsincontext.org/model#hasStepparent" + description: "A stepparent (spouse of biological parent)" + inverse: "pico:hasStepchild" + + - property: "pico:hasStepchild" + property_uri: "https://personsincontext.org/model#hasStepchild" + inverse: "pico:hasStepparent" + + - property: "pico:hasStepsibling" + property_uri: "https://personsincontext.org/model#hasStepsibling" + description: "A stepbrother or stepsister" + symmetric: true + + - property: "pico:hasHalf-sibling" + property_uri: "https://personsincontext.org/model#hasHalf-sibling" + description: "A half-brother or half-sister (one shared parent)" + symmetric: true + + # --------------------------------------------------------------------------- + # Foster/Godparent + # --------------------------------------------------------------------------- + + non_biological_relationships: + - property: "pico:hasFosterParent" + property_uri: "https://personsincontext.org/model#hasFosterParent" + inverse: "pico:hasFosterChild" + + - property: "pico:hasFosterChild" + property_uri: "https://personsincontext.org/model#hasFosterChild" + inverse: "pico:hasFosterParent" + + - property: "pico:hasGodparent" + property_uri: "https://personsincontext.org/model#hasGodparent" + description: "A godparent (witness at baptism)" + inverse: "pico:hasGodchild" + + - property: "pico:hasGodchild" + property_uri: "https://personsincontext.org/model#hasGodchild" + inverse: "pico:hasGodparent" + + - property: "pico:hasLegitimizedChild" + property_uri: "https://personsincontext.org/model#hasLegitimizedChild" + description: "A child legitimized by marriage or legal recognition" + inverse: "pico:isLegitimitezedChildOf" + + - property: "pico:isLegitimitezedChildOf" + property_uri: "https://personsincontext.org/model#isLegitimitezedChildOf" + inverse: "pico:hasLegitimizedChild" + + # --------------------------------------------------------------------------- + # In-Laws + # --------------------------------------------------------------------------- + + in_law_relationships: + - property: "pico:hasParent-in-law" + property_uri: "https://personsincontext.org/model#hasParent-in-law" + inverse: "pico:hasChild-in-law" + + - property: "pico:hasChild-in-law" + property_uri: "https://personsincontext.org/model#hasChild-in-law" + inverse: "pico:hasParent-in-law" + + - property: "pico:hasSibling-in-law" + property_uri: "https://personsincontext.org/model#hasSibling-in-law" + description: "Brother/sister-in-law" + symmetric: true + + - property: "pico:hasGrandparent-in-law" + property_uri: "https://personsincontext.org/model#hasGrandparent-in-law" + inverse: "pico:hasGrandchild-in-law" + + - property: "pico:hasGrandchild-in-law" + property_uri: "https://personsincontext.org/model#hasGrandchild-in-law" + inverse: "pico:hasGrandparent-in-law" + + - property: "pico:hasUncle_Aunt-in-law" + property_uri: "https://personsincontext.org/model#hasUncle_Aunt-in-law" + inverse: "pico:hasNephew_Niece-in-law" + + - property: "pico:hasNephew_Niece-in-law" + property_uri: "https://personsincontext.org/model#hasNephew_Niece-in-law" + inverse: "pico:hasUncle_Aunt-in-law" + + - property: "pico:hasCousin-in-law" + property_uri: "https://personsincontext.org/model#hasCousin-in-law" + symmetric: true + + - property: "pico:hasStepparent-in-law" + property_uri: "https://personsincontext.org/model#hasStepparent-in-law" + inverse: "pico:hasStepchild-in-law" + + - property: "pico:hasStepchild-in-law" + property_uri: "https://personsincontext.org/model#hasStepchild-in-law" + inverse: "pico:hasStepparent-in-law" + + # --------------------------------------------------------------------------- + # Former Partners + # --------------------------------------------------------------------------- + + former_partner_relationships: + - property: "pico:isWidOf" + property_uri: "https://personsincontext.org/model#isWidOf" + description: "Is widow/widower of deceased spouse" + note: "The subject is the surviving partner" + + - property: "pico:hasPreviousPartner" + property_uri: "https://personsincontext.org/model#hasPreviousPartner" + description: "A former spouse or partner" + symmetric: true + +# ----------------------------------------------------------------------------- +# Historical Relationship Indicators by Language +# ----------------------------------------------------------------------------- + +historical_relationship_patterns: + description: | + Common relationship indicators in historical documents by language. + Use these patterns to identify family relationships in source texts. + + dutch: + description: "Dutch relationship indicators" + patterns: + - pattern: "huijsvrou van" + meaning: "wife of" + relationship: "spouse" + - pattern: "zoon van" + meaning: "son of" + relationship: "parent" + - pattern: "dochter van" + meaning: "daughter of" + relationship: "parent" + - pattern: "weduwe van" + meaning: "widow of" + relationship: "widow_of" + - pattern: "weduwnaar van" + meaning: "widower of" + relationship: "widow_of" + - pattern: "peter" + meaning: "godfather" + relationship: "godparent" + - pattern: "meter" + meaning: "godmother" + relationship: "godparent" + - pattern: "getuige" + meaning: "witness" + relationship: "witness" + - pattern: "broeder van" + meaning: "brother of" + relationship: "sibling" + - pattern: "zuster van" + meaning: "sister of" + relationship: "sibling" + + latin: + description: "Latin relationship indicators (common in church records)" + patterns: + - pattern: "filius" + meaning: "son" + relationship: "parent" + - pattern: "filia" + meaning: "daughter" + relationship: "parent" + - pattern: "uxor" + meaning: "wife" + relationship: "spouse" + - pattern: "maritus" + meaning: "husband" + relationship: "spouse" + - pattern: "vidua" + meaning: "widow" + relationship: "widow_of" + - pattern: "viduus" + meaning: "widower" + relationship: "widow_of" + - pattern: "quondam" + meaning: "the late" + relationship: "deceased_marker" + - pattern: "patrinus" + meaning: "godfather" + relationship: "godparent" + - pattern: "matrina" + meaning: "godmother" + relationship: "godparent" + - pattern: "testis" + meaning: "witness" + relationship: "witness" + + german: + description: "German relationship indicators" + patterns: + - pattern: "Ehefrau von" + meaning: "wife of" + relationship: "spouse" + - pattern: "Ehemann von" + meaning: "husband of" + relationship: "spouse" + - pattern: "Sohn von" + meaning: "son of" + relationship: "parent" + - pattern: "Tochter von" + meaning: "daughter of" + relationship: "parent" + - pattern: "Witwe von" + meaning: "widow of" + relationship: "widow_of" + - pattern: "Witwer von" + meaning: "widower of" + relationship: "widow_of" + - pattern: "Taufpate" + meaning: "godfather" + relationship: "godparent" + - pattern: "Taufpatin" + meaning: "godmother" + relationship: "godparent" + + french: + description: "French relationship indicators" + patterns: + - pattern: "fils de" + meaning: "son of" + relationship: "parent" + - pattern: "fille de" + meaning: "daughter of" + relationship: "parent" + - pattern: "épouse de" + meaning: "wife of" + relationship: "spouse" + - pattern: "époux de" + meaning: "husband of" + relationship: "spouse" + - pattern: "veuve de" + meaning: "widow of" + relationship: "widow_of" + - pattern: "veuf de" + meaning: "widower of" + relationship: "widow_of" + - pattern: "feu" + meaning: "the late (m)" + relationship: "deceased_marker" + - pattern: "feue" + meaning: "the late (f)" + relationship: "deceased_marker" + - pattern: "parrain" + meaning: "godfather" + relationship: "godparent" + - pattern: "marraine" + meaning: "godmother" + relationship: "godparent" + + arabic: + description: "Arabic relationship indicators" + patterns: + - pattern: "ابن" + transliteration: "ibn" + meaning: "son of" + relationship: "parent" + - pattern: "بن" + transliteration: "bin" + meaning: "son of (shorter form)" + relationship: "parent" + - pattern: "بنت" + transliteration: "bint" + meaning: "daughter of" + relationship: "parent" + - pattern: "زوج" + transliteration: "zawj" + meaning: "husband" + relationship: "spouse" + - pattern: "زوجة" + transliteration: "zawja" + meaning: "wife" + relationship: "spouse" + - pattern: "أرملة" + transliteration: "armala" + meaning: "widow" + relationship: "widow_of" + - pattern: "المرحوم" + transliteration: "al-marhum" + meaning: "the late (m)" + relationship: "deceased_marker" + - pattern: "المرحومة" + transliteration: "al-marhuma" + meaning: "the late (f)" + relationship: "deceased_marker" + - pattern: "آل" + transliteration: "Al" + meaning: "family of" + relationship: "family_marker" + + hebrew: + description: "Hebrew relationship indicators" + patterns: + - pattern: "בן" + transliteration: "ben" + meaning: "son of" + relationship: "parent" + - pattern: "בת" + transliteration: "bat" + meaning: "daughter of" + relationship: "parent" + - pattern: "אשת" + transliteration: "eshet" + meaning: "wife of" + relationship: "spouse" + - pattern: "אלמנה" + transliteration: "almana" + meaning: "widow" + relationship: "widow_of" + - pattern: "ז״ל" + transliteration: "z\"l" + meaning: "of blessed memory" + relationship: "deceased_marker" + - pattern: "ע״ה" + transliteration: "a\"h" + meaning: "peace be upon him/her" + relationship: "deceased_marker" + + spanish: + description: "Spanish relationship indicators" + patterns: + - pattern: "hijo de" + meaning: "son of" + relationship: "parent" + - pattern: "hija de" + meaning: "daughter of" + relationship: "parent" + - pattern: "esposa de" + meaning: "wife of" + relationship: "spouse" + - pattern: "esposo de" + meaning: "husband of" + relationship: "spouse" + - pattern: "viuda de" + meaning: "widow of" + relationship: "widow_of" + - pattern: "viudo de" + meaning: "widower of" + relationship: "widow_of" + - pattern: "padrino" + meaning: "godfather" + relationship: "godparent" + - pattern: "madrina" + meaning: "godmother" + relationship: "godparent" + - pattern: "hijo legítimo" + meaning: "legitimate son" + relationship: "legitimacy_marker" + - pattern: "hijo natural" + meaning: "illegitimate son" + relationship: "legitimacy_marker" + + portuguese: + description: "Portuguese relationship indicators" + patterns: + - pattern: "filho de" + meaning: "son of" + relationship: "parent" + - pattern: "filha de" + meaning: "daughter of" + relationship: "parent" + - pattern: "esposa de" + meaning: "wife of" + relationship: "spouse" + - pattern: "esposo de" + meaning: "husband of" + relationship: "spouse" + - pattern: "viúva de" + meaning: "widow of" + relationship: "widow_of" + - pattern: "viúvo de" + meaning: "widower of" + relationship: "widow_of" + - pattern: "padrinho" + meaning: "godfather" + relationship: "godparent" + - pattern: "madrinha" + meaning: "godmother" + relationship: "godparent" + + ottoman_turkish: + description: "Ottoman Turkish relationship indicators" + patterns: + - pattern: "oğlu" + meaning: "son of" + relationship: "parent" + - pattern: "kızı" + meaning: "daughter of" + relationship: "parent" + - pattern: "zevcesi" + meaning: "wife" + relationship: "spouse" + - pattern: "merhum" + meaning: "the late (m)" + relationship: "deceased_marker" + - pattern: "merhume" + meaning: "the late (f)" + relationship: "deceased_marker" diff --git a/data/entity_annotation/modules/integrations/pico/schema/temporal.yaml b/data/entity_annotation/modules/integrations/pico/schema/temporal.yaml new file mode 100644 index 0000000000..35b7759cc6 --- /dev/null +++ b/data/entity_annotation/modules/integrations/pico/schema/temporal.yaml @@ -0,0 +1,570 @@ +# ============================================================================= +# PiCo Integration Module: Temporal Patterns & Calendar Systems +# ============================================================================= +# Part of: data/entity_annotation/modules/integrations/pico/ +# Parent: _index.yaml +# +# Description: Temporal expression handling, calendar systems, date normalization, +# and PROV-O provenance model for tracking observation/reconstruction +# activities. +# +# Last Updated: 2025-12-12 +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Calendar Systems +# ----------------------------------------------------------------------------- +# Historical documents use various calendar systems. This section defines +# how to handle and normalize dates from different calendrical traditions. + +calendar_systems: + description: | + Historical sources use diverse calendar systems depending on culture, + religion, and time period. Proper extraction requires: + 1. Identifying the source calendar + 2. Preserving the original date expression + 3. Providing normalized ISO 8601 equivalents where possible + + supported_calendars: + + gregorian: + id: "gregorian" + label: "Gregorian Calendar" + uri: "https://www.wikidata.org/wiki/Q12138" + description: | + The civil calendar used worldwide since 1582 (Catholic countries) + or later (Protestant/Orthodox countries). + adoption_dates: + catholic: "1582-10-15" + protestant: "1700-03-01" + british_empire: "1752-09-14" + russia: "1918-02-14" + greece: "1923-03-01" + usage_notes: | + - Default for modern documents + - Used in civil registrations after adoption + - Standard for ISO 8601 normalization + example: + original: "15 October 1582" + normalized: "1582-10-15" + + julian: + id: "julian" + label: "Julian Calendar" + uri: "https://www.wikidata.org/wiki/Q11184" + description: | + Calendar introduced by Julius Caesar in 45 BCE. Used in Europe + until Gregorian reform, and by Eastern Orthodox churches today. + offset_from_gregorian: + 16th_century: 10 + 17th_century: 10 + 18th_century: 11 + 19th_century: 12 + 20th_century: 13 + 21st_century: 13 + usage_notes: | + - Greek Orthodox Church records use Julian calendar + - Russian Empire used Julian until 1918 + - Dual dating common in transition periods + - Format: "Julian date / Gregorian date" or "O.S./N.S." notation + example: + original: "14 March 1875 (O.S.)" + gregorian_equivalent: "27 March 1875" + normalized: "1875-03-27" + note: "Greek Orthodox used Julian; Gregorian equivalent calculated" + + hijri: + id: "hijri" + label: "Islamic/Hijri Calendar" + uri: "https://www.wikidata.org/wiki/Q28892" + alternative_names: + - "Islamic Calendar" + - "Muslim Calendar" + - "Lunar Hijri" + - "Anno Hegirae (AH)" + description: | + Lunar calendar used in Islamic societies. Year 1 = 622 CE (Hijra). + 354 or 355 days per year (12 lunar months). + months: + 1: "Muharram" + 2: "Safar" + 3: "Rabi' al-Awwal" + 4: "Rabi' al-Thani" + 5: "Jumada al-Awwal" + 6: "Jumada al-Thani" + 7: "Rajab" + 8: "Sha'ban" + 9: "Ramadan" + 10: "Shawwal" + 11: "Dhu al-Qa'dah" + 12: "Dhu al-Hijjah" + usage_notes: | + - Ottoman Empire, Waqf documents, Sijill records + - Year conversion: Gregorian = (Hijri * 0.97) + 622 + - Month-level precision often sufficient + - Some documents use both Hijri and local calendars + example: + original: "month of Rajab, year 1225 Hijri" + normalized: "1810-07" + note: "Approximate month - exact day unknown" + + hebrew: + id: "hebrew" + label: "Hebrew Calendar" + uri: "https://www.wikidata.org/wiki/Q9644" + alternative_names: + - "Jewish Calendar" + - "Anno Mundi" + description: | + Lunisolar calendar used in Jewish religious and civil life. + Year 1 = 3761 BCE (traditional Creation date). + months: + 1: "Nisan" + 2: "Iyar" + 3: "Sivan" + 4: "Tammuz" + 5: "Av" + 6: "Elul" + 7: "Tishrei" + 8: "Cheshvan" + 9: "Kislev" + 10: "Tevet" + 11: "Shevat" + 12: "Adar" + usage_notes: | + - Ketubot (marriage contracts) + - Get (divorce documents) + - Synagogue records + - Year conversion: Gregorian = Hebrew - 3760 (approx) + - Month names often transliterated in various ways + example: + original: "23 Elul 5656" + normalized: "1896-09-01" + note: "Hebrew date from Creation (anno mundi)" + + french_republican: + id: "french_republican" + label: "French Republican Calendar" + uri: "https://www.wikidata.org/wiki/Q181974" + description: | + Calendar used in France 1793-1805. Year 1 = 1792 CE. + 12 months of 30 days + 5-6 supplementary days. + months: + 1: "Vendemiaire" + 2: "Brumaire" + 3: "Frimaire" + 4: "Nivose" + 5: "Pluviose" + 6: "Ventose" + 7: "Germinal" + 8: "Floreal" + 9: "Prairial" + 10: "Messidor" + 11: "Thermidor" + 12: "Fructidor" + usage_notes: | + - French civil registrations 1793-1805 + - Some Belgian/Dutch territories + - Conversion tables widely available + example: + original: "14 Vendemiaire an IV" + normalized: "1795-10-06" + + chinese: + id: "chinese" + label: "Chinese Calendar" + uri: "https://www.wikidata.org/wiki/Q32823" + description: | + Lunisolar calendar used in China and East Asia. + Combines 60-year cycle with lunar months. + usage_notes: | + - Emperor reign year + lunar month + day + - Gregorian adopted 1912 (Republic of China) + - Traditional dates still used for festivals + example: + original: "Guangxu 22, 8th month, 15th day" + normalized: "1896-09-21" + +# ----------------------------------------------------------------------------- +# Date Expression Patterns +# ----------------------------------------------------------------------------- + +date_expression_patterns: + description: | + Common patterns for expressing dates in historical sources. + GLM annotators should recognize these patterns and extract: + 1. The original expression (exact transcription) + 2. The calendar system used + 3. A normalized ISO 8601 date (where possible) + + patterns: + + full_date: + description: "Complete date with day, month, and year" + examples: + - pattern: "15 October 1582" + calendar: "gregorian" + normalized: "1582-10-15" + + - pattern: "the fifteenth day of October in the year 1582" + calendar: "gregorian" + normalized: "1582-10-15" + + - pattern: "23 Elul 5656" + calendar: "hebrew" + normalized: "1896-09-01" + + partial_date: + description: "Date with some components missing" + examples: + - pattern: "March 1875" + calendar: "gregorian" + normalized: "1875-03" + precision: "month" + + - pattern: "in the year 1810" + calendar: "gregorian" + normalized: "1810" + precision: "year" + + - pattern: "month of Rajab, 1225 AH" + calendar: "hijri" + normalized: "1810-07" + precision: "month" + + dual_dating: + description: "Documents showing both Julian and Gregorian dates" + notation_styles: + - "O.S. (Old Style = Julian)" + - "N.S. (New Style = Gregorian)" + - "Slash notation: 14/27 March 1875" + examples: + - pattern: "14/27 March 1875" + interpretation: "14 March (Julian) = 27 March (Gregorian)" + normalized: "1875-03-27" + note: "Use Gregorian for normalization" + + - pattern: "6 January 1894 (Gregorian)" + normalized: "1894-01-06" + note: "Explicit calendar indicator" + + relative_dating: + description: "Dates relative to events or other dates" + examples: + - pattern: "three days after Easter" + requires: "Year context to calculate" + + - pattern: "the Sunday before St. Martins Day" + requires: "Year context and liturgical calendar" + + floruit: + description: "Period when person was known to be active" + notation: "fl." + examples: + - pattern: "fl. 1780-1820" + interpretation: "Active between 1780 and 1820" + + - pattern: "fl. c. 1850" + interpretation: "Active around 1850" + +# ----------------------------------------------------------------------------- +# Temporal Properties in PiCo +# ----------------------------------------------------------------------------- + +temporal_properties: + description: | + Properties for capturing temporal information about persons + observed in historical sources. + + biographical_dates: + birth_date: + property: "sdo:birthDate" + property_uri: "https://schema.org/birthDate" + range: "xsd:date or xsd:gYearMonth or xsd:gYear" + description: "Date of birth" + extraction_notes: | + - May be explicitly stated or inferred from age + - Capture calendar system if non-Gregorian + - Normalize to ISO 8601 for querying + + death_date: + property: "sdo:deathDate" + property_uri: "https://schema.org/deathDate" + range: "xsd:date or xsd:gYearMonth or xsd:gYear" + description: "Date of death" + extraction_notes: | + - "deceased" annotation indicates death before document date + - Infer approximate date from context when possible + + baptism_date: + property: "pico:baptismDate" + range: "xsd:date" + description: "Date of baptism/christening" + note: "Common in church records; often within days of birth" + + burial_date: + property: "pico:burialDate" + range: "xsd:date" + description: "Date of burial" + note: "Common in church/cemetery records" + + event_dates: + marriage_date: + property: "pico:marriageDate" + range: "xsd:date" + description: "Date of marriage event" + + divorce_date: + property: "pico:divorceDate" + range: "xsd:date" + description: "Date of divorce" + + document_date: + property: "sdo:dateCreated" + property_uri: "https://schema.org/dateCreated" + range: "xsd:date" + description: "Date the source document was created" + note: "Critical for temporal context of observations" + + age_expressions: + age_at_event: + property: "pico:ageAtEvent" + range: "xsd:string" + description: "Age as stated in document" + examples: + - "25 years" + - "about 30 years old" + - "minor (under legal age)" + - "of full age (adult)" + note: | + Preserve original expression; calculate birth year if needed. + "oud 25 jaar" (Dutch) = "25 years old" + +# ----------------------------------------------------------------------------- +# PROV-O Provenance Model +# ----------------------------------------------------------------------------- + +provenance_model: + description: | + PiCo uses W3C PROV-O for provenance tracking at two levels: + + 1. OBSERVATION LEVEL: Where did this observation come from? + - prov:hadPrimarySource -> Source document + - prov:wasGeneratedBy -> Extraction activity (optional) + + 2. RECONSTRUCTION LEVEL: How was this person entity created? + - prov:wasDerivedFrom -> Source observation(s) + - prov:wasGeneratedBy -> Reconstruction activity + - prov:wasRevisionOf -> Previous reconstruction version + + activity_class: + class: "prov:Activity" + class_uri: "http://www.w3.org/ns/prov#Activity" + description: "The activity that generated a PersonReconstruction" + + properties: + - property: "prov:wasAssociatedWith" + description: "Agent responsible for the activity" + range: "prov:Agent" + + - property: "prov:startedAtTime" + description: "When the activity started" + range: "xsd:dateTime" + + - property: "prov:endedAtTime" + description: "When the activity completed" + range: "xsd:dateTime" + + - property: "prov:used" + description: "Resources/tools used in the activity" + range: "prov:Entity" + note: "E.g., ML model, matching algorithm, rule set" + + activity_types: + human_reconstruction: + description: "Manual reconstruction by researcher" + note: "Provide: time, place, knowledge sources, researcher name" + + algorithmic_reconstruction: + description: "Automated reconstruction by software" + note: "Provide: algorithm name, version, configuration, parameters" + + agent_class: + class: "prov:Agent" + class_uri: "http://www.w3.org/ns/prov#Agent" + description: "Person or organization responsible for reconstruction" + + properties: + - property: "sdo:name" + description: "Name of the agent" + range: "xsd:string" + + - property: "sdo:url" + description: "URL identifying the agent" + range: "sdo:URL" + + examples: + - name: "CBG Center for Family History" + url: "https://cbg.nl" + type: "organization" + + - name: "GLM-4.6 Person Extractor v1.0" + url: null + type: "software" + + derivation_properties: + - property: "prov:wasDerivedFrom" + property_uri: "http://www.w3.org/ns/prov#wasDerivedFrom" + description: "Links PersonReconstruction to source PersonObservation(s)" + domain: "pico:PersonReconstruction" + range: "pico:PersonObservation" + cardinality: "1..*" + note: "REQUIRED for all PersonReconstructions" + + - property: "prov:wasRevisionOf" + property_uri: "http://www.w3.org/ns/prov#wasRevisionOf" + description: "Links to previous version of reconstruction" + domain: "pico:PersonReconstruction" + range: "pico:PersonReconstruction" + cardinality: "0..1" + note: "For tracking reconstruction updates over time" + +# ----------------------------------------------------------------------------- +# PiCo Vocabularies/Thesauri +# ----------------------------------------------------------------------------- + +pico_vocabularies: + description: | + PiCo defines three SKOS concept schemes for controlled terminology: + + - Roles: The role a person plays in a source (child, declarant, witness, etc.) + - SourceTypes: Types of historical sources (birth certificate, census, etc.) + - EventTypes: Types of life events (birth, marriage, death, etc.) + + roles_thesaurus: + id: "picot_roles" + uri: "https://terms.personsincontext.org/roles/" + type: "skos:ConceptScheme" + label: "Persons in Context role thesaurus" + description: "Roles that persons can have in historical sources" + usage: | + Use pico:hasRole property with a term from this thesaurus. + Example: picot_roles:575 (child), picot_roles:489 (declarant) + example_concepts: + - id: "575" + label: "child" + description: "Person appearing as child in a record" + + - id: "489" + label: "declarant" + description: "Person declaring/reporting an event" + + - id: "witness" + label: "witness" + description: "Person witnessing an event or signing a document" + + - id: "bride" + label: "bride" + description: "Female partner in a marriage" + + - id: "groom" + label: "groom" + description: "Male partner in a marriage" + + sourcetypes_thesaurus: + id: "picot_sourcetypes" + uri: "https://terms.personsincontext.org/sourcetypes/" + type: "skos:ConceptScheme" + label: "Persons in Context sourceType thesaurus" + description: "Types of historical sources containing person observations" + usage: | + Use sdo:additionalType property on sdo:ArchiveComponent. + Example: picot_sourcetypes:551 (civil registry: birth) + example_concepts: + - id: "551" + label: "civil registry: birth" + description: "Birth certificate from civil registration" + + - id: "marriage" + label: "civil registry: marriage" + description: "Marriage certificate" + + - id: "death" + label: "civil registry: death" + description: "Death certificate" + + - id: "census" + label: "census" + description: "Population census record" + + - id: "church_baptism" + label: "church record: baptism" + description: "Baptismal record from church register" + + - id: "notarial" + label: "notarial record" + description: "Notarial act or protocol" + + eventtypes_thesaurus: + id: "picot_eventtypes" + uri: "https://terms.personsincontext.org/eventtypes/" + type: "skos:ConceptScheme" + label: "Persons in Context eventType thesaurus" + description: "Types of life events documented in sources" + example_concepts: + - id: "birth" + label: "birth" + + - id: "baptism" + label: "baptism" + + - id: "marriage" + label: "marriage" + + - id: "death" + label: "death" + + - id: "burial" + label: "burial" + + - id: "emigration" + label: "emigration" + + - id: "immigration" + label: "immigration" + +# ----------------------------------------------------------------------------- +# CH-Annotator Hypernym Integration for Temporal +# ----------------------------------------------------------------------------- + +temporal_hypernym_mapping: + description: | + Mapping between temporal expressions and CH-Annotator hypernyms. + + mappings: + - pico_property: "sdo:birthDate" + ch_hypernym: "TMP.DAT" + ch_code: "TMP.DAT" + note: "Birth date temporal expression" + + - pico_property: "sdo:deathDate" + ch_hypernym: "TMP.DAT" + ch_code: "TMP.DAT" + note: "Death date temporal expression" + + - pico_property: "sdo:dateCreated" + ch_hypernym: "TMP.DAT" + ch_code: "TMP.DAT" + note: "Document creation date" + + - calendar_expression: "Hijri date" + ch_hypernym: "TMP.DAT" + normalization: "Convert to Gregorian ISO 8601" + + - calendar_expression: "Hebrew date" + ch_hypernym: "TMP.DAT" + normalization: "Convert to Gregorian ISO 8601" + + - calendar_expression: "Julian date" + ch_hypernym: "TMP.DAT" + normalization: "Convert to Gregorian ISO 8601" diff --git a/data/entity_annotation/modules/relationships/family.yaml b/data/entity_annotation/modules/relationships/family.yaml new file mode 100644 index 0000000000..c4ab049d7d --- /dev/null +++ b/data/entity_annotation/modules/relationships/family.yaml @@ -0,0 +1,1503 @@ +# ============================================================================= +# CH-Annotator Entity Annotation Convention v1.7.0 +# Module: relationships/family.yaml +# ============================================================================= +# Family relationship properties from PiCo (Person in Context) ontology. +# These enable modeling complex family structures from historical records. +# +# This module is referenced by integrations/pico.yaml +# +# References: +# - PiCo Ontology: https://w3id.org/pico +# - Schema.org: https://schema.org/ +# ============================================================================= + +family_relationships: + description: | + Family relationship properties link persons within and across sources. + + Rules: + - For PersonObservations: relationships refer to OTHER observations on SAME source + - For PersonReconstructions: relationships refer to other reconstructions + + Property characteristics: + - Symmetric: If A hasRelation B, then B hasRelation A (spouses, siblings, cousins) + - Transitive: hasAncestor/hasDescendant chain through generations + - Inverse pairs: parent/children, grandparent/grandchild, etc. + + # --------------------------------------------------------------------------- + # Core Family (Schema.org) + # --------------------------------------------------------------------------- + + core_relationships: + description: "Basic family relationships using Schema.org vocabulary" + properties: + - property: "sdo:parent" + property_uri: "https://schema.org/parent" + description: "A parent of the person" + inverse: "sdo:children" + subPropertyOf: ["sdo:relatedTo", "pico:hasAncestor"] + note: "Biological or legal parent" + + - property: "sdo:children" + property_uri: "https://schema.org/children" + description: "A child of the person" + inverse: "sdo:parent" + subPropertyOf: ["sdo:relatedTo", "pico:hasDescendant"] + + - property: "sdo:spouse" + property_uri: "https://schema.org/spouse" + description: "The person's spouse" + symmetric: true + subPropertyOf: "sdo:relatedTo" + + - property: "sdo:sibling" + property_uri: "https://schema.org/sibling" + description: "A brother or sister" + symmetric: true + subPropertyOf: "sdo:relatedTo" + + # --------------------------------------------------------------------------- + # Transitive Ancestry (PiCo) + # --------------------------------------------------------------------------- + + ancestry_relationships: + description: "Transitive properties for ancestor/descendant chains" + properties: + - property: "pico:hasAncestor" + property_uri: "https://personsincontext.org/model#hasAncestor" + description: "Any ancestor (parent, grandparent, etc.)" + type: "owl:TransitiveProperty" + inverse: "pico:hasDescendant" + note: "Not used directly; parent→parent chains automatically create ancestors" + + - property: "pico:hasDescendant" + property_uri: "https://personsincontext.org/model#hasDescendant" + description: "Any descendant (child, grandchild, etc.)" + type: "owl:TransitiveProperty" + inverse: "pico:hasAncestor" + + # --------------------------------------------------------------------------- + # Grandparents/Grandchildren + # --------------------------------------------------------------------------- + + grandparent_relationships: + description: "Multi-generational direct line relationships" + properties: + - property: "pico:hasGrandparent" + property_uri: "https://personsincontext.org/model#hasGrandparent" + description: "A grandparent of the person" + inverse: "pico:hasGrandchild" + + - property: "pico:hasGrandchild" + property_uri: "https://personsincontext.org/model#hasGrandchild" + description: "A grandchild of the person" + inverse: "pico:hasGrandparent" + + - property: "pico:hasGreat-grandparent" + property_uri: "https://personsincontext.org/model#hasGreat-grandparent" + description: "A great-grandparent of the person" + inverse: "pico:hasGreat-grandchild" + + - property: "pico:hasGreat-grandchild" + property_uri: "https://personsincontext.org/model#hasGreat-grandchild" + description: "A great-grandchild of the person" + inverse: "pico:hasGreat-grandparent" + + # --------------------------------------------------------------------------- + # Aunts/Uncles and Nieces/Nephews + # --------------------------------------------------------------------------- + + extended_family: + description: "Collateral relatives (aunts, uncles, cousins)" + properties: + - property: "pico:hasUncle_Aunt" + property_uri: "https://personsincontext.org/model#hasUncle_Aunt" + description: "An uncle or aunt (sibling of parent)" + inverse: "pico:hasNephew_Niece" + + - property: "pico:hasNephew_Niece" + property_uri: "https://personsincontext.org/model#hasNephew_Niece" + description: "A nephew or niece (child of sibling)" + inverse: "pico:hasUncle_Aunt" + + - property: "pico:hasCousin" + property_uri: "https://personsincontext.org/model#hasCousin" + description: "A cousin (child of parent's sibling)" + symmetric: true + + # --------------------------------------------------------------------------- + # Step-family + # --------------------------------------------------------------------------- + + step_relationships: + description: "Step-family relationships through remarriage" + properties: + - property: "pico:hasStepparent" + property_uri: "https://personsincontext.org/model#hasStepparent" + description: "A stepparent (spouse of biological parent)" + inverse: "pico:hasStepchild" + + - property: "pico:hasStepchild" + property_uri: "https://personsincontext.org/model#hasStepchild" + description: "A stepchild (child of spouse)" + inverse: "pico:hasStepparent" + + - property: "pico:hasStepsibling" + property_uri: "https://personsincontext.org/model#hasStepsibling" + description: "A stepbrother or stepsister" + symmetric: true + + - property: "pico:hasHalf-sibling" + property_uri: "https://personsincontext.org/model#hasHalf-sibling" + description: "A half-brother or half-sister (one shared parent)" + symmetric: true + + # --------------------------------------------------------------------------- + # Foster/Godparent (Non-biological) + # --------------------------------------------------------------------------- + + non_biological_relationships: + description: "Non-biological family relationships (foster, godparent, legitimized)" + properties: + - property: "pico:hasFosterParent" + property_uri: "https://personsincontext.org/model#hasFosterParent" + description: "A foster parent" + inverse: "pico:hasFosterChild" + + - property: "pico:hasFosterChild" + property_uri: "https://personsincontext.org/model#hasFosterChild" + description: "A foster child" + inverse: "pico:hasFosterParent" + + - property: "pico:hasGodparent" + property_uri: "https://personsincontext.org/model#hasGodparent" + description: "A godparent (witness at baptism)" + inverse: "pico:hasGodchild" + note: "Critical for historical genealogical research - baptism records" + + - property: "pico:hasGodchild" + property_uri: "https://personsincontext.org/model#hasGodchild" + description: "A godchild" + inverse: "pico:hasGodparent" + + - property: "pico:hasLegitimizedChild" + property_uri: "https://personsincontext.org/model#hasLegitimizedChild" + description: "A child legitimized by marriage or legal recognition" + inverse: "pico:isLegitimizedChildOf" + note: "Common in historical records when child born out of wedlock" + + - property: "pico:isLegitimizedChildOf" + property_uri: "https://personsincontext.org/model#isLegitimizedChildOf" + description: "The parent who legitimized the child" + inverse: "pico:hasLegitimizedChild" + + # --------------------------------------------------------------------------- + # In-Laws + # --------------------------------------------------------------------------- + + in_law_relationships: + description: "Relationships through marriage (in-laws)" + properties: + - property: "pico:hasParent-in-law" + property_uri: "https://personsincontext.org/model#hasParent-in-law" + description: "A parent-in-law (parent of spouse)" + inverse: "pico:hasChild-in-law" + + - property: "pico:hasChild-in-law" + property_uri: "https://personsincontext.org/model#hasChild-in-law" + description: "A child-in-law (spouse of child)" + inverse: "pico:hasParent-in-law" + + - property: "pico:hasSibling-in-law" + property_uri: "https://personsincontext.org/model#hasSibling-in-law" + description: "A brother/sister-in-law" + symmetric: true + + - property: "pico:hasGrandparent-in-law" + property_uri: "https://personsincontext.org/model#hasGrandparent-in-law" + description: "A grandparent-in-law" + inverse: "pico:hasGrandchild-in-law" + + - property: "pico:hasGrandchild-in-law" + property_uri: "https://personsincontext.org/model#hasGrandchild-in-law" + description: "A grandchild-in-law" + inverse: "pico:hasGrandparent-in-law" + + - property: "pico:hasUncle_Aunt-in-law" + property_uri: "https://personsincontext.org/model#hasUncle_Aunt-in-law" + description: "An uncle/aunt-in-law" + inverse: "pico:hasNephew_Niece-in-law" + + - property: "pico:hasNephew_Niece-in-law" + property_uri: "https://personsincontext.org/model#hasNephew_Niece-in-law" + description: "A nephew/niece-in-law" + inverse: "pico:hasUncle_Aunt-in-law" + + - property: "pico:hasCousin-in-law" + property_uri: "https://personsincontext.org/model#hasCousin-in-law" + description: "A cousin-in-law" + symmetric: true + + - property: "pico:hasStepparent-in-law" + property_uri: "https://personsincontext.org/model#hasStepparent-in-law" + description: "A stepparent-in-law" + inverse: "pico:hasStepchild-in-law" + + - property: "pico:hasStepchild-in-law" + property_uri: "https://personsincontext.org/model#hasStepchild-in-law" + description: "A stepchild-in-law" + inverse: "pico:hasStepparent-in-law" + + # --------------------------------------------------------------------------- + # Former Partners + # --------------------------------------------------------------------------- + + former_partner_relationships: + description: "Relationships with deceased or former spouses" + properties: + - property: "pico:isWidOf" + property_uri: "https://personsincontext.org/model#isWidOf" + description: "Is widow/widower of deceased spouse" + note: "The subject is the surviving partner; critical for historical records" + + - property: "pico:hasPreviousPartner" + property_uri: "https://personsincontext.org/model#hasPreviousPartner" + description: "A former spouse or partner" + symmetric: true + +# ============================================================================= +# Relationship Type Summary +# ============================================================================= + +relationship_summary: + total_properties: 34 + + by_category: + core_family: 4 + ancestry: 2 + grandparent: 4 + extended_family: 3 + step_family: 4 + non_biological: 6 + in_law: 9 + former_partners: 2 + + symmetric_properties: + - "sdo:spouse" + - "sdo:sibling" + - "pico:hasCousin" + - "pico:hasStepsibling" + - "pico:hasHalf-sibling" + - "pico:hasSibling-in-law" + - "pico:hasCousin-in-law" + - "pico:hasPreviousPartner" + + transitive_properties: + - "pico:hasAncestor" + - "pico:hasDescendant" + +# ============================================================================= +# GLM Extraction Output Schema (for family_relationships field) +# ============================================================================= + +extraction_output_schema: + description: | + Schema for the family_relationships object in GLM extraction output. + All arrays contain person references with person_index and target_name. + + schema: + parent: "array of person references" + children: "array of person references" + spouse: "array of person references" + sibling: "array of person references" + grandparent: "array of person references" + grandchild: "array of person references" + uncle_aunt: "array of person references" + nephew_niece: "array of person references" + cousin: "array of person references" + stepparent: "array of person references" + stepchild: "array of person references" + stepsibling: "array of person references" + half_sibling: "array of person references" + foster_parent: "array of person references" + foster_child: "array of person references" + godparent: "array of person references" + godchild: "array of person references" + parent_in_law: "array of person references" + child_in_law: "array of person references" + sibling_in_law: "array of person references" + previous_partner: "array of person references" + widow_of: "person reference|null" + + person_reference_format: + person_index: "integer (0-based index into persons array)" + target_name: "string (name for readability)" + + example: + family_relationships: + parent: + - person_index: 2 + target_name: "Pieter Koppen" + - person_index: 3 + target_name: "Anna Maria Brouwer" + spouse: + - person_index: 1 + target_name: "Anna Maria Visser" + sibling: + - person_index: 6 + target_name: "Hendrik Koppen" + +# ============================================================================= +# Historical Source Patterns +# ============================================================================= + +historical_source_patterns: + description: | + Common patterns for expressing family relationships in historical sources, + organized by language. Used for extraction guidance. + + dutch: + spouse: + - "huijsvrou van" # wife of + - "echtgenoot van" # husband of + - "gehuwd met" # married to + - "vrouw van" # wife of + - "man van" # husband of + + parent: + - "zoon van" # son of + - "dochter van" # daughter of + - "kind van" # child of + - "vader van" # father of + - "moeder van" # mother of + + widow: + - "weduwe van" # widow of (female) + - "weduwnaar van" # widower of (male) + - "wijlen" # the late (indicates deceased) + + godparent: + - "peter" # godfather + - "meter" # godmother + - "getuijge" # witness (often godparent at baptism) + - "doopgetuige" # baptismal witness + + sibling: + - "broeder van" # brother of + - "zuster van" # sister of + + latin: + parent: + - "filius" # son + - "filia" # daughter + - "pater" # father + - "mater" # mother + + spouse: + - "uxor" # wife + - "maritus" # husband + - "conjux" # spouse + + widow: + - "vidua" # widow + - "viduus" # widower + - "quondam" # the late + + german: + spouse: + - "Ehefrau von" # wife of + - "Ehemann von" # husband of + - "verheiratet mit" # married to + + parent: + - "Sohn von" # son of + - "Tochter von" # daughter of + - "Vater von" # father of + - "Mutter von" # mother of + + widow: + - "Witwe von" # widow of + - "Witwer von" # widower of + + # --------------------------------------------------------------------------- + # Arabic (العربية) - Common in Ottoman records, Islamic archives, waqf documents + # Note: Includes both Modern Standard Arabic and common dialectal variants + # --------------------------------------------------------------------------- + arabic: + description: | + Arabic naming conventions use patronymic chains (nasab) with ibn/bint, + honorific titles (laqab), occupational names (nisba), and tribal/regional + affiliations. Common in Ottoman court records, waqf documents, Islamic + marriage contracts (aqd nikah), and genealogical registers (shajara). + + # Patronymic/parentage (nasab - نسب) + parent: + - "ابن" # ibn - son of (formal) + - "بن" # bin - son of (abbreviated, common in names) + - "بنت" # bint - daughter of + - "أب" # ab - father + - "أبو" # abu - father of (kunya) + - "أم" # umm - mother / mother of (kunya) + - "والد" # walid - father + - "والدة" # walida - mother + - "ولد" # walad - child/son + - "أبوه" # abuhu - his father + - "أمه" # ummuhu - his mother + + # Spouse relationships + spouse: + - "زوج" # zawj - husband + - "زوجة" # zawja - wife + - "زوجته" # zawjatuhu - his wife + - "زوجها" # zawjuha - her husband + - "امرأة" # imra'a - woman/wife of + - "حرم" # haram - wife (honorific, lit. "sanctuary") + - "عقيلة" # aqila - wife (formal) + - "قرينة" # qarina - spouse/companion + - "متزوج من" # mutazawwij min - married to (male) + - "متزوجة من" # mutazawwija min - married to (female) + + # Widow/widower + widow: + - "أرملة" # armala - widow + - "أرمل" # armal - widower + - "المرحوم" # al-marhum - the late (male, deceased) + - "المرحومة" # al-marhuma - the late (female, deceased) + - "المتوفى" # al-mutawaffa - the deceased (male) + - "المتوفاة" # al-mutawaffat - the deceased (female) + - "رحمه الله" # rahimahu Allah - may God have mercy on him + - "رحمها الله" # rahimaha Allah - may God have mercy on her + + # Siblings + sibling: + - "أخ" # akh - brother + - "أخت" # ukht - sister + - "شقيق" # shaqiq - full brother (same parents) + - "شقيقة" # shaqiqa - full sister (same parents) + - "أخوه" # akhuhu - his brother + - "أختها" # ukhtuha - her sister + + # Grandparents/grandchildren + grandparent: + - "جد" # jadd - grandfather + - "جدة" # jadda - grandmother + - "حفيد" # hafid - grandson + - "حفيدة" # hafida - granddaughter + + # Extended family + extended: + - "عم" # 'amm - paternal uncle + - "عمة" # 'amma - paternal aunt + - "خال" # khal - maternal uncle + - "خالة" # khala - maternal aunt + - "ابن عم" # ibn 'amm - paternal cousin (male) + - "بنت عم" # bint 'amm - paternal cousin (female) + - "ابن خال" # ibn khal - maternal cousin (male) + - "بنت خال" # bint khal - maternal cousin (female) + + # In-laws (أصهار - ashar) + in_law: + - "حمو" # hamu - father-in-law + - "حماة" # hamah - mother-in-law + - "صهر" # sihr - son-in-law / brother-in-law + - "كنة" # kanna - daughter-in-law + - "نسيب" # nasib - in-law relation + - "سلف" # silf - co-brother-in-law + - "سلفة" # silfa - co-sister-in-law + + # Step-relations + step_family: + - "ربيب" # rabib - stepson + - "ربيبة" # rabiba - stepdaughter + - "زوج الأم" # zawj al-umm - stepfather + - "زوجة الأب" # zawjat al-ab - stepmother + + # Orphan/guardian (common in waqf documents) + guardian: + - "يتيم" # yatim - orphan (fatherless) + - "وصي" # wasi - guardian/executor + - "كفيل" # kafil - sponsor/guarantor + - "حاضن" # hadin - custodian + + # Tribal/clan affiliations (common in genealogies) + tribal: + - "آل" # Al - family of / house of + - "بني" # Bani - sons of (tribe) + - "قبيلة" # qabila - tribe + - "عشيرة" # 'ashira - clan + + # --------------------------------------------------------------------------- + # French (for North African/Ottoman archives) + # --------------------------------------------------------------------------- + french: + spouse: + - "épouse de" # wife of + - "époux de" # husband of + - "marié à" # married to (male) + - "mariée à" # married to (female) + + parent: + - "fils de" # son of + - "fille de" # daughter of + - "père de" # father of + - "mère de" # mother of + + widow: + - "veuve de" # widow of + - "veuf de" # widower of + - "feu" # the late (male) + - "feue" # the late (female) + + sibling: + - "frère de" # brother of + - "sœur de" # sister of + + # --------------------------------------------------------------------------- + # Ottoman Turkish (for Ottoman archival records) + # --------------------------------------------------------------------------- + ottoman_turkish: + description: | + Ottoman Turkish used Arabic script and incorporated Arabic/Persian + terminology. Common in court registers (sicil), tax records (tahrir), + and endowment documents (vakfiye). + + parent: + - "oğlu" # son of (Turkish suffix) + - "kızı" # daughter of (Turkish suffix) + - "ibn" # son of (Arabic, used in Ottoman) + - "bint" # daughter of (Arabic, used in Ottoman) + - "veled-i" # child of (Persian-Turkish) + - "mahdumu" # son of (Persian honorific) + + spouse: + - "zevcesi" # his wife + - "zevci" # her husband + - "haremi" # his wife (from harem) + - "nikâhlı" # legally married + + widow: + - "dul" # widow/widower + - "müteveffa" # the deceased (male) + - "müteveffiye" # the deceased (female) + - "merhum" # the late (male) + - "merhume" # the late (female) + + # --------------------------------------------------------------------------- + # Hebrew (עברית) - Jewish archives, genizah documents, rabbinic records + # Note: Includes both Biblical/Rabbinic Hebrew and modern Israeli Hebrew + # --------------------------------------------------------------------------- + hebrew: + description: | + Hebrew naming conventions in Jewish records include patronymics (ben/bat), + tribal affiliations (ha-Kohen, ha-Levi), honorifics, and memorial phrases. + Common in ketubbot (marriage contracts), get documents (divorce), pinqasim + (community records), genizah fragments, and rabbinic responsa. + + # Patronymic/parentage + parent: + - "בן" # ben - son of + - "בת" # bat - daughter of + - "אבי" # avi - my father + - "אמי" # imi - my mother + - "אב" # av - father + - "אם" # em - mother + - "בנו של" # beno shel - his son + - "בתו של" # bito shel - his daughter + - "ילד" # yeled - child (male) + - "ילדה" # yalda - child (female) + + # Spouse relationships + spouse: + - "אשת" # eshet - wife of + - "בעל" # ba'al - husband + - "אישה" # isha - wife/woman + - "זוג" # zug - spouse (modern) + - "זוגתו" # zugato - his spouse (female) + - "נשוי ל" # nasui le - married to (male) + - "נשואה ל" # nesu'a le - married to (female) + - "כלה" # kala - bride + - "חתן" # chatan - groom + + # Widow/widower + widow: + - "אלמנה" # almana - widow + - "אלמן" # alman - widower + - "ז״ל" # z"l (zikhronó livrakha) - of blessed memory + - "ע״ה" # a"h (alav/aleha hashalom) - peace be upon him/her + - "הי״ד" # hy"d (Hashem yinkom damo) - may God avenge his blood (martyrs) + - "נ״ע" # n"e (nishmatá eden) - may their soul be in Eden + - "המנוח" # ha-manoakh - the late (male) + - "המנוחה" # ha-menukha - the late (female) + + # Siblings + sibling: + - "אח" # akh - brother + - "אחות" # akhot - sister + - "אחיו" # akhiv - his brother + - "אחותו" # akhoto - his sister + + # Grandparents/grandchildren + grandparent: + - "סב" # sav - grandfather + - "סבתא" # savta - grandmother + - "נכד" # nekhed - grandson + - "נכדה" # nekhda - granddaughter + + # Extended family + extended: + - "דוד" # dod - uncle + - "דודה" # doda - aunt + - "בן דוד" # ben dod - cousin (male) + - "בת דודה" # bat doda - cousin (female) + + # In-laws + in_law: + - "חם" # kham - father-in-law + - "חמות" # khamot - mother-in-law + - "חתן" # khatan - son-in-law + - "כלה" # kala - daughter-in-law + - "גיס" # gis - brother-in-law + - "גיסה" # gisa - sister-in-law + + # Tribal/priestly affiliations (common in Jewish records) + tribal: + - "הכהן" # ha-Kohen - the Priest (Kohen lineage) + - "הלוי" # ha-Levi - the Levite + - "כ״ץ" # KaTz - abbreviation for Kohen Tzedek + - "סג״ל" # SeGaL - abbreviation for Segan Leviya + + # Honorifics (common in rabbinic records) + honorifics: + - "רב" # rav - rabbi + - "ר׳" # r' - rabbi (abbreviated) + - "הרב" # ha-rav - the rabbi + - "מורנו" # morenu - our teacher + - "מר" # mar - mister (Aramaic/Hebrew) + - "מרת" # marat - mistress/Mrs. + - "החכם" # ha-khakham - the sage (Sephardic) + - "הגאון" # ha-gaon - the genius (high honorific) + + # --------------------------------------------------------------------------- + # Persian/Farsi (فارسی) - Iranian archives, Safavid/Qajar documents + # --------------------------------------------------------------------------- + persian: + description: | + Persian naming conventions blend Arabic Islamic patterns with indigenous + Iranian elements. Common in Safavid court records, Qajar-era documents, + endowment deeds (vaqfnameh), and judicial records (mahzar). + + # Patronymic/parentage + parent: + - "پسر" # pesar - son + - "دختر" # dokhtar - daughter + - "فرزند" # farzand - child/offspring + - "پدر" # pedar - father + - "مادر" # madar - mother + - "ابن" # ebn - son of (Arabic loanword, formal) + - "بنت" # bent - daughter of (Arabic loanword, formal) + - "ولد" # valad - child of + + # Spouse relationships + spouse: + - "شوهر" # showhar - husband + - "زن" # zan - wife/woman + - "همسر" # hamsar - spouse + - "عیال" # ayal - wife (formal/classical) + - "زوجه" # zowjeh - wife (Arabic loanword) + - "زوج" # zowj - husband (Arabic loanword) + - "عقد" # aqd - marriage contract + - "نکاح" # nekah - marriage + + # Widow/widower + widow: + - "بیوه" # biveh - widow + - "بیوه‌مرد" # biveh-mard - widower + - "مرحوم" # marhum - the late (male) + - "مرحومه" # marhumeh - the late (female) + - "متوفی" # motevaffa - the deceased (male) + - "متوفیه" # motevaffiyeh - the deceased (female) + - "شادروان" # shadravan - the late (respectful) + - "درگذشته" # dargozashteh - passed away + + # Siblings + sibling: + - "برادر" # baradar - brother + - "خواهر" # khahar - sister + + # Grandparents/grandchildren + grandparent: + - "پدربزرگ" # pedarbozorg - grandfather + - "مادربزرگ" # madarbozorg - grandmother + - "نوه" # naveh - grandchild + + # Extended family + extended: + - "عمو" # amu - paternal uncle + - "عمه" # ammeh - paternal aunt + - "دایی" # dayi - maternal uncle + - "خاله" # khaleh - maternal aunt + - "پسرعمو" # pesar-amu - paternal cousin (male) + - "دخترعمو" # dokhtar-amu - paternal cousin (female) + + # In-laws + in_law: + - "پدرشوهر" # pedar-showhar - father-in-law (husband's father) + - "مادرشوهر" # madar-showhar - mother-in-law (husband's mother) + - "پدرزن" # pedar-zan - father-in-law (wife's father) + - "مادرزن" # madar-zan - mother-in-law (wife's mother) + - "داماد" # damad - son-in-law + - "عروس" # arus - daughter-in-law/bride + - "باجناق" # bajnaq - co-brother-in-law + + # Titles and honorifics + honorifics: + - "آقا" # aqa - mister/sir + - "خانم" # khanom - lady/Mrs. + - "میرزا" # mirza - educated man/scribe + - "خان" # khan - lord/chief + - "بیگم" # begom - lady (Turkic origin) + - "سلطان" # soltan - sultan/ruler + - "شاهزاده" # shahzadeh - prince/princess + + # --------------------------------------------------------------------------- + # Spanish (Español) - Colonial Latin America, Iberian archives + # --------------------------------------------------------------------------- + spanish: + description: | + Spanish colonial records include civil and ecclesiastical documents from + Latin America, Philippines, and Spain. Common in padrones (censuses), + testamentos (wills), actas de bautismo/matrimonio/defunción (vital records), + and Inquisition proceedings. + + # Parentage + parent: + - "hijo de" # son of + - "hija de" # daughter of + - "hijo legítimo de" # legitimate son of + - "hija legítima de" # legitimate daughter of + - "hijo natural de" # illegitimate son of + - "hija natural de" # illegitimate daughter of + - "padre" # father + - "madre" # mother + - "padres" # parents + - "progenitor" # progenitor + + # Spouse relationships + spouse: + - "esposo de" # husband of + - "esposa de" # wife of + - "marido de" # husband of + - "mujer de" # wife of (also "woman of") + - "casado con" # married to (male) + - "casada con" # married to (female) + - "consorte" # consort/spouse + - "cónyuge" # spouse (legal) + - "desposado con" # betrothed to + - "velado con" # veiled with (church wedding) + + # Widow/widower + widow: + - "viuda de" # widow of + - "viudo de" # widower of + - "difunto" # the deceased (male) + - "difunta" # the deceased (female) + - "finado" # the late (male) + - "finada" # the late (female) + - "que en paz descanse" # may they rest in peace + - "q.e.p.d." # que en paz descanse (abbreviated) + - "que santa gloria haya" # may they have holy glory + + # Siblings + sibling: + - "hermano de" # brother of + - "hermana de" # sister of + - "medio hermano" # half-brother + - "media hermana" # half-sister + - "hermanastro" # stepbrother + - "hermanastra" # stepsister + + # Grandparents/grandchildren + grandparent: + - "abuelo" # grandfather + - "abuela" # grandmother + - "nieto de" # grandson of + - "nieta de" # granddaughter of + - "bisabuelo" # great-grandfather + - "bisabuela" # great-grandmother + + # Extended family + extended: + - "tío" # uncle + - "tía" # aunt + - "sobrino" # nephew + - "sobrina" # niece + - "primo" # cousin (male) + - "prima" # cousin (female) + - "primo hermano" # first cousin (male) + - "prima hermana" # first cousin (female) + + # In-laws + in_law: + - "suegro" # father-in-law + - "suegra" # mother-in-law + - "yerno" # son-in-law + - "nuera" # daughter-in-law + - "cuñado" # brother-in-law + - "cuñada" # sister-in-law + - "consuegro" # co-father-in-law + - "consuegra" # co-mother-in-law + + # Step-relations + step_family: + - "padrastro" # stepfather + - "madrastra" # stepmother + - "hijastro" # stepson + - "hijastra" # stepdaughter + - "entenado" # stepchild (archaic) + - "entenada" # stepdaughter (archaic) + + # Godparent relationships (very important in colonial records) + godparent: + - "padrino" # godfather + - "madrina" # godmother + - "ahijado" # godson + - "ahijada" # goddaughter + - "compadre" # co-father (godparent relationship) + - "comadre" # co-mother (godparent relationship) + + # Legal/social status (colonial casta system) + status: + - "don" # honorific for male + - "doña" # honorific for female + - "indio/india" # indigenous person + - "mestizo/mestiza" # mixed Spanish-indigenous + - "mulato/mulata" # mixed African heritage + - "español/española" # Spanish descent + - "criollo/criolla" # American-born Spanish + - "libre" # free (for formerly enslaved) + - "esclavo/esclava" # enslaved person + + # --------------------------------------------------------------------------- + # Portuguese (Português) - Colonial Brazil, African archives, Lusophone world + # --------------------------------------------------------------------------- + portuguese: + description: | + Portuguese colonial records span Brazil, Angola, Mozambique, Goa, Macau, + and other territories. Common in registros paroquiais (parish registers), + inventários (estate inventories), testamentos (wills), and Inquisition + proceedings. + + # Parentage + parent: + - "filho de" # son of + - "filha de" # daughter of + - "filho legítimo de" # legitimate son of + - "filha legítima de" # legitimate daughter of + - "filho natural de" # illegitimate son of + - "filha natural de" # illegitimate daughter of + - "pai" # father + - "mãe" # mother + - "pais" # parents + - "progenitor" # progenitor + + # Spouse relationships + spouse: + - "esposo de" # husband of + - "esposa de" # wife of + - "marido de" # husband of + - "mulher de" # wife of + - "casado com" # married to (male) + - "casada com" # married to (female) + - "cônjuge" # spouse (legal) + - "consorte" # consort + - "desposado com" # betrothed to + + # Widow/widower + widow: + - "viúva de" # widow of + - "viúvo de" # widower of + - "defunto" # the deceased (male) + - "defunta" # the deceased (female) + - "falecido" # the late (male) + - "falecida" # the late (female) + - "finado" # the late (male) + - "finada" # the late (female) + - "que Deus haja" # whom God has (taken) + + # Siblings + sibling: + - "irmão de" # brother of + - "irmã de" # sister of + - "meio-irmão" # half-brother + - "meia-irmã" # half-sister + + # Grandparents/grandchildren + grandparent: + - "avô" # grandfather + - "avó" # grandmother + - "neto de" # grandson of + - "neta de" # granddaughter of + - "bisavô" # great-grandfather + - "bisavó" # great-grandmother + + # Extended family + extended: + - "tio" # uncle + - "tia" # aunt + - "sobrinho" # nephew + - "sobrinha" # niece + - "primo" # cousin (male) + - "prima" # cousin (female) + - "primo direito" # first cousin (male) + - "prima direita" # first cousin (female) + + # In-laws + in_law: + - "sogro" # father-in-law + - "sogra" # mother-in-law + - "genro" # son-in-law + - "nora" # daughter-in-law + - "cunhado" # brother-in-law + - "cunhada" # sister-in-law + - "consogro" # co-father-in-law + - "consogra" # co-mother-in-law + + # Step-relations + step_family: + - "padrasto" # stepfather + - "madrasta" # stepmother + - "enteado" # stepson + - "enteada" # stepdaughter + + # Godparent relationships + godparent: + - "padrinho" # godfather + - "madrinha" # godmother + - "afilhado" # godson + - "afilhada" # goddaughter + - "compadre" # co-father (godparent relationship) + - "comadre" # co-mother (godparent relationship) + + # Legal/social status (colonial Brazil) + status: + - "dom" # honorific for male + - "dona" # honorific for female + - "índio/índia" # indigenous person + - "mameluco/mameluca" # mixed Portuguese-indigenous + - "mulato/mulata" # mixed African heritage + - "pardo/parda" # mixed-race (general) + - "preto/preta" # Black person + - "branco/branca" # White person + - "forro/forra" # freed person + - "escravo/escrava" # enslaved person + - "liberto/liberta" # freedperson + - "crioulo/crioula" # Brazilian-born Black person + +# ============================================================================= +# ITALIAN HISTORICAL PATTERNS +# ============================================================================= +# Coverage: Italian states (Papal States, Venice, Naples, etc.), 15th-19th century +# Script: Latin +# ============================================================================= + +italian: + description: | + Italian historical documents include notarial acts, parish registers, + catasti (censuses), and stato delle anime (status animarum - soul records). + Naming follows patronymic patterns with "di" or "fu" (the late), + with regional variations across Italian states. + + source_types: + - "atti notarili" # notarial acts + - "registri parrocchiali" # parish registers + - "stato delle anime" # status animarum (census of souls) + - "catasto" # tax census + - "liber baptizatorum" # baptismal register + - "liber matrimoniorum" # marriage register + - "liber mortuorum" # death register + + patterns: + # Child-parent relationships + child_of: + - "figlio di" # son of + - "figlia di" # daughter of + - "figlio del fu" # son of the late + - "figlia del fu" # daughter of the late + - "figlio della fu" # son of the late (female) + - "figlia della fu" # daughter of the late (female) + - "figlio legittimo di" # legitimate son of + - "figlia legittima di" # legitimate daughter of + - "figlio naturale di" # natural (illegitimate) son of + - "figlia naturale di" # natural (illegitimate) daughter of + - "nato da" # born of + - "nata da" # born of (female) + - "di padre" # of father + - "di madre" # of mother + - "q." # abbreviation for quondam (the late) + - "fu" # the late (abbreviated from "fu il") + - "del fu" # of the late (male) + - "della fu" # of the late (female) + + # Spouse relationships + spouse: + - "marito di" # husband of + - "moglie di" # wife of + - "sposo di" # spouse of (male) + - "sposa di" # spouse of (female) + - "consorte di" # consort of + - "coniuge di" # spouse of (legal) + - "maritata con" # married to (female subject) + - "maritato con" # married to (male subject) + - "sposata con" # married to (female) + - "sposato con" # married to (male) + - "uxor" # wife (Latin in Italian documents) + + # Widow/widower + widow: + - "vedova di" # widow of + - "vedovo di" # widower of + - "relicta di" # widow of (Latin: left behind by) + - "relicta del fu" # widow of the late + - "vidua" # widow (Latin) + + # Siblings + sibling: + - "fratello di" # brother of + - "sorella di" # sister of + - "germano" # full sibling + - "germana" # full sibling (female) + - "fratello uterino" # maternal half-brother + - "sorella uterina" # maternal half-sister + - "fratello consanguineo" # paternal half-brother + - "sorella consanguinea" # paternal half-sister + + # Grandparents/grandchildren + grandparent: + - "nonno" # grandfather + - "nonna" # grandmother + - "avo" # grandfather (archaic) + - "ava" # grandmother (archaic) + - "nipote di" # grandchild/nephew/niece of + - "bisnonno" # great-grandfather + - "bisnonna" # great-grandmother + - "trisavolo" # great-great-grandfather + - "trisavola" # great-great-grandmother + + # Extended family + extended: + - "zio" # uncle + - "zia" # aunt + - "nipote" # nephew/niece/grandchild + - "cugino" # cousin (male) + - "cugina" # cousin (female) + - "cugino germano" # first cousin (male) + - "cugina germana" # first cousin (female) + + # In-laws + in_law: + - "suocero" # father-in-law + - "suocera" # mother-in-law + - "genero" # son-in-law + - "nuora" # daughter-in-law + - "cognato" # brother-in-law + - "cognata" # sister-in-law + + # Step-relations + step_family: + - "patrigno" # stepfather + - "matrigna" # stepmother + - "figliastro" # stepson + - "figliastra" # stepdaughter + + # Godparent relationships + godparent: + - "padrino" # godfather + - "madrina" # godmother + - "figlioccio" # godson + - "figlioccia" # goddaughter + - "compare" # co-father (spiritual kinship) + - "comare" # co-mother (spiritual kinship) + - "patrino" # godfather (archaic) + - "matrina" # godmother (archaic) + + # Honorifics and titles + honorifics: + - "Messer" # Mister (medieval/Renaissance) + - "Madonna" # My lady (medieval/Renaissance) + - "Ser" # Sir (notarial) + - "Don" # Lord (clergy, nobility) + - "Donna" # Lady + - "Signor" # Mister + - "Signora" # Mrs. + - "Illustrissimo" # Most illustrious + - "Eccellentissimo" # Most excellent + - "Magnifico" # Magnificent (Venetian nobility) + - "Nobil Homo" # Noble man (Venetian) + - "Nobil Donna" # Noble woman (Venetian) + + # Deceased markers + deceased: + - "fu" # the late + - "quondam" # the late (Latin) + - "q." # abbreviation for quondam + - "bone memorie" # of good memory + - "defunto" # deceased (male) + - "defunta" # deceased (female) + - "morto" # dead (male) + - "morta" # dead (female) + + # Occupation/status indicators + status: + - "cittadino" # citizen + - "abitante" # resident + - "forestiero" # foreigner + - "nobile" # noble + - "popolano" # commoner + - "contadino" # peasant + - "artigiano" # artisan + - "mercante" # merchant + +# ============================================================================= +# GREEK HISTORICAL PATTERNS +# ============================================================================= +# Coverage: Byzantine, Ottoman-era Greek, Modern Greek +# Script: Greek +# ============================================================================= + +greek: + description: | + Greek historical documents span Byzantine manuscripts, Ottoman-era parish + registers (often bilingual Greek/Ottoman Turkish), and modern civil records. + Names follow patronymic patterns with genitive case inflection. + + Key features: + - Patronymics in genitive case (του Ιωάννη = of Ioannis) + - Byzantine titles and honorifics + - Ottoman-era Greek with Turkish administrative terms + - Greek Orthodox church terminology + + source_types: + - "ληξιαρχικά βιβλία" # civil registry + - "μητρώα βαπτίσεων" # baptismal registers + - "μητρώα γάμων" # marriage registers + - "κώδικες μονών" # monastery codices + - "πατριαρχικά έγγραφα" # patriarchal documents + + patterns: + # Child-parent relationships + child_of: + - "υιός του" # son of (+ genitive) + - "θυγατέρα του" # daughter of (+ genitive) + - "υιός της" # son of (female, + genitive) + - "θυγατέρα της" # daughter of (female, + genitive) + - "γιος του" # son of (modern) + - "κόρη του" # daughter of (modern) + - "τέκνον του" # child of (archaic) + - "παις του" # child of (Byzantine) + - "του" # of (genitive article, male) + - "της" # of (genitive article, female) + - "γνήσιος υιός" # legitimate son + - "γνησία θυγατέρα" # legitimate daughter + - "νόθος υιός" # illegitimate son + - "νόθη θυγατέρα" # illegitimate daughter + + # Spouse relationships + spouse: + - "σύζυγος του" # spouse of (male possessor) + - "σύζυγος της" # spouse of (female possessor) + - "γυναίκα του" # wife of + - "άνδρας της" # husband of + - "ομόζυγος" # spouse (formal) + - "νυμφευθείσα με" # married to (female) + - "νυμφευθείς με" # married to (male) + - "παντρεμένη με" # married to (female, modern) + - "παντρεμένος με" # married to (male, modern) + + # Widow/widower + widow: + - "χήρα του" # widow of + - "χήρος της" # widower of + - "η χήρα" # the widow + - "μακαρίτης" # the late (male, blessed) + - "μακαρίτισσα" # the late (female, blessed) + - "αείμνηστος" # of eternal memory (male) + - "αείμνηστη" # of eternal memory (female) + + # Siblings + sibling: + - "αδελφός του" # brother of + - "αδελφή του" # sister of + - "ετεροθαλής αδελφός" # half-brother + - "ετεροθαλής αδελφή" # half-sister + - "ομομήτριος" # maternal half-sibling + - "ομοπάτριος" # paternal half-sibling + + # Grandparents/grandchildren + grandparent: + - "παππούς" # grandfather + - "γιαγιά" # grandmother + - "πάππος" # grandfather (archaic) + - "μάμμη" # grandmother (archaic) + - "εγγονός του" # grandson of + - "εγγονή του" # granddaughter of + - "προπάππος" # great-grandfather + - "προμάμμη" # great-grandmother + + # Extended family + extended: + - "θείος" # uncle + - "θεία" # aunt + - "ανιψιός" # nephew + - "ανιψιά" # niece + - "ξάδελφος" # cousin (male) + - "ξαδέλφη" # cousin (female) + - "πρωτοξάδελφος" # first cousin (male) + - "πρωτοξαδέλφη" # first cousin (female) + + # In-laws + in_law: + - "πεθερός" # father-in-law + - "πεθερά" # mother-in-law + - "γαμπρός" # son-in-law / brother-in-law + - "νύφη" # daughter-in-law / bride + - "κουνιάδος" # brother-in-law + - "κουνιάδα" # sister-in-law + - "συμπέθερος" # co-father-in-law + - "συμπεθέρα" # co-mother-in-law + + # Godparent relationships + godparent: + - "νονός" # godfather + - "νονά" # godmother + - "ανάδοχος" # godparent (formal) + - "βαπτιστικός" # godson + - "βαπτιστική" # goddaughter + - "κουμπάρος" # best man / godfather + - "κουμπάρα" # best woman / godmother + - "σύντεκνος" # co-parent (spiritual kinship) + + # Honorifics and titles + honorifics: + - "Κύριος" # Mister / Lord + - "Κυρία" # Mrs. / Lady + - "Δεσπότης" # Bishop / Master + - "Δεσπόινα" # Lady (unmarried) + - "Άρχων" # Archon (Byzantine title) + - "Σεβαστός" # Sebastos (Byzantine title) + - "Πρωτοσπαθάριος" # Protospatharios (Byzantine) + - "Μέγας Λογοθέτης" # Grand Logothete (Byzantine) + - "Παπάς" # Priest (Orthodox) + - "Ιερεύς" # Priest (formal) + - "Διάκονος" # Deacon + - "Μοναχός" # Monk + - "Μοναχή" # Nun + - "Ηγούμενος" # Abbot + - "Ηγουμένη" # Abbess + + # Deceased markers + deceased: + - "μακαρίτης" # the late (blessed, male) + - "μακαρίτισσα" # the late (blessed, female) + - "αείμνηστος" # of eternal memory (male) + - "αείμνηστη" # of eternal memory (female) + - "αποθανών" # the deceased (male) + - "αποθανούσα" # the deceased (female) + +# ============================================================================= +# RUSSIAN HISTORICAL PATTERNS +# ============================================================================= +# Coverage: Imperial Russia, Soviet era, modern Russia +# Script: Cyrillic +# ============================================================================= + +russian: + description: | + Russian historical documents include metrical books (метрические книги), + revision lists (ревизские сказки), and church records. Names follow + a three-part system: given name, patronymic (отчество), and surname. + + Key features: + - Patronymics formed with -ович/-евич (male) or -овна/-евна (female) + - Estate/class designations (сословие) + - Diminutive forms of given names + - Pre-revolutionary vs. Soviet-era terminology + + source_types: + - "метрические книги" # metrical books (vital records) + - "ревизские сказки" # revision lists (censuses) + - "исповедные росписи" # confession lists + - "клировые ведомости" # clerical records + - "формулярные списки" # service records + - "родословные книги" # genealogical books + + patterns: + # Child-parent relationships + child_of: + - "сын" # son + - "дочь" # daughter + - "сын крестьянина" # son of a peasant + - "дочь мещанина" # daughter of a townsman + - "законный сын" # legitimate son + - "законная дочь" # legitimate daughter + - "незаконнорождённый" # illegitimate (male) + - "незаконнорождённая" # illegitimate (female) + - "приёмный сын" # adopted son + - "приёмная дочь" # adopted daughter + - "воспитанник" # foster child (male) + - "воспитанница" # foster child (female) + + # Patronymic patterns + patronymic: + - "-ович" # male patronymic suffix (after consonant) + - "-евич" # male patronymic suffix (after vowel/soft consonant) + - "-ич" # male patronymic suffix (short form) + - "-овна" # female patronymic suffix (after consonant) + - "-евна" # female patronymic suffix (after vowel/soft consonant) + - "-ична" # female patronymic suffix (archaic) + - "-инична" # female patronymic suffix (archaic variant) + + # Spouse relationships + spouse: + - "жена" # wife + - "муж" # husband + - "супруга" # spouse (female) + - "супруг" # spouse (male) + - "законная жена" # lawful wife + - "законный муж" # lawful husband + - "венчаная жена" # church-married wife + - "венчаный муж" # church-married husband + + # Widow/widower + widow: + - "вдова" # widow + - "вдовец" # widower + - "вдова после" # widow of (+ genitive) + - "покойный" # the late (male) + - "покойная" # the late (female) + - "умерший" # the deceased (male) + - "умершая" # the deceased (female) + - "в Бозе почивший" # resting in God (male, religious) + - "в Бозе почившая" # resting in God (female, religious) + + # Siblings + sibling: + - "брат" # brother + - "сестра" # sister + - "родной брат" # full brother + - "родная сестра" # full sister + - "единоутробный брат" # maternal half-brother + - "единоутробная сестра" # maternal half-sister + - "единокровный брат" # paternal half-brother + - "единокровная сестра" # paternal half-sister + - "сводный брат" # stepbrother + - "сводная сестра" # stepsister + + # Grandparents/grandchildren + grandparent: + - "дед" # grandfather + - "бабка" # grandmother (archaic) + - "бабушка" # grandmother + - "дедушка" # grandfather (affectionate) + - "внук" # grandson + - "внучка" # granddaughter + - "прадед" # great-grandfather + - "прабабка" # great-grandmother + - "правнук" # great-grandson + - "правнучка" # great-granddaughter + + # Extended family + extended: + - "дядя" # uncle + - "тётя" # aunt + - "тётка" # aunt (archaic) + - "племянник" # nephew + - "племянница" # niece + - "двоюродный брат" # first cousin (male) + - "двоюродная сестра" # first cousin (female) + - "троюродный брат" # second cousin (male) + - "троюродная сестра" # second cousin (female) + + # In-laws + in_law: + - "тесть" # father-in-law (wife's father) + - "тёща" # mother-in-law (wife's mother) + - "свёкор" # father-in-law (husband's father) + - "свекровь" # mother-in-law (husband's mother) + - "зять" # son-in-law + - "невестка" # daughter-in-law + - "сноха" # daughter-in-law (son's wife) + - "деверь" # husband's brother + - "золовка" # husband's sister + - "шурин" # wife's brother + - "свояченица" # wife's sister + - "свояк" # sister's husband + + # Godparent relationships + godparent: + - "крёстный отец" # godfather + - "крёстная мать" # godmother + - "крёстный" # godfather (short) + - "крёстная" # godmother (short) + - "крестник" # godson + - "крестница" # goddaughter + - "кум" # co-father (spiritual kinship) + - "кума" # co-mother (spiritual kinship) + - "восприемник" # godparent (formal, male) + - "восприемница" # godparent (formal, female) + + # Estate/class designations (сословия) + estate: + - "дворянин" # nobleman + - "дворянка" # noblewoman + - "потомственный дворянин" # hereditary nobleman + - "личный дворянин" # personal nobleman + - "купец" # merchant + - "купчиха" # merchant's wife + - "мещанин" # townsman + - "мещанка" # townswoman + - "крестьянин" # peasant + - "крестьянка" # peasant woman + - "однодворец" # single-homesteader + - "казак" # Cossack + - "казачка" # Cossack woman + - "разночинец" # person of mixed estate + - "священник" # priest + - "дьякон" # deacon + - "дьячок" # sexton + - "пономарь" # acolyte + + # Honorifics and titles + honorifics: + - "господин" # Mister / Sir + - "госпожа" # Mrs. / Madam + - "Его Превосходительство" # His Excellency + - "Её Превосходительство" # Her Excellency + - "Его Сиятельство" # His Highness (for counts) + - "Её Сиятельство" # Her Highness + - "князь" # prince + - "княгиня" # princess + - "граф" # count + - "графиня" # countess + - "барон" # baron + - "баронесса" # baroness + - "отец" # Father (priest) + - "батюшка" # Father (priest, affectionate) + - "матушка" # Mother (priest's wife) + + # Deceased markers + deceased: + - "покойный" # the late (male) + - "покойная" # the late (female) + - "умерший" # deceased (male) + - "умершая" # deceased (female) + - "скончавшийся" # passed away (male) + - "скончавшаяся" # passed away (female) + - "в Бозе почивший" # resting in God (male) + - "в Бозе почившая" # resting in God (female) + - "приснопамятный" # of blessed memory (male) + - "приснопамятная" # of blessed memory (female) + +# ============================================================================= +# END OF MODULE +# ============================================================================= diff --git a/data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_132017.json b/data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_132017.json new file mode 100644 index 0000000000..923277cbd5 --- /dev/null +++ b/data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_132017.json @@ -0,0 +1,163 @@ +{ + "pico_observation": { + "observation_id": "waqf_aleppo_1225h", + "observed_at": "2023-10-27T10:00:00Z", + "source_type": "waqf_document", + "source_reference": "Aleppo Waqf, 1225 H" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "الحاج أحمد بن محمد العمري", + "literalName_romanized": "al-Hajj Ahmad ibn Muhammad al-Umari", + "givenName": "أحمد", + "givenName_romanized": "Ahmad", + "patronym": "محمد", + "patronym_romanized": "Muhammad", + "baseSurname": "العمري", + "baseSurname_romanized": "al-Umari", + "honorificPrefix": "الحاج", + "honorificPrefix_romanized": "al-Hajj" + }, + "roles": [ + { + "role_title": "تاجر", + "role_title_romanized": "tajir", + "role_in_source": "founder" + } + ], + "biographical": { + "deceased": true, + "address": "مدينة حلب الشهباء" + }, + "family_relationships": { + "parent": [ + { + "person_index": 1, + "target_name": "محمد بن عبد الله العمري" + } + ], + "children": [] + }, + "context": "The founder (waqif) of the endowment, a deceased merchant from Aleppo." + }, + { + "person_index": 1, + "pnv_name": { + "literalName": "المرحوم محمد بن عبد الله العمري", + "literalName_romanized": "al-marhum Muhammad ibn Abd Allah al-Umari", + "givenName": "محمد", + "givenName_romanized": "Muhammad", + "patronym": "عبد الله", + "patronym_romanized": "Abd Allah", + "baseSurname": "العمري", + "baseSurname_romanized": "al-Umari" + }, + "roles": [ + { + "role_title": null, + "role_title_romanized": null, + "role_in_source": null + } + ], + "biographical": { + "deceased": true, + "address": null + }, + "family_relationships": { + "parent": [], + "children": [ + { + "person_index": 0, + "target_name": "أحمد بن محمد العمري" + } + ] + }, + "context": "The deceased father of the founder, Ahmad al-Umari." + }, + { + "person_index": 2, + "pnv_name": { + "literalName": "الحاج إبراهيم بن يوسف التركماني", + "literalName_romanized": "al-Hajj Ibrahim ibn Yusuf al-Turkmani", + "givenName": "إبراهيم", + "givenName_romanized": "Ibrahim", + "patronym": "يوسف", + "patronym_romanized": "Yusuf", + "baseSurname": "التركماني", + "baseSurname_romanized": "al-Turkmani", + "honorificPrefix": "الحاج", + "honorificPrefix_romanized": "al-Hajj" + }, + "roles": [ + { + "role_title": "شاهد", + "role_title_romanized": "shahid", + "role_in_source": "witness" + } + ], + "biographical": { + "deceased": null, + "address": null + }, + "family_relationships": { + "parent": [], + "children": [] + }, + "context": "One of the witnesses to the endowment deed." + }, + { + "person_index": 3, + "pnv_name": { + "literalName": "السيد علي بن حسين الحلبي", + "literalName_romanized": "al-Sayyid Ali ibn Husayn al-Halabi", + "givenName": "علي", + "givenName_romanized": "Ali", + "patronym": "حسين", + "patronym_romanized": "Husayn", + "baseSurname": "الحلبي", + "baseSurname_romanized": "al-Halabi", + "honorificPrefix": "السيد", + "honorificPrefix_romanized": "al-Sayyid" + }, + "roles": [ + { + "role_title": "شاهد", + "role_title_romanized": "shahid", + "role_in_source": "witness" + } + ], + "biographical": { + "deceased": null, + "address": null + }, + "family_relationships": { + "parent": [], + "children": [] + }, + "context": "The second witness to the endowment deed." + } + ], + "temporal_references": [ + { + "expression": "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية", + "expression_romanized": "Shahr Rajab sanat alf wa mi'ayn wa khamsa wa 'ishrin hijriyyah", + "normalized": "1811-01", + "calendar": "Hijri", + "type": "DATE" + } + ], + "locations_mentioned": [ + { + "name": "حلب الشهباء", + "name_romanized": "Halab al-Shahba'", + "type": "city" + }, + { + "name": "محلة الجديدة", + "name_romanized": "Mahallat al-Jadida", + "type": "neighborhood" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_152524.json b/data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_152524.json new file mode 100644 index 0000000000..8091b103de --- /dev/null +++ b/data/entity_annotation/test_outputs/arabic_waqf_extraction_20251212_152524.json @@ -0,0 +1,93 @@ +{ + "pico_observation": { + "observation_id": "waqf_doc_001", + "source_type": "Waqf Document", + "source_reference": "Arabic Waqf Deed Snippet" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "script": "أحمد بن محمد العمري", + "romanized": "Ahmad ibn Muhammad al-Umari", + "full_name": "المرحوم الحاج أحمد بن محمد العمري" + }, + "roles": [ + "founder" + ], + "biographical": { + "status": "deceased", + "occupation": "تاجر", + "address": "مدينة حلب الشهباء" + }, + "family_relationships": { + "father": "محمد بن عبد الله العمري" + }, + "context": "The founder (waqif) who endowed his house for his descendants." + }, + { + "person_index": 1, + "pnv_name": { + "script": "محمد بن عبد الله العمري", + "romanized": "Muhammad ibn Abdullah al-Umari", + "full_name": "المرحوم محمد بن عبد الله العمري" + }, + "roles": [], + "biographical": { + "status": "deceased" + }, + "family_relationships": { + "son": "أحمد بن محمد العمري" + }, + "context": "Father of the founder, mentioned in his patronymic." + }, + { + "person_index": 2, + "pnv_name": { + "script": "إبراهيم بن يوسف التركماني", + "romanized": "Ibrahim ibn Yusuf al-Turkmani", + "full_name": "الحاج إبراهيم بن يوسف التركماني" + }, + "roles": [ + "witness" + ], + "biographical": {}, + "family_relationships": {}, + "context": "A witness to the waqf deed." + }, + { + "person_index": 3, + "pnv_name": { + "script": "علي بن حسين الحلبي", + "romanized": "Ali ibn Husayn al-Halabi", + "full_name": "السيد علي بن حسين الحلبي" + }, + "roles": [ + "witness" + ], + "biographical": {}, + "family_relationships": {}, + "context": "A witness to the waqf deed." + } + ], + "temporal_references": [ + { + "expression": "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية", + "expression_romanized": "shahr rajab sanat alf wa mi'atayn wa khamsa wa 'ishrin hijriyya", + "normalized": "1225 AH", + "calendar": "Hijri" + } + ], + "locations_mentioned": [ + { + "name": "حلب", + "name_romanized": "Halab", + "type": "city" + }, + { + "name": "الجديدة", + "name_romanized": "al-Jadida", + "type": "neighborhood" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_145817.json b/data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_145817.json new file mode 100644 index 0000000000..b3a01c2aaa --- /dev/null +++ b/data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_145817.json @@ -0,0 +1,139 @@ +{ + "pico_observation": { + "observation_id": "marriage_cert_1885-03-04_haarlem_001", + "source_type": "marriage_certificate", + "source_reference": "Haarlem, 4 March 1885, marriage of Johannes Petrus van der Berg and Cornelia Wilhelmina de Groot" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "person_name": "Johannes Petrus", + "family_name": "van der Berg", + "tussenvoegsel": "van der", + "geslachtsnaam": "Berg" + }, + "roles": [ + "groom" + ], + "biographical": { + "age": 30, + "occupation": "koopman", + "birth_place": "Amsterdam", + "residence": "Haarlem", + "civil_status": "meerderjarige" + }, + "family_relationships": { + "father": { + "person_name": "Pieter", + "family_name": "van der Berg", + "tussenvoegsel": "van der", + "geslachtsnaam": "Berg", + "status": "deceased", + "occupation": "koopman" + }, + "mother": { + "person_name": "Maria Johanna", + "family_name": "Bakker", + "geslachtsnaam": "Bakker", + "status": "living", + "occupation": "zonder beroep", + "residence": "Amsterdam" + } + }, + "context": "Groom, son of Pieter van der Berg and Maria Johanna Bakker." + }, + { + "person_index": 1, + "pnv_name": { + "person_name": "Cornelia Wilhelmina", + "family_name": "de Groot", + "tussenvoegsel": "de", + "geslachtsnaam": "Groot" + }, + "roles": [ + "bride" + ], + "biographical": { + "age": 25, + "occupation": "zonder beroep", + "birth_place": "Haarlem", + "residence": "Haarlem", + "civil_status": "meerderjarige" + }, + "family_relationships": { + "father": { + "person_name": "Hendrik", + "family_name": "de Groot", + "tussenvoegsel": "de", + "geslachtsnaam": "Groot", + "status": "living", + "occupation": "timmerman" + }, + "mother": { + "person_name": "Elisabeth", + "family_name": "van Dijk", + "tussenvoegsel": "van", + "geslachtsnaam": "Dijk", + "status": "deceased" + } + }, + "context": "Bride, daughter of Hendrik de Groot and Elisabeth van Dijk." + }, + { + "person_index": 2, + "pnv_name": { + "person_name": "Willem Frederik", + "family_name": "Smit", + "geslachtsnaam": "Smit" + }, + "roles": [ + "witness" + ], + "biographical": { + "age": 40, + "occupation": "notaris" + }, + "family_relationships": {}, + "context": "Witness to the marriage." + }, + { + "person_index": 3, + "pnv_name": { + "person_name": "Jacobus Hendrikus", + "family_name": "Jansen", + "geslachtsnaam": "Jansen" + }, + "roles": [ + "witness" + ], + "biographical": { + "age": 35, + "occupation": "klerk" + }, + "family_relationships": {}, + "context": "Witness to the marriage." + } + ], + "temporal_references": [ + { + "expression": "vierden Maart achttien honderd vijf en tachtig", + "normalized": "1885-03-04", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "Haarlem", + "type": "municipality" + }, + { + "name": "Amsterdam", + "type": "city" + }, + { + "name": "Haarlem", + "type": "city" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_152853.json b/data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_152853.json new file mode 100644 index 0000000000..286cc23b91 --- /dev/null +++ b/data/entity_annotation/test_outputs/dutch_marriage_extraction_20251212_152853.json @@ -0,0 +1,185 @@ +{ + "pico_observation": { + "observation_id": "obs_haarlem_1885-03-04", + "source_type": "marriage_certificate", + "source_reference": "Haarlem, 1885-03-04" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "person_name_standard_text": "Johannes Petrus van der Berg", + "person_name_given_name": "Johannes Petrus", + "person_name_family_name_prefix": "van der", + "person_name_family_name": "Berg" + }, + "roles": [ + "groom" + ], + "biographical": { + "age": 30, + "occupation": "koopman", + "birth_place": "Amsterdam", + "residence": "Haarlem", + "civil_status": "meerderjarige" + }, + "family_relationships": { + "relationship_to_parents": "zoon van", + "father": "Pieter van der Berg", + "mother": "Maria Johanna Bakker" + }, + "context": "Groom, 30-year-old merchant, born in Amsterdam, residing in Haarlem, son of the late Pieter van der Berg and Maria Johanna Bakker." + }, + { + "person_index": 1, + "pnv_name": { + "person_name_standard_text": "Pieter van der Berg", + "person_name_given_name": "Pieter", + "person_name_family_name_prefix": "van der", + "person_name_family_name": "Berg" + }, + "roles": [ + "father_of_groom" + ], + "biographical": { + "deceased": true, + "occupation": "koopman" + }, + "family_relationships": { + "father_of": "Johannes Petrus van der Berg" + }, + "context": "Father of the groom, deceased, was a merchant." + }, + { + "person_index": 2, + "pnv_name": { + "person_name_standard_text": "Maria Johanna Bakker", + "person_name_given_name": "Maria Johanna", + "person_name_family_name": "Bakker" + }, + "roles": [ + "mother_of_groom" + ], + "biographical": { + "occupation": "zonder beroep", + "residence": "Amsterdam" + }, + "family_relationships": { + "mother_of": "Johannes Petrus van der Berg" + }, + "context": "Mother of the groom, without occupation, residing in Amsterdam." + }, + { + "person_index": 3, + "pnv_name": { + "person_name_standard_text": "Cornelia Wilhelmina de Groot", + "person_name_given_name": "Cornelia Wilhelmina", + "person_name_family_name_prefix": "de", + "person_name_family_name": "Groot" + }, + "roles": [ + "bride" + ], + "biographical": { + "age": 25, + "occupation": "zonder beroep", + "birth_place": "Haarlem", + "residence": "Haarlem", + "civil_status": "meerderjarige" + }, + "family_relationships": { + "relationship_to_parents": "dochter van", + "father": "Hendrik de Groot", + "mother": "Elisabeth van Dijk" + }, + "context": "Bride, 25-year-old without occupation, born in Haarlem, residing in Haarlem, daughter of Hendrik de Groot and the late Elisabeth van Dijk." + }, + { + "person_index": 4, + "pnv_name": { + "person_name_standard_text": "Hendrik de Groot", + "person_name_given_name": "Hendrik", + "person_name_family_name_prefix": "de", + "person_name_family_name": "Groot" + }, + "roles": [ + "father_of_bride" + ], + "biographical": { + "occupation": "timmerman" + }, + "family_relationships": { + "father_of": "Cornelia Wilhelmina de Groot" + }, + "context": "Father of the bride, a carpenter." + }, + { + "person_index": 5, + "pnv_name": { + "person_name_standard_text": "Elisabeth van Dijk", + "person_name_given_name": "Elisabeth", + "person_name_family_name_prefix": "van", + "person_name_family_name": "Dijk" + }, + "roles": [ + "mother_of_bride" + ], + "biographical": { + "deceased": true + }, + "family_relationships": { + "mother_of": "Cornelia Wilhelmina de Groot" + }, + "context": "Mother of the bride, deceased." + }, + { + "person_index": 6, + "pnv_name": { + "person_name_standard_text": "Willem Frederik Smit", + "person_name_given_name": "Willem Frederik", + "person_name_family_name": "Smit" + }, + "roles": [ + "witness" + ], + "biographical": { + "age": 40, + "occupation": "notaris" + }, + "context": "Witness, 40-year-old notary." + }, + { + "person_index": 7, + "pnv_name": { + "person_name_standard_text": "Jacobus Hendrikus Jansen", + "person_name_given_name": "Jacobus Hendrikus", + "person_name_family_name": "Jansen" + }, + "roles": [ + "witness" + ], + "biographical": { + "age": 35, + "occupation": "klerk" + }, + "context": "Witness, 35-year-old clerk." + } + ], + "temporal_references": [ + { + "expression": "vierden Maart achttien honderd vijf en tachtig", + "normalized": "1885-03-04", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "Amsterdam", + "type": "city" + }, + { + "name": "Haarlem", + "type": "city" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_152118.json b/data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_152118.json new file mode 100644 index 0000000000..e3f235f874 --- /dev/null +++ b/data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_152118.json @@ -0,0 +1,139 @@ +{ + "pico_observation": { + "observation_id": "bap_reg_thess_1875_03_15_01", + "source_type": "baptismal_register", + "source_reference": "Greek Orthodox Baptismal Register, Thessaloniki, 15 March 1875" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "Δημήτριος", + "literalName_romanized": "Dēmētrios" + }, + "roles": [ + "baptized" + ], + "biographical": {}, + "family_relationships": { + "father": "Νικόλαος Παπαδόπουλος", + "mother": "Ἑλένη" + }, + "context": "The baptized, son of Nikolaos Papadopoulos and Elenē." + }, + { + "person_index": 1, + "pnv_name": { + "literalName": "Νικόλαος Παπαδόπουλος", + "literalName_romanized": "Nikolaos Papadopoulos" + }, + "roles": [ + "parent" + ], + "biographical": { + "occupation": "ἔμπορος" + }, + "family_relationships": { + "son": "Δημήτριος", + "wife": "Ἑλένη" + }, + "context": "Father of the baptized, a merchant, husband of Elenē." + }, + { + "person_index": 2, + "pnv_name": { + "literalName": "Ἑλένη", + "literalName_romanized": "Elenē" + }, + "roles": [ + "parent" + ], + "biographical": {}, + "family_relationships": { + "son": "Δημήτριος", + "husband": "Νικόλαος Παπαδόπουλος", + "father": "Γεώργιος Οἰκόνομος" + }, + "context": "Mother of the baptized, wife of Nikolaos Papadopoulos, daughter of the late Geōrgios Oikonomos." + }, + { + "person_index": 3, + "pnv_name": { + "literalName": "Γεώργιος Οἰκόνομος", + "literalName_romanized": "Geōrgios Oikonomos" + }, + "roles": [ + "grandparent" + ], + "biographical": { + "deceased": true + }, + "family_relationships": { + "daughter": "Ἑλένη" + }, + "context": "The late father of the mother (Elenē)." + }, + { + "person_index": 4, + "pnv_name": { + "literalName": "Κωνσταντῖνος Καρατζᾶς", + "literalName_romanized": "Kōnstantinos Karatzas" + }, + "roles": [ + "godparent" + ], + "biographical": { + "occupation": "ἰατρός" + }, + "family_relationships": { + "father": "Ἰωάννης" + }, + "context": "Godparent, a physician, son of Iōannēs." + }, + { + "person_index": 5, + "pnv_name": { + "literalName": "Ἰωάννης", + "literalName_romanized": "Iōannēs" + }, + "roles": [ + "godparent's_parent" + ], + "biographical": {}, + "family_relationships": { + "son": "Κωνσταντῖνος Καρατζᾶς" + }, + "context": "Father of the godparent (Kōnstantinos Karatzas)." + }, + { + "person_index": 6, + "pnv_name": { + "literalName": "Ἀθανάσιος Χρυσοστόμου", + "literalName_romanized": "Athanasios Chrysostomou" + }, + "roles": [ + "priest" + ], + "biographical": { + "ecclesiastical_title": "Πρωτοπρεσβύτερος" + }, + "family_relationships": {}, + "context": "The officiating priest, an Archpriest." + } + ], + "temporal_references": [ + { + "expression": "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875", + "expression_romanized": "tē dekatē pemptē Martiou tou etous 1875", + "normalized": "1875-03-15", + "calendar": "Julian" + } + ], + "locations_mentioned": [ + { + "name": "Θεσσαλονίκῃ", + "name_romanized": "Thessalonikē", + "type": "city" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_153159.json b/data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_153159.json new file mode 100644 index 0000000000..073307ee71 --- /dev/null +++ b/data/entity_annotation/test_outputs/greek_baptismal_extraction_20251212_153159.json @@ -0,0 +1,124 @@ +{ + "pico_observation": { + "observation_id": "bap_reg_thess_1875_03_15_01", + "source_type": "baptismal_register", + "source_reference": "Thessaloniki Baptismal Register, 15 March 1875" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "Δημήτριος", + "literalName_romanized": "Dēmētrios" + }, + "roles": [ + "baptized" + ], + "biographical": {}, + "family_relationships": { + "father": "Νικολάου Παπαδοπούλου", + "mother": "Ἑλένης" + }, + "context": "Son of Nikolaos Papadopoulos and Eleni, baptized in Thessaloniki." + }, + { + "person_index": 1, + "pnv_name": { + "literalName": "Νικολάου Παπαδοπούλου", + "literalName_romanized": "Nikolaou Papadopoulou" + }, + "roles": [ + "parent" + ], + "biographical": { + "occupation": "ἔμπορος" + }, + "family_relationships": { + "son": "Δημήτριος", + "wife": "Ἑλένης" + }, + "context": "Father of the baptized Dimitrios, merchant, husband of Eleni." + }, + { + "person_index": 2, + "pnv_name": { + "literalName": "Ἑλένης", + "literalName_romanized": "Elenēs" + }, + "roles": [ + "parent" + ], + "biographical": {}, + "family_relationships": { + "son": "Δημήτριος", + "husband": "Νικολάου Παπαδοπούλου", + "father": "μακαρίτου Γεωργίου Οἰκονόμου" + }, + "context": "Mother of the baptized Dimitrios, wife of Nikolaos Papadopoulos, daughter of the late Georgios Oikonomou." + }, + { + "person_index": 3, + "pnv_name": { + "literalName": "Γεωργίου Οἰκονόμου", + "literalName_romanized": "Geōrgiou Oikonomou" + }, + "roles": [ + "grandparent" + ], + "biographical": { + "deceased": true + }, + "family_relationships": { + "daughter": "Ἑλένης" + }, + "context": "Late father of Eleni, maternal grandfather of the baptized Dimitrios." + }, + { + "person_index": 4, + "pnv_name": { + "literalName": "Κωνσταντῖνος Καρατζᾶς", + "literalName_romanized": "Kōnstantinos Karatzas" + }, + "roles": [ + "godparent" + ], + "biographical": { + "occupation": "ἰατρός" + }, + "family_relationships": { + "father": "Ἰωάννου" + }, + "context": "Godparent of Dimitrios, son of Ioannis, physician." + }, + { + "person_index": 5, + "pnv_name": { + "literalName": "Ἀθανάσιος Χρυσοστόμου", + "literalName_romanized": "Athanasios Chrysostomou" + }, + "roles": [ + "priest" + ], + "biographical": { + "ecclesiastical_title": "Πρωτοπρεσβύτερος" + }, + "family_relationships": {}, + "context": "Archpriest who performed the baptism." + } + ], + "temporal_references": [ + { + "expression": "τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875", + "expression_romanized": "tē dekatē pemptē Martiou tou etous 1875", + "normalized": "1875-03-15", + "calendar": "Julian" + } + ], + "locations_mentioned": [ + { + "name": "Θεσσαλονίκῃ", + "name_romanized": "Thessalonikē", + "type": "city" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_133437.json b/data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_133437.json new file mode 100644 index 0000000000..49d00bc353 --- /dev/null +++ b/data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_133437.json @@ -0,0 +1,252 @@ +{ + "pico_observation": { + "observation_id": "ketubah_vilna_5605_obs_001", + "source_type": "ketubah", + "source_reference": "Vilna Ketubah, 12 Iyar 5605" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "script": "Hebrew", + "text": "יצחק", + "romanized": "Yitzchak" + }, + "roles": [ + "groom" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בן הר״ר אברהם", + "romanized": "ben HaRav Avraham" + }, + "tribal_affiliation": { + "script": "Hebrew", + "text": "הכהן", + "romanized": "haKohen" + }, + "honorifics": [ + { + "script": "Hebrew", + "text": "הבחור", + "romanized": "haBachur" + } + ] + }, + "family_relationships": { + "father": { + "person_index": 1, + "relationship_type": "paternal" + } + }, + "context": "The groom, son of Avraham haKohen." + }, + { + "person_index": 1, + "pnv_name": { + "script": "Hebrew", + "text": "אברהם", + "romanized": "Avraham" + }, + "roles": [], + "biographical": { + "honorifics": [ + { + "script": "Hebrew", + "text": "הר״ר", + "romanized": "HaRav" + } + ], + "tribal_affiliation": { + "script": "Hebrew", + "text": "הכהן", + "romanized": "haKohen" + }, + "deceased_marker": { + "script": "Hebrew", + "text": "ז״ל", + "romanized": "z'l" + } + }, + "family_relationships": { + "child": { + "person_index": 0, + "relationship_type": "paternal" + } + }, + "context": "Father of the groom, of blessed memory." + }, + { + "person_index": 2, + "pnv_name": { + "script": "Hebrew", + "text": "מרים", + "romanized": "Miriam" + }, + "roles": [ + "bride" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בת הר״ר משה", + "romanized": "bat HaRav Moshe" + }, + "tribal_affiliation": { + "script": "Hebrew", + "text": "הלוי", + "romanized": "haLevi" + }, + "honorifics": [ + { + "script": "Hebrew", + "text": "מרת", + "romanized": "Marat" + } + ] + }, + "family_relationships": { + "father": { + "person_index": 3, + "relationship_type": "paternal" + } + }, + "context": "The bride, daughter of Moshe haLevi." + }, + { + "person_index": 3, + "pnv_name": { + "script": "Hebrew", + "text": "משה", + "romanized": "Moshe" + }, + "roles": [], + "biographical": { + "honorifics": [ + { + "script": "Hebrew", + "text": "הר״ר", + "romanized": "HaRav" + } + ], + "tribal_affiliation": { + "script": "Hebrew", + "text": "הלוי", + "romanized": "haLevi" + } + }, + "family_relationships": { + "child": { + "person_index": 2, + "relationship_type": "paternal" + } + }, + "context": "Father of the bride." + }, + { + "person_index": 4, + "pnv_name": { + "script": "Hebrew", + "text": "שמעון", + "romanized": "Shimon" + }, + "roles": [ + "witness" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בן יעקב", + "romanized": "ben Yaakov" + }, + "tribal_affiliation": { + "script": "Hebrew", + "text": "הכהן", + "romanized": "haKohen" + } + }, + "family_relationships": { + "father": { + "person_index": 5, + "relationship_type": "paternal" + } + }, + "context": "First witness to the marriage." + }, + { + "person_index": 5, + "pnv_name": { + "script": "Hebrew", + "text": "יעקב", + "romanized": "Yaakov" + }, + "roles": [], + "biographical": {}, + "family_relationships": { + "child": { + "person_index": 4, + "relationship_type": "paternal" + } + }, + "context": "Father of the first witness, Shimon." + }, + { + "person_index": 6, + "pnv_name": { + "script": "Hebrew", + "text": "דוד", + "romanized": "David" + }, + "roles": [ + "witness" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בן אליהו", + "romanized": "ben Eliyahu" + } + }, + "family_relationships": { + "father": { + "person_index": 7, + "relationship_type": "paternal" + } + }, + "context": "Second witness to the marriage." + }, + { + "person_index": 7, + "pnv_name": { + "script": "Hebrew", + "text": "אליהו", + "romanized": "Eliyahu" + }, + "roles": [], + "biographical": {}, + "family_relationships": { + "child": { + "person_index": 6, + "relationship_type": "paternal" + } + }, + "context": "Father of the second witness, David." + } + ], + "temporal_references": [ + { + "expression": "ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות וארבעים וחמש לבריאת עולם", + "expression_romanized": "BeYom Shlishi beShabbat, Shneim Asar Yom leChodesh Iyar, Shnat Chameshet Alafim Shesh Meot veArba'im veChamesh leBriyat Olam", + "normalized": "5605-04-12", + "calendar": "Hebrew" + } + ], + "locations_mentioned": [ + { + "name": "פה ווילנא", + "name_romanized": "Po Vilna", + "type": "city" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_152634.json b/data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_152634.json new file mode 100644 index 0000000000..7a6462ffa1 --- /dev/null +++ b/data/entity_annotation/test_outputs/hebrew_ketubah_extraction_20251212_152634.json @@ -0,0 +1,202 @@ +{ + "pico_observation": { + "observation_id": "ketubah_vilna_5605_obs_1", + "source_type": "ketubah", + "source_reference": "Vilna Ketubah, 12 Iyar 5605 (1845 CE)" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "script": "Hebrew", + "text": "יצחק", + "romanized": "Yitzchak" + }, + "roles": [ + "groom", + "חתן" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בן הר״ר אברהם", + "romanized": "ben HaRav Avraham" + }, + "tribal_affiliation": { + "script": "Hebrew", + "text": "הכהן", + "romanized": "haKohen" + }, + "honorifics": [ + "הבחור" + ] + }, + "family_relationships": { + "father": { + "person_index": 1, + "relationship_type": "paternal", + "deceased": true + } + }, + "context": "The groom, son of the late Rabbi Avraham haKohen." + }, + { + "person_index": 1, + "pnv_name": { + "script": "Hebrew", + "text": "אברהם", + "romanized": "Avraham" + }, + "roles": [ + "father_of_groom" + ], + "biographical": { + "honorifics": [ + "הר״ר" + ], + "tribal_affiliation": { + "script": "Hebrew", + "text": "הכהן", + "romanized": "haKohen" + }, + "deceased_marker": { + "script": "Hebrew", + "text": "ז״ל", + "romanized": "z'l" + } + }, + "family_relationships": { + "son": { + "person_index": 0, + "relationship_type": "paternal" + } + }, + "context": "The deceased father of the groom, Rabbi Avraham haKohen." + }, + { + "person_index": 2, + "pnv_name": { + "script": "Hebrew", + "text": "מרים", + "romanized": "Miriam" + }, + "roles": [ + "bride", + "כלה" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בת הר״ר משה", + "romanized": "bat HaRav Moshe" + }, + "tribal_affiliation": { + "script": "Hebrew", + "text": "הלוי", + "romanized": "haLevi" + }, + "honorifics": [ + "מרת" + ] + }, + "family_relationships": { + "father": { + "person_index": 3, + "relationship_type": "paternal", + "deceased": false + } + }, + "context": "The bride, daughter of Rabbi Moshe haLevi." + }, + { + "person_index": 3, + "pnv_name": { + "script": "Hebrew", + "text": "משה", + "romanized": "Moshe" + }, + "roles": [ + "father_of_bride" + ], + "biographical": { + "honorifics": [ + "הר״ר" + ], + "tribal_affiliation": { + "script": "Hebrew", + "text": "הלוי", + "romanized": "haLevi" + } + }, + "family_relationships": { + "daughter": { + "person_index": 2, + "relationship_type": "paternal" + } + }, + "context": "The father of the bride, Rabbi Moshe haLevi." + }, + { + "person_index": 4, + "pnv_name": { + "script": "Hebrew", + "text": "שמעון", + "romanized": "Shimon" + }, + "roles": [ + "witness", + "עד" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בן יעקב", + "romanized": "ben Yaakov" + }, + "tribal_affiliation": { + "script": "Hebrew", + "text": "הכהן", + "romanized": "haKohen" + } + }, + "family_relationships": {}, + "context": "First witness to the marriage." + }, + { + "person_index": 5, + "pnv_name": { + "script": "Hebrew", + "text": "דוד", + "romanized": "David" + }, + "roles": [ + "witness", + "עד" + ], + "biographical": { + "patronymic": { + "script": "Hebrew", + "text": "בן אליהו", + "romanized": "ben Eliyahu" + } + }, + "family_relationships": {}, + "context": "Second witness to the marriage." + } + ], + "temporal_references": [ + { + "expression": "ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות וארבעים וחמש לבריאת עולם", + "expression_romanized": "BeYom Shlishi BeShabbat, Shneim Asar Yom LeChodesh Iyar, Shnat Chamishat Alafim Shesh Meot VeArba'im VeChamesh LeBeriat Olam", + "normalized": "1845-04-18", + "calendar": "Hebrew" + } + ], + "locations_mentioned": [ + { + "name": "ווילנא", + "name_romanized": "Vilna", + "type": "city" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_152024.json b/data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_152024.json new file mode 100644 index 0000000000..faf8272688 --- /dev/null +++ b/data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_152024.json @@ -0,0 +1,192 @@ +{ + "pico_observation": { + "observation_id": "obs_001", + "source_type": "notarial_act", + "source_reference": "Adì 15 Marzo 1654, in Venetia" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "given_name": "Giovanni Battista", + "surname": "Morosini" + }, + "roles": [ + "party" + ], + "biographical": { + "title": "Nobil Homo Messer", + "residence": "contrada di San Marco", + "status": "living" + }, + "family_relationships": { + "father": { + "type": "father", + "name": { + "pnv_name": { + "given_name": "Andrea", + "surname": null + }, + "title": "Magnifico Messer" + }, + "status": "deceased" + } + }, + "context": "Il Nobil Homo Messer Giovanni Battista Morosini fu quondam Magnifico Messer Andrea, della contrada di San Marco" + }, + { + "person_index": 1, + "pnv_name": { + "given_name": "Caterina", + "surname": "Contarini" + }, + "roles": [ + "party" + ], + "biographical": { + "title": "Nobil Donna Madonna", + "residence": "contrada di San Marco", + "status": "living" + }, + "family_relationships": { + "father": { + "type": "father", + "name": { + "pnv_name": { + "given_name": "Francesco", + "surname": null + }, + "title": "Messer" + }, + "status": "deceased" + }, + "spouse": { + "type": "spouse", + "name": { + "pnv_name": { + "given_name": "Giovanni Battista", + "surname": "Morosini" + }, + "title": "Nobil Homo Messer" + }, + "status": "living" + } + }, + "context": "sua moglie la Nobil Donna Madonna Caterina Contarini fu quondam Messer Francesco" + }, + { + "person_index": 2, + "pnv_name": { + "given_name": "Pietro", + "surname": "Fabbro" + }, + "roles": [ + "witness" + ], + "biographical": { + "title": "Messer", + "residence": "contrada di San Polo", + "status": "living" + }, + "family_relationships": { + "father": { + "type": "father", + "name": { + "pnv_name": { + "given_name": "Paolo", + "surname": null + }, + "title": null + }, + "status": "deceased" + } + }, + "context": "Messer Pietro fu Paolo Fabbro, habitante nella contrada di San Polo" + }, + { + "person_index": 3, + "pnv_name": { + "given_name": "Marco Antonio", + "surname": "Ferrari" + }, + "roles": [ + "witness" + ], + "biographical": { + "title": "Messer", + "occupation": "bottegaio", + "residence": "Rialto", + "status": "living" + }, + "family_relationships": { + "father": { + "type": "father", + "name": { + "pnv_name": { + "given_name": "Giovanni", + "surname": null + }, + "title": null + }, + "status": "deceased" + } + }, + "context": "Messer Marco Antonio Ferrari fu Giovanni, bottegaio in Rialto" + }, + { + "person_index": 4, + "pnv_name": { + "given_name": "Antonio", + "surname": "Zen" + }, + "roles": [ + "notary" + ], + "biographical": { + "title": "Notaro", + "occupation": "Notaro publico di Venetia", + "residence": "Venetia", + "status": "living" + }, + "family_relationships": { + "father": { + "type": "father", + "name": { + "pnv_name": { + "given_name": "Giacomo", + "surname": null + }, + "title": "Messer" + }, + "status": "deceased" + } + }, + "context": "Rogato io Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico di Venetia" + } + ], + "temporal_references": [ + { + "expression": "15 Marzo 1654", + "normalized": "1654-03-15", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "Venetia", + "type": "city" + }, + { + "name": "San Marco", + "type": "contrada" + }, + { + "name": "San Polo", + "type": "contrada" + }, + { + "name": "Rialto", + "type": "district" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_153104.json b/data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_153104.json new file mode 100644 index 0000000000..771a383959 --- /dev/null +++ b/data/entity_annotation/test_outputs/italian_notarial_extraction_20251212_153104.json @@ -0,0 +1,156 @@ +{ + "pico_observation": { + "observation_id": "notarial_act_1654_03_15_venezia_01", + "source_type": "notarial_act", + "source_reference": "Adì 15 Marzo 1654, in Venetia. Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu quondam Magnifico Messer Andrea, della contrada di San Marco, et sua moglie la Nobil Donna Madonna Caterina Contarini fu quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo Fabbro, habitante nella contrada di San Polo, et Messer Marco Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico di Venetia." + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "given_name": "Giovanni Battista", + "surname": "Morosini" + }, + "roles": [ + "party" + ], + "biographical": { + "nobility_titles": [ + "Nobil Homo", + "Messer" + ], + "residence": "contrada di San Marco" + }, + "family_relationships": { + "father": { + "name": "Andrea Morosini", + "status": "deceased", + "title": "Magnifico Messer" + }, + "spouse": "Caterina Contarini" + }, + "context": "Il Nobil Homo Messer Giovanni Battista Morosini fu quondam Magnifico Messer Andrea, della contrada di San Marco" + }, + { + "person_index": 1, + "pnv_name": { + "given_name": "Caterina", + "surname": "Contarini" + }, + "roles": [ + "party" + ], + "biographical": { + "nobility_titles": [ + "Nobil Donna", + "Madonna" + ] + }, + "family_relationships": { + "father": { + "name": "Francesco Contarini", + "status": "deceased", + "title": "Messer" + }, + "spouse": "Giovanni Battista Morosini" + }, + "context": "sua moglie la Nobil Donna Madonna Caterina Contarini fu quondam Messer Francesco" + }, + { + "person_index": 2, + "pnv_name": { + "given_name": "Pietro", + "surname": "Fabbro" + }, + "roles": [ + "witness" + ], + "biographical": { + "nobility_titles": [ + "Messer" + ], + "residence": "contrada di San Polo" + }, + "family_relationships": { + "father": { + "name": "Paolo Fabbro", + "status": "deceased" + } + }, + "context": "Messer Pietro fu Paolo Fabbro, habitante nella contrada di San Polo" + }, + { + "person_index": 3, + "pnv_name": { + "given_name": "Marco Antonio", + "surname": "Ferrari" + }, + "roles": [ + "witness" + ], + "biographical": { + "nobility_titles": [ + "Messer" + ], + "occupation": "bottegaio", + "work_location": "Rialto" + }, + "family_relationships": { + "father": { + "name": "Giovanni Ferrari", + "status": "deceased" + } + }, + "context": "Messer Marco Antonio Ferrari fu Giovanni, bottegaio in Rialto" + }, + { + "person_index": 4, + "pnv_name": { + "given_name": "Antonio", + "surname": "Zen" + }, + "roles": [ + "notary" + ], + "biographical": { + "nobility_titles": [ + "Messer" + ], + "occupation": "Notaro publico di Venetia" + }, + "family_relationships": { + "father": { + "name": "Giacomo Zen", + "status": "deceased", + "title": "Messer" + } + }, + "context": "io Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico di Venetia" + } + ], + "temporal_references": [ + { + "expression": "15 Marzo 1654", + "normalized": "1654-03-15", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "Venetia", + "type": "city" + }, + { + "name": "contrada di San Marco", + "type": "district" + }, + { + "name": "contrada di San Polo", + "type": "district" + }, + { + "name": "Rialto", + "type": "area" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_152313.json b/data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_152313.json new file mode 100644 index 0000000000..058105ca6d --- /dev/null +++ b/data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_152313.json @@ -0,0 +1,125 @@ +{ + "pico_observation": { + "observation_id": "sijill_001", + "source_type": "sijill", + "source_reference": "Ottoman Court Record, Dated 1258 AH" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "محمد آغا بن عبد الله مرحوم", + "literalName_romanized": "Muhammad Ağa bin Abdullah al-merhum" + }, + "roles": [ + "seller" + ], + "biographical": { + "honorifics": [ + "آغا/Ağa" + ], + "is_deceased": true, + "patronymic": "بن عبد الله/bin Abdullah" + }, + "family_relationships": {}, + "context": "Seller of the property, identified as a resident of Demirci-köy. The record notes he is deceased (merhum), implying the sale is conducted by his estate or heirs." + }, + { + "person_index": 1, + "pnv_name": { + "literalName": "محمد بن احمد افندی", + "literalName_romanized": "Muhammad bin Ahmad Efendi" + }, + "roles": [ + "buyer" + ], + "biographical": { + "honorifics": [ + "افندی/Efendi" + ], + "patronymic": "بن احمد/bin Ahmad" + }, + "family_relationships": { + "spouse": { + "person_index": 2, + "name": "فاطمة خاتون/Fatima Hatun" + } + }, + "context": "Buyer of the property, a resident of Demirci-köy, present with his wife for the transaction." + }, + { + "person_index": 2, + "pnv_name": { + "literalName": "فاطمه خاتون بنت علی‌اوغلو", + "literalName_romanized": "Fatima Hatun bint Ali-oğlu" + }, + "roles": [ + "buyer" + ], + "biographical": { + "honorifics": [ + "خاتون/Hatun" + ], + "patronymic": "بنت علی‌اوغلو/bint Ali-oğlu" + }, + "family_relationships": { + "spouse": { + "person_index": 1, + "name": "محمد بن احمد افندی/Muhammad bin Ahmad Efendi" + } + }, + "context": "Wife (zevcesi) of the buyer, Muhammad bin Ahmad Efendi, and co-purchaser. She is identified as the daughter of Ali-oğlu." + }, + { + "person_index": 3, + "pnv_name": { + "literalName": "حسن افندی بن عمر", + "literalName_romanized": "Hasan Efendi bin Umar" + }, + "roles": [ + "witness" + ], + "biographical": { + "honorifics": [ + "افندی/Efendi" + ], + "patronymic": "بن عمر/bin Umar" + }, + "family_relationships": {}, + "context": "One of the two witnesses (şühûd-ı hâl) to the sale." + }, + { + "person_index": 4, + "pnv_name": { + "literalName": "ابراهيم چلبی بن مصطفی", + "literalName_romanized": "Ibrahim Çelebi bin Mustafa" + }, + "roles": [ + "witness" + ], + "biographical": { + "honorifics": [ + "چلبی/Çelebi" + ], + "patronymic": "بن مصطفی/bin Mustafa" + }, + "family_relationships": {}, + "context": "One of the two witnesses (şühûd-ı hâl) to the sale." + } + ], + "temporal_references": [ + { + "expression": "فی اوائل شهر رجب سنة ١٢٥٨", + "expression_romanized": "Fi eva'il-i şehr-i Receb sene 1258", + "normalized": "Beginning of Rajab, 1258 AH", + "calendar": "Hijri" + } + ], + "locations_mentioned": [ + { + "name": "دميرجی‌كوي", + "name_romanized": "Demirci-köy", + "type": "قصبه/kasaba" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_153300.json b/data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_153300.json new file mode 100644 index 0000000000..694bc8ab4f --- /dev/null +++ b/data/entity_annotation/test_outputs/ottoman_sijill_extraction_20251212_153300.json @@ -0,0 +1,149 @@ +{ + "pico_observation": { + "observation_id": "sijill_1258_rajab_001", + "source_type": "sijill", + "source_reference": "Ottoman Court Record, Dated Beginning of Rajab 1258 AH" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "محمد آغا", + "literalName_romanized": "Muhammad Ağa" + }, + "roles": [ + "seller" + ], + "biographical": { + "patronymic": "بن عبد الله", + "patronymic_romanized": "bin Abdullah", + "honorific": "آغا", + "honorific_romanized": "Ağa", + "deceased_father": true + }, + "family_relationships": { + "father": { + "name": "عبد الله", + "name_romanized": "Abdullah", + "deceased": true + } + }, + "context": "Seller from the district of Demirciköy, son of the deceased Abdullah." + }, + { + "person_index": 1, + "pnv_name": { + "literalName": "محمد بن احمد افندی", + "literalName_romanized": "Muhammad bin Ahmad Efendi" + }, + "roles": [ + "buyer" + ], + "biographical": { + "patronymic": "بن احمد", + "patronymic_romanized": "bin Ahmad", + "honorific": "افندی", + "honorific_romanized": "Efendi" + }, + "family_relationships": { + "father": { + "name": "احمد", + "name_romanized": "Ahmad" + }, + "spouse": { + "person_index": 2, + "relation": "wife" + } + }, + "context": "Buyer, son of Ahmad, husband of Fatima Hatun." + }, + { + "person_index": 2, + "pnv_name": { + "literalName": "فاطمه خاتوم", + "literalName_romanized": "Fatima Hatun" + }, + "roles": [ + "buyer" + ], + "biographical": { + "patronymic": "بنت علی‌اوغلو", + "patronymic_romanized": "bint Ali-oğlu", + "honorific": "خاتوم", + "honorific_romanized": "Hatun" + }, + "family_relationships": { + "father": { + "name": "علی‌اوغلو", + "name_romanized": "Ali-oğlu" + }, + "spouse": { + "person_index": 1, + "relation": "husband" + } + }, + "context": "Buyer, daughter of Ali-oğlu, wife of Muhammad bin Ahmad Efendi." + }, + { + "person_index": 3, + "pnv_name": { + "literalName": "حسن افندی بن عمر", + "literalName_romanized": "Hasan Efendi bin Umar" + }, + "roles": [ + "witness" + ], + "biographical": { + "patronymic": "بن عمر", + "patronymic_romanized": "bin Umar", + "honorific": "افندی", + "honorific_romanized": "Efendi" + }, + "family_relationships": { + "father": { + "name": "عمر", + "name_romanized": "Umar" + } + }, + "context": "Witness to the transaction, son of Umar." + }, + { + "person_index": 4, + "pnv_name": { + "literalName": "ابراهيم چلبی بن مصطفی", + "literalName_romanized": "Ibrahim Çelebi bin Mustafa" + }, + "roles": [ + "witness" + ], + "biographical": { + "patronymic": "بن مصطفی", + "patronymic_romanized": "bin Mustafa", + "honorific": "چلبی", + "honorific_romanized": "Çelebi" + }, + "family_relationships": { + "father": { + "name": "مصطفی", + "name_romanized": "Mustafa" + } + }, + "context": "Witness to the transaction, son of Mustafa." + } + ], + "temporal_references": [ + { + "expression": "فی اوائل شهر رجب سنة ١٢٥٨", + "expression_romanized": "Fi awāli shahr Rajab sanat 1258", + "normalized": "Beginning of Rajab, 1258 AH", + "calendar": "Hijri" + } + ], + "locations_mentioned": [ + { + "name": "قصبه دميرجی‌کوی", + "name_romanized": "Kasaba-ı Demirciköy", + "type": "District/Town" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/raw_response_20251212_132017.txt b/data/entity_annotation/test_outputs/raw_response_20251212_132017.txt new file mode 100644 index 0000000000..98e819c2c9 --- /dev/null +++ b/data/entity_annotation/test_outputs/raw_response_20251212_132017.txt @@ -0,0 +1,166 @@ + +```json +{ + "pico_observation": { + "observation_id": "waqf_aleppo_1225h", + "observed_at": "2023-10-27T10:00:00Z", + "source_type": "waqf_document", + "source_reference": "Aleppo Waqf, 1225 H" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "الحاج أحمد بن محمد العمري", + "literalName_romanized": "al-Hajj Ahmad ibn Muhammad al-Umari", + "givenName": "أحمد", + "givenName_romanized": "Ahmad", + "patronym": "محمد", + "patronym_romanized": "Muhammad", + "baseSurname": "العمري", + "baseSurname_romanized": "al-Umari", + "honorificPrefix": "الحاج", + "honorificPrefix_romanized": "al-Hajj" + }, + "roles": [ + { + "role_title": "تاجر", + "role_title_romanized": "tajir", + "role_in_source": "founder" + } + ], + "biographical": { + "deceased": true, + "address": "مدينة حلب الشهباء" + }, + "family_relationships": { + "parent": [ + { + "person_index": 1, + "target_name": "محمد بن عبد الله العمري" + } + ], + "children": [] + }, + "context": "The founder (waqif) of the endowment, a deceased merchant from Aleppo." + }, + { + "person_index": 1, + "pnv_name": { + "literalName": "المرحوم محمد بن عبد الله العمري", + "literalName_romanized": "al-marhum Muhammad ibn Abd Allah al-Umari", + "givenName": "محمد", + "givenName_romanized": "Muhammad", + "patronym": "عبد الله", + "patronym_romanized": "Abd Allah", + "baseSurname": "العمري", + "baseSurname_romanized": "al-Umari" + }, + "roles": [ + { + "role_title": null, + "role_title_romanized": null, + "role_in_source": null + } + ], + "biographical": { + "deceased": true, + "address": null + }, + "family_relationships": { + "parent": [], + "children": [ + { + "person_index": 0, + "target_name": "أحمد بن محمد العمري" + } + ] + }, + "context": "The deceased father of the founder, Ahmad al-Umari." + }, + { + "person_index": 2, + "pnv_name": { + "literalName": "الحاج إبراهيم بن يوسف التركماني", + "literalName_romanized": "al-Hajj Ibrahim ibn Yusuf al-Turkmani", + "givenName": "إبراهيم", + "givenName_romanized": "Ibrahim", + "patronym": "يوسف", + "patronym_romanized": "Yusuf", + "baseSurname": "التركماني", + "baseSurname_romanized": "al-Turkmani", + "honorificPrefix": "الحاج", + "honorificPrefix_romanized": "al-Hajj" + }, + "roles": [ + { + "role_title": "شاهد", + "role_title_romanized": "shahid", + "role_in_source": "witness" + } + ], + "biographical": { + "deceased": null, + "address": null + }, + "family_relationships": { + "parent": [], + "children": [] + }, + "context": "One of the witnesses to the endowment deed." + }, + { + "person_index": 3, + "pnv_name": { + "literalName": "السيد علي بن حسين الحلبي", + "literalName_romanized": "al-Sayyid Ali ibn Husayn al-Halabi", + "givenName": "علي", + "givenName_romanized": "Ali", + "patronym": "حسين", + "patronym_romanized": "Husayn", + "baseSurname": "الحلبي", + "baseSurname_romanized": "al-Halabi", + "honorificPrefix": "السيد", + "honorificPrefix_romanized": "al-Sayyid" + }, + "roles": [ + { + "role_title": "شاهد", + "role_title_romanized": "shahid", + "role_in_source": "witness" + } + ], + "biographical": { + "deceased": null, + "address": null + }, + "family_relationships": { + "parent": [], + "children": [] + }, + "context": "The second witness to the endowment deed." + } + ], + "temporal_references": [ + { + "expression": "شهر رجب سنة ألف ومائتين وخمس وعشرين هجرية", + "expression_romanized": "Shahr Rajab sanat alf wa mi'ayn wa khamsa wa 'ishrin hijriyyah", + "normalized": "1811-01", + "calendar": "Hijri", + "type": "DATE" + } + ], + "locations_mentioned": [ + { + "name": "حلب الشهباء", + "name_romanized": "Halab al-Shahba'", + "type": "city" + }, + { + "name": "محلة الجديدة", + "name_romanized": "Mahallat al-Jadida", + "type": "neighborhood" + } + ] +} +``` \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_150120.json b/data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_150120.json new file mode 100644 index 0000000000..263bf330b9 --- /dev/null +++ b/data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_150120.json @@ -0,0 +1,167 @@ +{ + "pico_observation": { + "observation_id": "obs_001", + "source_type": "metrical_book", + "source_reference": "Метрическая книга Троицкой церкви села Покровского за 1892 год" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "full_name_cyrillic": "Иван", + "full_name_romanized": "Ivan", + "given_name_cyrillic": "Иван", + "given_name_romanized": "Ivan" + }, + "roles": [ + "newborn" + ], + "biographical": { + "sex": "male", + "estate": "peasant" + }, + "family_relationships": { + "father": "Пётр Иванович Сидоров", + "mother": "Анна Фёдоровна" + }, + "context": "Born March 15, baptized March 17, 1892, son of Pyotr Ivanovich Sidorov and Anna Fyodorovna." + }, + { + "person_index": 1, + "pnv_name": { + "full_name_cyrillic": "Пётр Иванович Сидоров", + "full_name_romanized": "Pyotr Ivanovich Sidorov", + "given_name_cyrillic": "Пётр", + "given_name_romanized": "Pyotr", + "patronymic_cyrillic": "Иванович", + "patronymic_romanized": "Ivanovich", + "surname_cyrillic": "Сидоров", + "surname_romanized": "Sidorov" + }, + "roles": [ + "father" + ], + "biographical": { + "sex": "male", + "estate": "крестьянин (peasant)", + "religion": "православный (Orthodox)" + }, + "family_relationships": { + "son": "Иван", + "wife": "Анна Фёдоровна", + "sibling": "Мария Ивановна Сидорова" + }, + "context": "Peasant from the village of Ivanovka, father of the newborn Ivan, husband of Anna Fyodorovna." + }, + { + "person_index": 2, + "pnv_name": { + "full_name_cyrillic": "Анна Фёдоровна", + "full_name_romanized": "Anna Fyodorovna", + "given_name_cyrillic": "Анна", + "given_name_romanized": "Anna", + "patronymic_cyrillic": "Фёдоровна", + "patronymic_romanized": "Fyodorovna" + }, + "roles": [ + "mother" + ], + "biographical": { + "sex": "female", + "religion": "православный (Orthodox)" + }, + "family_relationships": { + "son": "Иван", + "husband": "Пётр Иванович Сидоров" + }, + "context": "Lawful wife of Pyotr Ivanovich Sidorov, mother of the newborn Ivan." + }, + { + "person_index": 3, + "pnv_name": { + "full_name_cyrillic": "Николай Петрович Кузнецов", + "full_name_romanized": "Nikolay Petrovich Kuznetsov", + "given_name_cyrillic": "Николай", + "given_name_romanized": "Nikolay", + "patronymic_cyrillic": "Петрович", + "patronymic_romanized": "Petrovich", + "surname_cyrillic": "Кузнецов", + "surname_romanized": "Kuznetsov" + }, + "roles": [ + "godparent" + ], + "biographical": { + "sex": "male", + "estate": "крестьянин (peasant)" + }, + "family_relationships": { + "godson": "Иван" + }, + "context": "Godparent of Ivan, a peasant from the same village (Ivanovka)." + }, + { + "person_index": 4, + "pnv_name": { + "full_name_cyrillic": "Мария Ивановна Сидорова", + "full_name_romanized": "Maria Ivanovna Sidorova", + "given_name_cyrillic": "Мария", + "given_name_romanized": "Maria", + "patronymic_cyrillic": "Ивановна", + "patronymic_romanized": "Ivanovna", + "surname_cyrillic": "Сидорова", + "surname_romanized": "Sidorova" + }, + "roles": [ + "godparent" + ], + "biographical": { + "sex": "female", + "estate": "крестьянская дочь (peasant's daughter)", + "marital_status": "девица (unmarried)" + }, + "family_relationships": { + "godson": "Иван", + "brother": "Пётр Иванович Сидоров" + }, + "context": "Godparent of Ivan, an unmarried peasant's daughter from the village of Ivanovka." + } + ], + "temporal_references": [ + { + "expression": "за 1892 год", + "expression_romanized": "za 1892 god", + "normalized": "1892", + "calendar": "Gregorian" + }, + { + "expression": "Марта 15 дня", + "expression_romanized": "Marta 15 dnya", + "normalized": "1892-03-15", + "calendar": "Julian" + }, + { + "expression": "17 дня", + "expression_romanized": "17 dnya", + "normalized": "1892-03-17", + "calendar": "Julian" + } + ], + "locations_mentioned": [ + { + "name": "Троицкой церкви", + "name_romanized": "Troitskoy tserkvi", + "type": "church" + }, + { + "name": "села Покровского", + "name_romanized": "sela Pokrovskogo", + "type": "village" + }, + { + "name": "деревни Ивановки", + "name_romanized": "derevni Ivanovki", + "type": "village" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_153018.json b/data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_153018.json new file mode 100644 index 0000000000..210803d8cc --- /dev/null +++ b/data/entity_annotation/test_outputs/russian_metrical_extraction_20251212_153018.json @@ -0,0 +1,192 @@ +{ + "pico_observation": { + "observation_id": "obs_1892_03_17_ivan_sidorov", + "source_type": "metrical_book", + "source_reference": "Метрическая книга Троицкой церкви села Покровского за 1892 год" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "full_name_cyrillic": "Иван", + "full_name_romanized": "Ivan", + "given_name_cyrillic": "Иван", + "given_name_romanized": "Ivan" + }, + "roles": [ + "newborn" + ], + "biographical": { + "sex": "male", + "estate": null, + "religion": "Orthodox" + }, + "family_relationships": { + "son_of": [ + "Пётр Иванович Сидоров", + "Анна Фёдоровна" + ] + }, + "context": "Родился 15 марта, крещён 17 марта 1892 года. Сын крестьянина Петра Ивановича Сидорова и Анны Фёдоровны." + }, + { + "person_index": 1, + "pnv_name": { + "full_name_cyrillic": "Пётр Иванович Сидоров", + "full_name_romanized": "Pyotr Ivanovich Sidorov", + "given_name_cyrillic": "Пётр", + "given_name_romanized": "Pyotr", + "patronymic_cyrillic": "Иванович", + "patronymic_romanized": "Ivanovich", + "surname_cyrillic": "Сидоров", + "surname_romanized": "Sidorov" + }, + "roles": [ + "parent", + "father" + ], + "biographical": { + "sex": "male", + "estate": "крестьянин", + "religion": "Orthodox" + }, + "family_relationships": { + "father_of": [ + "Иван" + ], + "husband_of": [ + "Анна Фёдоровна" + ] + }, + "context": "Отец новорождённого Ивана. Крестьянин из деревни Ивановки." + }, + { + "person_index": 2, + "pnv_name": { + "full_name_cyrillic": "Анна Фёдоровна", + "full_name_romanized": "Anna Fyodorovna", + "given_name_cyrillic": "Анна", + "given_name_romanized": "Anna", + "patronymic_cyrillic": "Фёдоровна", + "patronymic_romanized": "Fyodorovna" + }, + "roles": [ + "parent", + "mother" + ], + "biographical": { + "sex": "female", + "estate": "крестьянка", + "religion": "Orthodox" + }, + "family_relationships": { + "mother_of": [ + "Иван" + ], + "wife_of": [ + "Пётр Иванович Сидоров" + ] + }, + "context": "Мать новорождённого Ивана. Законная жена крестьянина Петра Ивановича Сидорова." + }, + { + "person_index": 3, + "pnv_name": { + "full_name_cyrillic": "Николай Петрович Кузнецов", + "full_name_romanized": "Nikolai Petrovich Kuznetsov", + "given_name_cyrillic": "Николай", + "given_name_romanized": "Nikolai", + "patronymic_cyrillic": "Петрович", + "patronymic_romanized": "Petrovich", + "surname_cyrillic": "Кузнецов", + "surname_romanized": "Kuznetsov" + }, + "roles": [ + "godparent", + "godfather" + ], + "biographical": { + "sex": "male", + "estate": "крестьянин", + "religion": "Orthodox" + }, + "family_relationships": { + "godparent_of": [ + "Иван" + ] + }, + "context": "Восприемник (крёстный отец) Ивана. Крестьянин из той же деревни Ивановки." + }, + { + "person_index": 4, + "pnv_name": { + "full_name_cyrillic": "Мария Ивановна Сидорова", + "full_name_romanized": "Maria Ivanovna Sidorova", + "given_name_cyrillic": "Мария", + "given_name_romanized": "Maria", + "patronymic_cyrillic": "Ивановна", + "patronymic_romanized": "Ivanovna", + "surname_cyrillic": "Сидорова", + "surname_romanized": "Sidorova" + }, + "roles": [ + "godparent", + "godmother" + ], + "biographical": { + "sex": "female", + "estate": "крестьянская дочь девица", + "religion": "Orthodox" + }, + "family_relationships": { + "godparent_of": [ + "Иван" + ] + }, + "context": "Восприемница (крёстная мать) Ивана. Крестьянская дочь девица из той же деревни Ивановки." + } + ], + "temporal_references": [ + { + "expression": "Марта 15 дня", + "expression_romanized": "Marta 15 dnya", + "normalized": "1892-03-15", + "calendar": "Julian" + }, + { + "expression": "Марта 15 дня", + "expression_romanized": "Marta 15 dnya", + "normalized": "1892-03-27", + "calendar": "Gregorian" + }, + { + "expression": "17 дня", + "expression_romanized": "17 dnya", + "normalized": "1892-03-17", + "calendar": "Julian" + }, + { + "expression": "17 дня", + "expression_romanized": "17 dnya", + "normalized": "1892-03-29", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "село Покровское", + "name_romanized": "selo Pokrovskoye", + "type": "village" + }, + { + "name": "Троицкая церковь", + "name_romanized": "Troitskaya tserkov", + "type": "church" + }, + { + "name": "деревня Ивановка", + "name_romanized": "derevnya Ivanovka", + "type": "village" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_133618.json b/data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_133618.json new file mode 100644 index 0000000000..e8be7db1d8 --- /dev/null +++ b/data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_133618.json @@ -0,0 +1,221 @@ +{ + "pico_observation": { + "observation_id": "baptism_mexico_1742-02-23_001", + "source_type": "baptismal_register", + "source_reference": "Ciudad de México, 23 de febrero de 1742. Bautismo de Juan José, hijo de Don Pedro García de la Cruz y Doña María Josefa de los Reyes." + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "given_name": "Antonio", + "surname": "de Mendoza" + }, + "roles": [ + "teniente de cura", + "priest" + ], + "biographical": { + "honorifics": [ + "Br.", + "Don" + ] + }, + "family_relationships": {}, + "context": "El sacerdote que oficia el bautismo." + }, + { + "person_index": 1, + "pnv_name": { + "given_name": "Juan José", + "surname": "" + }, + "roles": [ + "baptized_infant" + ], + "biographical": { + "casta": "español", + "legitimacy": "legítimo" + }, + "family_relationships": { + "parents": [ + { + "person_index": 2, + "relationship": "father" + }, + { + "person_index": 3, + "relationship": "mother" + } + ], + "godparents": [ + { + "person_index": 4, + "relationship": "godfather" + }, + { + "person_index": 5, + "relationship": "godmother" + } + ] + }, + "context": "El infante bautizado." + }, + { + "person_index": 2, + "pnv_name": { + "given_name": "Pedro", + "surname": "García de la Cruz" + }, + "roles": [ + "father" + ], + "biographical": { + "honorifics": [ + "Don" + ], + "casta": "español", + "origin": { + "place": "villa de Puebla de los Ángeles", + "type": "natural" + } + }, + "family_relationships": { + "children": [ + { + "person_index": 1, + "relationship": "son" + } + ], + "spouse": [ + { + "person_index": 3, + "relationship": "wife" + } + ] + }, + "context": "Padre del bautizado." + }, + { + "person_index": 3, + "pnv_name": { + "given_name": "María Josefa", + "surname": "de los Reyes" + }, + "roles": [ + "mother" + ], + "biographical": { + "honorifics": [ + "Doña" + ], + "casta": "español", + "origin": { + "place": "esta ciudad", + "type": "natural" + } + }, + "family_relationships": { + "children": [ + { + "person_index": 1, + "relationship": "son" + } + ], + "spouse": [ + { + "person_index": 2, + "relationship": "husband" + } + ] + }, + "context": "Madre del bautizado." + }, + { + "person_index": 4, + "pnv_name": { + "given_name": "Francisco Xavier", + "surname": "de Castañeda" + }, + "roles": [ + "godfather" + ], + "biographical": { + "honorifics": [ + "Don" + ], + "casta": "español", + "origin": { + "place": "esta ciudad", + "type": "vecino" + } + }, + "family_relationships": { + "spouse": [ + { + "person_index": 5, + "relationship": "wife" + } + ], + "godchild": [ + { + "person_index": 1, + "relationship": "godson" + } + ] + }, + "context": "Padrino del bautizado." + }, + { + "person_index": 5, + "pnv_name": { + "given_name": "Ana María", + "surname": "de la Encarnación" + }, + "roles": [ + "godmother" + ], + "biographical": { + "honorifics": [ + "Doña" + ] + }, + "family_relationships": { + "spouse": [ + { + "person_index": 4, + "relationship": "husband" + } + ], + "godchild": [ + { + "person_index": 1, + "relationship": "godson" + } + ] + }, + "context": "Madrina del bautizado, esposa legítima del padrino." + } + ], + "temporal_references": [ + { + "expression": "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años", + "normalized": "1742-02-23", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "ciudad de México", + "type": "city" + }, + { + "name": "esta santa iglesia catedral", + "type": "church" + }, + { + "name": "villa de Puebla de los Ángeles", + "type": "villa" + } + ] +} \ No newline at end of file diff --git a/data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_152722.json b/data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_152722.json new file mode 100644 index 0000000000..672de3ae85 --- /dev/null +++ b/data/entity_annotation/test_outputs/spanish_colonial_baptism_extraction_20251212_152722.json @@ -0,0 +1,259 @@ +{ + "pico_observation": { + "observation_id": "baptism_mexico_1742-02-23_001", + "source_type": "baptismal_register", + "source_reference": "Ciudad de México, 23 de febrero de 1742. Bautismo de Juan José, hijo de Don Pedro García de la Cruz y Doña María Josefa de los Reyes." + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "given_name": "Antonio", + "surname": "de Mendoza" + }, + "roles": [ + "teniente de cura", + "priest" + ], + "honorifics": [ + "Br.", + "Don" + ], + "biographical": {}, + "family_relationships": {}, + "context": "El sacerdote que oficia el bautismo." + }, + { + "person_index": 1, + "pnv_name": { + "given_name": "Juan José", + "surname": "" + }, + "roles": [ + "baptized_infant" + ], + "biographical": { + "casta": "español", + "legitimacy": "legítimo" + }, + "family_relationships": { + "parents": [ + { + "person_index": 2, + "relationship": "father" + }, + { + "person_index": 3, + "relationship": "mother" + } + ], + "godparents": [ + { + "person_index": 4, + "relationship": "godfather" + }, + { + "person_index": 5, + "relationship": "godmother" + } + ] + }, + "context": "El niño bautizado, hijo legítimo de Pedro García de la Cruz y María Josefa de los Reyes." + }, + { + "person_index": 2, + "pnv_name": { + "given_name": "Pedro", + "surname": "García de la Cruz" + }, + "roles": [ + "father" + ], + "honorifics": [ + "Don" + ], + "biographical": { + "casta": "español", + "origin": { + "type": "natural", + "place": "Puebla de los Ángeles" + } + }, + "family_relationships": { + "children": [ + { + "person_index": 1, + "relationship": "son" + } + ], + "spouse": [ + { + "person_index": 3, + "relationship": "wife" + } + ], + "compadres": [ + { + "person_index": 4, + "relationship": "godfather_of_his_child" + }, + { + "person_index": 5, + "relationship": "godmother_of_his_child" + } + ] + }, + "context": "Padre del bautizado, natural de Puebla de los Ángeles." + }, + { + "person_index": 3, + "pnv_name": { + "given_name": "María Josefa", + "surname": "de los Reyes" + }, + "roles": [ + "mother" + ], + "honorifics": [ + "Doña" + ], + "biographical": { + "casta": "español", + "origin": { + "type": "natural", + "place": "Ciudad de México" + } + }, + "family_relationships": { + "children": [ + { + "person_index": 1, + "relationship": "son" + } + ], + "spouse": [ + { + "person_index": 2, + "relationship": "husband" + } + ], + "comadres": [ + { + "person_index": 4, + "relationship": "godfather_of_her_child" + }, + { + "person_index": 5, + "relationship": "godmother_of_her_child" + } + ] + }, + "context": "Madre del bautizado, natural de la Ciudad de México." + }, + { + "person_index": 4, + "pnv_name": { + "given_name": "Francisco Xavier", + "surname": "de Castañeda" + }, + "roles": [ + "godfather" + ], + "honorifics": [ + "Don" + ], + "biographical": { + "casta": "español", + "origin": { + "type": "resident", + "place": "Ciudad de México" + } + }, + "family_relationships": { + "godchild": [ + { + "person_index": 1, + "relationship": "godson" + } + ], + "spouse": [ + { + "person_index": 5, + "relationship": "wife" + } + ], + "compadres": [ + { + "person_index": 2, + "relationship": "father_of_godchild" + }, + { + "person_index": 3, + "relationship": "mother_of_godchild" + } + ] + }, + "context": "Padrino del bautizado, vecino de la Ciudad de México." + }, + { + "person_index": 5, + "pnv_name": { + "given_name": "Ana María", + "surname": "de la Encarnación" + }, + "roles": [ + "godmother" + ], + "honorifics": [ + "Doña" + ], + "biographical": {}, + "family_relationships": { + "godchild": [ + { + "person_index": 1, + "relationship": "godson" + } + ], + "spouse": [ + { + "person_index": 4, + "relationship": "husband" + } + ], + "comadres": [ + { + "person_index": 2, + "relationship": "father_of_godchild" + }, + { + "person_index": 3, + "relationship": "mother_of_godchild" + } + ] + }, + "context": "Madrina del bautizado y esposa legítima del padrino." + } + ], + "temporal_references": [ + { + "expression": "a veinte y tres días del mes de febrero de mil setecientos cuarenta y dos años", + "normalized": "1742-02-23", + "calendar": "Gregorian" + } + ], + "locations_mentioned": [ + { + "name": "Ciudad de México", + "type": "city" + }, + { + "name": "Puebla de los Ángeles", + "type": "villa" + }, + { + "name": "esta santa iglesia catedral", + "type": "church" + } + ] +} \ No newline at end of file diff --git a/frontend/public/schemas/20251121/linkml/custodian_source.yaml b/frontend/public/schemas/20251121/linkml/custodian_source.yaml index 7d177d520c..ed77160a10 100644 --- a/frontend/public/schemas/20251121/linkml/custodian_source.yaml +++ b/frontend/public/schemas/20251121/linkml/custodian_source.yaml @@ -88,6 +88,8 @@ enums: description: Entry requires further enrichment processing new_entry: description: Newly added entry not yet enriched + google_maps_searched: + description: Google Maps search attempted but not yet fully enriched InstitutionTypeCodeEnum: description: Single-letter GLAMORCUBESFIXPHDNT type codes @@ -182,6 +184,10 @@ enums: description: GeoNames geographic entity identifier LinkedIn: description: LinkedIn profile or company page + GHCID_PREVIOUS: + description: Previous GHCID before relocation or reorganization + OCLC: + description: OCLC (Online Computer Library Center) identifier LocationResolutionMethodEnum: description: Method used to resolve settlement location @@ -228,6 +234,8 @@ enums: description: Verified through web search CITY_NAME_LOOKUP: description: Looked up city name directly + MANUAL_RESEARCH: + description: Manually researched and assigned location GEONAMES_CITY_LOOKUP: description: Looked up city in GeoNames database NAME_EXTRACTION: @@ -252,8 +260,6 @@ enums: description: Extracted location from institution name GEONAMES_FUZZY: description: Fuzzy matched in GeoNames - MANUAL_RESEARCH: - description: Manually researched location WIKIDATA_ENRICHMENT: description: Enriched from Wikidata COORDINATE_LOOKUP: @@ -430,6 +436,9 @@ classes: organisatie: range: string description: Organization name from source + organisation: + range: string + description: Organization name from source (British spelling variant) isil_code_na: range: string description: ISIL code from Nationaal Archief @@ -650,10 +659,19 @@ classes: range: string description: Status of Wikidata enrichment for this entry comment: - range: ReferenceLink - multivalued: true + any_of: + - range: string + - range: ReferenceLink + multivalued: true inlined_as_list: true - description: Comments about this entry (array of objects with label field) + description: Comments about this entry (can be a string or array of objects with label field) + comments: + any_of: + - range: string + - range: ReferenceLink + multivalued: true + inlined_as_list: true + description: Comments about this entry (string or array of objects with label field) succeeded_by: range: ReferenceLink multivalued: true @@ -666,6 +684,15 @@ classes: label: range: string description: Name/label of the duplicate institution + entry_index: + range: integer + description: Index of the duplicate entry in source data + entry_file: + range: string + description: Filename of the duplicate entry + reason: + range: string + description: Reason why this is considered a duplicate TimeEntry: description: Structured time entry from source data @@ -676,10 +703,13 @@ classes: - range: integer description: Time label (date string or year) type: - range: TimeEntryType - multivalued: true + any_of: + - range: string + multivalued: true + - range: TimeEntryType + multivalued: true inlined_as_list: true - description: Type of time point (begin, end, etc.) + description: Type of time point (begin, end, etc.) - can be strings or TimeEntryType objects TimeEntryType: description: Type classification for time entry @@ -847,6 +877,11 @@ classes: data_source: range: string description: Data source type (CSV_REGISTRY, API_SCRAPING, etc.) + data_sources: + range: string + multivalued: true + inlined_as_list: true + description: List of data sources (e.g., NDE registry, Google Maps, website) data_tier: range: DataTierEnum description: Quality tier of the data @@ -856,6 +891,12 @@ classes: extraction_method: range: string description: Method used to extract the data + enrichment_date: + range: string + description: When enrichment was performed (ISO date string) + enrichment_method: + range: string + description: Method used to enrich the data (e.g., website_research) confidence_score: range: float description: Confidence score (0-1) @@ -889,6 +930,15 @@ classes: wikidata_property: range: string description: Wikidata property ID (e.g., P856) + archive_location: + range: string + description: Location of archived copy (e.g., web/1186/hartebrug.nl) + claim_extracted_from: + range: string + description: Source path from which claim was extracted (e.g., original_entry.reference) + verified_via_web_archive: + range: boolean + description: Whether claim was verified via web archive ProvenanceSources: description: Sources organized by type @@ -938,6 +988,52 @@ classes: multivalued: true inlined_as_list: true description: Nationaal Archief ISIL registry source records + whois_research: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: WHOIS domain research source records + manual_research: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Manual research source records + website: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Website source records (institution website data) + web_scrape: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Web scrape source records (scraped website data) + # Data tier summary fields (for provenance summaries) + TIER_1_AUTHORITATIVE: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_1 authoritative sources + TIER_2_VERIFIED: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_2 verified sources + TIER_3_CROWD_SOURCED: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_3 crowd-sourced sources + TIER_4_INFERRED: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_4 inferred sources + museum_register: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Museum register source records SourceRecord: description: Individual source record with claims @@ -999,6 +1095,20 @@ classes: source_file: range: string description: Source file name + research_date: + range: string + description: Date of research (YYYY-MM-DD format) + url: + range: uri + description: URL of the source (website URL, etc.) + data_extracted: + range: string + multivalued: true + inlined_as_list: true + description: List of data types/fields extracted from this source + merge_note: + range: string + description: Note about merge operations involving this source record DataTierSummary: description: Summary of data tiers present in entry @@ -1029,7 +1139,7 @@ classes: attributes: identifier_scheme: range: IdentifierSchemeEnum - required: true + required: false description: Type of identifier identifier_value: any_of: @@ -1051,6 +1161,14 @@ classes: notes: range: string description: Additional note about this identifier (alias for note) + scheme: + range: string + description: Identifier scheme (alias for identifier_scheme, used in some data sources) + value: + any_of: + - range: string + - range: integer + description: Identifier value (alias for identifier_value, used in some data sources) # --------------------------------------------------------------------------- # GHCID BLOCK - Heritage Custodian ID with history @@ -1277,6 +1395,15 @@ classes: resolution_notes: range: string description: Additional notes from location resolution process + specific_location: + range: string + description: More specific location info within the city (e.g., neighborhood, district) + specific_geonames_id: + range: integer + description: GeoNames ID for the specific location (if different from main city) + correction_note: + range: string + description: Note explaining any correction made to the location resolution SourceCoordinates: description: Source of coordinates for resolution @@ -1296,13 +1423,19 @@ classes: attributes: type: range: string - description: Type of research source (e.g., note, wikidata, web_archive, official_source) + description: Type of research source (e.g., note, wikidata, web_archive, official_source, whois) text: range: string description: Text or description of the research source + value: + range: string + description: Value from this source (e.g., plus code, address) notes: range: string description: Additional notes about this source + note: + range: string + description: Additional note about this source (singular alias for notes) id: range: string description: Identifier for the source (e.g., Wikidata Q-number) @@ -1315,6 +1448,56 @@ classes: coordinates: range: string description: Coordinates from this source (e.g., "31.515, 34.434") + data: + range: ResearchSourceData + description: Structured data from the source (e.g., WHOIS registrant info) + + ResearchSourceData: + description: Structured data from a research source + attributes: + registrant_name: + range: string + description: WHOIS registrant name + registrant_address: + range: string + description: WHOIS registrant address + registrant_city: + range: string + description: WHOIS registrant city + registrant_state: + range: string + description: WHOIS registrant state/province + registrant_country: + range: string + description: WHOIS registrant country + registrant_postal_code: + range: string + description: WHOIS registrant postal code + # Additional flexible fields for other data types + organization: + range: string + description: Organization name + email: + range: string + description: Contact email + phone: + range: string + description: Contact phone + creation_date: + range: string + description: Domain creation date + updated_date: + range: string + description: Domain updated date + expiration_date: + range: string + description: Domain expiration date + domain_registered: + range: string + description: Domain registration date + registry: + range: string + description: Domain registrar name # --------------------------------------------------------------------------- # GOOGLE MAPS ENRICHMENT @@ -1459,21 +1642,28 @@ classes: - range: HoursStatus description: Opening hours information (string or status object) admission: - range: string - description: Admission price information + any_of: + - range: string + - range: AdmissionInfo + description: Admission price information (string or structured object) related_places: - range: string - multivalued: true + any_of: + - range: string + multivalued: true + - range: RelatedPlace + multivalued: true inlined_as_list: true - description: Related places nearby + description: Related places nearby (strings or structured objects) review_topics: range: string multivalued: true inlined_as_list: true description: Topics mentioned in reviews reviews_summary: - range: string - description: Summary of reviews + any_of: + - range: string + - range: ReviewsSummary + description: Summary of reviews (string or structured breakdown) sample_reviews: any_of: - range: string @@ -1510,10 +1700,13 @@ classes: inlined_as_list: true description: Nearby organizations (strings or structured objects) features: - range: string - multivalued: true + any_of: + - range: string + multivalued: true + - range: PlaceFeature + multivalued: true inlined_as_list: true - description: Features of the place + description: Features of the place (strings or key-value objects) hours_status: range: string description: Current opening status (e.g., "Closed · Opens 2 pm Wed") @@ -1590,6 +1783,23 @@ classes: match_notes: range: string description: Notes about how the Google Maps match was determined + price_level: + any_of: + - range: integer + - range: string + description: Google Maps price level (0-4 or string description) + match_warning: + range: string + description: Warning about potential issues with the match + location_note: + range: string + description: Note about the physical location of the place + search_attempted: + range: boolean + description: Whether a Google Maps search was attempted + result: + range: string + description: Result of search operation (found, not_found, found_via_user_link, etc.) RejectedGoogleMapsData: description: Rejected Google Maps data preserved for audit trail @@ -1612,6 +1822,53 @@ classes: returned_country: range: string description: Country code actually returned by Google Maps + website: + range: uri + description: Website URL from Google Maps + latitude: + range: float + description: Latitude coordinate + longitude: + range: float + description: Longitude coordinate + enriched_at: + range: datetime + description: When enrichment was performed + + PlaceFeature: + description: A feature flag for a place (e.g., native_garden, shop, volunteers) + class_uri: schema:PropertyValue + attributes: + native_garden: + range: boolean + description: Has a native garden + shop: + range: boolean + description: Has a shop + volunteers: + range: boolean + description: Has volunteers + parking: + range: boolean + description: Has parking + cafe: + range: boolean + description: Has a cafe + restaurant: + range: boolean + description: Has a restaurant + gift_shop: + range: boolean + description: Has a gift shop + wheelchair_accessible: + range: boolean + description: Is wheelchair accessible + guided_tours: + range: boolean + description: Offers guided tours + audio_guide: + range: boolean + description: Offers audio guides LlmVerification: description: LLM-based verification results for Google Maps matching @@ -1696,6 +1953,25 @@ classes: minute: range: integer + ReviewsSummary: + description: Breakdown of reviews by star rating + attributes: + 5_star: + range: integer + description: Number of 5-star reviews + 4_star: + range: integer + description: Number of 4-star reviews + 3_star: + range: integer + description: Number of 3-star reviews + 2_star: + range: integer + description: Number of 2-star reviews + 1_star: + range: integer + description: Number of 1-star reviews + GoogleReview: description: Google Maps review attributes: @@ -1722,6 +1998,16 @@ classes: range: string description: Alias for relative_time_description (review date) + AdmissionInfo: + description: Structured admission price information from Google Maps + attributes: + price: + range: string + description: Admission price (e.g., "€9.00") + notes: + range: string + description: Additional notes about admission (e.g., "Additional fees might apply") + PhotoMetadata: description: Google Maps photo metadata attributes: @@ -1805,8 +2091,10 @@ classes: wikidata_temporal: range: WikidataTemporal wikidata_inception: - range: string - description: Inception date (P571) + any_of: + - range: string + - range: WikidataTimeValue + description: Inception date (P571) - can be string or structured time value wikidata_classification: range: WikidataClassification wikidata_instance_of: @@ -1923,6 +2211,29 @@ classes: multivalued: true inlined_as_list: true description: Search terms attempted when looking for Wikidata entity + wikidata_description_nl: + range: string + description: Description in Dutch language + wikidata_claims: + range: WikidataClaims + description: Structured Wikidata claims with property metadata + inlined: true + _resolved_entities: + range: WikidataResolvedEntities + description: Resolved Wikidata property and entity metadata cache + inlined: true + + WikidataClaims: + description: | + Structured Wikidata claims with property metadata and values. + Uses flexible dict-like structure for various claim types. + class_uri: linkml:Any + + WikidataResolvedEntities: + description: | + Cache of resolved Wikidata property and entity metadata. + Keys are property IDs (P123), values are property metadata. + class_uri: linkml:Any WikidataApiMetadata: description: API call metadata @@ -2035,6 +2346,19 @@ classes: inlined_as_list: true description: Main subject (P921) + WikidataTimeValue: + description: Wikidata time value with precision metadata + attributes: + time: + range: string + description: Time value in ISO 8601 format (e.g., +2015-00-00T00:00:00Z) + precision: + range: integer + description: Precision level (9=year, 10=month, 11=day, etc.) + calendarmodel: + range: uri + description: Calendar model URI (e.g., http://www.wikidata.org/entity/Q1985727 for Gregorian) + WikidataEntity: description: Reference to a Wikidata entity attributes: @@ -2081,7 +2405,10 @@ classes: description: Location properties from Wikidata attributes: country: - range: WikidataEntity + any_of: + - range: string + - range: WikidataEntity + description: Country Q-ID (can be string or WikidataEntity object) headquarters_location: range: WikidataEntity coordinates: @@ -2135,8 +2462,10 @@ classes: multivalued: true inlined_as_list: true parent_organization: - range: WikidataEntity - description: Parent organization (P749) + any_of: + - range: string + - range: WikidataEntity + description: Parent organization Q-ID or entity (P749) subsidiary: range: WikidataEntity multivalued: true @@ -2410,12 +2739,54 @@ classes: website_found: range: boolean description: Whether a website was found + official_website: + range: uri + description: Official website URL found during research research_notes: range: string description: Notes from research organizational_change: range: OrganizationalChange description: Organizational change information (closures, mergers, etc.) + # WHOIS/domain information fields + domain: + range: string + description: Domain name of the website + domain_registered: + range: string + description: Date domain was registered (YYYY-MM-DD) + registrar: + range: string + description: Domain registrar name + registration_country: + range: string + description: Country where domain is registered (ISO 3166-1 alpha-2) + site_launched: + range: string + description: Year or date when site was launched + collections: + range: WebCollection + multivalued: true + inlined_as_list: true + description: Collections documented on the website + # Duplicate/canonical entry tracking + is_canonical_entry: + range: boolean + description: Whether this is the canonical entry (vs duplicate) + duplicate_entries: + range: DuplicateEntry + multivalued: true + inlined_as_list: true + description: References to duplicate entries of this institution + organization_status: + range: string + description: Current status of the organization (ACTIVE, CLOSED, etc.) + research_timestamp: + range: datetime + description: When research was performed + website: + range: uri + description: Website URL found during research # Migration tracking fields claims_migrated: range: boolean @@ -2442,6 +2813,12 @@ classes: merger_target: range: string description: Target organization in merger + successor_name: + range: string + description: Name of successor organization (for mergers) + successor_location: + range: string + description: Location of successor organization (for mergers) notes: range: string description: Additional notes @@ -2474,6 +2851,33 @@ classes: range: string description: Archive status (new, updated, etc.) + WebCollection: + description: A collection documented on a heritage institution website + attributes: + name: + range: string + required: true + description: Name of the collection + description: + range: string + description: Description of the collection + url: + range: uri + description: URL to the collection page + type: + range: string + description: Type of collection (oral_history, photographs, documents, etc.) + item_count: + any_of: + - range: integer + - range: string + description: Number of items in the collection (integer or descriptive string) + total_hours: + any_of: + - range: float + - range: string + description: Total hours of content (for audio/video collections) + WebArchiveFailure: description: Failed archive attempt record attributes: @@ -2603,7 +3007,8 @@ classes: - range: string - range: string multivalued: true - description: Extracted value (alias for claim_value, can be string or list) + - range: OpeningHoursMap + description: Extracted value (alias for claim_value, can be string, list, or structured object like opening hours) raw_value: range: string description: Raw value before processing @@ -2728,6 +3133,9 @@ classes: job_title_en: range: string description: Job title in English + department_en: + range: string + description: Department name in English RawSource: description: Raw source information for web enrichment @@ -2741,6 +3149,9 @@ classes: fetch_timestamp: range: datetime description: When the source was fetched + published_date: + range: datetime + description: When the source content was published source_type: range: string description: Type of source (official_website, etc.) @@ -2756,6 +3167,63 @@ classes: raw_markdown_hash: range: string description: SHA-256 hash of the raw markdown content + exa_highlights: + range: string + multivalued: true + inlined_as_list: true + description: Highlighted excerpts from Exa search results + exa_highlight_scores: + range: float + multivalued: true + inlined_as_list: true + description: Relevance scores for Exa highlights + + OpeningHoursMap: + description: Opening hours as a day-keyed map + class_uri: schema:OpeningHoursSpecification + attributes: + maandag: + range: string + description: Monday hours (Dutch) + dinsdag: + range: string + description: Tuesday hours (Dutch) + woensdag: + range: string + description: Wednesday hours (Dutch) + donderdag: + range: string + description: Thursday hours (Dutch) + vrijdag: + range: string + description: Friday hours (Dutch) + zaterdag: + range: string + description: Saturday hours (Dutch) + zondag: + range: string + description: Sunday hours (Dutch) + monday: + range: string + description: Monday hours (English) + tuesday: + range: string + description: Tuesday hours (English) + wednesday: + range: string + description: Wednesday hours (English) + thursday: + range: string + description: Thursday hours (English) + friday: + range: string + description: Friday hours (English) + saturday: + range: string + description: Saturday hours (English) + sunday: + range: string + description: Sunday hours (English) SourceReference: description: Structured source reference for a claim @@ -2879,8 +3347,12 @@ classes: range: string description: Note explaining manual correction made to the name merge_notes: - range: string - description: Notes about name merging or deduplication + any_of: + - range: string + - range: MergeNote + multivalued: true + inlined_as_list: true + description: Notes about name merging or deduplication (string or array of structured objects) abbreviation: range: string description: Short form or abbreviation of the name @@ -2891,10 +3363,49 @@ classes: range: string description: Official registered name former_names: - range: string - multivalued: true + any_of: + - range: string + multivalued: true + - range: FormerName + multivalued: true inlined_as_list: true - description: Previous names the institution was known by + description: Previous names the institution was known by (strings or structured objects) + short_name: + range: string + description: Short name or commonly used abbreviated form of the institution name + + FormerName: + description: A former name of the institution with optional metadata + attributes: + name: + range: string + required: true + description: The former name + abbreviated: + range: string + description: Abbreviated form of the former name + used_until: + range: string + description: Date until which this name was used (YYYY-MM or YYYY) + used_from: + range: string + description: Date from which this name was used (YYYY-MM or YYYY) + notes: + range: string + description: Additional notes about this former name + + MergeNote: + description: Note about a merge operation between duplicate entries + attributes: + source: + range: string + description: Source entry identifier that was merged + merged_on: + range: string + description: Date when merge occurred (YYYY-MM-DD) + reason: + range: string + description: Reason for the merge (e.g., duplicate Wikidata ID, same place ID) MatchingSource: description: Source that contributed to name consensus @@ -2910,6 +3421,9 @@ classes: score: range: float description: Match score + notes: + range: string + description: Additional notes about this source match AlternativeName: description: Alternative name with language and source information @@ -3168,14 +3682,54 @@ classes: source: range: string description: Source of this platform information - source_references: + description: range: string - multivalued: true + description: Description of this platform + source_references: + any_of: + - range: string + multivalued: true + - range: PlatformSourceReference + multivalued: true inlined_as_list: true - description: References to source data + description: References to source data (strings or structured objects) enrichment_source: range: string description: Source of enrichment (e.g., manual_curation, api_scraping) + host_organization: + range: string + description: Organization hosting this platform + host_website: + range: uri + description: Main website of the host organization + language: + range: string + description: Primary language of the platform (ISO 639-1 code) + features: + range: string + multivalued: true + inlined_as_list: true + description: Features of this platform + platforms: + range: string + multivalued: true + inlined_as_list: true + description: Sub-platforms or related platforms + + PlatformSourceReference: + description: Structured source reference for a digital platform + attributes: + url: + range: uri + description: Source URL + fetch_timestamp: + range: datetime + description: When the source was fetched + data_extracted: + range: string + multivalued: true + inlined_as_list: true + description: Data fields extracted from this source # --------------------------------------------------------------------------- # UNESCO ICH ENRICHMENT @@ -3336,6 +3890,12 @@ classes: override_reason: range: string description: Reason for manual coordinate override + source_url: + range: uri + description: URL source of coordinates (e.g., Google Maps link) + note: + range: string + description: Additional note about coordinate provenance # --------------------------------------------------------------------------- # ADDITIONAL ENRICHMENT TYPES @@ -3444,9 +4004,16 @@ classes: range: float review_count: range: integer + description: Number of reviews + reviews: + range: integer + description: Number of reviews (alias for review_count) place_type: range: string description: Type of place (Museum, Cafe, etc.) + type: + range: string + description: Type of place (alias for place_type) MuseumRegisterEnrichment: description: Dutch Museum Register (Museumregister Nederland) data diff --git a/frontend/public/schemas/20251121/linkml/manifest.json b/frontend/public/schemas/20251121/linkml/manifest.json index 8dca4de295..bd1a01dd93 100644 --- a/frontend/public/schemas/20251121/linkml/manifest.json +++ b/frontend/public/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2025-12-08T17:42:08.000Z", + "generated": "2025-12-12T16:08:52.770Z", "version": "1.0.0", "categories": [ { @@ -247,6 +247,11 @@ "path": "modules/classes/ConfidenceMeasure.yaml", "category": "classes" }, + { + "name": "ConflictStatus", + "path": "modules/classes/ConflictStatus.yaml", + "category": "classes" + }, { "name": "ConservationLab", "path": "modules/classes/ConservationLab.yaml", @@ -452,6 +457,16 @@ "path": "modules/classes/FinancialStatement.yaml", "category": "classes" }, + { + "name": "FindingAid", + "path": "modules/classes/FindingAid.yaml", + "category": "classes" + }, + { + "name": "FindingAidType", + "path": "modules/classes/FindingAidType.yaml", + "category": "classes" + }, { "name": "Foremalarkiv", "path": "modules/classes/Foremalarkiv.yaml", @@ -487,11 +502,6 @@ "path": "modules/classes/Fylkesarkiv.yaml", "category": "classes" }, - { - "name": "GLAM", - "path": "modules/classes/GLAM.yaml", - "category": "classes" - }, { "name": "GalleryType", "path": "modules/classes/GalleryType.yaml", @@ -507,6 +517,11 @@ "path": "modules/classes/GiftShop.yaml", "category": "classes" }, + { + "name": "GLAM", + "path": "modules/classes/GLAM.yaml", + "category": "classes" + }, { "name": "GovernmentArchive", "path": "modules/classes/GovernmentArchive.yaml", @@ -518,13 +533,13 @@ "category": "classes" }, { - "name": "HistoricBuilding", - "path": "modules/classes/HistoricBuilding.yaml", + "name": "HistoricalArchive", + "path": "modules/classes/HistoricalArchive.yaml", "category": "classes" }, { - "name": "HistoricalArchive", - "path": "modules/classes/HistoricalArchive.yaml", + "name": "HistoricBuilding", + "path": "modules/classes/HistoricBuilding.yaml", "category": "classes" }, { @@ -607,11 +622,6 @@ "path": "modules/classes/Kustodie.yaml", "category": "classes" }, - { - "name": "LGBTArchive", - "path": "modules/classes/LGBTArchive.yaml", - "category": "classes" - }, { "name": "Landsarkiv", "path": "modules/classes/Landsarkiv.yaml", @@ -642,6 +652,11 @@ "path": "modules/classes/LegalResponsibilityCollection.yaml", "category": "classes" }, + { + "name": "LGBTArchive", + "path": "modules/classes/LGBTArchive.yaml", + "category": "classes" + }, { "name": "LibraryType", "path": "modules/classes/LibraryType.yaml", @@ -787,11 +802,6 @@ "path": "modules/classes/Organization.yaml", "category": "classes" }, - { - "name": "OrganizationBranch", - "path": "modules/classes/OrganizationBranch.yaml", - "category": "classes" - }, { "name": "OrganizationalChangeEvent", "path": "modules/classes/OrganizationalChangeEvent.yaml", @@ -807,6 +817,11 @@ "path": "modules/classes/OrganizationalSubdivision.yaml", "category": "classes" }, + { + "name": "OrganizationBranch", + "path": "modules/classes/OrganizationBranch.yaml", + "category": "classes" + }, { "name": "OutdoorSite", "path": "modules/classes/OutdoorSite.yaml", @@ -837,16 +852,6 @@ "path": "modules/classes/PerformingArtsArchive.yaml", "category": "classes" }, - { - "name": "PersonObservation", - "path": "modules/classes/PersonObservation.yaml", - "category": "classes" - }, - { - "name": "PersonOrOrganization", - "path": "modules/classes/PersonOrOrganization.yaml", - "category": "classes" - }, { "name": "PersonalCollectionType", "path": "modules/classes/PersonalCollectionType.yaml", @@ -862,6 +867,16 @@ "path": "modules/classes/Personenstandsarchiv.yaml", "category": "classes" }, + { + "name": "PersonObservation", + "path": "modules/classes/PersonObservation.yaml", + "category": "classes" + }, + { + "name": "PersonOrOrganization", + "path": "modules/classes/PersonOrOrganization.yaml", + "category": "classes" + }, { "name": "PhotoArchive", "path": "modules/classes/PhotoArchive.yaml", @@ -1323,6 +1338,11 @@ "path": "modules/enums/CommercialCustodianTypeEnum.yaml", "category": "enums" }, + { + "name": "ConflictStatusEnum", + "path": "modules/enums/ConflictStatusEnum.yaml", + "category": "enums" + }, { "name": "CustodianPrimaryTypeEnum", "path": "modules/enums/CustodianPrimaryTypeEnum.yaml", @@ -1428,11 +1448,6 @@ "path": "modules/enums/OfficialInstitutionTypeEnum.yaml", "category": "enums" }, - { - "name": "OrganizationBranchTypeEnum", - "path": "modules/enums/OrganizationBranchTypeEnum.yaml", - "category": "enums" - }, { "name": "OrganizationalChangeEventTypeEnum", "path": "modules/enums/OrganizationalChangeEventTypeEnum.yaml", @@ -1443,6 +1458,11 @@ "path": "modules/enums/OrganizationalUnitTypeEnum.yaml", "category": "enums" }, + { + "name": "OrganizationBranchTypeEnum", + "path": "modules/enums/OrganizationBranchTypeEnum.yaml", + "category": "enums" + }, { "name": "PersonalCollectionTypeEnum", "path": "modules/enums/PersonalCollectionTypeEnum.yaml", @@ -1744,6 +1764,11 @@ "path": "modules/slots/confidence_value.yaml", "category": "slots" }, + { + "name": "conflict_status", + "path": "modules/slots/conflict_status.yaml", + "category": "slots" + }, { "name": "contact", "path": "modules/slots/contact.yaml", @@ -1824,6 +1849,11 @@ "path": "modules/slots/digital_platform.yaml", "category": "slots" }, + { + "name": "digitization_status", + "path": "modules/slots/digitization_status.yaml", + "category": "slots" + }, { "name": "dissolution_date", "path": "modules/slots/dissolution_date.yaml", @@ -1839,6 +1869,11 @@ "path": "modules/slots/documentation_url.yaml", "category": "slots" }, + { + "name": "embargo_end_date", + "path": "modules/slots/embargo_end_date.yaml", + "category": "slots" + }, { "name": "emic_name", "path": "modules/slots/emic_name.yaml", @@ -2229,6 +2264,11 @@ "path": "modules/slots/parent_custodian.yaml", "category": "slots" }, + { + "name": "parent_society", + "path": "modules/slots/parent_society.yaml", + "category": "slots" + }, { "name": "parent_unit", "path": "modules/slots/parent_unit.yaml", @@ -2279,6 +2319,16 @@ "path": "modules/slots/platform_type.yaml", "category": "slots" }, + { + "name": "policy_id", + "path": "modules/slots/policy_id.yaml", + "category": "slots" + }, + { + "name": "policy_name", + "path": "modules/slots/policy_name.yaml", + "category": "slots" + }, { "name": "portal_data_sources", "path": "modules/slots/portal_data_sources.yaml", @@ -2359,6 +2409,11 @@ "path": "modules/slots/retrieved_on.yaml", "category": "slots" }, + { + "name": "rico_equivalent", + "path": "modules/slots/rico_equivalent.yaml", + "category": "slots" + }, { "name": "role_end_date", "path": "modules/slots/role_end_date.yaml", @@ -2384,6 +2439,16 @@ "path": "modules/slots/safeguards.yaml", "category": "slots" }, + { + "name": "security_level", + "path": "modules/slots/security_level.yaml", + "category": "slots" + }, + { + "name": "serves_finding_aids", + "path": "modules/slots/serves_finding_aids.yaml", + "category": "slots" + }, { "name": "service_area", "path": "modules/slots/service_area.yaml", @@ -2504,6 +2569,11 @@ "path": "modules/slots/time_of_destruction.yaml", "category": "slots" }, + { + "name": "typical_domains", + "path": "modules/slots/typical_domains.yaml", + "category": "slots" + }, { "name": "unit_affiliation", "path": "modules/slots/unit_affiliation.yaml", @@ -2568,6 +2638,11 @@ "name": "website", "path": "modules/slots/website.yaml", "category": "slots" + }, + { + "name": "wikidata_id", + "path": "modules/slots/wikidata_id.yaml", + "category": "slots" } ] } diff --git a/frontend/src/components/database/PointDetailsPanel.tsx b/frontend/src/components/database/PointDetailsPanel.tsx index f81b604f31..6ea1fb79af 100644 --- a/frontend/src/components/database/PointDetailsPanel.tsx +++ b/frontend/src/components/database/PointDetailsPanel.tsx @@ -16,6 +16,7 @@ import React, { useState, useRef, useCallback, useEffect, memo } from 'react'; import type { EmbeddingPoint } from './EmbeddingProjector'; +import { isTargetInsideAny } from '../../utils/dom'; import './PointDetailsPanel.css'; interface NearestNeighbor { @@ -169,9 +170,7 @@ const PointDetailsPanelComponent: React.FC = ({ // Drag handlers const handleMouseDown = useCallback((e: React.MouseEvent) => { // Don't start drag if clicking on buttons - if ((e.target as HTMLElement).closest('button') || - (e.target as HTMLElement).closest('input') || - (e.target as HTMLElement).closest('.neighbor-item')) { + if (isTargetInsideAny(e.target, ['button', 'input', '.neighbor-item'])) { return; } diff --git a/frontend/src/components/layout/Layout.css b/frontend/src/components/layout/Layout.css index 84680e4e64..5f38ef7a02 100644 --- a/frontend/src/components/layout/Layout.css +++ b/frontend/src/components/layout/Layout.css @@ -23,13 +23,14 @@ } /* Ensure content area can grow but footer stays visible */ -.layout-content > *:not(.layout-footer) { +.layout-content > *:not(.layout-footer):not(.layout-main) { flex-shrink: 0; } /* Main content wrapper - takes available space */ .layout-main { flex: 1 0 auto; /* Grow to fill space, don't shrink, auto basis */ + min-height: min-content; /* At least as tall as content */ } /* Footer Styles - minimal, at the very bottom */ diff --git a/frontend/src/components/layout/Layout.tsx b/frontend/src/components/layout/Layout.tsx index e646a1080d..3f158b8759 100644 --- a/frontend/src/components/layout/Layout.tsx +++ b/frontend/src/components/layout/Layout.tsx @@ -10,7 +10,7 @@ import { useLanguage } from '../../contexts/LanguageContext'; import './Layout.css'; // Pages that handle their own footer (full-screen apps with sidebars) -const PAGES_WITH_CUSTOM_FOOTER = ['/map', '/visualize', '/query-builder', '/linkml', '/ontology']; +const PAGES_WITH_CUSTOM_FOOTER = ['/map', '/visualize', '/query-builder', '/linkml', '/ontology', '/conversation']; export function Layout() { const currentYear = new Date().getFullYear(); diff --git a/frontend/src/components/layout/Navigation.css b/frontend/src/components/layout/Navigation.css index 39031aee55..42f1236c11 100644 --- a/frontend/src/components/layout/Navigation.css +++ b/frontend/src/components/layout/Navigation.css @@ -505,6 +505,18 @@ font-weight: 500; } +/* External Link Icon */ +.nav-external-icon { + margin-left: 0.35rem; + font-size: 0.75em; + opacity: 0.7; +} + +.nav-dropdown-item:hover .nav-external-icon, +.nav-mobile-link:hover .nav-external-icon { + opacity: 1; +} + /* Mobile Section Styles */ .nav-mobile-section { border-bottom: 1px solid rgba(23, 42, 89, 0.1); diff --git a/frontend/src/components/layout/Navigation.tsx b/frontend/src/components/layout/Navigation.tsx index 50d02ff87b..efd29195b4 100644 --- a/frontend/src/components/layout/Navigation.tsx +++ b/frontend/src/components/layout/Navigation.tsx @@ -9,6 +9,7 @@ import { Link, useLocation, useNavigate } from 'react-router-dom'; import { useAuth } from '../../contexts/AuthContext'; import { useLanguage, translations } from '../../contexts/LanguageContext'; import { useUIState } from '../../contexts/UIStateContext'; +import { isTargetInsideAny } from '../../utils/dom'; import './Navigation.css'; export function Navigation() { @@ -41,14 +42,13 @@ export function Navigation() { const WHEEL_RESET_DELAY = 300; // Reset wheel accumulator after this many ms of no wheel events const handleScroll = (e: Event) => { - const target = e.target as Element; - // Ignore scroll events from the navigation itself - if (target.closest('.navigation') || target.closest('.nav-mobile-menu')) { + if (isTargetInsideAny(e.target, ['.navigation', '.nav-mobile-menu'])) { return; } // Get scroll position from the target element + const target = e.target as Element; const scrollTop = target instanceof HTMLElement ? target.scrollTop : 0; const scrollingUp = scrollTop < lastScrollTop; const scrollDelta = lastScrollTop - scrollTop; @@ -78,10 +78,8 @@ export function Navigation() { // Wheel event handler for non-scrollable areas const handleWheel = (e: WheelEvent) => { - const target = e.target as Element; - // Ignore wheel events from the navigation itself - if (target.closest('.navigation') || target.closest('.nav-mobile-menu')) { + if (isTargetInsideAny(e.target, ['.navigation', '.nav-mobile-menu'])) { return; } @@ -381,6 +379,15 @@ export function Navigation() { {t('preferences')} + + {t('database')} + + )} @@ -500,6 +507,15 @@ export function Navigation() { {t('preferences')} + + {t('database')} + + diff --git a/frontend/src/components/map/InstitutionInfoPanel.tsx b/frontend/src/components/map/InstitutionInfoPanel.tsx index efcea105d3..dbfce26b6c 100644 --- a/frontend/src/components/map/InstitutionInfoPanel.tsx +++ b/frontend/src/components/map/InstitutionInfoPanel.tsx @@ -24,6 +24,7 @@ import { CustodianTimeline } from './CustodianTimeline'; // import { VoronoiStippling } from './VoronoiStippling'; import { ErrorBoundary } from '../common/ErrorBoundary'; import { safeString } from '../../utils/safeString'; +import { isTargetInsideAny } from '../../utils/dom'; import { useWikidataImage } from '../../hooks/useWikidataImage'; import type { Archive } from '../../types/werkgebied'; @@ -428,6 +429,10 @@ const InstitutionInfoPanelComponent: React.FC = ({ // Tab state for info/youtube const [activeTab, setActiveTab] = useState<'info' | 'youtube'>('info'); + // Export dropdown state + const [showExportMenu, setShowExportMenu] = useState(false); + const exportMenuRef = useRef(null); + // Track if user has manually positioned this panel const hasUserPositioned = useRef(false); @@ -474,8 +479,7 @@ const InstitutionInfoPanelComponent: React.FC = ({ // Drag handlers const handleMouseDown = useCallback((e: React.MouseEvent) => { // Don't start drag if clicking on buttons or links - if ((e.target as HTMLElement).closest('button') || - (e.target as HTMLElement).closest('a')) { + if (isTargetInsideAny(e.target, ['button', 'a'])) { return; } @@ -551,6 +555,19 @@ const InstitutionInfoPanelComponent: React.FC = ({ return () => window.removeEventListener('keydown', handleKeyDown); }, [onClose]); + // Close export menu when clicking outside + useEffect(() => { + const handleClickOutside = (e: MouseEvent) => { + if (exportMenuRef.current && !exportMenuRef.current.contains(e.target as Node)) { + setShowExportMenu(false); + } + }; + if (showExportMenu) { + document.addEventListener('mousedown', handleClickOutside); + } + return () => document.removeEventListener('mousedown', handleClickOutside); + }, [showExportMenu]); + // GHCID click handler - cycle through displays const handleGhcidClick = () => { if (ghcidDisplay === 'current') { diff --git a/frontend/src/components/map/MediaGallery.tsx b/frontend/src/components/map/MediaGallery.tsx index 5aa322bd62..4449de60d2 100644 --- a/frontend/src/components/map/MediaGallery.tsx +++ b/frontend/src/components/map/MediaGallery.tsx @@ -762,6 +762,36 @@ const MediaGalleryComponent: React.FC = ({ const [youtubeReady, setYoutubeReady] = useState(false); const [failedPhotoUrls, setFailedPhotoUrls] = useState>(new Set()); + /** + * Get specific failure reason based on URL pattern + * Returns user-friendly error message for common failure types + * @internal Reserved for future diagnostic display + */ + const getFailureReason = (url: string): string => { + const lowerUrl = url.toLowerCase(); + + // Google Places/Maps images require API key + if (lowerUrl.includes('lh3.googleusercontent.com') || + lowerUrl.includes('maps.googleapis.com') || + lowerUrl.includes('googleusercontent.com/p/')) { + return t('Google Places afbeelding niet beschikbaar (API-sleutel vereist)', + 'Google Places image unavailable (API key required)'); + } + + // Relative URLs that weren't resolved + if (!url.startsWith('http://') && !url.startsWith('https://')) { + return t('Ongeldige URL (niet volledig)', 'Invalid URL (not fully resolved)'); + } + + // YouTube videos in image field + if (lowerUrl.includes('youtube.com/watch') || lowerUrl.includes('youtu.be/')) { + return t('Video-URL in afbeeldingsveld', 'Video URL in image field'); + } + + // Generic expired/404 error + return t('Afbeelding kon niet worden geladen', 'Image could not be loaded'); + }; + // Mark failed photos but DON'T remove them - prevents crash when all photos fail const effectivePhotos = useMemo(() => { // Start with initial photos, add wikidata image if available and not already included @@ -770,11 +800,11 @@ const MediaGalleryComponent: React.FC = ({ allPhotos.push({ url: wikidataImageUrl, attribution: 'Wikimedia Commons' }); } - // Mark failed photos but DON'T remove them + // Mark failed photos with specific failure reasons const photosWithStatus = allPhotos.map(photo => ({ ...photo, failed: failedPhotoUrls.has(photo.url), - failReason: failedPhotoUrls.has(photo.url) ? 'Image could not be loaded' : undefined + failReason: failedPhotoUrls.has(photo.url) ? getFailureReason(photo.url) : undefined })); // Sort: working images first, failed images last @@ -783,7 +813,7 @@ const MediaGalleryComponent: React.FC = ({ if (!a.failed && b.failed) return -1; return 0; }); - }, [initialPhotos, failedPhotoUrls, wikidataImageUrl]); + }, [initialPhotos, failedPhotoUrls, wikidataImageUrl, t]); const containerRef = useRef(null); const playerRef = useRef(null); diff --git a/frontend/src/components/query/ConversationPanel.css b/frontend/src/components/query/ConversationPanel.css index 7da3305f39..4647d58990 100644 --- a/frontend/src/components/query/ConversationPanel.css +++ b/frontend/src/components/query/ConversationPanel.css @@ -125,6 +125,14 @@ border-bottom-right-radius: 4px; } +/* Ensure all text inside user messages is white */ +.conversation-panel__message--user .conversation-panel__message-content p, +.conversation-panel__message--user .conversation-panel__message-content span, +.conversation-panel__message--user .conversation-panel__loading, +.conversation-panel__message--user .conversation-panel__error { + color: white; +} + .conversation-panel__message--assistant .conversation-panel__message-content { background: var(--surface-secondary, #f5f5f5); color: var(--text-primary, #212121); @@ -497,6 +505,12 @@ color: var(--error-color, #d32f2f); } +.conversation-panel__toolbar-btn--warning:hover:not(:disabled) { + background: #fff3e0; + border-color: #ff9800; + color: #f57c00; +} + /* History Dropdown */ .conversation-panel__history-selector { position: relative; diff --git a/frontend/src/components/query/ConversationPanel.tsx b/frontend/src/components/query/ConversationPanel.tsx index 150b33ca7e..fbab55c535 100644 --- a/frontend/src/components/query/ConversationPanel.tsx +++ b/frontend/src/components/query/ConversationPanel.tsx @@ -18,7 +18,7 @@ */ import React, { useState, useRef, useEffect, useCallback } from 'react'; -import { Send, Loader2, Sparkles, AlertCircle, Copy, Check, ChevronDown, History, Download, Upload, Trash2, X } from 'lucide-react'; +import { Send, Loader2, Sparkles, AlertCircle, Copy, Check, ChevronDown, History, Download, Upload, Trash2, X, RefreshCw } from 'lucide-react'; import { useLanguage } from '../../contexts/LanguageContext'; import './ConversationPanel.css'; @@ -87,6 +87,8 @@ const TEXT = { exportSuccess: { nl: 'Conversatie geëxporteerd', en: 'Conversation exported' }, importSuccess: { nl: 'Conversatie geïmporteerd', en: 'Conversation imported' }, importError: { nl: 'Ongeldig bestand', en: 'Invalid file' }, + resetCache: { nl: 'Cache wissen', en: 'Clear cache' }, + cacheCleared: { nl: 'Cache gewist - probeer uw vraag opnieuw', en: 'Cache cleared - try your question again' }, }; // Example questions to help users get started - shorter list @@ -453,6 +455,39 @@ export const ConversationPanel: React.FC = ({ onQueryGen showNotification(t('conversationCleared')); }; + /** + * Clear all caches (IndexedDB semantic cache + conversation) and reload + */ + const handleResetCache = async () => { + try { + // Clear IndexedDB semantic cache + const databases = ['GLAM_SemanticCache', 'GLAM_InstitutionsCache']; + for (const dbName of databases) { + await new Promise((resolve) => { + const request = indexedDB.deleteDatabase(dbName); + request.onsuccess = () => resolve(); + request.onerror = () => resolve(); // Continue even if error + request.onblocked = () => resolve(); + }); + } + + // Clear messages + setMessages([]); + + // Show notification + showNotification(t('cacheCleared')); + + // Reload after short delay to ensure notification is seen + setTimeout(() => { + window.location.reload(); + }, 1500); + } catch (err) { + console.error('Failed to clear cache:', err); + // Force reload anyway + window.location.reload(); + } + }; + return (
{/* Notification Toast */} @@ -600,6 +635,16 @@ export const ConversationPanel: React.FC = ({ onQueryGen + {/* Reset Cache Button - clears IndexedDB and reloads */} + + {/* Clear Conversation Button */} {messages.length > 0 && (
- + + {/* Header Actions */} +
+ {/* Export Button */} +
+ + {showExportMenu && ( +
+
+
Download
+ + + + {profileData?.career_history && profileData.career_history.length > 0 && ( + + )} +
+
+
Copy to Clipboard
+ + + +
+
+ )} + {copySuccess && ( +
+ ✓ {copySuccess} copied! +
+ )} +
+ + +
{/* Content - Always show all available metadata */} @@ -396,48 +769,151 @@ const PersonInfoPanelComponent: React.FC = ({ )} - {/* Career History */} - {profileData?.career_history && profileData.career_history.length > 0 && ( -
-
Career History
-
- {profileData.career_history.slice(0, 5).map((job, index) => ( -
-
-
{job.role}
-
{job.dates}
- {job.current && ( - Current - )} -
-
{job.organization}
- {job.location && ( -
{job.location}
- )} - {job.description && ( -
{job.description}
- )} -
- {job.level && ( - {job.level} - )} - {job.company_size && ( - {job.company_size} - )} - {job.industry && ( - {job.industry} - )} -
-
- ))} - {profileData.career_history.length > 5 && ( -
- +{profileData.career_history.length - 5} more positions -
- )} + {/* Career History - Accordion UI */} + {profileData?.career_history && profileData.career_history.length > 0 && (() => { + // Pre-process career history to handle field variants and filter empty items + const validCareerItems = profileData.career_history + .map((job, originalIndex) => { + // Handle field name variants + const jobRole = job.role || job.title || null; + const jobCompany = job.organization || job.company || null; + const jobLocation = job.location || null; + const jobDates = job.dates || job.duration || job.duration_text || null; + + // Handle "Unknown" role as effectively empty + const displayRole = (jobRole && jobRole !== 'Unknown') ? jobRole : null; + + // Skip items with no meaningful content + if (!displayRole && !jobCompany) return null; + + return { + ...job, + displayRole, + jobCompany, + jobLocation, + jobDates, + originalIndex, + }; + }) + .filter(Boolean); + + if (validCareerItems.length === 0) return null; + + return ( +
+
+ Career History ({validCareerItems.length} position{validCareerItems.length !== 1 ? 's' : ''}) +
+
+ {validCareerItems.map((job) => { + if (!job) return null; + const isExpanded = expandedCareerItems.has(job.originalIndex); + const hasDetails = job.description || job.level || job.company_size || job.industry; + + return ( +
+ {/* Accordion Header - Always Visible */} + + + {/* Accordion Content - Collapsible */} +
+
+ {/* Full Location - only show if location exists, has content, and contains a comma */} + {job.jobLocation && job.jobLocation.trim() && job.jobLocation.includes(',') && ( +
+ Location: + {job.jobLocation} +
+ )} + + {/* Description */} + {job.description && ( +
+ {job.description} +
+ )} + + {/* Metadata Badges */} + {hasDetails && ( +
+ {job.level && ( + + 📊 + {job.level} + + )} + {job.company_size && ( + + 👥 + {job.company_size} + + )} + {job.industry && ( + + 🏢 + {job.industry} + + )} +
+ )} +
+
+
+ ); + })} +
-
- )} + ); + })()} {/* Role Category */} {roleLabel && ( diff --git a/frontend/src/hooks/useGeoApiInstitutions.ts b/frontend/src/hooks/useGeoApiInstitutions.ts index e6219d0e40..7e1fd30c1a 100644 --- a/frontend/src/hooks/useGeoApiInstitutions.ts +++ b/frontend/src/hooks/useGeoApiInstitutions.ts @@ -515,6 +515,8 @@ interface GeoAPIFeature { social_media?: Record | string; // Logo URL extracted from web claims logo_url?: string; + // Web claims - structured data extracted from websites + web_claims?: string | { claims?: WebClaim[] } | WebClaim[]; // YouTube enrichment - may be object or JSON string youtube_enrichment?: string | { status?: string; @@ -575,6 +577,19 @@ interface GeoAPISearchResponse { results: GeoAPISearchResult[]; } +/** + * Web claim from website scraping + */ +interface WebClaim { + claim_type?: string; + claim_value?: string; + raw_value?: string; + source_url?: string; + retrieved_on?: string; + xpath?: string; + extraction_method?: string; +} + /** * Loading progress for UI feedback */ @@ -613,6 +628,124 @@ function parseProvinceFromGhcid(ghcid: string | null | undefined): string { return PROVINCE_CODE_MAP[code] || ''; } +/** + * Parse web_claims from JSON string or object + */ +function parseWebClaims(value: unknown): WebClaim[] | undefined { + if (!value) return undefined; + + try { + let parsed = value; + if (typeof value === 'string') { + parsed = JSON.parse(value); + } + + // Web claims can be an array directly or wrapped in an object + const claims: WebClaim[] = Array.isArray(parsed) ? parsed : ((parsed as Record).claims as WebClaim[] || []); + + return claims.length > 0 ? claims : undefined; + } catch { + return undefined; + } +} + +/** + * Resolve a potentially relative URL against a base URL + */ +function resolveUrl(url: string, baseUrl?: string): string { + // Already absolute URL + if (url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//')) { + return url.startsWith('//') ? `https:${url}` : url; + } + + // No base URL to resolve against + if (!baseUrl) return url; + + try { + // Use URL API to resolve relative URLs + const base = new URL(baseUrl); + return new URL(url, base).href; + } catch { + // If URL parsing fails, return as-is + return url; + } +} + +/** + * Check if a URL is a valid image URL (not a video, not relative without base) + */ +function isValidImageUrl(url: string): boolean { + if (!url) return false; + + // Must be absolute URL + if (!url.startsWith('http://') && !url.startsWith('https://')) { + return false; + } + + // Filter out non-image URLs + const invalidPatterns = [ + 'youtube.com/watch', + 'youtu.be/', + 'vimeo.com/', + 'twitter.com/', + 'facebook.com/', + '.mp4', + '.webm', + '.mov', + '.avi', + ]; + + const lowerUrl = url.toLowerCase(); + return !invalidPatterns.some(pattern => lowerUrl.includes(pattern)); +} + +/** + * Extract logo URL from web_claims - prefer logo_img_attr extraction method + * Priority: logo_img_attr > og_image > favicon_link > others + * Also resolves relative URLs against source_url + */ +function extractLogoFromWebClaims(webClaimsValue: unknown): string | undefined { + const claims = parseWebClaims(webClaimsValue); + if (!claims || claims.length === 0) return undefined; + + // Filter for logo claims + const logoClaims = claims.filter(c => c.claim_type === 'logo' && c.claim_value); + + if (logoClaims.length === 0) return undefined; + + // Sort by preference: logo_img_attr > og_image > favicon_link > others + const sorted = logoClaims.sort((a, b) => { + const priority: Record = { + 'logo_img_attr': 3, + 'og_image': 2, + 'favicon_link': 1, + }; + return (priority[b.extraction_method || ''] || 0) - (priority[a.extraction_method || ''] || 0); + }); + + // Filter out favicons, loading placeholders, and non-image URLs + const bestLogo = sorted.find(c => { + const url = c.claim_value || ''; + // Skip favicon-like URLs + if (c.extraction_method === 'favicon_link') return false; + // Skip very small images or placeholder images + if (url.includes('favicon') || url.includes('loading')) return false; + + // Resolve the URL and check if it's valid + const resolvedUrl = resolveUrl(url, c.source_url); + return isValidImageUrl(resolvedUrl); + }); + + const selectedClaim = bestLogo || sorted[0]; + if (!selectedClaim?.claim_value) return undefined; + + // Resolve relative URLs against source_url + const resolvedUrl = resolveUrl(selectedClaim.claim_value, selectedClaim.source_url); + + // Final validation - only return if it's a valid image URL + return isValidImageUrl(resolvedUrl) ? resolvedUrl : undefined; +} + /** * Safely extract a year number from a value that might be: * - A number (return as-is) @@ -985,7 +1118,11 @@ function featureToInstitution(feature: GeoAPIFeature): Institution { dissolution_year: safeExtractYear(props.dissolution_year), social_media: normalizeSocialMedia(props.social_media), youtube: normalizeYouTubeEnrichment(props.youtube_enrichment), - logo_url: props.logo_url, + // Extract logo URL from web_claims (primary) or use direct logo_url prop (fallback) + // Priority: web_claims logo_img_attr > web_claims og_image > props.logo_url + // Only use props.logo_url if it's a valid absolute image URL (not relative, not video) + logo_url: extractLogoFromWebClaims(props.web_claims) || + (props.logo_url && isValidImageUrl(props.logo_url) ? props.logo_url : undefined), }; } @@ -1578,6 +1715,11 @@ function detailResponseToInstitution(data: Record): Institution // Handle social media const socialMedia = data.social_media as Record | undefined; + // Extract logo URL from web_claims (primary) or use direct logo_url (fallback) + // Priority: web_claims logo_img_attr > web_claims og_image > props.logo_url + const logoUrl = extractLogoFromWebClaims(data.web_claims) || + (data.logo_url && isValidImageUrl(data.logo_url as string) ? data.logo_url as string : undefined); + return { lat: data.lat as number, lon: data.lon as number, @@ -1621,6 +1763,7 @@ function detailResponseToInstitution(data: Record): Institution dissolution_year: dissolutionYear, social_media: socialMedia, youtube, + logo_url: logoUrl, }; } @@ -1682,7 +1825,7 @@ export function useInstitutionDetail(ghcid: string | null): UseInstitutionDetail // Transform it to Institution format const inst = detailResponseToInstitution(data); - console.log('[GeoAPI Detail] Loaded institution:', inst.name); + console.log('[GeoAPI Detail] Loaded institution:', inst.name, 'logo_url:', inst.logo_url, 'youtube:', inst.youtube); // Store in cache detailCache.set(ghcid, { data: inst, timestamp: Date.now() }); diff --git a/frontend/src/hooks/useMultiDatabaseRAG.ts b/frontend/src/hooks/useMultiDatabaseRAG.ts index 848e8cd8ed..d99954d0ed 100644 --- a/frontend/src/hooks/useMultiDatabaseRAG.ts +++ b/frontend/src/hooks/useMultiDatabaseRAG.ts @@ -354,21 +354,56 @@ async function callDSPy( answer: m.role === 'assistant' ? m.content : '', })).filter(m => m.question || m.answer) || []; - const response = await fetch(`${DSPY_URL}/query`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - question, - language: options.language || 'nl', - context: conversationContext, // Backend expects conversation history here - include_visualization: true, - }), - }); + let response: Response; + try { + response = await fetch(`${DSPY_URL}/query`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + question, + language: options.language || 'nl', + context: conversationContext, // Backend expects conversation history here + include_visualization: true, + }), + }); + } catch (networkError) { + // Network error - server unreachable + console.error('[DSPy] Network error:', networkError); + const lang = options.language || 'nl'; + return { + answer: lang === 'nl' + ? '⚠️ **Serverfout**: Kan geen verbinding maken met de RAG-server. Controleer of de backend draait op poort 8003.' + : '⚠️ **Server Error**: Cannot connect to RAG server. Check if backend is running on port 8003.', + confidence: 0, + }; + } if (!response.ok) { - // Fallback response if DSPy service unavailable + // HTTP error - log details for debugging + console.error(`[DSPy] HTTP ${response.status}: ${response.statusText}`); + const lang = options.language || 'nl'; + + if (response.status === 404) { + return { + answer: lang === 'nl' + ? '⚠️ **Serverfout (404)**: De RAG API endpoint is niet gevonden. Controleer de proxy configuratie in vite.config.ts en herstart de frontend.' + : '⚠️ **Server Error (404)**: RAG API endpoint not found. Check proxy configuration in vite.config.ts and restart frontend.', + confidence: 0, + }; + } + + if (response.status >= 500) { + return { + answer: lang === 'nl' + ? `⚠️ **Serverfout (${response.status})**: De RAG-server heeft een interne fout. Controleer de backend logs.` + : `⚠️ **Server Error (${response.status})**: RAG server internal error. Check backend logs.`, + confidence: 0, + }; + } + + // Other HTTP errors - fall back to context-based answer return { - answer: generateFallbackAnswer(question, context, options.language || 'nl'), + answer: generateFallbackAnswer(question, context, lang), confidence: 0.5, }; } diff --git a/frontend/src/hooks/useProgressiveInstitutions.ts b/frontend/src/hooks/useProgressiveInstitutions.ts index d522a69863..3900bd2bfd 100644 --- a/frontend/src/hooks/useProgressiveInstitutions.ts +++ b/frontend/src/hooks/useProgressiveInstitutions.ts @@ -137,6 +137,135 @@ interface FullFeature { properties: Record; } +// Parse web_claims JSON for logo extraction +interface WebClaim { + claim_type?: string; + claim_value?: string; + raw_value?: string; + source_url?: string; + retrieved_on?: string; + xpath?: string; + extraction_method?: string; +} + +/** + * Parse web_claims from JSON string or object + */ +function parseWebClaims(value: unknown): WebClaim[] | undefined { + if (!value) return undefined; + + try { + let parsed = value; + if (typeof value === 'string') { + parsed = JSON.parse(value); + } + + // Web claims can be an array directly or wrapped in an object + const claims: WebClaim[] = Array.isArray(parsed) ? parsed : ((parsed as Record).claims as WebClaim[] || []); + + return claims.length > 0 ? claims : undefined; + } catch { + return undefined; + } +} + +/** + * Resolve a potentially relative URL against a base URL + */ +function resolveUrl(url: string, baseUrl?: string): string { + // Already absolute URL + if (url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//')) { + return url.startsWith('//') ? `https:${url}` : url; + } + + // No base URL to resolve against + if (!baseUrl) return url; + + try { + // Use URL API to resolve relative URLs + const base = new URL(baseUrl); + return new URL(url, base).href; + } catch { + // If URL parsing fails, return as-is + return url; + } +} + +/** + * Check if a URL is a valid image URL (not a video, not relative without base) + */ +function isValidImageUrl(url: string): boolean { + if (!url) return false; + + // Must be absolute URL + if (!url.startsWith('http://') && !url.startsWith('https://')) { + return false; + } + + // Filter out non-image URLs + const invalidPatterns = [ + 'youtube.com/watch', + 'youtu.be/', + 'vimeo.com/', + 'twitter.com/', + 'facebook.com/', + '.mp4', + '.webm', + '.mov', + '.avi', + ]; + + const lowerUrl = url.toLowerCase(); + return !invalidPatterns.some(pattern => lowerUrl.includes(pattern)); +} + +/** + * Extract logo URL from web_claims - prefer logo_img_attr extraction method + * Priority: logo_img_attr > og_image > favicon_link > others + * Also resolves relative URLs against source_url + */ +function extractLogoFromWebClaims(webClaimsValue: unknown): string | undefined { + const claims = parseWebClaims(webClaimsValue); + if (!claims || claims.length === 0) return undefined; + + // Filter for logo claims + const logoClaims = claims.filter(c => c.claim_type === 'logo' && c.claim_value); + + if (logoClaims.length === 0) return undefined; + + // Sort by preference: logo_img_attr > og_image > favicon_link > others + const sorted = logoClaims.sort((a, b) => { + const priority: Record = { + 'logo_img_attr': 3, + 'og_image': 2, + 'favicon_link': 1, + }; + return (priority[b.extraction_method || ''] || 0) - (priority[a.extraction_method || ''] || 0); + }); + + // Filter out favicons, loading placeholders, and non-image URLs + const bestLogo = sorted.find(c => { + const url = c.claim_value || ''; + // Skip favicon-like URLs + if (c.extraction_method === 'favicon_link') return false; + // Skip very small images or placeholder images + if (url.includes('favicon') || url.includes('loading')) return false; + + // Resolve the URL and check if it's valid + const resolvedUrl = resolveUrl(url, c.source_url); + return isValidImageUrl(resolvedUrl); + }); + + const selectedClaim = bestLogo || sorted[0]; + if (!selectedClaim?.claim_value) return undefined; + + // Resolve relative URLs against source_url + const resolvedUrl = resolveUrl(selectedClaim.claim_value, selectedClaim.source_url); + + // Final validation - only return if it's a valid image URL + return isValidImageUrl(resolvedUrl) ? resolvedUrl : undefined; +} + function fullFeatureToInstitution(feature: FullFeature): Institution { const props = feature.properties; const [lon, lat] = feature.geometry.coordinates; @@ -149,6 +278,13 @@ function fullFeatureToInstitution(feature: FullFeature): Institution { const socialMedia = parseSocialMedia(props.social_media); const youtube = parseYouTube(props.youtube_enrichment); + // Extract logo URL from web_claims (primary) or use direct logo_url prop (fallback) + // Priority: web_claims logo_img_attr > web_claims og_image > props.logo_url + // Only use props.logo_url if it's a valid absolute image URL (not relative, not video) + const webClaimsLogo = extractLogoFromWebClaims(props.web_claims); + const fallbackLogo = props.logo_url as string | undefined; + const logoUrl = webClaimsLogo || (fallbackLogo && isValidImageUrl(fallbackLogo) ? fallbackLogo : undefined); + return { lat, lon, @@ -183,6 +319,7 @@ function fullFeatureToInstitution(feature: FullFeature): Institution { dissolution_year: safeExtractYear(props.dissolution_year), social_media: socialMedia, youtube, + logo_url: logoUrl, }; } diff --git a/frontend/src/lib/storage/institutions-cache.ts b/frontend/src/lib/storage/institutions-cache.ts index 93e9678ea3..4658a9aa37 100644 --- a/frontend/src/lib/storage/institutions-cache.ts +++ b/frontend/src/lib/storage/institutions-cache.ts @@ -67,7 +67,7 @@ const CACHE_KEY = 'all_institutions'; const DEFAULT_CONFIG: InstitutionsCacheConfig = { staleTtlMs: 1 * 60 * 60 * 1000, // 1 hour - trigger background refresh expiredTtlMs: 24 * 60 * 60 * 1000, // 24 hours - force foreground refresh - cacheVersion: '1.0.0', + cacheVersion: '1.1.0', // Bumped from 1.0.0 to include logo_url in cached data }; // ============================================================================ diff --git a/frontend/src/lib/storage/ui-state.ts b/frontend/src/lib/storage/ui-state.ts index 9905d1a7d1..9cc2a82640 100644 --- a/frontend/src/lib/storage/ui-state.ts +++ b/frontend/src/lib/storage/ui-state.ts @@ -11,7 +11,7 @@ * - Migration support for format changes */ -const STORAGE_VERSION = 1; +const STORAGE_VERSION = 2; // Incremented to trigger migration to progressive default const STORAGE_KEY_PREFIX = 'rdf-visualizer'; /** @@ -268,10 +268,19 @@ export function clearRecentQueries(): boolean { * Migrate UI state from old version to current */ function migrateUIState(oldState: UIState): UIState { - // Currently only version 1 exists - // Future migrations would go here + // Version 1 -> 2: Reset dataBackend to 'progressive' (new recommended default) + // This ensures all users start with progressive mode after the update + if (oldState.version < 2) { + console.log('Migration v1->v2: Setting dataBackend to progressive (new recommended default)'); + oldState = { + ...oldState, + dataBackend: 'progressive', + }; + } - // For now, merge with defaults to add any missing fields + // Future migrations would go here (e.g., version 2 -> 3) + + // Merge with defaults to add any missing fields and update version return deepMerge(DEFAULT_UI_STATE, { ...oldState, version: STORAGE_VERSION }); } diff --git a/frontend/src/pages/ConversationPage.css b/frontend/src/pages/ConversationPage.css index 47e8c64e44..cdacb1778a 100644 --- a/frontend/src/pages/ConversationPage.css +++ b/frontend/src/pages/ConversationPage.css @@ -36,7 +36,8 @@ grid-template-columns: 1fr; gap: 0; height: 100%; - overflow: hidden; + min-height: 0; + overflow: visible; } .conversation-layout--with-viz { @@ -115,6 +116,7 @@ display: flex; flex-direction: column; height: 100%; + min-height: 0; background: white; border-right: 1px solid var(--border-color, #e5e5e5); } @@ -151,6 +153,37 @@ color: #ffd700; } +.conversation-chat__header { + display: flex; + justify-content: space-between; + align-items: center; +} + +.conversation-chat__new-btn { + display: flex; + align-items: center; + gap: 6px; + padding: 8px 14px; + background: rgba(255, 255, 255, 0.15); + border: 1px solid rgba(255, 255, 255, 0.3); + border-radius: 8px; + color: white; + font-size: 0.875rem; + font-weight: 500; + cursor: pointer; + transition: all 0.2s ease; +} + +.conversation-chat__new-btn:hover { + background: rgba(255, 255, 255, 0.25); + border-color: rgba(255, 255, 255, 0.5); +} + +.conversation-chat__new-btn:active { + background: rgba(255, 255, 255, 0.3); + transform: scale(0.98); +} + /* ============================================================================ Input Area ============================================================================ */ @@ -375,6 +408,17 @@ background: #fff5f5; } +.conversation-chat__action-btn--warning { + border-color: #f59e0b; + color: #b45309; +} + +.conversation-chat__action-btn--warning:hover:not(:disabled) { + border-color: #d97706; + color: #d97706; + background: #fffbeb; +} + /* History dropdown */ .conversation-chat__history-selector { position: relative; @@ -475,6 +519,7 @@ display: flex; flex-direction: column; gap: 16px; + min-height: 0; } /* Welcome state */ @@ -482,11 +527,12 @@ display: flex; flex-direction: column; align-items: center; - justify-content: center; + justify-content: flex-start; text-align: center; padding: 48px 24px; max-width: 600px; margin: 0 auto; + min-height: min-content; } .conversation-chat__welcome-header { @@ -588,6 +634,7 @@ margin: 0; white-space: pre-wrap; word-wrap: break-word; + color: inherit; } .conversation-message__loading { diff --git a/frontend/src/pages/ConversationPage.tsx b/frontend/src/pages/ConversationPage.tsx index 49c21d0767..30a1e5296c 100644 --- a/frontend/src/pages/ConversationPage.tsx +++ b/frontend/src/pages/ConversationPage.tsx @@ -43,6 +43,8 @@ import { Layers, Database, Zap, + RefreshCw, + Plus, } from 'lucide-react'; import { useLanguage } from '../contexts/LanguageContext'; import { useMultiDatabaseRAG, type RAGResponse, type ConversationMessage, type VisualizationType, type InstitutionData } from '../hooks/useMultiDatabaseRAG'; @@ -110,6 +112,8 @@ const TEXT = { export: { nl: 'Export', en: 'Export' }, import: { nl: 'Import', en: 'Import' }, clear: { nl: 'Wis', en: 'Clear' }, + new: { nl: 'Nieuw', en: 'New' }, + newConversation: { nl: 'Nieuw gesprek starten', en: 'Start new conversation' }, embeddings: { nl: 'Embeddings', en: 'Embeddings' }, advanced: { nl: 'Geavanceerd', en: 'Advanced' }, simple: { nl: 'Eenvoudig', en: 'Simple' }, @@ -1032,6 +1036,15 @@ const ConversationPage: React.FC = () => {

{t('pageSubtitle')}

+ {/* Input Area - Top */} @@ -1183,6 +1196,17 @@ const ConversationPage: React.FC = () => { )} + {/* Reset Cache - Warning Style */} + + {/* Cache Status Indicator */} {lastCacheLookup && (
diff --git a/frontend/src/pages/Database.css b/frontend/src/pages/Database.css index 50fee7bd6e..5a70d6aed4 100644 --- a/frontend/src/pages/Database.css +++ b/frontend/src/pages/Database.css @@ -4,7 +4,7 @@ max-width: none; margin: 0; padding: 1rem 1.5rem; - min-height: calc(100vh - 60px); + padding-bottom: 2rem; animation: fadeIn 0.5s ease-in; } diff --git a/frontend/src/pages/InstitutionBrowserPage.css b/frontend/src/pages/InstitutionBrowserPage.css index 3b2a114c31..7ab25bbffe 100644 --- a/frontend/src/pages/InstitutionBrowserPage.css +++ b/frontend/src/pages/InstitutionBrowserPage.css @@ -250,6 +250,15 @@ font-size: 1.25rem; } +.card-logo { + width: 28px; + height: 28px; + border-radius: 4px; + object-fit: contain; + background: #f5f7fa; + flex-shrink: 0; +} + .type-badge { font-size: 0.7rem; font-weight: 600; @@ -453,6 +462,15 @@ font-size: 2.5rem; } +.modal-logo { + width: 56px; + height: 56px; + border-radius: 8px; + object-fit: contain; + background: #f5f7fa; + flex-shrink: 0; +} + .modal-header h2 { font-size: 1.25rem; font-weight: 600; @@ -800,6 +818,10 @@ color: #a0a0b0; } +[data-theme="dark"] .card-logo { + background: #1e1e32; +} + [data-theme="dark"] .card-title { color: #e0e0e0; } @@ -880,6 +902,10 @@ color: #e0e0e0; } +[data-theme="dark"] .modal-logo { + background: #1e1e32; +} + [data-theme="dark"] .detail-section h4 { color: #a0a0b0; } diff --git a/frontend/src/pages/InstitutionBrowserPage.tsx b/frontend/src/pages/InstitutionBrowserPage.tsx index dd5ac9d61e..cf98733834 100644 --- a/frontend/src/pages/InstitutionBrowserPage.tsx +++ b/frontend/src/pages/InstitutionBrowserPage.tsx @@ -526,11 +526,22 @@ function InstitutionCard({ const typeInfo = TYPE_INFO[institution.type] || TYPE_INFO['U']; const countryCode = institution.ghcid?.current?.substring(0, 2) || ''; const hasNetwork = hasStaffNetworkData(getCustodianSlug(institution.name)); + const [logoError, setLogoError] = useState(false); return (
- {typeInfo.icon} + {/* Logo or type icon */} + {institution.logo_url && !logoError ? ( + setLogoError(true)} + /> + ) : ( + {typeInfo.icon} + )} {typeInfo.name} @@ -587,6 +598,7 @@ function InstitutionDetailModal({ }) { const t = (key: keyof typeof TEXT) => TEXT[key][language]; const typeInfo = TYPE_INFO[institution.type] || TYPE_INFO['U']; + const [logoError, setLogoError] = useState(false); // Close on escape key useEffect(() => { @@ -603,7 +615,17 @@ function InstitutionDetailModal({
- {typeInfo.icon} + {/* Logo or type icon */} + {institution.logo_url && !logoError ? ( + setLogoError(true)} + /> + ) : ( + {typeInfo.icon} + )}

{institution.name}

diff --git a/frontend/src/pages/NDEMapPageMapLibre.tsx b/frontend/src/pages/NDEMapPageMapLibre.tsx index 5cc38ff6b5..e728275f0f 100644 --- a/frontend/src/pages/NDEMapPageMapLibre.tsx +++ b/frontend/src/pages/NDEMapPageMapLibre.tsx @@ -14,7 +14,7 @@ */ import { useEffect, useRef, useState, useMemo, useCallback } from 'react'; -import { useSearchParams } from 'react-router-dom'; +import { useSearchParams, useNavigate } from 'react-router-dom'; import maplibregl from 'maplibre-gl'; import type { StyleSpecification, MapLayerMouseEvent, GeoJSONSource } from 'maplibre-gl'; import 'maplibre-gl/dist/maplibre-gl.css'; @@ -207,6 +207,7 @@ function institutionsToGeoJSON(institutions: Institution[]): GeoJSON.FeatureColl export default function NDEMapPage() { const [searchParams, setSearchParams] = useSearchParams(); + const navigate = useNavigate(); const mapRef = useRef(null); const mapInstanceRef = useRef(null); const [mapReady, setMapReady] = useState(false); @@ -2060,7 +2061,7 @@ export default function NDEMapPage() { {/* Link to settings for backend selection */}

- {t('Wijzig databron in', 'Change data source in')} { e.preventDefault(); /* Open settings modal */ }}>{t('Instellingen', 'Settings')} + {t('Wijzig databron in', 'Change data source in')} { e.preventDefault(); navigate('/settings'); }}>{t('Instellingen', 'Settings')}

diff --git a/frontend/src/types/socialNetwork.ts b/frontend/src/types/socialNetwork.ts index 3f6569bb99..8dc171a246 100644 --- a/frontend/src/types/socialNetwork.ts +++ b/frontend/src/types/socialNetwork.ts @@ -457,10 +457,13 @@ export interface ExtendedProfileData { country?: string; }>; career_history?: Array<{ - organization: string; - role: string; - dates: string; + organization?: string; + company?: string; // Alternative to organization + role?: string; + title?: string; // Alternative to role + dates?: string; duration?: string; + duration_text?: string; // Alternative to duration location?: string; current?: boolean; company_size?: string; diff --git a/frontend/src/utils/dom.ts b/frontend/src/utils/dom.ts new file mode 100644 index 0000000000..707085e774 --- /dev/null +++ b/frontend/src/utils/dom.ts @@ -0,0 +1,52 @@ +/** + * DOM utility functions for safe element operations + */ + +/** + * Safely calls .closest() on an event target. + * Works with HTMLElement, SVGElement, and Text nodes. + * + * @param target - The event target (may be Element, Text node, or null) + * @param selector - CSS selector to match + * @returns The closest matching element, or null if not found + */ +export function safeClosest(target: EventTarget | null, selector: string): Element | null { + if (!target) return null; + + // If target is a Text node or other non-Element, get its parent element + let element: Element | null = null; + + if (target instanceof Element) { + element = target; + } else if (target instanceof Node && target.parentElement) { + // Text nodes, comment nodes, etc. + element = target.parentElement; + } + + if (!element) return null; + + // Now safely call closest() + return element.closest(selector); +} + +/** + * Check if an event target is inside an element matching the selector + * + * @param target - The event target + * @param selector - CSS selector to match + * @returns true if target is inside an element matching selector + */ +export function isTargetInside(target: EventTarget | null, selector: string): boolean { + return safeClosest(target, selector) !== null; +} + +/** + * Check if an event target matches any of the given selectors + * + * @param target - The event target + * @param selectors - Array of CSS selectors to match + * @returns true if target is inside any element matching the selectors + */ +export function isTargetInsideAny(target: EventTarget | null, selectors: string[]): boolean { + return selectors.some(selector => isTargetInside(target, selector)); +} diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index 2171e276f6..d4e9c8c42e 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -55,6 +55,11 @@ export default defineConfig({ changeOrigin: true, rewrite: (path) => path.replace(/^\/ducklake/, ''), }, + // RAG API proxy (Heritage RAG backend on port 8003) + '/api/rag': { + target: 'http://localhost:8003', + changeOrigin: true, + }, // Generic API fallback '/api': { target: 'http://localhost:8000', diff --git a/pyproject.toml b/pyproject.toml index d25fe15ac3..ca1516aa4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ python = "^3.11" # Core data processing pandas = "^2.1.0" -numpy = "^1.26.0" +numpy = ">=2.0.0" # Text processing (direct dependencies only) # NOTE: NLP extraction (NER) is handled by coding subagents via Task tool @@ -47,7 +47,7 @@ rdflib = "^7.0.0" SPARQLWrapper = "^2.0.0" # Database and storage -duckdb = "^0.9.0" +duckdb = ">=1.0.0" sqlalchemy = "^2.0.0" pyarrow = "^14.0.0" @@ -71,6 +71,9 @@ pydantic-settings = "^2.0.0" # DSPy for LLM-powered SPARQL generation dspy-ai = "^2.5.0" openai = "^1.0.0" # DSPy backend for OpenAI/Anthropic +qdrant-client = "^1.16.2" +sentence-transformers = "^5.2.0" +typedb-driver = "^3.0.0" [tool.poetry.group.dev.dependencies] # Testing diff --git a/schemas/20251121/linkml/custodian_source.yaml b/schemas/20251121/linkml/custodian_source.yaml index 6a542297c6..ed77160a10 100644 --- a/schemas/20251121/linkml/custodian_source.yaml +++ b/schemas/20251121/linkml/custodian_source.yaml @@ -88,6 +88,8 @@ enums: description: Entry requires further enrichment processing new_entry: description: Newly added entry not yet enriched + google_maps_searched: + description: Google Maps search attempted but not yet fully enriched InstitutionTypeCodeEnum: description: Single-letter GLAMORCUBESFIXPHDNT type codes @@ -184,6 +186,8 @@ enums: description: LinkedIn profile or company page GHCID_PREVIOUS: description: Previous GHCID before relocation or reorganization + OCLC: + description: OCLC (Online Computer Library Center) identifier LocationResolutionMethodEnum: description: Method used to resolve settlement location @@ -432,6 +436,9 @@ classes: organisatie: range: string description: Organization name from source + organisation: + range: string + description: Organization name from source (British spelling variant) isil_code_na: range: string description: ISIL code from Nationaal Archief @@ -652,10 +659,19 @@ classes: range: string description: Status of Wikidata enrichment for this entry comment: - range: ReferenceLink - multivalued: true + any_of: + - range: string + - range: ReferenceLink + multivalued: true inlined_as_list: true - description: Comments about this entry (array of objects with label field) + description: Comments about this entry (can be a string or array of objects with label field) + comments: + any_of: + - range: string + - range: ReferenceLink + multivalued: true + inlined_as_list: true + description: Comments about this entry (string or array of objects with label field) succeeded_by: range: ReferenceLink multivalued: true @@ -668,6 +684,15 @@ classes: label: range: string description: Name/label of the duplicate institution + entry_index: + range: integer + description: Index of the duplicate entry in source data + entry_file: + range: string + description: Filename of the duplicate entry + reason: + range: string + description: Reason why this is considered a duplicate TimeEntry: description: Structured time entry from source data @@ -852,6 +877,11 @@ classes: data_source: range: string description: Data source type (CSV_REGISTRY, API_SCRAPING, etc.) + data_sources: + range: string + multivalued: true + inlined_as_list: true + description: List of data sources (e.g., NDE registry, Google Maps, website) data_tier: range: DataTierEnum description: Quality tier of the data @@ -861,6 +891,12 @@ classes: extraction_method: range: string description: Method used to extract the data + enrichment_date: + range: string + description: When enrichment was performed (ISO date string) + enrichment_method: + range: string + description: Method used to enrich the data (e.g., website_research) confidence_score: range: float description: Confidence score (0-1) @@ -894,6 +930,15 @@ classes: wikidata_property: range: string description: Wikidata property ID (e.g., P856) + archive_location: + range: string + description: Location of archived copy (e.g., web/1186/hartebrug.nl) + claim_extracted_from: + range: string + description: Source path from which claim was extracted (e.g., original_entry.reference) + verified_via_web_archive: + range: boolean + description: Whether claim was verified via web archive ProvenanceSources: description: Sources organized by type @@ -943,6 +988,52 @@ classes: multivalued: true inlined_as_list: true description: Nationaal Archief ISIL registry source records + whois_research: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: WHOIS domain research source records + manual_research: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Manual research source records + website: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Website source records (institution website data) + web_scrape: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Web scrape source records (scraped website data) + # Data tier summary fields (for provenance summaries) + TIER_1_AUTHORITATIVE: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_1 authoritative sources + TIER_2_VERIFIED: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_2 verified sources + TIER_3_CROWD_SOURCED: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_3 crowd-sourced sources + TIER_4_INFERRED: + range: string + multivalued: true + inlined_as_list: true + description: List of TIER_4 inferred sources + museum_register: + range: SourceRecord + multivalued: true + inlined_as_list: true + description: Museum register source records SourceRecord: description: Individual source record with claims @@ -1004,6 +1095,20 @@ classes: source_file: range: string description: Source file name + research_date: + range: string + description: Date of research (YYYY-MM-DD format) + url: + range: uri + description: URL of the source (website URL, etc.) + data_extracted: + range: string + multivalued: true + inlined_as_list: true + description: List of data types/fields extracted from this source + merge_note: + range: string + description: Note about merge operations involving this source record DataTierSummary: description: Summary of data tiers present in entry @@ -1034,7 +1139,7 @@ classes: attributes: identifier_scheme: range: IdentifierSchemeEnum - required: true + required: false description: Type of identifier identifier_value: any_of: @@ -1056,6 +1161,14 @@ classes: notes: range: string description: Additional note about this identifier (alias for note) + scheme: + range: string + description: Identifier scheme (alias for identifier_scheme, used in some data sources) + value: + any_of: + - range: string + - range: integer + description: Identifier value (alias for identifier_value, used in some data sources) # --------------------------------------------------------------------------- # GHCID BLOCK - Heritage Custodian ID with history @@ -1285,6 +1398,12 @@ classes: specific_location: range: string description: More specific location info within the city (e.g., neighborhood, district) + specific_geonames_id: + range: integer + description: GeoNames ID for the specific location (if different from main city) + correction_note: + range: string + description: Note explaining any correction made to the location resolution SourceCoordinates: description: Source of coordinates for resolution @@ -1304,13 +1423,19 @@ classes: attributes: type: range: string - description: Type of research source (e.g., note, wikidata, web_archive, official_source) + description: Type of research source (e.g., note, wikidata, web_archive, official_source, whois) text: range: string description: Text or description of the research source + value: + range: string + description: Value from this source (e.g., plus code, address) notes: range: string description: Additional notes about this source + note: + range: string + description: Additional note about this source (singular alias for notes) id: range: string description: Identifier for the source (e.g., Wikidata Q-number) @@ -1323,6 +1448,56 @@ classes: coordinates: range: string description: Coordinates from this source (e.g., "31.515, 34.434") + data: + range: ResearchSourceData + description: Structured data from the source (e.g., WHOIS registrant info) + + ResearchSourceData: + description: Structured data from a research source + attributes: + registrant_name: + range: string + description: WHOIS registrant name + registrant_address: + range: string + description: WHOIS registrant address + registrant_city: + range: string + description: WHOIS registrant city + registrant_state: + range: string + description: WHOIS registrant state/province + registrant_country: + range: string + description: WHOIS registrant country + registrant_postal_code: + range: string + description: WHOIS registrant postal code + # Additional flexible fields for other data types + organization: + range: string + description: Organization name + email: + range: string + description: Contact email + phone: + range: string + description: Contact phone + creation_date: + range: string + description: Domain creation date + updated_date: + range: string + description: Domain updated date + expiration_date: + range: string + description: Domain expiration date + domain_registered: + range: string + description: Domain registration date + registry: + range: string + description: Domain registrar name # --------------------------------------------------------------------------- # GOOGLE MAPS ENRICHMENT @@ -1485,8 +1660,10 @@ classes: inlined_as_list: true description: Topics mentioned in reviews reviews_summary: - range: string - description: Summary of reviews + any_of: + - range: string + - range: ReviewsSummary + description: Summary of reviews (string or structured breakdown) sample_reviews: any_of: - range: string @@ -1523,10 +1700,13 @@ classes: inlined_as_list: true description: Nearby organizations (strings or structured objects) features: - range: string - multivalued: true + any_of: + - range: string + multivalued: true + - range: PlaceFeature + multivalued: true inlined_as_list: true - description: Features of the place + description: Features of the place (strings or key-value objects) hours_status: range: string description: Current opening status (e.g., "Closed · Opens 2 pm Wed") @@ -1603,6 +1783,23 @@ classes: match_notes: range: string description: Notes about how the Google Maps match was determined + price_level: + any_of: + - range: integer + - range: string + description: Google Maps price level (0-4 or string description) + match_warning: + range: string + description: Warning about potential issues with the match + location_note: + range: string + description: Note about the physical location of the place + search_attempted: + range: boolean + description: Whether a Google Maps search was attempted + result: + range: string + description: Result of search operation (found, not_found, found_via_user_link, etc.) RejectedGoogleMapsData: description: Rejected Google Maps data preserved for audit trail @@ -1625,6 +1822,53 @@ classes: returned_country: range: string description: Country code actually returned by Google Maps + website: + range: uri + description: Website URL from Google Maps + latitude: + range: float + description: Latitude coordinate + longitude: + range: float + description: Longitude coordinate + enriched_at: + range: datetime + description: When enrichment was performed + + PlaceFeature: + description: A feature flag for a place (e.g., native_garden, shop, volunteers) + class_uri: schema:PropertyValue + attributes: + native_garden: + range: boolean + description: Has a native garden + shop: + range: boolean + description: Has a shop + volunteers: + range: boolean + description: Has volunteers + parking: + range: boolean + description: Has parking + cafe: + range: boolean + description: Has a cafe + restaurant: + range: boolean + description: Has a restaurant + gift_shop: + range: boolean + description: Has a gift shop + wheelchair_accessible: + range: boolean + description: Is wheelchair accessible + guided_tours: + range: boolean + description: Offers guided tours + audio_guide: + range: boolean + description: Offers audio guides LlmVerification: description: LLM-based verification results for Google Maps matching @@ -1709,6 +1953,25 @@ classes: minute: range: integer + ReviewsSummary: + description: Breakdown of reviews by star rating + attributes: + 5_star: + range: integer + description: Number of 5-star reviews + 4_star: + range: integer + description: Number of 4-star reviews + 3_star: + range: integer + description: Number of 3-star reviews + 2_star: + range: integer + description: Number of 2-star reviews + 1_star: + range: integer + description: Number of 1-star reviews + GoogleReview: description: Google Maps review attributes: @@ -1828,8 +2091,10 @@ classes: wikidata_temporal: range: WikidataTemporal wikidata_inception: - range: string - description: Inception date (P571) + any_of: + - range: string + - range: WikidataTimeValue + description: Inception date (P571) - can be string or structured time value wikidata_classification: range: WikidataClassification wikidata_instance_of: @@ -1946,6 +2211,29 @@ classes: multivalued: true inlined_as_list: true description: Search terms attempted when looking for Wikidata entity + wikidata_description_nl: + range: string + description: Description in Dutch language + wikidata_claims: + range: WikidataClaims + description: Structured Wikidata claims with property metadata + inlined: true + _resolved_entities: + range: WikidataResolvedEntities + description: Resolved Wikidata property and entity metadata cache + inlined: true + + WikidataClaims: + description: | + Structured Wikidata claims with property metadata and values. + Uses flexible dict-like structure for various claim types. + class_uri: linkml:Any + + WikidataResolvedEntities: + description: | + Cache of resolved Wikidata property and entity metadata. + Keys are property IDs (P123), values are property metadata. + class_uri: linkml:Any WikidataApiMetadata: description: API call metadata @@ -2058,6 +2346,19 @@ classes: inlined_as_list: true description: Main subject (P921) + WikidataTimeValue: + description: Wikidata time value with precision metadata + attributes: + time: + range: string + description: Time value in ISO 8601 format (e.g., +2015-00-00T00:00:00Z) + precision: + range: integer + description: Precision level (9=year, 10=month, 11=day, etc.) + calendarmodel: + range: uri + description: Calendar model URI (e.g., http://www.wikidata.org/entity/Q1985727 for Gregorian) + WikidataEntity: description: Reference to a Wikidata entity attributes: @@ -2104,7 +2405,10 @@ classes: description: Location properties from Wikidata attributes: country: - range: WikidataEntity + any_of: + - range: string + - range: WikidataEntity + description: Country Q-ID (can be string or WikidataEntity object) headquarters_location: range: WikidataEntity coordinates: @@ -2158,8 +2462,10 @@ classes: multivalued: true inlined_as_list: true parent_organization: - range: WikidataEntity - description: Parent organization (P749) + any_of: + - range: string + - range: WikidataEntity + description: Parent organization Q-ID or entity (P749) subsidiary: range: WikidataEntity multivalued: true @@ -2433,6 +2739,9 @@ classes: website_found: range: boolean description: Whether a website was found + official_website: + range: uri + description: Official website URL found during research research_notes: range: string description: Notes from research @@ -2504,6 +2813,12 @@ classes: merger_target: range: string description: Target organization in merger + successor_name: + range: string + description: Name of successor organization (for mergers) + successor_location: + range: string + description: Location of successor organization (for mergers) notes: range: string description: Additional notes @@ -2552,6 +2867,16 @@ classes: type: range: string description: Type of collection (oral_history, photographs, documents, etc.) + item_count: + any_of: + - range: integer + - range: string + description: Number of items in the collection (integer or descriptive string) + total_hours: + any_of: + - range: float + - range: string + description: Total hours of content (for audio/video collections) WebArchiveFailure: description: Failed archive attempt record @@ -2682,7 +3007,8 @@ classes: - range: string - range: string multivalued: true - description: Extracted value (alias for claim_value, can be string or list) + - range: OpeningHoursMap + description: Extracted value (alias for claim_value, can be string, list, or structured object like opening hours) raw_value: range: string description: Raw value before processing @@ -2807,6 +3133,9 @@ classes: job_title_en: range: string description: Job title in English + department_en: + range: string + description: Department name in English RawSource: description: Raw source information for web enrichment @@ -2838,6 +3167,63 @@ classes: raw_markdown_hash: range: string description: SHA-256 hash of the raw markdown content + exa_highlights: + range: string + multivalued: true + inlined_as_list: true + description: Highlighted excerpts from Exa search results + exa_highlight_scores: + range: float + multivalued: true + inlined_as_list: true + description: Relevance scores for Exa highlights + + OpeningHoursMap: + description: Opening hours as a day-keyed map + class_uri: schema:OpeningHoursSpecification + attributes: + maandag: + range: string + description: Monday hours (Dutch) + dinsdag: + range: string + description: Tuesday hours (Dutch) + woensdag: + range: string + description: Wednesday hours (Dutch) + donderdag: + range: string + description: Thursday hours (Dutch) + vrijdag: + range: string + description: Friday hours (Dutch) + zaterdag: + range: string + description: Saturday hours (Dutch) + zondag: + range: string + description: Sunday hours (Dutch) + monday: + range: string + description: Monday hours (English) + tuesday: + range: string + description: Tuesday hours (English) + wednesday: + range: string + description: Wednesday hours (English) + thursday: + range: string + description: Thursday hours (English) + friday: + range: string + description: Friday hours (English) + saturday: + range: string + description: Saturday hours (English) + sunday: + range: string + description: Sunday hours (English) SourceReference: description: Structured source reference for a claim @@ -2961,8 +3347,12 @@ classes: range: string description: Note explaining manual correction made to the name merge_notes: - range: string - description: Notes about name merging or deduplication + any_of: + - range: string + - range: MergeNote + multivalued: true + inlined_as_list: true + description: Notes about name merging or deduplication (string or array of structured objects) abbreviation: range: string description: Short form or abbreviation of the name @@ -2980,6 +3370,9 @@ classes: multivalued: true inlined_as_list: true description: Previous names the institution was known by (strings or structured objects) + short_name: + range: string + description: Short name or commonly used abbreviated form of the institution name FormerName: description: A former name of the institution with optional metadata @@ -3001,6 +3394,19 @@ classes: range: string description: Additional notes about this former name + MergeNote: + description: Note about a merge operation between duplicate entries + attributes: + source: + range: string + description: Source entry identifier that was merged + merged_on: + range: string + description: Date when merge occurred (YYYY-MM-DD) + reason: + range: string + description: Reason for the merge (e.g., duplicate Wikidata ID, same place ID) + MatchingSource: description: Source that contributed to name consensus attributes: @@ -3290,6 +3696,25 @@ classes: enrichment_source: range: string description: Source of enrichment (e.g., manual_curation, api_scraping) + host_organization: + range: string + description: Organization hosting this platform + host_website: + range: uri + description: Main website of the host organization + language: + range: string + description: Primary language of the platform (ISO 639-1 code) + features: + range: string + multivalued: true + inlined_as_list: true + description: Features of this platform + platforms: + range: string + multivalued: true + inlined_as_list: true + description: Sub-platforms or related platforms PlatformSourceReference: description: Structured source reference for a digital platform @@ -3465,6 +3890,12 @@ classes: override_reason: range: string description: Reason for manual coordinate override + source_url: + range: uri + description: URL source of coordinates (e.g., Google Maps link) + note: + range: string + description: Additional note about coordinate provenance # --------------------------------------------------------------------------- # ADDITIONAL ENRICHMENT TYPES diff --git a/scripts/load_typedb_schema.py b/scripts/load_typedb_schema.py index b93f100fb8..cb0594010e 100755 --- a/scripts/load_typedb_schema.py +++ b/scripts/load_typedb_schema.py @@ -1,106 +1,355 @@ #!/usr/bin/env python3 """ -Load TypeDB schemas from files into the glam-heritage database +Load Heritage Custodian schema into TypeDB 3.x. + +This script loads the Heritage Custodian Observation & Reconstruction schema +into TypeDB. The schema must be loaded in parts due to TypeDB 3.x requirements: +1. Attributes first (before entities can reference them) +2. Relations second (before entities can play roles) +3. Entities third (can now reference attributes and play roles) + +Usage: + python scripts/load_typedb_schema.py [--host HOST] [--port PORT] [--database DATABASE] + +Prerequisites: + - TypeDB server running (default: localhost:1729) + - TypeDB Python driver installed (typedb-driver >= 3.0.0) + - Database will be created if it doesn't exist + +Example: + # Start TypeDB server + ~/.typedb/typedb server + + # Load schema + poetry run python scripts/load_typedb_schema.py + +TypeDB 3.x Migration Notes: + - Uses Credentials and DriverOptions (not core_driver) + - Uses TransactionType.SCHEMA (not SessionType.SCHEMA + TransactionType.WRITE) + - tx.query().resolve() instead of tx.query.define() + - No sessions - transactions are created directly on driver + - 'entity' is a reserved word - renamed to 'observed-entity' in observation-of relation """ -import os -from pathlib import Path -from typedb.driver import TypeDB, SessionType, TransactionType -# Configuration -SERVER_ADDRESS = "localhost:1729" -DATABASE_NAME = "glam-heritage" -SCHEMA_DIR = Path("/Users/kempersc/apps/glam/schemas/20251121/typedb") +import argparse +import sys -# Schema files in order -SCHEMA_FILES = [ - "01_name_entity_hub.tql", - "02_heritage_custodian.tql", - "03_identifiers.tql", - "04_locations.tql", - "05_digital_platforms.tql", - "06_provenance.tql", - "07_collections.tql", - "08_relationships.tql", - "09_change_events.tql", - "10_rules.tql", -] -def load_schema(): - """Load TypeDB schema files into the database""" - print(f"🔗 Connecting to TypeDB at {SERVER_ADDRESS}...") +def get_schema_parts(): + """Return the schema split into loadable parts. - with TypeDB.core_driver(SERVER_ADDRESS) as driver: - # Check if database exists - if not driver.databases.contains(DATABASE_NAME): - print(f"❌ Database '{DATABASE_NAME}' does not exist!") - print(f" Create it first: typedb console --command='database create {DATABASE_NAME}'") - return False - - print(f"✅ Connected to database: {DATABASE_NAME}") - print(f"📂 Schema directory: {SCHEMA_DIR}") - print() - - # Load each schema file - for schema_file in SCHEMA_FILES: - schema_path = SCHEMA_DIR / schema_file - - if not schema_path.exists(): - print(f"⚠️ Skipping {schema_file} (file not found)") - continue - - print(f"📝 Loading {schema_file}...") - - try: - with driver.session(DATABASE_NAME, SessionType.SCHEMA) as session: - with session.transaction(TransactionType.WRITE) as tx: - # Read schema file - with open(schema_path, 'r') as f: - schema_content = f.read() - - # Execute TypeQL define query - tx.query.define(schema_content) - tx.commit() - - print(f" ✅ Successfully loaded {schema_file}") - - except Exception as e: - print(f" ❌ Error loading {schema_file}: {e}") - return False - - print() - print("🎉 All schemas loaded successfully!") - return True - -def verify_schema(): - """Verify the loaded schema""" - print("\n🔍 Verifying schema...") + The schema is split into 3 parts that must be loaded in order: + 1. Attributes - all attribute type definitions + 2. Relations - all relation type definitions with role types + 3. Entities - all entity type definitions with owns/plays - with TypeDB.core_driver(SERVER_ADDRESS) as driver: - with driver.session(DATABASE_NAME, SessionType.SCHEMA) as session: - with session.transaction(TransactionType.READ) as tx: - # Get all entity types - result = tx.query.fetch("match $x sub entity; fetch $x;") - entities = list(result) - - print(f"✅ Found {len(entities)} entity types") - - # Sample a few - for i, entity in enumerate(entities[:5]): - print(f" - {entity}") - - if len(entities) > 5: - print(f" ... and {len(entities) - 5} more") + Note: The original .tql file uses 'entity' as a role name in observation-of, + but 'entity' is a reserved word in TypeDB 3.x. This is fixed by renaming + the role to 'observed-entity'. + """ + + # Part 1: Attributes + attributes = """ +define +attribute id, value string; +attribute created, value datetime; +attribute modified, value datetime; +attribute observed-name, value string; +attribute alternative-observed-name, value string; +attribute observation-date, value datetime; +attribute observation-context, value string; +attribute standardized-name, value string; +attribute endorsement-source, value string; +attribute name-authority, value string; +attribute valid-from, value datetime; +attribute valid-to, value datetime; +attribute legal-name, value string; +attribute legal-form, value string; +attribute registration-number, value string; +attribute registration-date, value datetime; +attribute registration-authority, value string; +attribute dissolution-date, value datetime; +attribute legal-status, value string; +attribute governance-structure, value string; +attribute source-uri, value string; +attribute source-type, value string; +attribute source-date, value datetime; +attribute source-creator, value string; +attribute activity-type, value string; +attribute method, value string; +attribute justification, value string; +attribute started-at-time, value datetime; +attribute ended-at-time, value datetime; +attribute agent-name, value string; +attribute agent-type, value string; +attribute affiliation, value string; +attribute contact, value string; +attribute appellation-value, value string; +attribute appellation-language, value string; +attribute appellation-type, value string; +attribute identifier-scheme, value string; +attribute identifier-value, value string; +attribute begin-of-the-begin, value datetime; +attribute begin-of-the-end, value datetime; +attribute end-of-the-begin, value datetime; +attribute end-of-the-end, value datetime; +attribute confidence-value, value double; +attribute confidence-method, value string; +attribute language-code-value, value string; +""" + + # Part 2: Relations + relations = """ +define +relation derivation, + relates derived-entity, + relates source-entity; + +relation generation, + relates generated-entity, + relates generating-activity; + +relation revision, + relates revised-entity, + relates prior-version; + +relation activity-association, + relates activity, + relates agent; + +relation activity-usage, + relates activity, + relates used-source; + +relation source-citation, + relates observation, + relates source; + +relation organizational-hierarchy, + relates parent, + relates child; + +relation name-succession, + relates predecessor, + relates successor; + +relation has-appellation, + relates subject, + relates appellation; + +relation has-identifier, + relates subject, + relates identifier; + +relation observation-of, + relates observation, + relates observed-entity; +""" + + # Part 3: Entities + entities = """ +define +entity custodian @abstract, + owns id, + owns created, + owns modified, + plays derivation:derived-entity, + plays derivation:source-entity, + plays generation:generated-entity, + plays observation-of:observation, + plays observation-of:observed-entity; + +entity custodian-observation sub custodian, + owns observed-name, + owns alternative-observed-name, + owns observation-date, + owns observation-context, + owns confidence-value, + owns confidence-method, + plays source-citation:observation, + plays has-appellation:subject; + +entity custodian-name sub custodian-observation, + owns standardized-name, + owns endorsement-source, + owns name-authority, + owns valid-from, + owns valid-to, + plays name-succession:predecessor, + plays name-succession:successor; + +entity custodian-reconstruction sub custodian, + owns legal-name, + owns legal-form, + owns registration-number, + owns registration-date, + owns registration-authority, + owns dissolution-date, + owns legal-status, + owns governance-structure, + plays has-identifier:subject, + plays organizational-hierarchy:parent, + plays organizational-hierarchy:child, + plays revision:revised-entity, + plays revision:prior-version; + +entity source-document, + owns id, + owns source-uri, + owns source-type, + owns source-date, + owns source-creator, + plays source-citation:source, + plays activity-usage:used-source; + +entity reconstruction-activity, + owns id, + owns activity-type, + owns method, + owns justification, + owns started-at-time, + owns ended-at-time, + plays generation:generating-activity, + plays activity-association:activity, + plays activity-usage:activity; + +entity agent, + owns id, + owns agent-name, + owns agent-type, + owns affiliation, + owns contact, + plays activity-association:agent; + +entity appellation, + owns appellation-value, + owns appellation-language, + owns appellation-type, + plays has-appellation:appellation; + +entity identifier, + owns identifier-scheme, + owns identifier-value, + plays has-identifier:identifier; + +entity time-span, + owns begin-of-the-begin, + owns begin-of-the-end, + owns end-of-the-begin, + owns end-of-the-end; + +entity confidence-measure, + owns confidence-value, + owns confidence-method; + +entity language-code, + owns language-code-value; +""" + + return [ + ("Attributes", attributes), + ("Relations", relations), + ("Entities", entities), + ] + + +def load_schema(host: str = "localhost", port: int = 1729, database: str = "heritage_custodians"): + """Load the Heritage Custodian schema into TypeDB. + + Args: + host: TypeDB server host + port: TypeDB server port + database: Database name (will be created if doesn't exist) + """ + try: + from typedb.driver import TypeDB, Credentials, DriverOptions, TransactionType + except ImportError: + print("Error: typedb-driver not installed. Run: poetry add typedb-driver") + sys.exit(1) + + # Connect to TypeDB + address = f"{host}:{port}" + credentials = Credentials("admin", "password") + options = DriverOptions(is_tls_enabled=False) + + print(f"Connecting to TypeDB at {address}...") + + try: + driver = TypeDB.driver(address, credentials, options) + except Exception as e: + print(f"Error connecting to TypeDB: {e}") + print("Make sure TypeDB server is running: ~/.typedb/typedb server") + sys.exit(1) + + # Check if database exists + db_names = [db.name for db in driver.databases.all()] + if database not in db_names: + print(f"Database '{database}' not found. Creating...") + driver.databases.create(database) + print(f"Created database '{database}'") + else: + print(f"Using existing database '{database}'") + + # Load schema parts + print() + schema_parts = get_schema_parts() + + for name, schema in schema_parts: + print(f"Loading {name}...") + try: + with driver.transaction(database, TransactionType.SCHEMA) as tx: + tx.query(schema).resolve() + tx.commit() + print(f" ✓ {name} loaded successfully") + except Exception as e: + error_msg = str(e) + if "already exists" in error_msg.lower() or "redefinition" in error_msg.lower(): + print(f" ⚠ {name} already exists (skipping)") + else: + print(f" ✗ Error loading {name}: {e}") + driver.close() + sys.exit(1) + + # Verify schema loaded + print("\nVerifying schema...") + with driver.transaction(database, TransactionType.READ) as tx: + for type_name in ["custodian-observation", "custodian-name", "custodian-reconstruction"]: + count_query = f""" + match + $inst isa {type_name}; + reduce $count = count; + """ + answer = tx.query(count_query).resolve() + for row in answer: + value = row.get("count") + count = value.get_integer() if hasattr(value, 'get_integer') else int(str(value)) + print(f" {type_name}: {count} entities") + break + + driver.close() + print("\n✓ Schema loaded successfully!") + print(f"\nDatabase '{database}' is ready for data ingestion.") + + +def main(): + parser = argparse.ArgumentParser( + description="Load Heritage Custodian schema into TypeDB 3.x" + ) + parser.add_argument( + "--host", default="localhost", help="TypeDB server host (default: localhost)" + ) + parser.add_argument( + "--port", type=int, default=1729, help="TypeDB server port (default: 1729)" + ) + parser.add_argument( + "--database", + default="heritage_custodians", + help="Database name (default: heritage_custodians)", + ) + + args = parser.parse_args() + load_schema(host=args.host, port=args.port, database=args.database) + if __name__ == "__main__": - print("=" * 60) - print("TypeDB Schema Loader") - print("=" * 60) - print() - - success = load_schema() - - if success: - verify_schema() - - print() - print("=" * 60) + main() diff --git a/scripts/test_pico_arabic_waqf.py b/scripts/test_pico_arabic_waqf.py new file mode 100644 index 0000000000..2f144a05ab --- /dev/null +++ b/scripts/test_pico_arabic_waqf.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python3 +""" +Test PiCo extraction with Arabic waqf (endowment) document example. + +This script tests the GLM annotator's ability to extract person observations +from Arabic historical documents following the PiCo ontology pattern. + +Usage: + python scripts/test_pico_arabic_waqf.py + +Environment Variables: + ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API +""" + +import asyncio +import json +import os +import sys +from pathlib import Path +from datetime import datetime, timezone + +import httpx + +# Load environment variables from .env file +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + +try: + from dotenv import load_dotenv + load_dotenv(project_root / ".env") +except ImportError: + pass # dotenv not required if env vars set directly + + +# Z.AI API configuration (per AGENTS.md Rule 11) +# GLM-4.6 uses reasoning mode - essential for complex historical document extraction +# Requires higher max_tokens to accommodate reasoning + output +ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" +ZAI_MODEL = "glm-4.6" + + +# Arabic waqf document example (from pico.yaml) +ARABIC_WAQF_TEXT = """بسم الله الرحمن الرحيم +هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة +حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة +بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح +الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف +التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين +وخمس وعشرين هجرية.""" + + +# PiCo extraction system prompt (abbreviated version for testing) +PICO_SYSTEM_PROMPT = """You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons mentioned in the source text, capturing: +1. Names using PNV (Person Name Vocabulary) structure +2. Roles in the source document +3. Biographical information +4. Family relationships between persons in THIS source +5. For Arabic texts: include both original script AND romanized versions + +### Arabic Naming Conventions +- ابن/بن (ibn/bin): son of (patronymic) +- بنت (bint): daughter of +- الحاج (al-Hajj): honorific for pilgrimage completer +- السيد (al-Sayyid): honorific (descendant of Prophet) +- المرحوم (al-marhum): the late (deceased male) +- آل (Al): family of + +### Family Relationship Keys +- parent: array of person references (person_index + target_name) +- children: array of person references +- spouse: array of person references + +### Output Format +Return ONLY valid JSON: + +{ + "pico_observation": { + "observation_id": "", + "observed_at": "", + "source_type": "", + "source_reference": "" + }, + "persons": [ + { + "person_index": 0, + "pnv_name": { + "literalName": "Name in original script", + "literalName_romanized": "Romanized name", + "givenName": "Given name", + "givenName_romanized": "Romanized given name", + "patronym": "Father's name", + "patronym_romanized": "Romanized patronym", + "baseSurname": "Family/tribal name", + "baseSurname_romanized": "Romanized surname", + "honorificPrefix": "Title/honorific", + "honorificPrefix_romanized": "Romanized honorific" + }, + "roles": [ + { + "role_title": "Role as stated", + "role_title_romanized": "Romanized role", + "role_in_source": "founder|witness|beneficiary|null" + } + ], + "biographical": { + "deceased": true/false/null, + "address": "Location if mentioned" + }, + "family_relationships": { + "parent": [{"person_index": N, "target_name": "Name"}], + "children": [{"person_index": N, "target_name": "Name"}] + }, + "context": "Brief description of person's role" + } + ], + "temporal_references": [ + { + "expression": "Original text", + "expression_romanized": "Romanized", + "normalized": "ISO date or approximate", + "calendar": "Hijri|Gregorian", + "type": "DATE" + } + ], + "locations_mentioned": [ + { + "name": "Original name", + "name_romanized": "Romanized", + "type": "city|neighborhood" + } + ] +}""" + + +async def call_glm_api(system_prompt: str, user_content: str) -> dict: + """Call Z.AI GLM-4.6 API and return parsed JSON response.""" + api_token = os.environ.get("ZAI_API_TOKEN") + if not api_token: + raise ValueError("ZAI_API_TOKEN not set in environment") + + headers = { + "Authorization": f"Bearer {api_token}", + "Content-Type": "application/json", + } + + payload = { + "model": ZAI_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_content}, + ], + "temperature": 0.1, # Low temperature for consistent extraction + "max_tokens": 16000, # High limit for GLM-4.6 reasoning mode + output + } + + async with httpx.AsyncClient(timeout=300.0) as client: # 5 min timeout for GLM-4.6 reasoning + response = await client.post(ZAI_API_URL, headers=headers, json=payload) + response.raise_for_status() + + result = response.json() + content = result["choices"][0]["message"]["content"] + + # Save raw response for debugging + raw_output_path = project_root / "data/entity_annotation/test_outputs" + raw_output_path.mkdir(parents=True, exist_ok=True) + raw_file = raw_output_path / f"raw_response_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt" + with open(raw_file, 'w', encoding='utf-8') as f: + f.write(content) + print(f" Raw response saved to: {raw_file.name}") + + # Parse JSON from response (handle markdown code blocks) + json_content = content + if "```json" in content: + json_content = content.split("```json")[1].split("```")[0] + elif "```" in content: + parts = content.split("```") + if len(parts) >= 2: + json_content = parts[1] + + # Try to parse, with fallback for truncated JSON + try: + return json.loads(json_content.strip()) + except json.JSONDecodeError as e: + print(f"\n⚠️ JSON parse error at position {e.pos}, attempting repair...") + # Try to repair truncated JSON by closing brackets + repaired = repair_truncated_json(json_content.strip()) + return json.loads(repaired) + + +def repair_truncated_json(json_str: str) -> str: + """Attempt to repair truncated JSON by closing open brackets.""" + import re + + # Count open/close brackets + open_braces = json_str.count('{') - json_str.count('}') + open_brackets = json_str.count('[') - json_str.count(']') + + # Check if we're in the middle of a string + # Find position of last complete key-value pair + last_comma = json_str.rfind(',') + last_colon = json_str.rfind(':') + + if last_colon > last_comma: + # We're in the middle of a value, try to find a safe truncation point + # Look for the last complete object or array element + safe_pos = last_comma + if safe_pos > 0: + json_str = json_str[:safe_pos] + # Recount brackets after truncation + open_braces = json_str.count('{') - json_str.count('}') + open_brackets = json_str.count('[') - json_str.count(']') + + # Close open brackets + json_str = json_str.rstrip() + + # Remove trailing comma if present + if json_str.endswith(','): + json_str = json_str[:-1] + + # Add closing brackets + json_str += ']' * open_brackets + json_str += '}' * open_braces + + return json_str + + +def validate_extraction(result: dict) -> tuple[bool, list[str]]: + """Validate the extraction result against expected structure.""" + errors = [] + + # Check top-level structure + if "pico_observation" not in result: + errors.append("Missing 'pico_observation' field") + if "persons" not in result: + errors.append("Missing 'persons' field") + + if "persons" in result: + persons = result["persons"] + + # Check minimum person count (should be at least 4: founder, father, 2 witnesses) + if len(persons) < 4: + errors.append(f"Expected at least 4 persons, got {len(persons)}") + + # Check person structure + for i, person in enumerate(persons): + if "person_index" not in person: + errors.append(f"Person {i}: missing 'person_index'") + if "pnv_name" not in person: + errors.append(f"Person {i}: missing 'pnv_name'") + elif "literalName" not in person["pnv_name"]: + errors.append(f"Person {i}: missing 'literalName' in pnv_name") + + # Check for specific expected persons + names = [p.get("pnv_name", {}).get("literalName_romanized", "") for p in persons] + names_lower = [n.lower() for n in names] + + if not any("ahmad" in n for n in names_lower): + errors.append("Missing founder: Ahmad ibn Muhammad al-'Umari") + if not any("ibrahim" in n for n in names_lower): + errors.append("Missing witness: Ibrahim ibn Yusuf al-Turkmani") + if not any("ali" in n for n in names_lower): + errors.append("Missing witness: Ali ibn Husayn al-Halabi") + + # Check temporal reference + if "temporal_references" in result and result["temporal_references"]: + temp = result["temporal_references"][0] + if "calendar" in temp and temp["calendar"] != "Hijri": + errors.append(f"Expected Hijri calendar, got {temp.get('calendar')}") + + # Check locations + if "locations_mentioned" in result: + loc_names = [l.get("name_romanized", "").lower() for l in result["locations_mentioned"]] + if not any("aleppo" in n or "halab" in n for n in loc_names): + errors.append("Missing location: Aleppo (حلب)") + + return len(errors) == 0, errors + + +async def test_arabic_waqf_extraction(): + """Test PiCo extraction from Arabic waqf document.""" + print("\n" + "=" * 70) + print("TEST: PiCo Arabic Waqf Document Extraction") + print("=" * 70) + + # Check API token + if not os.environ.get("ZAI_API_TOKEN"): + print("\n⚠️ SKIPPED: ZAI_API_TOKEN not set") + print("Set it with: export ZAI_API_TOKEN=") + return None + + print(f"\nModel: {ZAI_MODEL}") + print(f"API: {ZAI_API_URL}") + + # Prepare user prompt + user_prompt = f"""Extract all persons, relationships, dates, and locations from this Arabic waqf (endowment) document: + +{ARABIC_WAQF_TEXT} + +This is a historical Islamic endowment document from Aleppo. Extract all information following the PiCo ontology pattern.""" + + print("\n" + "-" * 40) + print("SOURCE TEXT (Arabic Waqf Document)") + print("-" * 40) + print(ARABIC_WAQF_TEXT[:200] + "...") + + # Call API + print("\n⏳ Calling GLM-4.6 API (this may take 30-60 seconds)...") + + try: + start_time = datetime.now(timezone.utc) + result = await call_glm_api(PICO_SYSTEM_PROMPT, user_prompt) + end_time = datetime.now(timezone.utc) + duration = (end_time - start_time).total_seconds() + + print(f"✅ API call completed in {duration:.1f}s") + + except httpx.HTTPStatusError as e: + print(f"\n❌ API Error: {e.response.status_code}") + print(f"Response: {e.response.text[:500]}") + return False + except json.JSONDecodeError as e: + print(f"\n❌ JSON Parse Error: {e}") + return False + except Exception as e: + print(f"\n❌ Error: {type(e).__name__}: {e}") + return False + + # Display results + print("\n" + "-" * 40) + print("EXTRACTION RESULTS") + print("-" * 40) + + # PiCo observation metadata + if "pico_observation" in result: + obs = result["pico_observation"] + print(f"\n📋 Observation ID: {obs.get('observation_id', 'N/A')}") + print(f" Source Type: {obs.get('source_type', 'N/A')}") + print(f" Source Ref: {obs.get('source_reference', 'N/A')}") + + # Persons extracted + persons = result.get("persons", []) + print(f"\n👥 Persons Extracted: {len(persons)}") + + for person in persons: + idx = person.get("person_index", "?") + name = person.get("pnv_name", {}) + lit_name = name.get("literalName", "") + rom_name = name.get("literalName_romanized", "") + + print(f"\n [{idx}] {lit_name}") + if rom_name: + print(f" Romanized: {rom_name}") + + # Honorific + if name.get("honorificPrefix"): + hon = name.get("honorificPrefix", "") + hon_rom = name.get("honorificPrefix_romanized", "") + print(f" Honorific: {hon} ({hon_rom})") + + # Patronym + if name.get("patronym"): + pat = name.get("patronym", "") + pat_rom = name.get("patronym_romanized", "") + print(f" Patronym: {pat} ({pat_rom})") + + # Roles + roles = person.get("roles", []) + for role in roles: + role_title = role.get("role_title", "") + role_rom = role.get("role_title_romanized", "") + role_in_src = role.get("role_in_source", "") + if role_title or role_in_src: + print(f" Role: {role_title} ({role_rom}) - {role_in_src}") + + # Biographical + bio = person.get("biographical", {}) + if bio.get("deceased"): + print(f" Status: Deceased (المرحوم)") + if bio.get("address"): + print(f" Address: {bio.get('address')}") + + # Family relationships + fam = person.get("family_relationships", {}) + if fam.get("parent"): + parents = [p.get("target_name", "") for p in fam["parent"]] + print(f" Parents: {', '.join(parents)}") + if fam.get("children"): + children = [c.get("target_name", "") for c in fam["children"]] + print(f" Children: {', '.join(children)}") + + # Context + if person.get("context"): + print(f" Context: {person.get('context')}") + + # Temporal references + temps = result.get("temporal_references", []) + if temps: + print(f"\n📅 Temporal References: {len(temps)}") + for temp in temps: + expr = temp.get("expression", "") + expr_rom = temp.get("expression_romanized", "") + norm = temp.get("normalized", "") + cal = temp.get("calendar", "") + print(f" {expr}") + if expr_rom: + print(f" → {expr_rom}") + print(f" → Normalized: {norm} ({cal})") + + # Locations + locs = result.get("locations_mentioned", []) + if locs: + print(f"\n📍 Locations: {len(locs)}") + for loc in locs: + name = loc.get("name", "") + name_rom = loc.get("name_romanized", "") + loc_type = loc.get("type", "") + print(f" {name} ({name_rom}) - {loc_type}") + + # Validate results + print("\n" + "-" * 40) + print("VALIDATION") + print("-" * 40) + + is_valid, errors = validate_extraction(result) + + if is_valid: + print("\n✅ All validations passed!") + else: + print(f"\n⚠️ Validation issues ({len(errors)}):") + for error in errors: + print(f" - {error}") + + # Save result to file for inspection + output_path = project_root / "data/entity_annotation/test_outputs" + output_path.mkdir(parents=True, exist_ok=True) + output_file = output_path / f"arabic_waqf_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print(f"\n💾 Full result saved to: {output_file.relative_to(project_root)}") + + # Final verdict + print("\n" + "=" * 70) + if is_valid: + print("✅ TEST PASSED: Arabic waqf extraction successful") + else: + print("⚠️ TEST COMPLETED WITH WARNINGS: Check validation issues above") + print("=" * 70) + + return is_valid + + +async def main(): + """Run the test.""" + print("\n" + "#" * 70) + print("# PiCo ARABIC WAQF EXTRACTION TEST") + print("# Testing GLM-4.6 reasoning mode with historical Arabic document") + print("#" * 70) + + result = await test_arabic_waqf_extraction() + + if result is None: + return 0 # Skipped (no API key) + return 0 if result else 1 + + +if __name__ == "__main__": + exit_code = asyncio.run(main()) + sys.exit(exit_code) diff --git a/scripts/test_pico_batch.py b/scripts/test_pico_batch.py new file mode 100644 index 0000000000..38654b99c0 --- /dev/null +++ b/scripts/test_pico_batch.py @@ -0,0 +1,786 @@ +#!/usr/bin/env python3 +""" +Batch test runner for PiCo (Person in Context) extraction across multiple document types. + +This script tests GLM-4.6 reasoning mode extraction from various historical document types: +1. Arabic Waqf (Islamic endowment) +2. Hebrew Ketubah (Jewish marriage contract) +3. Spanish Colonial Baptism +4. Dutch Marriage Certificate +5. Latin Notarial Protocol + +Usage: + python scripts/test_pico_batch.py [--test-name NAME] [--all] [--list] + +Examples: + python scripts/test_pico_batch.py --all # Run all tests + python scripts/test_pico_batch.py --test-name arabic # Run only Arabic waqf test + python scripts/test_pico_batch.py --list # List available tests + +Environment Variables: + ZAI_API_TOKEN - Required for Z.AI GLM-4.6 API +""" + +import asyncio +import argparse +import json +import os +import sys +from pathlib import Path +from datetime import datetime, timezone +from dataclasses import dataclass +from typing import Optional + +import httpx + +# Load environment variables from .env file +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + +try: + from dotenv import load_dotenv + load_dotenv(project_root / ".env") +except ImportError: + pass + + +# ============================================================================= +# API Configuration +# ============================================================================= + +ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" +ZAI_MODEL = "glm-4.6" +MAX_TOKENS = 16000 # High limit for GLM-4.6 reasoning mode +TIMEOUT = 300 # 5 minutes for complex reasoning + + +# ============================================================================= +# Test Document Definitions +# ============================================================================= + +@dataclass +class TestDocument: + """A historical document for PiCo extraction testing.""" + name: str + language: str + script: str + date_period: str + source_type: str + source_text: str + system_prompt: str + expected_persons: int + expected_locations: int + validation_names: list[str] # Names that should appear in extraction + + +# Arabic Waqf Document +ARABIC_WAQF = TestDocument( + name="arabic_waqf", + language="Arabic", + script="Arabic", + date_period="1225 AH (1810 CE)", + source_type="waqf_document", + source_text="""بسم الله الرحمن الرحيم +هذا ما وقف وحبس وسبل وأبد المرحوم الحاج أحمد بن محمد العمري، تاجر بمدينة +حلب الشهباء، ابن المرحوم محمد بن عبد الله العمري. وقف جميع داره الكائنة +بمحلة الجديدة على أولاده وأولاد أولاده ذكوراً وإناثاً. وإن انقرضوا لا سمح +الله فعلى فقراء المسلمين. وشهد على ذلك الشهود: الحاج إبراهيم بن يوسف +التركماني، والسيد علي بن حسين الحلبي. وكتب في شهر رجب سنة ألف ومائتين +وخمس وعشرين هجرية.""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Arabic waqf (endowment) document: +1. Names using PNV structure with both Arabic script AND romanized versions +2. Patronymics (ابن/بن = son of) +3. Honorifics (الحاج = pilgrim, السيد = sayyid, المرحوم = the late) +4. Family relationships between persons +5. Roles in the document (founder, witness) +6. Biographical info (deceased status, occupation, address) + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "...", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "..."}], + "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}] +}""", + expected_persons=4, + expected_locations=2, + validation_names=["ahmad", "ibrahim", "ali"] +) + + +# Hebrew Ketubah +HEBREW_KETUBAH = TestDocument( + name="hebrew_ketubah", + language="Hebrew/Aramaic", + script="Hebrew", + date_period="5645 AM (1885 CE)", + source_type="ketubah", + source_text="""בס״ד + +ביום שלישי בשבת, שנים עשר יום לחודש אייר שנת חמשת אלפים שש מאות +וארבעים וחמש לבריאת עולם למנין שאנו מונין בו פה ווילנא + +איך החתן הבחור יצחק בן הר״ר אברהם הכהן ז״ל אמר לה להדא בתולתא +מרים בת הר״ר משה הלוי: הוי לי לאנתו כדת משה וישראל ואנא אפלח +ואוקיר ואיזון ואפרנס יתיכי כהלכות גוברין יהודאין + +ונתרצית מרת מרים בתולתא דא והות ליה לאנתו + +עדים: +שמעון בן יעקב הכהן +דוד בן אליהו""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Hebrew ketubah (Jewish marriage contract): +1. Names using PNV structure with both Hebrew script AND romanized versions +2. Patronymics (בן/בת = son/daughter of) +3. Tribal affiliations (הכהן = the priest/Kohen, הלוי = the Levite) +4. Honorifics (הר״ר = Rabbi, מרת = Mrs., ז״ל = of blessed memory) +5. Family relationships between persons +6. Roles in document (groom/חתן, bride/כלה, witness/עד) +7. Deceased markers (ז״ל) + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "ketubah", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hebrew"}], + "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}] +}""", + expected_persons=6, # groom, bride, 2 fathers, 2 witnesses (fathers implicit) + expected_locations=1, + validation_names=["yitzchak", "miriam", "shimon", "david"] +) + + +# Spanish Colonial Baptism +SPANISH_BAPTISM = TestDocument( + name="spanish_colonial_baptism", + language="Spanish", + script="Latin", + date_period="1742 CE", + source_type="baptismal_register", + source_text="""En la ciudad de México, a veinte y tres días del mes de febrero de mil +setecientos cuarenta y dos años, yo el Br. Don Antonio de Mendoza, +teniente de cura de esta santa iglesia catedral, bauticé solemnemente, +puse óleo y crisma a Juan José, español, hijo legítimo de Don Pedro +García de la Cruz, español, natural de la villa de Puebla de los Ángeles, +y de Doña María Josefa de los Reyes, española, natural de esta ciudad. + +Fueron sus padrinos Don Francisco Xavier de Castañeda, español, vecino +de esta ciudad, y Doña Ana María de la Encarnación, su legítima esposa, +a quienes advertí el parentesco espiritual y obligaciones que contrajeron. + +Y lo firmé. +Br. Don Antonio de Mendoza""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Spanish colonial baptismal record: +1. Names using PNV structure (given name, surname with particles like "de") +2. Casta (racial/social) designations (español, mestizo, mulato, indio, etc.) +3. Legitimacy markers (hijo legítimo, hijo natural) +4. Place of origin (natural de, vecino de) +5. Family relationships (parents, godparents/padrinos) +6. Compadrazgo relationships (spiritual kinship between parents and godparents) +7. Ecclesiastical roles (priest, teniente de cura) +8. Honorifics (Don, Doña, Br./Bachiller) + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}], + "locations_mentioned": [{"name": "...", "type": "..."}] +}""", + expected_persons=6, # infant, father, mother, godfather, godmother, priest + expected_locations=3, + validation_names=["juan", "pedro", "maria", "francisco", "antonio"] +) + + +# Dutch Marriage Certificate +DUTCH_MARRIAGE = TestDocument( + name="dutch_marriage", + language="Dutch", + script="Latin", + date_period="1885 CE", + source_type="marriage_certificate", + source_text="""Heden den vierden Maart achttien honderd vijf en tachtig, compareerden +voor mij, Ambtenaar van den Burgerlijken Stand der Gemeente Haarlem: + +Johannes Petrus van der Berg, oud dertig jaren, koopman, geboren te +Amsterdam, wonende alhier, meerderjarige zoon van wijlen Pieter van der +Berg, in leven koopman, en van Maria Johanna Bakker, zonder beroep, +wonende te Amsterdam; + +en + +Cornelia Wilhelmina de Groot, oud vijf en twintig jaren, zonder beroep, +geboren te Haarlem, wonende alhier, meerderjarige dochter van Hendrik +de Groot, timmerman, en van wijlen Elisabeth van Dijk. + +De getuigen waren: +Willem Frederik Smit, oud veertig jaren, notaris +Jacobus Hendrikus Jansen, oud vijf en dertig jaren, klerk""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Dutch marriage certificate (huwelijksakte): +1. Names using PNV structure with Dutch naming conventions +2. Patronymics and tussenvoegsels (van der, de, etc.) +3. Ages, occupations, birthplaces, residences +4. Family relationships (parents identified with "zoon van" / "dochter van") +5. Deceased markers ("wijlen" = the late) +6. Roles in document (groom, bride, witnesses/getuigen) +7. Civil status terminology + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "marriage_certificate", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}], + "locations_mentioned": [{"name": "...", "type": "..."}] +}""", + expected_persons=8, # groom, bride, 4 parents (2 deceased), 2 witnesses + expected_locations=2, + validation_names=["johannes", "cornelia", "willem", "jacobus"] +) + + +# Russian Metrical Book Entry +RUSSIAN_METRICAL = TestDocument( + name="russian_metrical", + language="Russian", + script="Cyrillic", + date_period="1892 CE", + source_type="metrical_book", + source_text="""Метрическая книга Троицкой церкви села Покровского за 1892 год + +О родившихся + +Марта 15 дня родился, 17 дня крещён Иван. + +Родители: крестьянин деревни Ивановки Пётр Иванович Сидоров и законная +жена его Анна Фёдоровна, оба православного вероисповедания. + +Восприемники: крестьянин той же деревни Николай Петрович Кузнецов +и крестьянская дочь девица Мария Ивановна Сидорова.""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Russian metrical book (метрическая книга) entry: +1. Names using Russian naming conventions: given name + patronymic (отчество) + surname +2. Patronymic patterns (-ович/-евич for males, -овна/-евна for females) +3. Estate/class designations (крестьянин = peasant, мещанин = townsman, дворянин = noble) +4. Family relationships +5. Roles (родители = parents, восприемники = godparents) +6. Religious denomination (православный = Orthodox) +7. Include both Cyrillic AND romanized versions + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "metrical_book", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Gregorian/Julian"}], + "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}] +}""", + expected_persons=5, # infant, father, mother, godfather, godmother + expected_locations=2, + validation_names=["ivan", "petr", "anna", "nikolai", "maria"] +) + + +# Italian Notarial Act +ITALIAN_NOTARIAL = TestDocument( + name="italian_notarial", + language="Italian", + script="Latin", + date_period="1654 CE", + source_type="notarial_act", + source_text="""Adì 15 Marzo 1654, in Venetia. + +Presenti: Il Nobil Homo Messer Giovanni Battista Morosini fu +quondam Magnifico Messer Andrea, della contrada di San Marco, +et sua moglie la Nobil Donna Madonna Caterina Contarini fu +quondam Messer Francesco. Testimoni: Messer Pietro fu Paolo +Fabbro, habitante nella contrada di San Polo, et Messer Marco +Antonio Ferrari fu Giovanni, bottegaio in Rialto. Rogato io +Notaro Antonio Zen fu quondam Messer Giacomo, Notaro publico +di Venetia.""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Italian notarial act: +1. Names using PNV structure (given name, surname) +2. Venetian nobility titles (Nobil Homo, Magnifico Messer, Nobil Donna Madonna) +3. Deceased father markers ("fu", "quondam" = the late) +4. Family relationships (spouses, children of) +5. Occupations (bottegaio = shopkeeper, notaro = notary) +6. Roles in document (party, witness/testimone, notary) +7. Residence/contrada information + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "notarial_act", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {...}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "normalized": "...", "calendar": "Gregorian"}], + "locations_mentioned": [{"name": "...", "type": "..."}] +}""", + expected_persons=6, # Giovanni, Caterina, 2 witnesses, notary, plus fathers + expected_locations=4, + validation_names=["giovanni", "caterina", "pietro", "antonio"] +) + + +# Greek Orthodox Baptismal Register +GREEK_BAPTISMAL = TestDocument( + name="greek_baptismal", + language="Greek", + script="Greek", + date_period="1875 CE", + source_type="baptismal_register", + source_text="""Ἐν Θεσσαλονίκῃ, τῇ δεκάτῃ πέμπτῃ Μαρτίου τοῦ ἔτους 1875. + +Ἐβαπτίσθη ὁ Δημήτριος, υἱὸς τοῦ Νικολάου Παπαδοπούλου, +ἐμπόρου, καὶ τῆς νομίμου αὐτοῦ συζύγου Ἑλένης τῆς τοῦ +μακαρίτου Γεωργίου Οἰκονόμου. Νονὸς ὁ Κωνσταντῖνος +Καρατζᾶς τοῦ Ἰωάννου, ἰατρός. Ἱερεύς: ὁ Πρωτοπρεσβύτερος +Ἀθανάσιος Χρυσοστόμου.""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Greek Orthodox baptismal register: +1. Names with BOTH Greek script AND romanized versions +2. Greek patronymics ("τοῦ" + genitive = son/daughter of) +3. Deceased markers (μακαρίτης/μακαρίτισσα = the late) +4. Family relationships (υἱός = son, σύζυγος = wife) +5. Godparent (νονός/νονά) +6. Occupations (ἔμπορος = merchant, ἰατρός = physician) +7. Ecclesiastical titles (Πρωτοπρεσβύτερος = Archpriest) +8. Roles in document (baptized, parents, godparent, priest) + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "baptismal_register", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Julian"}], + "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}] +}""", + expected_persons=6, # infant, father, mother, maternal grandfather, godfather, priest + expected_locations=1, + validation_names=["dimitrios", "nikolaos", "eleni", "konstantinos"] +) + + +# Ottoman Turkish Court Record (Sijill) +OTTOMAN_SIJILL = TestDocument( + name="ottoman_sijill", + language="Ottoman Turkish", + script="Arabic", + date_period="1258 AH (1842 CE)", + source_type="sijill", + source_text="""بسم الله الرحمن الرحيم + +مجلس شرع شريفده محمد آغا بن عبد الله مرحوم قصبه دميرجی‌کوی +ساکنلرندن محمد بن احمد افندی و زوجه‌سی فاطمه خاتون بنت علی‌اوغلو +حاضر اولوب محمد آغا طرفندن یکری بش غروش بدل معلوم ایله صاتیلدی + +شهود الحال: حسن افندی بن عمر، ابراهیم چلبی بن مصطفی + +فی اوائل شهر رجب سنة ١٢٥٨""", + system_prompt="""You are a historical document annotator following the PiCo (Person in Context) ontology. + +Extract ALL persons from this Ottoman Turkish sijill (court record): +1. Names with both Arabic script AND romanized versions +2. Ottoman honorifics (آغا/Ağa, افندی/Efendi, چلبی/Çelebi, خاتون/Hatun) +3. Patronymics (بن/bin = son of, بنت/bint = daughter of) +4. Deceased markers (مرحوم/merhum) +5. Family relationships (زوجه/zevce = wife) +6. Roles in document (buyer, seller, witnesses) +7. Residence information + +Note: Ottoman Turkish uses Arabic script with Turkish vocabulary and grammatical structures. + +Return ONLY valid JSON with this structure: +{ + "pico_observation": {"observation_id": "...", "source_type": "sijill", "source_reference": "..."}, + "persons": [{"person_index": 0, "pnv_name": {"literalName": "...", "literalName_romanized": "..."}, "roles": [...], "biographical": {...}, "family_relationships": {...}, "context": "..."}], + "temporal_references": [{"expression": "...", "expression_romanized": "...", "normalized": "...", "calendar": "Hijri"}], + "locations_mentioned": [{"name": "...", "name_romanized": "...", "type": "..."}] +}""", + expected_persons=6, # Mehmed Ağa, Mehmed bin Ahmed, Fatma Hatun, 2 witnesses + fathers + expected_locations=1, + validation_names=["mehmed", "fatma", "hasan", "ibrahim"] +) + + +# All available tests +ALL_TESTS = { + "arabic": ARABIC_WAQF, + "hebrew": HEBREW_KETUBAH, + "spanish": SPANISH_BAPTISM, + "dutch": DUTCH_MARRIAGE, + "russian": RUSSIAN_METRICAL, + "italian": ITALIAN_NOTARIAL, + "greek": GREEK_BAPTISMAL, + "ottoman": OTTOMAN_SIJILL, +} + + +# ============================================================================= +# API Functions +# ============================================================================= + +async def call_glm_api(system_prompt: str, user_content: str) -> tuple[dict, float]: + """Call Z.AI GLM-4.6 API and return parsed JSON response with timing.""" + api_token = os.environ.get("ZAI_API_TOKEN") + if not api_token: + raise ValueError("ZAI_API_TOKEN not set in environment") + + headers = { + "Authorization": f"Bearer {api_token}", + "Content-Type": "application/json", + } + + payload = { + "model": ZAI_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_content}, + ], + "temperature": 0.1, + "max_tokens": MAX_TOKENS, + } + + start_time = datetime.now(timezone.utc) + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + response = await client.post(ZAI_API_URL, headers=headers, json=payload) + response.raise_for_status() + + result = response.json() + content = result["choices"][0]["message"]["content"] + + end_time = datetime.now(timezone.utc) + duration = (end_time - start_time).total_seconds() + + # Parse JSON from response + json_content = content + if "```json" in content: + json_content = content.split("```json")[1].split("```")[0] + elif "```" in content: + parts = content.split("```") + if len(parts) >= 2: + json_content = parts[1] + + return json.loads(json_content.strip()), duration + + +def extract_all_strings_recursive(obj, strings: list[str]) -> None: + """Recursively extract all string values from nested dicts/lists.""" + if isinstance(obj, str): + strings.append(obj.lower()) + elif isinstance(obj, dict): + for value in obj.values(): + extract_all_strings_recursive(value, strings) + elif isinstance(obj, list): + for item in obj: + extract_all_strings_recursive(item, strings) + + +def normalize_name_variant(name: str) -> list[str]: + """Generate common spelling variants for a name. + + Handles cross-script romanization differences like: + - mehmed/muhammad/mohammed + - fatma/fatima + - dimitrios/demetrios + - yitzchak/isaac + """ + variants = [name.lower()] + + # Arabic/Turkish name variants + variant_map = { + 'mehmed': ['muhammad', 'mohammed', 'mehmet'], + 'fatma': ['fatima', 'fatmah'], + 'ahmed': ['ahmad'], + 'ibrahim': ['abraham', 'ibrahim'], + 'hasan': ['hassan'], + 'hussein': ['husayn', 'huseyin'], + # Greek variants + 'dimitrios': ['demetrios', 'dimitris', 'dēmētrios'], + 'nikolaos': ['nicholas', 'nikolas'], + 'konstantinos': ['constantine', 'constantinos'], + 'georgios': ['george', 'geōrgios'], + 'eleni': ['helen', 'elena', 'elenē'], + 'athanasios': ['athanasius'], + # Hebrew variants + 'yitzchak': ['isaac', 'itzhak', 'yitzhak'], + 'miriam': ['mirjam', 'myriam'], + 'shimon': ['simon', 'shimeon'], + 'avraham': ['abraham'], + 'moshe': ['moses'], + 'david': ['dovid'], + 'yaakov': ['jacob', 'jakob'], + # Russian variants + 'petr': ['peter', 'pyotr', 'piotr'], + 'ivan': ['john', 'ioann'], + 'nikolai': ['nicholas', 'nikolay'], + 'maria': ['mary', 'mariya'], + } + + for key, values in variant_map.items(): + if name.lower() == key: + variants.extend(values) + elif name.lower() in values: + variants.append(key) + variants.extend(v for v in values if v != name.lower()) + + return variants + + +def validate_extraction(result: dict, test: TestDocument) -> tuple[bool, list[str]]: + """Validate extraction result against expected values.""" + errors = [] + warnings = [] + + # Check structure + if "persons" not in result: + errors.append("Missing 'persons' field") + return False, errors + + persons = result.get("persons", []) + + # Check person count + if len(persons) < test.expected_persons: + warnings.append(f"Expected at least {test.expected_persons} persons, got {len(persons)}") + + # Extract ALL string values from persons recursively for comprehensive name matching + all_name_strings = [] + for person in persons: + # Get pnv_name - could be nested structure + pnv = person.get("pnv_name", {}) + extract_all_strings_recursive(pnv, all_name_strings) + # Also check context field which often contains the original text + if person.get("context"): + all_name_strings.append(str(person["context"]).lower()) + + # Check for expected names with variant support + for expected_name in test.validation_names: + variants = normalize_name_variant(expected_name) + found = False + for variant in variants: + if any(variant in name_str for name_str in all_name_strings): + found = True + break + if not found: + warnings.append(f"Expected name '{expected_name}' (variants: {variants[:3]}) not found") + + # Check locations + locations = result.get("locations_mentioned", []) + if len(locations) < test.expected_locations: + warnings.append(f"Expected at least {test.expected_locations} locations, got {len(locations)}") + + # Combine errors and warnings + is_valid = len(errors) == 0 + all_issues = errors + warnings + + return is_valid, all_issues + + +# ============================================================================= +# Test Runner +# ============================================================================= + +async def run_single_test(test: TestDocument) -> dict: + """Run extraction test for a single document type.""" + print(f"\n{'='*70}") + print(f"TEST: {test.name.upper()}") + print(f"Language: {test.language} | Script: {test.script} | Period: {test.date_period}") + print(f"{'='*70}") + + # Prepare user prompt + user_prompt = f"""Extract all persons, relationships, dates, and locations from this {test.source_type}: + +{test.source_text} + +Follow the PiCo ontology pattern for person observations.""" + + print(f"\n📄 Source: {test.source_type}") + print(f" Text length: {len(test.source_text)} chars") + + # Call API + print(f"\n⏳ Calling GLM-4.6 API...") + + try: + result, duration = await call_glm_api(test.system_prompt, user_prompt) + print(f"✅ API call completed in {duration:.1f}s") + + except httpx.HTTPStatusError as e: + print(f"❌ API Error: {e.response.status_code}") + return {"test": test.name, "status": "error", "error": str(e)} + except json.JSONDecodeError as e: + print(f"❌ JSON Parse Error: {e}") + return {"test": test.name, "status": "error", "error": str(e)} + except Exception as e: + print(f"❌ Error: {type(e).__name__}: {e}") + return {"test": test.name, "status": "error", "error": str(e)} + + # Display summary + persons = result.get("persons", []) + locations = result.get("locations_mentioned", []) + temporal = result.get("temporal_references", []) + + print(f"\n📊 Extraction Summary:") + print(f" Persons: {len(persons)}") + print(f" Locations: {len(locations)}") + print(f" Temporal refs: {len(temporal)}") + + # Show persons + print(f"\n👥 Persons:") + for person in persons[:5]: # Show first 5 + idx = person.get("person_index", "?") + name = person.get("pnv_name", {}) + if isinstance(name, str): + lit_name = name + else: + lit_name = name.get("literalName_romanized") or name.get("literalName", "?") + + # Handle roles - could be list of dicts, list of strings, or string + roles_raw = person.get("roles", []) + if isinstance(roles_raw, str): + role = roles_raw + elif isinstance(roles_raw, list) and len(roles_raw) > 0: + first_role = roles_raw[0] + if isinstance(first_role, dict): + role = first_role.get("role_in_source", "-") + else: + role = str(first_role) + else: + role = "-" + + print(f" [{idx}] {str(lit_name)[:50]} ({role})") + + if len(persons) > 5: + print(f" ... and {len(persons) - 5} more") + + # Validate + is_valid, issues = validate_extraction(result, test) + + print(f"\n🔍 Validation: {'✅ PASSED' if is_valid else '⚠️ ISSUES'}") + if issues: + for issue in issues: + print(f" - {issue}") + + # Save result + output_dir = project_root / "data/entity_annotation/test_outputs" + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / f"{test.name}_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print(f"\n💾 Saved: {output_file.name}") + + return { + "test": test.name, + "status": "passed" if is_valid else "warning", + "persons_extracted": len(persons), + "locations_extracted": len(locations), + "duration_seconds": duration, + "issues": issues, + "output_file": str(output_file) + } + + +async def run_all_tests() -> list[dict]: + """Run all extraction tests sequentially.""" + results = [] + + for name, test in ALL_TESTS.items(): + result = await run_single_test(test) + results.append(result) + + return results + + +def print_summary(results: list[dict]): + """Print summary of all test results.""" + print("\n" + "=" * 70) + print("BATCH TEST SUMMARY") + print("=" * 70) + + passed = sum(1 for r in results if r["status"] == "passed") + warnings = sum(1 for r in results if r["status"] == "warning") + errors = sum(1 for r in results if r["status"] == "error") + + print(f"\n📊 Results: {passed} passed, {warnings} warnings, {errors} errors") + print(f" Total tests: {len(results)}") + + print(f"\n📋 Test Details:") + for r in results: + status_icon = {"passed": "✅", "warning": "⚠️", "error": "❌"}.get(r["status"], "?") + print(f" {status_icon} {r['test']}: {r.get('persons_extracted', 0)} persons, {r.get('duration_seconds', 0):.1f}s") + if r.get("issues"): + for issue in r["issues"][:2]: + print(f" - {issue}") + + print("\n" + "=" * 70) + if errors == 0: + print("✅ ALL TESTS COMPLETED SUCCESSFULLY") + else: + print(f"⚠️ {errors} TESTS FAILED - Check details above") + print("=" * 70) + + +# ============================================================================= +# Main +# ============================================================================= + +async def main(): + parser = argparse.ArgumentParser(description="Batch test PiCo extraction") + parser.add_argument("--test-name", "-t", choices=list(ALL_TESTS.keys()), + help="Run specific test by name") + parser.add_argument("--all", "-a", action="store_true", + help="Run all tests") + parser.add_argument("--list", "-l", action="store_true", + help="List available tests") + + args = parser.parse_args() + + # Check API token + if not os.environ.get("ZAI_API_TOKEN"): + print("❌ Error: ZAI_API_TOKEN not set") + print("Set it with: export ZAI_API_TOKEN=") + print("Or add to .env file in project root") + return 1 + + print("\n" + "#" * 70) + print("# PiCo BATCH EXTRACTION TEST") + print(f"# Model: {ZAI_MODEL} (reasoning mode)") + print(f"# Max tokens: {MAX_TOKENS}") + print("#" * 70) + + if args.list: + print("\n📋 Available tests:") + for name, test in ALL_TESTS.items(): + print(f" {name}: {test.language} {test.source_type} ({test.date_period})") + return 0 + + if args.test_name: + test = ALL_TESTS[args.test_name] + result = await run_single_test(test) + return 0 if result["status"] != "error" else 1 + + if args.all: + results = await run_all_tests() + print_summary(results) + errors = sum(1 for r in results if r["status"] == "error") + return 0 if errors == 0 else 1 + + # Default: show help + parser.print_help() + return 0 + + +if __name__ == "__main__": + exit_code = asyncio.run(main()) + sys.exit(exit_code) diff --git a/src/glam_extractor/api/hybrid_retriever.py b/src/glam_extractor/api/hybrid_retriever.py index 7a8737465c..0da2d8d41c 100644 --- a/src/glam_extractor/api/hybrid_retriever.py +++ b/src/glam_extractor/api/hybrid_retriever.py @@ -665,7 +665,7 @@ def create_hybrid_retriever( return HybridRetriever( qdrant_host="bronhouder.nl", qdrant_port=443, - sparql_endpoint="https://bronhouder.nl/query", + sparql_endpoint="https://bronhouder.nl/sparql", use_production_qdrant=True, **kwargs ) diff --git a/src/glam_extractor/api/typedb_retriever.py b/src/glam_extractor/api/typedb_retriever.py index e6ad308bd4..f33439c621 100644 --- a/src/glam_extractor/api/typedb_retriever.py +++ b/src/glam_extractor/api/typedb_retriever.py @@ -140,21 +140,26 @@ class TypeDBRetriever: self.database = database self.k = k - # Lazy-load TypeDB client + # Lazy-load TypeDB client (TypeDB 3.x - no sessions) self._client = None - self._session = None logger.info(f"Initialized TypeDBRetriever: {host}:{port}/{database}") @property def client(self): - """Lazy-load TypeDB client.""" + """Lazy-load TypeDB client (TypeDB 3.x API).""" if self._client is None: try: - from typedb.driver import TypeDB, SessionType + from typedb.driver import TypeDB, Credentials, DriverOptions - self._client = TypeDB.core_driver(f"{self.host}:{self.port}") - logger.info(f"Connected to TypeDB at {self.host}:{self.port}") + # TypeDB 3.x requires credentials and options + # Default credentials for local development (no auth) + credentials = Credentials("admin", "password") + options = DriverOptions(is_tls_enabled=False) # Disable TLS for local dev + + address = f"{self.host}:{self.port}" + self._client = TypeDB.driver(address, credentials, options) + logger.info(f"Connected to TypeDB 3.x at {self.host}:{self.port}") except ImportError: raise ImportError( "typedb-driver package required. Install with: pip install typedb-driver" @@ -164,16 +169,10 @@ class TypeDBRetriever: raise return self._client - def _get_session(self): - """Get a data session.""" - from typedb.driver import SessionType - - if self._session is None or not self._session.is_open(): - self._session = self.client.session(self.database, SessionType.DATA) - return self._session - def _execute_read(self, typeql: str) -> list[dict[str, Any]]: - """Execute a TypeQL read query. + """Execute a TypeQL read query (TypeDB 3.x API). + + TypeDB 3.x removed sessions - transactions are created directly on driver. Args: typeql: TypeQL query string @@ -184,23 +183,33 @@ class TypeDBRetriever: from typedb.driver import TransactionType results = [] - session = self._get_session() try: - with session.transaction(TransactionType.READ) as tx: - answer = tx.query.get(typeql) + # TypeDB 3.x: transactions directly on driver, specifying database + with self.client.transaction(self.database, TransactionType.READ) as tx: + # TypeDB 3.x: query() returns a Promise, need to resolve it + answer = tx.query(typeql).resolve() - for concept_map in answer: - row = {} - for var in concept_map.variables(): - concept = concept_map.get(var) - if hasattr(concept, 'get_value'): - row[var] = concept.get_value() - elif hasattr(concept, 'get_iid'): - row[var] = concept.get_iid() + # TypeDB 3.x: QueryAnswer may be iterable depending on query type + if hasattr(answer, '__iter__'): + for row in answer: + result_row = {} + # Access columns by index or iterate + if hasattr(row, 'concepts'): + for i, concept in enumerate(row.concepts()): + var_name = f"var_{i}" + if hasattr(concept, 'get_value'): + result_row[var_name] = concept.get_value() + elif hasattr(concept, 'as_entity'): + result_row[var_name] = str(concept) + else: + result_row[var_name] = str(concept) else: - row[var] = str(concept) - results.append(row) + result_row["result"] = str(row) + results.append(result_row) + else: + # Single result (e.g., count query) + results.append({"result": str(answer)}) except Exception as e: logger.error(f"TypeQL query failed: {e}") @@ -603,28 +612,52 @@ class TypeDBRetriever: "relations": {}, } - # Count heritage custodians + # Count custodian entities by type (TypeDB 3.x API) + # Schema types: custodian-observation, custodian-name, custodian-reconstruction + entity_types = [ + ("custodian-observation", "observations"), + ("custodian-name", "names"), + ("custodian-reconstruction", "reconstructions"), + ] + try: - count_query = """ - match - $inst isa heritage-custodian; - get $inst; - count; - """ - session = self._get_session() from typedb.driver import TransactionType - with session.transaction(TransactionType.READ) as tx: - answer = tx.query.get_aggregate(count_query) - stats["entities"]["heritage_custodian"] = answer.as_value().as_long() + + with self.client.transaction(self.database, TransactionType.READ) as tx: + for type_name, stat_key in entity_types: + try: + # TypeDB 3.x count query syntax + count_query = f""" + match + $inst isa {type_name}; + reduce $count = count; + """ + answer = tx.query(count_query).resolve() + # Parse count result - TypeDB 3.x returns _Value objects + count = 0 + for row in answer: + # row.get("count") returns a _Value object + value_obj = row.get("count") + # Extract integer - try multiple methods + if hasattr(value_obj, 'get_integer'): + count = value_obj.get_integer() + elif hasattr(value_obj, 'try_get_integer'): + count = value_obj.try_get_integer() or 0 + else: + # Fallback: string conversion + count = int(str(value_obj)) + break + stats["entities"][stat_key] = count + except Exception as e: + stats["entities"][stat_key] = f"error: {e}" + except Exception as e: stats["entities"]["error"] = str(e) return stats def close(self) -> None: - """Clean up resources.""" - if self._session and self._session.is_open(): - self._session.close() + """Clean up resources (TypeDB 3.x - no sessions to close).""" if self._client: self._client.close() self._client = None