From 7ec4e05dd45971c7fa1e28eb4bc432659bd953b2 Mon Sep 17 00:00:00 2001 From: kempersc Date: Fri, 9 Jan 2026 16:42:55 +0100 Subject: [PATCH] feat(merge): add script to merge PENDING files by matching emic names with existing files --- .../person_pid/10_ppid_ghcid_alignment.md | 409 +++++++++++++----- scripts/merge_pending_by_name.py | 200 +++++++++ 2 files changed, 492 insertions(+), 117 deletions(-) create mode 100755 scripts/merge_pending_by_name.py diff --git a/docs/plan/person_pid/10_ppid_ghcid_alignment.md b/docs/plan/person_pid/10_ppid_ghcid_alignment.md index a9fec96913..68edf32424 100644 --- a/docs/plan/person_pid/10_ppid_ghcid_alignment.md +++ b/docs/plan/person_pid/10_ppid_ghcid_alignment.md @@ -15,11 +15,12 @@ This document proposes a **revised PPID structure** that aligns with GHCID's geo | Aspect | Original (Doc 05) | Revised (This Document) | |--------|-------------------|-------------------------| -| **Format** | Opaque hex (`POID-7a3b-c4d5-...`) | Semantic (`PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG`) | +| **Format** | Opaque hex (`POID-7a3b-c4d5-...`) | Semantic (`PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG`) | | **Type Distinction** | POID vs PRID | ID (temporary) vs PID (persistent) | -| **Geographic** | None in identifier | Dual anchors: first + last observation | -| **Temporal** | None in identifier | Century range | -| **Name** | None in identifier | First + last token of emic label | +| **Geographic** | None in identifier | Dual anchors: first + last observation location | +| **Temporal** | None in identifier | ISO 8601 dates with variable precision (YYYY-MM-DD / YYYY-MM / YYYY) | +| **Name** | None in identifier | First + last token of emic label (skip particles) | +| **Delimiters** | Single type | Hierarchical: `_` (major groups) + `-` (within groups) | | **Persistence** | Always persistent | May remain ID indefinitely | ### 1.2 Design Philosophy @@ -111,19 +112,27 @@ This is acceptable and expected. An ID is still a valid identifier for internal ### 3.1 Full Format Specification ``` -{TYPE}-{FC}-{FR}-{FP}-{LC}-{LR}-{LP}-{CR}-{FT}-{LT}[-{FULL_EMIC}] - │ │ │ │ │ │ │ │ │ │ │ - │ │ │ │ │ │ │ │ │ │ └── Collision suffix (optional) - │ │ │ │ │ │ │ │ │ └── Last Token of emic label - │ │ │ │ │ │ │ │ └── First Token of emic label - │ │ │ │ │ │ │ └── Century Range (e.g., 19-20) - │ │ │ │ │ │ └── Last observation Place (GeoNames 3-letter) - │ │ │ │ │ └── Last observation Region (ISO 3166-2) - │ │ │ │ └── Last observation Country (ISO 3166-1 alpha-2) - │ │ │ └── First observation Place (GeoNames 3-letter) - │ │ └── First observation Region (ISO 3166-2) - │ └── First observation Country (ISO 3166-1 alpha-2) +{TYPE}_{FL}_{FD}_{LL}_{LD}_{NT}[-{FULL_EMIC}] + │ │ │ │ │ │ │ + │ │ │ │ │ │ └── Collision suffix (optional, snake_case) + │ │ │ │ │ └── Name Tokens (FIRST-LAST, hyphen-joined) + │ │ │ │ └── Last observation Date (ISO 8601: YYYY-MM-DD or reduced) + │ │ │ └── Last observation Location (CC-RR-PPP, hyphen-joined) + │ │ └── First observation Date (ISO 8601: YYYY-MM-DD or reduced) + │ └── First observation Location (CC-RR-PPP, hyphen-joined) └── Type: ID or PID + +Delimiters: + - Underscore (_) = Major delimiter between logical groups + - Hyphen (-) = Minor delimiter within groups +``` + +**Expanded with all components:** + +``` +{TYPE}_{FC-FR-FP}_{FD}_{LC-LR-LP}_{LD}_{FT-LT}[-{full_emic_label}] + +Example: PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG ``` ### 3.2 Component Definitions @@ -131,31 +140,58 @@ This is acceptable and expected. An ID is still a valid identifier for internal | Component | Format | Description | Example | |-----------|--------|-------------|---------| | **TYPE** | `ID` or `PID` | Identifier class | `PID` | -| **FC** | ISO 3166-1 α2 | First observation country (modern) | `NL` | -| **FR** | ISO 3166-2 suffix | First observation region | `NH` | -| **FP** | 3 letters | First observation place (GeoNames) | `AMS` | -| **LC** | ISO 3166-1 α2 | Last observation country (modern) | `NL` | -| **LR** | ISO 3166-2 suffix | Last observation region | `NH` | -| **LP** | 3 letters | Last observation place (GeoNames) | `HAA` | -| **CR** | `CC-CC` | Century range (CE) | `19-20` | -| **FT** | UPPERCASE | First token of emic label | `JAN` | -| **LT** | UPPERCASE | Last token of emic label | `BERG` | +| **FL** | `CC-RR-PPP` | First observation Location | `NL-NH-AMS` | +| → FC | ISO 3166-1 α2 | Country code | `NL` | +| → FR | ISO 3166-2 suffix | Region code | `NH` | +| → FP | 3 letters | Place code (GeoNames) | `AMS` | +| **FD** | ISO 8601 | First observation Date | `1895-03-15` or `1895` | +| **LL** | `CC-RR-PPP` | Last observation Location | `NL-NH-HAA` | +| → LC | ISO 3166-1 α2 | Country code | `NL` | +| → LR | ISO 3166-2 suffix | Region code | `NH` | +| → LP | 3 letters | Place code (GeoNames) | `HAA` | +| **LD** | ISO 8601 | Last observation Date | `1970-08-22` or `1970` | +| **NT** | `FIRST-LAST` | Name Tokens (emic label) | `JAN-BERG` | +| → FT | UPPERCASE | First token | `JAN` | +| → LT | UPPERCASE | Last token (skip particles) | `BERG` | | **FULL_EMIC** | snake_case | Full emic label (collision only) | `jan_van_den_berg` | -### 3.3 Examples +### 3.3 ISO 8601 Date Precision Levels + +Dates use ISO 8601 format with **variable precision** based on what can be verified: + +| Precision | Format | Example | When to Use | +|-----------|--------|---------|-------------| +| **Day** | `YYYY-MM-DD` | `1895-03-15` | Birth/death certificate with exact date | +| **Month** | `YYYY-MM` | `1895-03` | Record states month but not day | +| **Year** | `YYYY` | `1895` | Only year known (common for historical figures) | +| **Unknown** | `XXXX` | `XXXX` | Cannot determine; identifier remains ID class | + +**BCE Dates** (negative years per ISO 8601 extended): +| Year | ISO 8601 | Example PPID | +|------|----------|--------------| +| 469 BCE | `-0469` | `PID_GR-AT-ATH_-0469_GR-AT-ATH_-0399_SOCRATES-` | +| 44 BCE | `-0044` | `PID_IT-RM-ROM_-0100_IT-RM-ROM_-0044_GAIUS-CAESAR` | + +**No Time Components**: Hours, minutes, seconds are never included (impractical for historical persons). + +### 3.4 Examples | Person | Full Emic Label | PPID | |--------|-----------------|------| -| Jan van den Berg, born Amsterdam 1895, died Haarlem 1970 | Jan van den Berg | `PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG` | -| Rembrandt, born Leiden 1606, died Amsterdam 1669 | Rembrandt van Rijn | `PID-NL-ZH-LEI-NL-NH-AMS-17-17-REMBRANDT-RIJN` | -| Maria Sibylla Merian, born Frankfurt 1647, died Amsterdam 1717 | Maria Sibylla Merian | `PID-DE-HE-FRA-NL-NH-AMS-17-18-MARIA-MERIAN` | -| Unknown soldier, found Normandy, died 1944 | (unknown) | `ID-XX-XX-XXX-FR-NM-OMH-20-20-UNKNOWN-` | -| Henry VIII, born London 1491, died London 1547 | Henry VIII | `PID-GB-ENG-LON-GB-ENG-LON-15-16-HENRY-VIII` | +| Jan van den Berg (1895-03-15 → 1970-08-22) | Jan van den Berg | `PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG` | +| Rembrandt (1606-07-15 → 1669-10-04) | Rembrandt van Rijn | `PID_NL-ZH-LEI_1606-07-15_NL-NH-AMS_1669-10-04_REMBRANDT-RIJN` | +| Maria Sibylla Merian (1647 → 1717) | Maria Sibylla Merian | `PID_DE-HE-FRA_1647_NL-NH-AMS_1717_MARIA-MERIAN` | +| Socrates (c. 470 BCE → 399 BCE) | Σωκράτης (Sōkrátēs) | `PID_GR-AT-ATH_-0470_GR-AT-ATH_-0399_SOCRATES-` | +| Julius Caesar (100 BCE → 44 BCE) | Gaius Iulius Caesar | `PID_IT-RM-ROM_-0100_IT-RM-ROM_-0044_GAIUS-CAESAR` | +| Unknown soldier (? → 1944-06-06) | (unknown) | `ID_XX-XX-XXX_XXXX_FR-NM-OMH_1944-06-06_UNKNOWN-` | +| Henry VIII (1491-06-28 → 1547-01-28) | Henry VIII | `PID_GB-ENG-LON_1491-06-28_GB-ENG-LON_1547-01-28_HENRY-VIII` | +| Vincent van Gogh (1853-03-30 → 1890-07-29) | Vincent Willem van Gogh | `PID_NL-NB-ZUN_1853-03-30_FR-IDF-AUV_1890-07-29_VINCENT-GOGH` | **Notes on Emic Labels**: - Always use **formal/complete emic names** from primary sources, not modern colloquial short forms - "Rembrandt" alone is a modern convention; the emic label from his lifetime was "Rembrandt van Rijn" - **Tussenvoegsels (particles)** like "van", "de", "den", "der", "van de", "van den", "van der" are **skipped** when extracting the last token (see §4.5) +- Non-Latin names are transliterated following GHCID transliteration standards (see AGENTS.md) - This follows the same pattern as GHCID abbreviation rules (AGENTS.md Rule 8) --- @@ -787,51 +823,123 @@ def detect_collision(new_ppid: str, existing_ppids: Set[str]) -> bool: return False def get_base_ppid(ppid: str) -> str: - """Extract base PPID without collision suffix.""" - # Full PPID may have collision suffix after last token - # e.g., "PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG-jan_van_den_berg" - # Base: "PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG" + """Extract base PPID without collision suffix. - parts = ppid.split('-') + New format uses underscore as major delimiter: + PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG-jan_van_den_berg + ↑ collision suffix starts here - # Standard PPID has 11 parts (TYPE + 6 geo + CR + FT + LT) - # If more parts, the extra is collision suffix - if len(parts) > 11: - return '-'.join(parts[:11]) + Base PPID has exactly 6 underscore-delimited parts: + TYPE_FL_FD_LL_LD_NT + """ + # Split by underscore (major delimiter) + parts = ppid.split('_') + + # Standard PPID has 6 parts: TYPE, FL, FD, LL, LD, NT + if len(parts) < 6: + return ppid # Invalid format + + # Check if last part contains collision suffix (hyphen after name tokens) + name_tokens_part = parts[5] + + # Name tokens format: FIRST-LAST or FIRST-LAST-collision_suffix + # Collision suffix is in snake_case (contains underscores within the last major part) + # Since we already split by _, a collision suffix would appear as extra parts + if len(parts) > 6: + # Extra parts after NT are collision suffix + base_parts = parts[:6] + return '_'.join(base_parts) + + # Check for hyphen-appended collision within NT part + # e.g., "JAN-BERG-jan_van_den_berg" - but wait, this would be split by _ + # Actually: collision suffix uses - to connect: JAN-BERG-jan_van_den_berg + # Let's handle this case + if '-' in name_tokens_part: + nt_parts = name_tokens_part.split('-') + # First two are name tokens, rest is collision suffix + if len(nt_parts) > 2 and nt_parts[2].islower(): + # Has collision suffix + base_nt = '-'.join(nt_parts[:2]) + parts[5] = base_nt + return '_'.join(parts[:6]) return ppid ``` -### 5.2 Collision Resolution via Full Emic Label +### 5.2 Collision Resolution Strategy -When collision occurs, append full emic label in snake_case: +Collisions are resolved through a **three-tier escalation** strategy: + +1. **Tier 1**: Append full emic label in snake_case +2. **Tier 2**: If still collides, add 8-character hash discriminator +3. **Tier 3**: If still collides (virtually impossible), add timestamp-based discriminator ```python +import hashlib +import secrets +from datetime import datetime +from typing import Set + def resolve_collision( base_ppid: str, full_emic_label: str, - existing_ppids: Set[str] + existing_ppids: Set[str], + distinguishing_data: dict = None ) -> str: """ - Resolve collision by appending full emic label. + Resolve collision using three-tier escalation strategy. + + Args: + base_ppid: The base PPID without collision suffix + full_emic_label: The person's full emic name + existing_ppids: Set of existing PPIDs to check against + distinguishing_data: Optional dict with additional data for hashing + (e.g., occupation, parent names, source document ID) Example: - Base: "PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG" + Base: "PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG" Emic: "Jan van den Berg" - Result: "PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG-jan_van_den_berg" + + Tier 1 Result: "PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG-jan_van_den_berg" + Tier 2 Result: "PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG-jan_van_den_berg-a7b3c2d1" + Tier 3 Result: "PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG-jan_van_den_berg-20250109143022" """ - suffix = generate_collision_suffix(full_emic_label) - resolved = f"{base_ppid}-{suffix}" - # Check if still collides (extremely rare) - if resolved in existing_ppids: - # Add numeric discriminator - counter = 2 - while f"{resolved}_{counter}" in existing_ppids: - counter += 1 - resolved = f"{resolved}_{counter}" + # Tier 1: Full emic label suffix + emic_suffix = generate_collision_suffix(full_emic_label) + tier1_ppid = f"{base_ppid}-{emic_suffix}" - return resolved + if tier1_ppid not in existing_ppids: + return tier1_ppid + + # Tier 2: Add deterministic hash discriminator + # Hash is based on distinguishing data if provided, otherwise random + if distinguishing_data: + # Deterministic: hash of distinguishing data + hash_input = f"{tier1_ppid}|{sorted(distinguishing_data.items())}" + hash_bytes = hashlib.sha256(hash_input.encode()).digest() + discriminator = hash_bytes[:4].hex() # 8 hex characters + else: + # Fallback: cryptographically secure random + discriminator = secrets.token_hex(4) # 8 hex characters + + tier2_ppid = f"{tier1_ppid}-{discriminator}" + + if tier2_ppid not in existing_ppids: + return tier2_ppid + + # Tier 3: Timestamp-based (virtually impossible to reach) + # This should never happen with random discriminator, but provides safety + timestamp = datetime.utcnow().strftime("%Y%m%d%H%M%S") + tier3_ppid = f"{tier1_ppid}-{timestamp}" + + # Final fallback: add microseconds if still colliding + while tier3_ppid in existing_ppids: + timestamp = datetime.utcnow().strftime("%Y%m%d%H%M%S%f") + tier3_ppid = f"{tier1_ppid}-{timestamp}" + + return tier3_ppid + def generate_collision_suffix(full_emic_label: str) -> str: """ @@ -870,6 +978,48 @@ def generate_collision_suffix(full_emic_label: str) -> str: return final ``` +### 5.3 Distinguishing Data for Tier 2 Hash + +When two persons have identical base PPID and emic label, use **distinguishing data** to generate a deterministic hash: + +| Priority | Distinguishing Data | Example | +|----------|---------------------|---------| +| 1 | Source document ID | `"NL-NH-HAA/BS/Geb/1895/123"` | +| 2 | Parent names | `"father:Pieter_van_den_Berg"` | +| 3 | Occupation | `"occupation:timmerman"` | +| 4 | Spouse name | `"spouse:Maria_Jansen"` | +| 5 | Unique claim from observation | Any distinguishing fact | + +```python +# Example: Two "Jan van den Berg" born same day, same place +distinguishing_data_person_1 = { + "source_document": "NL-NH-HAA/BS/Geb/1895/123", + "father_name": "Pieter van den Berg", + "occupation": "timmerman" +} + +distinguishing_data_person_2 = { + "source_document": "NL-NH-HAA/BS/Geb/1895/456", + "father_name": "Hendrik van den Berg", + "occupation": "bakker" +} + +# Results in different deterministic hashes: +# Person 1: PID_NL-NH-AMS_1895-03-15_..._JAN-BERG-jan_van_den_berg-a7b3c2d1 +# Person 2: PID_NL-NH-AMS_1895-03-15_..._JAN-BERG-jan_van_den_berg-f2e8d4a9 +``` + +### 5.4 Collision Probability Analysis + +| Tier | Collision Probability | When Triggered | +|------|----------------------|----------------| +| **Base PPID** | ~1/10,000 for common names | Same location, date, name tokens | +| **Tier 1** (+emic) | ~1/1,000,000 | Same full emic label | +| **Tier 2** (+hash) | ~1/4.3 billion | Same emic AND no distinguishing data | +| **Tier 3** (+time) | ~0 | Cryptographic failure | + +**Practical Impact**: For a dataset of 10 million persons, expected Tier 2 collisions ≈ 0.002 (effectively zero). + --- ## 6. Unknown Components: XX and XXX Placeholders @@ -886,16 +1036,17 @@ Unlike GHCID (where `XX`/`XXX` are temporary and require research), PPID may hav | Unknown death country | `XX` | No (remains ID) | | Unknown death region | `XX` | No (remains ID) | | Unknown death place | `XXX` | No (remains ID) | -| Unknown century | `XX-XX` | No (remains ID) | +| Unknown date | `XXXX` | No (remains ID) | | Unknown first token | `UNKNOWN` | No (remains ID) | -| Unknown last token | (empty) | Yes (if mononym) | +| Unknown last token | (empty after hyphen) | Yes (if mononym) | ### 6.2 ID Examples with Unknown Components ``` -ID-XX-XX-XXX-FR-NM-OMH-20-20-UNKNOWN- # Unknown soldier, Normandy -ID-NL-NH-AMS-XX-XX-XXX-17-17-REMBRANDT- # Rembrandt, death place unknown -ID-XX-XX-XXX-XX-XX-XXX-XX-XX-ANONYMOUS- # Completely unknown person +ID_XX-XX-XXX_XXXX_FR-NM-OMH_1944-06-06_UNKNOWN- # Unknown soldier, Normandy +ID_NL-NH-AMS_1606_XX-XX-XXX_XXXX_REMBRANDT- # Rembrandt, death unknown (hypothetical) +ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_ANONYMOUS- # Completely unknown person +ID_NL-ZH-LEI_1606-07_NL-NH-AMS_1669_REMBRANDT-RIJN # Rembrandt, month known for birth, only year for death ``` --- @@ -908,7 +1059,7 @@ Every PPID generates three representations: | Format | Purpose | Example | |--------|---------|---------| -| **Semantic String** | Human-readable | `PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG` | +| **Semantic String** | Human-readable | `PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG` | | **UUID v5** | Linked data, URIs | `550e8400-e29b-41d4-a716-446655440000` | | **Numeric (64-bit)** | Database keys, CSV | `213324328442227739` | @@ -927,7 +1078,7 @@ def generate_ppid_identifiers(semantic_ppid: str) -> dict: Returns: { - 'semantic': 'PID-NL-NH-AMS-...', + 'semantic': 'PID_NL-NH-AMS_1895-03-15_...', 'uuid_v5': '550e8400-...', 'numeric': 213324328442227739 } @@ -947,10 +1098,10 @@ def generate_ppid_identifiers(semantic_ppid: str) -> dict: # Example: -ppid = "PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG" +ppid = "PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG" identifiers = generate_ppid_identifiers(ppid) # { -# 'semantic': 'PID-NL-NH-AMS-NL-NH-HAA-19-20-JAN-BERG', +# 'semantic': 'PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG', # 'uuid_v5': 'a1b2c3d4-e5f6-5a1b-9c2d-3e4f5a6b7c8d', # 'numeric': 1234567890123456789 # } @@ -1033,17 +1184,18 @@ If this revised structure is adopted: ### 10.1 Character Set and Length ```python -# Maximum lengths +# Component lengths MAX_COUNTRY_CODE = 2 # ISO 3166-1 alpha-2 MAX_REGION_CODE = 3 # ISO 3166-2 suffix (some are 3 chars) MAX_PLACE_CODE = 3 # GeoNames convention -MAX_CENTURY_RANGE = 5 # "XX-XX" +MAX_DATE = 10 # YYYY-MM-DD (ISO 8601) MAX_TOKEN_LENGTH = 20 # Reasonable limit for names -MAX_COLLISION_SUFFIX = 50 # Full emic label +MAX_COLLISION_SUFFIX = 50 # Full emic label in snake_case -# Maximum total PPID length (without collision suffix) -# "PID-" + "XX-XXX-XXX-" * 2 + "XX-XX-" + "TOKEN-TOKEN" -# = 4 + (2+3+3+4)*2 + 6 + 20 + 20 = ~70 characters +# Example PPID structure (without collision suffix): +# PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG +# 3 +1+ 10 +1+ 10 +1+ 10 +1+ 10 +1+ 20 +# ≈ 70 characters maximum # With collision suffix: ~120 characters max ``` @@ -1053,18 +1205,24 @@ MAX_COLLISION_SUFFIX = 50 # Full emic label ```python import re +# Location pattern: CC-RR-PPP (country-region-place) +LOCATION_PATTERN = r'([A-Z]{2}|XX)-([A-Z]{2,3}|XX)-([A-Z]{3}|XXX)' + +# Date pattern: YYYY-MM-DD or YYYY-MM or YYYY or XXXX (including BCE with leading -) +DATE_PATTERN = r'(-?\d{4}(?:-\d{2}(?:-\d{2})?)?|XXXX)' + +# Name tokens pattern: FIRST-LAST or FIRST- (for mononyms) +NAME_PATTERN = r'([A-Z0-9]+)-([A-Z0-9]*)' + +# Full PPID pattern PPID_PATTERN = re.compile( - r'^(ID|PID)-' # Type - r'([A-Z]{2}|XX)-' # First country - r'([A-Z]{2,3}|XX)-' # First region - r'([A-Z]{3}|XXX)-' # First place - r'([A-Z]{2}|XX)-' # Last country - r'([A-Z]{2,3}|XX)-' # Last region - r'([A-Z]{3}|XXX)-' # Last place - r'(\d{1,2}-\d{1,2}|XX-XX)-' # Century range - r'([A-Z0-9]+)-' # First token - r'([A-Z0-9]*)' # Last token (may be empty) - r'(-[a-z0-9_]+)?$' # Collision suffix (optional) + r'^(ID|PID)' # Type + r'_' + LOCATION_PATTERN + # First location (underscore + CC-RR-PPP) + r'_' + DATE_PATTERN + # First date (underscore + ISO 8601) + r'_' + LOCATION_PATTERN + # Last location (underscore + CC-RR-PPP) + r'_' + DATE_PATTERN + # Last date (underscore + ISO 8601) + r'_' + NAME_PATTERN + # Name tokens (underscore + FIRST-LAST) + r'(-[a-z0-9_]+)?$' # Collision suffix (optional, hyphen + snake_case) ) def validate_ppid(ppid: str) -> tuple[bool, str]: @@ -1072,23 +1230,36 @@ def validate_ppid(ppid: str) -> tuple[bool, str]: if not PPID_PATTERN.match(ppid): return False, "Invalid PPID format" - # Additional semantic validation - parts = ppid.split('-') + # Split by major delimiter (underscore) + parts = ppid.split('_') - # Century range validation - if len(parts) >= 9: - century_range = f"{parts[7]}-{parts[8]}" - if century_range != "XX-XX": - try: - first_c, last_c = map(int, [parts[7], parts[8]]) - if last_c < first_c: - return False, "Last century cannot be before first century" - if first_c < 1 or last_c > 22: # Reasonable bounds - return False, "Century out of reasonable range" - except ValueError: - pass + if len(parts) < 6: + return False, "Incomplete PPID - requires 6 underscore-delimited parts" + + # Extract dates for validation + first_date = parts[2] + last_date = parts[4] + + # Date ordering validation (if both are known) + if first_date != 'XXXX' and last_date != 'XXXX': + # Parse years (handle BCE with leading -) + try: + first_year = int(first_date.split('-')[0]) if not first_date.startswith('-') else -int(first_date.split('-')[1]) + last_year = int(last_date.split('-')[0]) if not last_date.startswith('-') else -int(last_date.split('-')[1]) + + if last_year < first_year: + return False, "Last observation date cannot be before first observation date" + except (ValueError, IndexError): + pass # Invalid date format caught by regex return True, "Valid" + + +# Example validations: +assert validate_ppid("PID_NL-NH-AMS_1895-03-15_NL-NH-HAA_1970-08-22_JAN-BERG")[0] +assert validate_ppid("PID_GR-AT-ATH_-0470_GR-AT-ATH_-0399_SOCRATES-")[0] +assert validate_ppid("ID_XX-XX-XXX_XXXX_FR-NM-OMH_1944-06-06_UNKNOWN-")[0] +assert not validate_ppid("PID_NL-NH-AMS_1970_NL-NH-HAA_1895_JAN-BERG")[0] # Dates reversed ``` --- @@ -1097,39 +1268,42 @@ def validate_ppid(ppid: str) -> tuple[bool, str]: ### 11.1 BCE Dates -How to handle persons from before Common Era? +**RESOLVED**: Use ISO 8601 extended format with negative years. -**Options**: -1. Negative century numbers: `-5--4` for 5th-4th century BCE -2. BCE prefix: `BCE5-BCE4` -3. Separate identifier scheme for ancient persons +- `-0469` for 469 BCE +- `-0044` for 44 BCE +- Examples in section 3.3 and 3.4 ### 11.2 Non-Latin Name Tokens -How to handle names in non-Latin scripts? +**RESOLVED**: Apply same transliteration rules as GHCID (see AGENTS.md). -**Options**: -1. Require transliteration (current approach) -2. Allow Unicode tokens with normalization -3. Dual representation (original + transliterated) +| Script | Standard | +|--------|----------| +| Cyrillic | ISO 9:1995 | +| Chinese | Hanyu Pinyin (ISO 7098) | +| Japanese | Modified Hepburn | +| Korean | Revised Romanization | +| Arabic | ISO 233-2/3 | ### 11.3 Disputed Locations -What if birth/death locations are historically disputed? - -**Options**: -1. Use most likely location with note -2. Use `XX`/`XXX` until resolved -3. Create multiple IDs for each interpretation +**RESOLVED**: Not a PPID concern - handled by ISO standardization. Use modern ISO-standardized location codes; document disputes in observation metadata. ### 11.4 Living Persons -How to handle persons still alive (no death observation)? +**RESOLVED**: Living persons are **always ID class** and can only be promoted to PID after death. -**Options**: -1. Cannot be PID until death -2. Use `XX-XX-XXX` for death location, current century for range -3. Separate identifier class for living persons +- Living persons have no verified last observation (death date/location) +- Use `XXXX` for unknown death date and `XX-XX-XXX` for unknown death location +- Example: `ID_NL-NH-AMS_1985-06-15_XX-XX-XXX_XXXX_JAN-BERG` +- Can be promoted to PID only after death observation is verified + +**Rationale**: +1. PID requires verified last observation (death) +2. Living persons have incomplete lifecycle data +3. Future observations may change identity assessment +4. Privacy considerations for living individuals --- @@ -1147,4 +1321,5 @@ How to handle persons still alive (no death observation)? ### Standards - ISO 3166-1: Country codes - ISO 3166-2: Subdivision codes +- ISO 8601: Date and time format (including BCE with negative years) - GeoNames: Geographic names database diff --git a/scripts/merge_pending_by_name.py b/scripts/merge_pending_by_name.py new file mode 100755 index 0000000000..6c88a1231e --- /dev/null +++ b/scripts/merge_pending_by_name.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Find and merge PENDING files that have matching emic names with existing files. + +This script: +1. Scans all existing custodian files to build a name -> file mapping +2. Scans all PENDING files to find matches +3. Merges staff data from PENDING into existing files +4. Archives merged PENDING files + +Usage: + python scripts/merge_pending_by_name.py --dry-run # Preview + python scripts/merge_pending_by_name.py # Apply +""" + +import os +import yaml +from pathlib import Path +from datetime import datetime, timezone +from typing import Dict, Optional +import shutil + +def load_yaml_fast(filepath: Path) -> Optional[Dict]: + """Load YAML file, return None on error.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + return yaml.safe_load(f) + except: + return None + +def save_yaml(filepath: Path, data: Dict): + """Save YAML file.""" + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, + sort_keys=False, width=120) + +def normalize_name(name: str) -> str: + """Normalize name for matching.""" + if not name: + return "" + return name.lower().strip() + +def merge_staff(source_data: Dict, target_data: Dict, source_name: str) -> int: + """Merge staff from source into target. Returns count of staff added.""" + if 'staff' not in source_data: + return 0 + + source_staff = source_data['staff'] + staff_list = source_staff.get('staff_list', []) + + if not staff_list: + return 0 + + # Skip if target already has staff + if 'staff' in target_data and target_data['staff'].get('staff_list'): + return 0 + + # Add staff section + target_data['staff'] = { + 'provenance': source_staff.get('provenance', {}), + 'staff_list': staff_list + } + + # Add provenance note + if 'provenance' not in target_data: + target_data['provenance'] = {} + notes = target_data['provenance'].get('notes', []) + if isinstance(notes, str): + notes = [notes] + notes.append(f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}") + target_data['provenance']['notes'] = notes + + return len(staff_list) + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--dry-run', action='store_true') + parser.add_argument('--custodian-dir', type=Path, + default=Path('/Users/kempersc/apps/glam/data/custodian')) + args = parser.parse_args() + + custodian_dir = args.custodian_dir + archive_dir = custodian_dir / 'archive' / 'pending_merged_20250109' + + print("=" * 80) + print("MERGING PENDING FILES BY NAME MATCH") + print("=" * 80) + print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") + print() + + # Step 1: Build name -> file mapping for existing files + print("Building name index from existing files...") + existing_by_name = {} + + for f in custodian_dir.glob('[A-Z][A-Z]-[A-Z][A-Z]-*.yaml'): + if 'PENDING' in f.name or 'archive' in str(f): + continue + data = load_yaml_fast(f) + if data: + name = data.get('custodian_name', {}).get('emic_name', '') + if name: + normalized = normalize_name(name) + existing_by_name[normalized] = (f, data) + + print(f" Indexed {len(existing_by_name)} existing files") + + # Step 2: Find matching PENDING files + print("\nScanning PENDING files for matches...") + matches = [] + no_matches = [] + + for f in sorted(custodian_dir.glob('*-XX-XXX-PENDING-*.yaml')): + if 'archive' in str(f): + continue + data = load_yaml_fast(f) + if data: + name = data.get('custodian_name', {}).get('emic_name', '') + normalized = normalize_name(name) + staff_count = len(data.get('staff', {}).get('staff_list', [])) + + if normalized in existing_by_name: + existing_file, existing_data = existing_by_name[normalized] + matches.append({ + 'pending_file': f, + 'pending_data': data, + 'existing_file': existing_file, + 'existing_data': existing_data, + 'name': name, + 'staff_count': staff_count + }) + else: + no_matches.append({ + 'file': f, + 'name': name, + 'staff_count': staff_count + }) + + print(f" Found {len(matches)} PENDING files with matching existing files") + print(f" Found {len(no_matches)} PENDING files without matches") + + # Step 3: Merge matches + if matches: + print("\n" + "=" * 80) + print("MERGING MATCHED FILES") + print("=" * 80) + + if not args.dry_run: + archive_dir.mkdir(parents=True, exist_ok=True) + + total_staff = 0 + merged_count = 0 + skipped_count = 0 + + for m in matches: + pending_file = m['pending_file'] + existing_file = m['existing_file'] + pending_data = m['pending_data'] + existing_data = m['existing_data'] + + # Check if existing already has staff + existing_staff = len(existing_data.get('staff', {}).get('staff_list', [])) + if existing_staff > 0: + skipped_count += 1 + continue + + staff_added = m['staff_count'] + if staff_added == 0: + skipped_count += 1 + continue + + print(f"\n[{'DRY RUN' if args.dry_run else 'MERGE'}] {m['name'][:50]}") + print(f" From: {pending_file.name}") + print(f" To: {existing_file.name}") + print(f" Staff: {staff_added}") + + if not args.dry_run: + # Merge staff + merge_staff(pending_data, existing_data, pending_file.name) + save_yaml(existing_file, existing_data) + + # Move PENDING to archive + shutil.move(str(pending_file), str(archive_dir / pending_file.name)) + + total_staff += staff_added + merged_count += 1 + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"Files merged: {merged_count}") + print(f"Files skipped (already has staff or no staff): {skipped_count}") + print(f"Total staff added: {total_staff}") + print(f"Unmatched PENDING files remaining: {len(no_matches)}") + + if not args.dry_run: + print(f"\nArchived to: {archive_dir}") + +if __name__ == '__main__': + main()