From 5ab9dd8ea2e7ada2d87dff11d5a262884e077f17 Mon Sep 17 00:00:00 2001 From: kempersc Date: Fri, 9 Jan 2026 14:51:57 +0100 Subject: [PATCH] docs(person_pid): add implementation guidelines and governance docs Add final two chapters of the Person PID (PPID) design document: - 08_implementation_guidelines.md: Database architecture, API design, data ingestion pipeline, GHCID integration, security, performance, technology stack, deployment, and monitoring specifications - 09_governance_and_sustainability.md: Data governance policies, quality assurance, sustainability planning, community engagement, legal considerations, and long-term maintenance strategies --- .../08_implementation_guidelines.md | 2447 +++++++++++++++++ .../09_governance_and_sustainability.md | 1009 +++++++ 2 files changed, 3456 insertions(+) create mode 100644 docs/plan/person_pid/08_implementation_guidelines.md create mode 100644 docs/plan/person_pid/09_governance_and_sustainability.md diff --git a/docs/plan/person_pid/08_implementation_guidelines.md b/docs/plan/person_pid/08_implementation_guidelines.md new file mode 100644 index 0000000000..d2d681716a --- /dev/null +++ b/docs/plan/person_pid/08_implementation_guidelines.md @@ -0,0 +1,2447 @@ +# Implementation Guidelines + +**Version**: 0.1.0 +**Last Updated**: 2025-01-09 +**Related**: [Identifier Structure](./05_identifier_structure_design.md) | [Claims and Provenance](./07_claims_and_provenance.md) + +--- + +## 1. Overview + +This document provides comprehensive technical specifications for implementing the PPID system: + +- Database architecture (PostgreSQL + RDF triple store) +- API design (REST + GraphQL) +- Data ingestion pipeline +- GHCID integration patterns +- Security and access control +- Performance requirements +- Technology stack recommendations +- Deployment architecture +- Monitoring and observability + +--- + +## 2. Architecture Overview + +### 2.1 System Components + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PPID System │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Ingestion │────▶│ Processing │────▶│ Storage │ │ +│ │ Layer │ │ Layer │ │ Layer │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Web Scrapers │ │ Entity Res. │ │ PostgreSQL │ │ +│ │ API Clients │ │ NER/NLP │ │ (Relational) │ │ +│ │ File Import │ │ Validation │ ├──────────────┤ │ +│ └──────────────┘ └──────────────┘ │ Apache Jena │ │ +│ │ (RDF/SPARQL) │ │ +│ ├──────────────┤ │ +│ │ Redis │ │ +│ │ (Cache) │ │ +│ └──────────────┘ │ +│ │ │ +│ ┌─────────────────────┴─────────────────────┐ │ +│ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────┐│ +│ │ REST API │ │ GraphQL ││ +│ │ (FastAPI) │ │ (Ariadne││ +│ └──────────────┘ └──────────┘│ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 2.2 Technology Stack + +| Layer | Technology | Purpose | +|-------|------------|---------| +| **API Framework** | FastAPI (Python 3.11+) | REST API, async support | +| **GraphQL** | Ariadne | GraphQL endpoint | +| **Relational DB** | PostgreSQL 16 | Primary data store | +| **Triple Store** | Apache Jena Fuseki | RDF/SPARQL queries | +| **Cache** | Redis 7 | Session, rate limiting, caching | +| **Queue** | Apache Kafka | Async processing pipeline | +| **Search** | Elasticsearch 8 | Full-text search | +| **Object Storage** | MinIO / S3 | HTML archives, files | +| **Container** | Docker + Kubernetes | Deployment | +| **Monitoring** | Prometheus + Grafana | Metrics and alerting | + +--- + +## 3. Database Schema + +### 3.1 PostgreSQL Schema + +```sql +-- Enable required extensions +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS "pg_trgm"; -- For fuzzy matching + +-- Enum types +CREATE TYPE ppid_type AS ENUM ('POID', 'PRID'); +CREATE TYPE claim_status AS ENUM ('active', 'superseded', 'retracted'); +CREATE TYPE source_type AS ENUM ( + 'official_registry', + 'institutional_website', + 'professional_network', + 'social_media', + 'news_article', + 'academic_publication', + 'user_submitted', + 'inferred' +); + +-- Person Observations (POID) +CREATE TABLE person_observations ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + poid VARCHAR(24) UNIQUE NOT NULL, -- POID-xxxx-xxxx-xxxx-xxxx + + -- Source metadata + source_url TEXT NOT NULL, + source_type source_type NOT NULL, + retrieved_at TIMESTAMPTZ NOT NULL, + content_hash VARCHAR(64) NOT NULL, -- SHA-256 + html_archive_path TEXT, + + -- Extracted name components (PNV-compatible) + literal_name TEXT, + given_name TEXT, + surname TEXT, + surname_prefix TEXT, -- van, de, etc. + patronymic TEXT, + generation_suffix TEXT, -- Jr., III, etc. + + -- Metadata + extraction_agent VARCHAR(100), + extraction_confidence DECIMAL(3,2), + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + + -- Indexes + CONSTRAINT valid_poid CHECK (poid ~ '^POID-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{3}[0-9a-fxX]$') +); + +-- Person Reconstructions (PRID) +CREATE TABLE person_reconstructions ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + prid VARCHAR(24) UNIQUE NOT NULL, -- PRID-xxxx-xxxx-xxxx-xxxx + + -- Canonical name (resolved from observations) + canonical_name TEXT NOT NULL, + given_name TEXT, + surname TEXT, + surname_prefix TEXT, + + -- Curation metadata + curator_id UUID REFERENCES users(id), + curation_method VARCHAR(50), -- 'manual', 'algorithmic', 'hybrid' + confidence_score DECIMAL(3,2), + + -- Versioning + version INTEGER DEFAULT 1, + previous_version_id UUID REFERENCES person_reconstructions(id), + + -- Timestamps + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + + CONSTRAINT valid_prid CHECK (prid ~ '^PRID-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{3}[0-9a-fxX]$') +); + +-- Link table: PRID derives from POIDs +CREATE TABLE reconstruction_observations ( + prid_id UUID REFERENCES person_reconstructions(id) ON DELETE CASCADE, + poid_id UUID REFERENCES person_observations(id) ON DELETE CASCADE, + linked_at TIMESTAMPTZ DEFAULT NOW(), + linked_by UUID REFERENCES users(id), + link_confidence DECIMAL(3,2), + PRIMARY KEY (prid_id, poid_id) +); + +-- Claims (assertions with provenance) +CREATE TABLE claims ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + claim_id VARCHAR(50) UNIQUE NOT NULL, + + -- Subject (what this claim is about) + poid_id UUID REFERENCES person_observations(id), + + -- Claim content + claim_type VARCHAR(50) NOT NULL, -- 'job_title', 'employer', 'email', etc. + claim_value TEXT NOT NULL, + + -- Provenance (MANDATORY per Rule 6) + source_url TEXT NOT NULL, + retrieved_on TIMESTAMPTZ NOT NULL, + xpath TEXT NOT NULL, + html_file TEXT NOT NULL, + xpath_match_score DECIMAL(3,2) NOT NULL, + content_hash VARCHAR(64), + + -- Quality + confidence DECIMAL(3,2), + extraction_agent VARCHAR(100), + status claim_status DEFAULT 'active', + + -- Relationships + supersedes_id UUID REFERENCES claims(id), + + -- Timestamps + created_at TIMESTAMPTZ DEFAULT NOW(), + verified_at TIMESTAMPTZ, + + -- Index for claim lookups + CONSTRAINT valid_xpath_score CHECK (xpath_match_score >= 0 AND xpath_match_score <= 1) +); + +-- Claim relationships (supports, conflicts) +CREATE TABLE claim_relationships ( + claim_a_id UUID REFERENCES claims(id) ON DELETE CASCADE, + claim_b_id UUID REFERENCES claims(id) ON DELETE CASCADE, + relationship_type VARCHAR(20) NOT NULL, -- 'supports', 'conflicts_with' + notes TEXT, + created_at TIMESTAMPTZ DEFAULT NOW(), + PRIMARY KEY (claim_a_id, claim_b_id, relationship_type) +); + +-- External identifiers (ORCID, ISNI, VIAF, Wikidata) +CREATE TABLE external_identifiers ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + prid_id UUID REFERENCES person_reconstructions(id) ON DELETE CASCADE, + identifier_scheme VARCHAR(20) NOT NULL, -- 'orcid', 'isni', 'viaf', 'wikidata' + identifier_value VARCHAR(100) NOT NULL, + verified BOOLEAN DEFAULT FALSE, + verified_at TIMESTAMPTZ, + source_url TEXT, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (prid_id, identifier_scheme, identifier_value) +); + +-- GHCID links (person to heritage institution) +CREATE TABLE ghcid_affiliations ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + prid_id UUID REFERENCES person_reconstructions(id) ON DELETE CASCADE, + ghcid VARCHAR(50) NOT NULL, -- e.g., NL-NH-HAA-A-NHA + + -- Affiliation details + role_title TEXT, + department TEXT, + affiliation_start DATE, + affiliation_end DATE, + is_current BOOLEAN DEFAULT TRUE, + + -- Provenance + source_poid_id UUID REFERENCES person_observations(id), + confidence DECIMAL(3,2), + + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Indexes for performance +CREATE INDEX idx_observations_source_url ON person_observations(source_url); +CREATE INDEX idx_observations_content_hash ON person_observations(content_hash); +CREATE INDEX idx_observations_name_trgm ON person_observations + USING gin (literal_name gin_trgm_ops); + +CREATE INDEX idx_reconstructions_name_trgm ON person_reconstructions + USING gin (canonical_name gin_trgm_ops); + +CREATE INDEX idx_claims_type ON claims(claim_type); +CREATE INDEX idx_claims_poid ON claims(poid_id); +CREATE INDEX idx_claims_status ON claims(status); + +CREATE INDEX idx_ghcid_affiliations_ghcid ON ghcid_affiliations(ghcid); +CREATE INDEX idx_ghcid_affiliations_current ON ghcid_affiliations(prid_id) + WHERE is_current = TRUE; + +-- Full-text search +CREATE INDEX idx_observations_fts ON person_observations + USING gin (to_tsvector('english', literal_name)); +CREATE INDEX idx_reconstructions_fts ON person_reconstructions + USING gin (to_tsvector('english', canonical_name)); +``` + +### 3.2 RDF Triple Store Schema + +```turtle +@prefix ppid: . +@prefix ppidv: . +@prefix ppidt: . +@prefix picom: . +@prefix pnv: . +@prefix prov: . +@prefix schema: . +@prefix xsd: . + +# Example Person Observation +ppid:POID-7a3b-c4d5-e6f7-890X a ppidt:PersonObservation, picom:PersonObservation ; + ppidv:poid "POID-7a3b-c4d5-e6f7-890X" ; + + # Name (PNV structured) + pnv:hasName [ + a pnv:PersonName ; + pnv:literalName "Jan van den Berg" ; + pnv:givenName "Jan" ; + pnv:surnamePrefix "van den" ; + pnv:baseSurname "Berg" + ] ; + + # Provenance + prov:wasDerivedFrom ; + prov:wasGeneratedBy ppid:extraction-activity-001 ; + prov:generatedAtTime "2025-01-09T14:30:00Z"^^xsd:dateTime ; + + # Claims + ppidv:hasClaim ppid:claim-001, ppid:claim-002, ppid:claim-003 . + +# Example Person Reconstruction +ppid:PRID-1234-5678-90ab-cde5 a ppidt:PersonReconstruction, picom:PersonReconstruction ; + ppidv:prid "PRID-1234-5678-90ab-cde5" ; + + # Canonical name + schema:name "Jan van den Berg" ; + pnv:hasName [ + a pnv:PersonName ; + pnv:literalName "Jan van den Berg" ; + pnv:givenName "Jan" ; + pnv:surnamePrefix "van den" ; + pnv:baseSurname "Berg" + ] ; + + # Derived from observations + prov:wasDerivedFrom ppid:POID-7a3b-c4d5-e6f7-890X, + ppid:POID-8c4d-e5f6-g7h8-901Y ; + + # GHCID affiliation + ppidv:affiliatedWith ; + + # External identifiers + ppidv:orcid "0000-0002-1234-5678" ; + owl:sameAs . +``` + +--- + +## 4. API Design + +### 4.1 REST API Endpoints + +```yaml +openapi: 3.1.0 +info: + title: PPID API + version: 1.0.0 + description: Person Persistent Identifier API + +servers: + - url: https://api.ppid.org/v1 + +paths: + # Person Observations + /observations: + post: + summary: Create new person observation + operationId: createObservation + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/CreateObservationRequest' + responses: + '201': + description: Observation created + content: + application/json: + schema: + $ref: '#/components/schemas/PersonObservation' + + get: + summary: Search observations + operationId: searchObservations + parameters: + - name: name + in: query + schema: + type: string + - name: source_url + in: query + schema: + type: string + - name: limit + in: query + schema: + type: integer + default: 20 + - name: offset + in: query + schema: + type: integer + default: 0 + responses: + '200': + description: List of observations + content: + application/json: + schema: + $ref: '#/components/schemas/ObservationList' + + /observations/{poid}: + get: + summary: Get observation by POID + operationId: getObservation + parameters: + - name: poid + in: path + required: true + schema: + type: string + pattern: '^POID-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{3}[0-9a-fxX]$' + responses: + '200': + description: Person observation + content: + application/json: + schema: + $ref: '#/components/schemas/PersonObservation' + text/turtle: + schema: + type: string + application/ld+json: + schema: + type: object + + /observations/{poid}/claims: + get: + summary: Get claims for observation + operationId: getObservationClaims + parameters: + - name: poid + in: path + required: true + schema: + type: string + responses: + '200': + description: List of claims + content: + application/json: + schema: + $ref: '#/components/schemas/ClaimList' + + # Person Reconstructions + /reconstructions: + post: + summary: Create person reconstruction from observations + operationId: createReconstruction + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/CreateReconstructionRequest' + responses: + '201': + description: Reconstruction created + content: + application/json: + schema: + $ref: '#/components/schemas/PersonReconstruction' + + /reconstructions/{prid}: + get: + summary: Get reconstruction by PRID + operationId: getReconstruction + parameters: + - name: prid + in: path + required: true + schema: + type: string + responses: + '200': + description: Person reconstruction + content: + application/json: + schema: + $ref: '#/components/schemas/PersonReconstruction' + + /reconstructions/{prid}/observations: + get: + summary: Get observations linked to reconstruction + operationId: getReconstructionObservations + responses: + '200': + description: Linked observations + + /reconstructions/{prid}/history: + get: + summary: Get version history + operationId: getReconstructionHistory + responses: + '200': + description: Version history + + # Validation + /validate/{ppid}: + get: + summary: Validate PPID format and checksum + operationId: validatePpid + responses: + '200': + description: Validation result + content: + application/json: + schema: + type: object + properties: + valid: + type: boolean + type: + type: string + enum: [POID, PRID] + exists: + type: boolean + + # Search + /search: + get: + summary: Full-text search across all records + operationId: search + parameters: + - name: q + in: query + required: true + schema: + type: string + - name: type + in: query + schema: + type: string + enum: [observation, reconstruction, all] + default: all + responses: + '200': + description: Search results + + # Entity Resolution + /resolve: + post: + summary: Find matching records for input data + operationId: resolveEntity + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/EntityResolutionRequest' + responses: + '200': + description: Resolution candidates + content: + application/json: + schema: + $ref: '#/components/schemas/ResolutionCandidates' + +components: + schemas: + CreateObservationRequest: + type: object + required: + - source_url + - retrieved_at + - claims + properties: + source_url: + type: string + format: uri + source_type: + type: string + enum: [official_registry, institutional_website, professional_network, social_media] + retrieved_at: + type: string + format: date-time + content_hash: + type: string + html_archive_path: + type: string + extraction_agent: + type: string + claims: + type: array + items: + $ref: '#/components/schemas/ClaimInput' + + ClaimInput: + type: object + required: + - claim_type + - claim_value + - xpath + - xpath_match_score + properties: + claim_type: + type: string + claim_value: + type: string + xpath: + type: string + xpath_match_score: + type: number + minimum: 0 + maximum: 1 + confidence: + type: number + + PersonObservation: + type: object + properties: + poid: + type: string + source_url: + type: string + literal_name: + type: string + claims: + type: array + items: + $ref: '#/components/schemas/Claim' + created_at: + type: string + format: date-time + + CreateReconstructionRequest: + type: object + required: + - observation_ids + properties: + observation_ids: + type: array + items: + type: string + minItems: 1 + canonical_name: + type: string + external_identifiers: + type: object + + PersonReconstruction: + type: object + properties: + prid: + type: string + canonical_name: + type: string + observations: + type: array + items: + type: string + external_identifiers: + type: object + ghcid_affiliations: + type: array + items: + $ref: '#/components/schemas/GhcidAffiliation' + + GhcidAffiliation: + type: object + properties: + ghcid: + type: string + role_title: + type: string + is_current: + type: boolean + + securitySchemes: + bearerAuth: + type: http + scheme: bearer + bearerFormat: JWT + apiKey: + type: apiKey + in: header + name: X-API-Key + +security: + - bearerAuth: [] + - apiKey: [] +``` + +### 4.2 GraphQL Schema + +```graphql +type Query { + # Observations + observation(poid: ID!): PersonObservation + observations( + name: String + sourceUrl: String + limit: Int = 20 + offset: Int = 0 + ): ObservationConnection! + + # Reconstructions + reconstruction(prid: ID!): PersonReconstruction + reconstructions( + name: String + ghcid: String + limit: Int = 20 + offset: Int = 0 + ): ReconstructionConnection! + + # Search + search(query: String!, type: SearchType = ALL): SearchResults! + + # Validation + validatePpid(ppid: String!): ValidationResult! + + # Entity Resolution + resolveEntity(input: EntityResolutionInput!): [ResolutionCandidate!]! +} + +type Mutation { + # Create observation + createObservation(input: CreateObservationInput!): PersonObservation! + + # Create reconstruction + createReconstruction(input: CreateReconstructionInput!): PersonReconstruction! + + # Link observation to reconstruction + linkObservation(prid: ID!, poid: ID!): PersonReconstruction! + + # Update reconstruction + updateReconstruction(prid: ID!, input: UpdateReconstructionInput!): PersonReconstruction! + + # Add external identifier + addExternalIdentifier( + prid: ID! + scheme: IdentifierScheme! + value: String! + ): PersonReconstruction! + + # Add GHCID affiliation + addGhcidAffiliation( + prid: ID! + ghcid: String! + roleTitle: String + isCurrent: Boolean = true + ): PersonReconstruction! +} + +type PersonObservation { + poid: ID! + sourceUrl: String! + sourceType: SourceType! + retrievedAt: DateTime! + contentHash: String + htmlArchivePath: String + + # Name components + literalName: String + givenName: String + surname: String + surnamePrefix: String + + # Related data + claims: [Claim!]! + linkedReconstructions: [PersonReconstruction!]! + + # Metadata + extractionAgent: String + extractionConfidence: Float + createdAt: DateTime! +} + +type PersonReconstruction { + prid: ID! + canonicalName: String! + givenName: String + surname: String + surnamePrefix: String + + # Linked observations + observations: [PersonObservation!]! + + # External identifiers + orcid: String + isni: String + viaf: String + wikidata: String + externalIdentifiers: [ExternalIdentifier!]! + + # GHCID affiliations + ghcidAffiliations: [GhcidAffiliation!]! + currentAffiliations: [GhcidAffiliation!]! + + # Versioning + version: Int! + previousVersion: PersonReconstruction + history: [PersonReconstruction!]! + + # Curation + curator: User + curationMethod: CurationMethod + confidenceScore: Float + + createdAt: DateTime! + updatedAt: DateTime! +} + +type Claim { + id: ID! + claimType: ClaimType! + claimValue: String! + + # Provenance (MANDATORY) + sourceUrl: String! + retrievedOn: DateTime! + xpath: String! + htmlFile: String! + xpathMatchScore: Float! + + # Quality + confidence: Float + extractionAgent: String + status: ClaimStatus! + + # Relationships + supports: [Claim!]! + conflictsWith: [Claim!]! + supersedes: Claim + + createdAt: DateTime! +} + +type GhcidAffiliation { + ghcid: String! + institution: HeritageCustodian # Resolved from GHCID + roleTitle: String + department: String + startDate: Date + endDate: Date + isCurrent: Boolean! + confidence: Float +} + +type HeritageCustodian { + ghcid: String! + name: String! + institutionType: String! + city: String + country: String +} + +type ExternalIdentifier { + scheme: IdentifierScheme! + value: String! + verified: Boolean! + verifiedAt: DateTime +} + +enum SourceType { + OFFICIAL_REGISTRY + INSTITUTIONAL_WEBSITE + PROFESSIONAL_NETWORK + SOCIAL_MEDIA + NEWS_ARTICLE + ACADEMIC_PUBLICATION + USER_SUBMITTED + INFERRED +} + +enum ClaimType { + FULL_NAME + GIVEN_NAME + FAMILY_NAME + JOB_TITLE + EMPLOYER + EMPLOYER_GHCID + EMAIL + LINKEDIN_URL + ORCID + BIRTH_DATE + EDUCATION +} + +enum ClaimStatus { + ACTIVE + SUPERSEDED + RETRACTED +} + +enum IdentifierScheme { + ORCID + ISNI + VIAF + WIKIDATA + LOC_NAF +} + +enum CurationMethod { + MANUAL + ALGORITHMIC + HYBRID +} + +enum SearchType { + OBSERVATION + RECONSTRUCTION + ALL +} + +input CreateObservationInput { + sourceUrl: String! + sourceType: SourceType! + retrievedAt: DateTime! + contentHash: String + htmlArchivePath: String + extractionAgent: String + claims: [ClaimInput!]! +} + +input ClaimInput { + claimType: ClaimType! + claimValue: String! + xpath: String! + xpathMatchScore: Float! + confidence: Float +} + +input CreateReconstructionInput { + observationIds: [ID!]! + canonicalName: String + externalIdentifiers: ExternalIdentifiersInput +} + +input EntityResolutionInput { + name: String! + employer: String + jobTitle: String + email: String + linkedinUrl: String +} + +type ResolutionCandidate { + reconstruction: PersonReconstruction + observation: PersonObservation + matchScore: Float! + matchFactors: [MatchFactor!]! +} + +type MatchFactor { + field: String! + score: Float! + method: String! +} +``` + +### 4.3 FastAPI Implementation + +```python +from fastapi import FastAPI, HTTPException, Depends, Query +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +from typing import Optional +from datetime import datetime +import uuid + +app = FastAPI( + title="PPID API", + description="Person Persistent Identifier API", + version="1.0.0" +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# --- Pydantic Models --- + +class ClaimInput(BaseModel): + claim_type: str + claim_value: str + xpath: str + xpath_match_score: float = Field(ge=0, le=1) + confidence: Optional[float] = Field(None, ge=0, le=1) + + +class CreateObservationRequest(BaseModel): + source_url: str + source_type: str = "institutional_website" + retrieved_at: datetime + content_hash: Optional[str] = None + html_archive_path: Optional[str] = None + extraction_agent: Optional[str] = None + claims: list[ClaimInput] + + +class PersonObservationResponse(BaseModel): + poid: str + source_url: str + source_type: str + retrieved_at: datetime + literal_name: Optional[str] = None + claims: list[dict] + created_at: datetime + + +class CreateReconstructionRequest(BaseModel): + observation_ids: list[str] + canonical_name: Optional[str] = None + external_identifiers: Optional[dict] = None + + +class ValidationResult(BaseModel): + valid: bool + ppid_type: Optional[str] = None + exists: Optional[bool] = None + error: Optional[str] = None + + +# --- Dependencies --- + +async def get_db(): + """Database connection dependency.""" + # In production, use connection pool + pass + + +async def get_current_user(api_key: str = Depends(oauth2_scheme)): + """Authenticate user from API key or JWT.""" + pass + + +# --- Endpoints --- + +@app.post("/api/v1/observations", response_model=PersonObservationResponse) +async def create_observation( + request: CreateObservationRequest, + db = Depends(get_db) +): + """ + Create a new Person Observation from extracted data. + + The POID is generated deterministically from source metadata. + """ + from ppid.identifiers import generate_poid + + # Generate deterministic POID + poid = generate_poid( + source_url=request.source_url, + retrieval_timestamp=request.retrieved_at.isoformat(), + content_hash=request.content_hash or "" + ) + + # Check for existing observation with same POID + existing = await db.get_observation(poid) + if existing: + return existing + + # Extract name from claims + literal_name = None + for claim in request.claims: + if claim.claim_type == "full_name": + literal_name = claim.claim_value + break + + # Create observation record + observation = await db.create_observation( + poid=poid, + source_url=request.source_url, + source_type=request.source_type, + retrieved_at=request.retrieved_at, + content_hash=request.content_hash, + html_archive_path=request.html_archive_path, + literal_name=literal_name, + extraction_agent=request.extraction_agent, + claims=[c.dict() for c in request.claims] + ) + + return observation + + +@app.get("/api/v1/observations/{poid}", response_model=PersonObservationResponse) +async def get_observation(poid: str, db = Depends(get_db)): + """Get Person Observation by POID.""" + from ppid.identifiers import validate_ppid + + is_valid, error = validate_ppid(poid) + if not is_valid: + raise HTTPException(status_code=400, detail=f"Invalid POID: {error}") + + observation = await db.get_observation(poid) + if not observation: + raise HTTPException(status_code=404, detail="Observation not found") + + return observation + + +@app.get("/api/v1/validate/{ppid}", response_model=ValidationResult) +async def validate_ppid_endpoint(ppid: str, db = Depends(get_db)): + """Validate PPID format, checksum, and existence.""" + from ppid.identifiers import validate_ppid_full + + is_valid, error = validate_ppid_full(ppid) + + if not is_valid: + return ValidationResult(valid=False, error=error) + + ppid_type = "POID" if ppid.startswith("POID") else "PRID" + + # Check existence + if ppid_type == "POID": + exists = await db.observation_exists(ppid) + else: + exists = await db.reconstruction_exists(ppid) + + return ValidationResult( + valid=True, + ppid_type=ppid_type, + exists=exists + ) + + +@app.post("/api/v1/reconstructions") +async def create_reconstruction( + request: CreateReconstructionRequest, + db = Depends(get_db), + user = Depends(get_current_user) +): + """Create Person Reconstruction from linked observations.""" + from ppid.identifiers import generate_prid + + # Validate all POIDs exist + for poid in request.observation_ids: + if not await db.observation_exists(poid): + raise HTTPException( + status_code=400, + detail=f"Observation not found: {poid}" + ) + + # Generate deterministic PRID + prid = generate_prid( + observation_ids=request.observation_ids, + curator_id=str(user.id), + timestamp=datetime.utcnow().isoformat() + ) + + # Determine canonical name + if request.canonical_name: + canonical_name = request.canonical_name + else: + # Use name from highest-confidence observation + observations = await db.get_observations(request.observation_ids) + canonical_name = max( + observations, + key=lambda o: o.extraction_confidence or 0 + ).literal_name + + # Create reconstruction + reconstruction = await db.create_reconstruction( + prid=prid, + canonical_name=canonical_name, + observation_ids=request.observation_ids, + curator_id=user.id, + external_identifiers=request.external_identifiers + ) + + return reconstruction + + +@app.get("/api/v1/search") +async def search( + q: str = Query(..., min_length=2), + type: str = Query("all", regex="^(observation|reconstruction|all)$"), + limit: int = Query(20, ge=1, le=100), + offset: int = Query(0, ge=0), + db = Depends(get_db) +): + """Full-text search across observations and reconstructions.""" + results = await db.search( + query=q, + search_type=type, + limit=limit, + offset=offset + ) + return results + + +@app.post("/api/v1/resolve") +async def resolve_entity( + name: str, + employer: Optional[str] = None, + job_title: Optional[str] = None, + email: Optional[str] = None, + db = Depends(get_db) +): + """ + Entity resolution: find matching records for input data. + + Returns ranked candidates with match scores. + """ + from ppid.entity_resolution import find_candidates + + candidates = await find_candidates( + db=db, + name=name, + employer=employer, + job_title=job_title, + email=email + ) + + return { + "candidates": candidates, + "query": { + "name": name, + "employer": employer, + "job_title": job_title, + "email": email + } + } +``` + +--- + +## 5. Data Ingestion Pipeline + +### 5.1 Pipeline Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ DATA INGESTION PIPELINE │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ Source │───▶│ Extract │───▶│ Transform│───▶│ Load │ │ +│ │ Fetch │ │ (NER) │ │ (Validate) │ (Store) │ │ +│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ Archive │ │ Claims │ │ POID │ │ Postgres│ │ +│ │ HTML │ │ XPath │ │ Generate│ │ + RDF │ │ +│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 5.2 Pipeline Implementation + +```python +from dataclasses import dataclass +from datetime import datetime +from typing import AsyncIterator +import hashlib +import asyncio +from kafka import KafkaProducer, KafkaConsumer + +@dataclass +class SourceDocument: + url: str + html_content: str + retrieved_at: datetime + content_hash: str + archive_path: str + + +@dataclass +class ExtractedClaim: + claim_type: str + claim_value: str + xpath: str + xpath_match_score: float + confidence: float + + +@dataclass +class PersonObservationData: + source: SourceDocument + claims: list[ExtractedClaim] + poid: str + + +class IngestionPipeline: + """ + Main data ingestion pipeline for PPID. + + Stages: + 1. Fetch: Retrieve web pages, archive HTML + 2. Extract: NER/NLP to identify person data, generate claims with XPath + 3. Transform: Validate, generate POID, structure data + 4. Load: Store in PostgreSQL and RDF triple store + """ + + def __init__( + self, + db_pool, + rdf_store, + kafka_producer: KafkaProducer, + archive_storage, + llm_extractor + ): + self.db = db_pool + self.rdf = rdf_store + self.kafka = kafka_producer + self.archive = archive_storage + self.extractor = llm_extractor + + async def process_url(self, url: str) -> list[PersonObservationData]: + """ + Full pipeline for a single URL. + + Returns list of PersonObservations extracted from the page. + """ + # Stage 1: Fetch and archive + source = await self._fetch_and_archive(url) + + # Stage 2: Extract claims with XPath + observations = await self._extract_observations(source) + + # Stage 3: Transform and validate + validated = await self._transform_and_validate(observations) + + # Stage 4: Load to databases + await self._load_observations(validated) + + return validated + + async def _fetch_and_archive(self, url: str) -> SourceDocument: + """Fetch URL and archive HTML.""" + from playwright.async_api import async_playwright + + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + await page.goto(url, wait_until='networkidle') + html_content = await page.content() + + await browser.close() + + # Calculate content hash + content_hash = hashlib.sha256(html_content.encode()).hexdigest() + + # Archive HTML + retrieved_at = datetime.utcnow() + archive_path = await self.archive.store( + url=url, + content=html_content, + timestamp=retrieved_at + ) + + return SourceDocument( + url=url, + html_content=html_content, + retrieved_at=retrieved_at, + content_hash=content_hash, + archive_path=archive_path + ) + + async def _extract_observations( + self, + source: SourceDocument + ) -> list[tuple[list[ExtractedClaim], str]]: + """ + Extract person observations with XPath provenance. + + Uses LLM for extraction, then validates XPath. + """ + from lxml import html + + # Parse HTML + tree = html.fromstring(source.html_content) + + # Use LLM to extract person data with XPath + extraction_result = await self.extractor.extract_persons( + html_content=source.html_content, + source_url=source.url + ) + + observations = [] + + for person in extraction_result.persons: + validated_claims = [] + + for claim in person.claims: + # Verify XPath points to expected value + try: + elements = tree.xpath(claim.xpath) + if elements: + actual_value = elements[0].text_content().strip() + + # Calculate match score + if actual_value == claim.claim_value: + match_score = 1.0 + else: + from difflib import SequenceMatcher + match_score = SequenceMatcher( + None, actual_value, claim.claim_value + ).ratio() + + if match_score >= 0.8: # Accept if 80%+ match + validated_claims.append(ExtractedClaim( + claim_type=claim.claim_type, + claim_value=claim.claim_value, + xpath=claim.xpath, + xpath_match_score=match_score, + confidence=claim.confidence * match_score + )) + except Exception as e: + # Skip claims with invalid XPath + continue + + if validated_claims: + # Get literal name from claims + literal_name = next( + (c.claim_value for c in validated_claims if c.claim_type == 'full_name'), + None + ) + observations.append((validated_claims, literal_name)) + + return observations + + async def _transform_and_validate( + self, + observations: list[tuple[list[ExtractedClaim], str]], + source: SourceDocument + ) -> list[PersonObservationData]: + """Transform extracted data and generate POIDs.""" + from ppid.identifiers import generate_poid + + results = [] + + for claims, literal_name in observations: + # Generate deterministic POID + claims_hash = hashlib.sha256( + str(sorted([c.claim_value for c in claims])).encode() + ).hexdigest() + + poid = generate_poid( + source_url=source.url, + retrieval_timestamp=source.retrieved_at.isoformat(), + content_hash=f"{source.content_hash}:{claims_hash}" + ) + + results.append(PersonObservationData( + source=source, + claims=claims, + poid=poid + )) + + return results + + async def _load_observations( + self, + observations: list[PersonObservationData] + ) -> None: + """Load observations to PostgreSQL and RDF store.""" + for obs in observations: + # Check if already exists (idempotent) + existing = await self.db.get_observation(obs.poid) + if existing: + continue + + # Insert to PostgreSQL + await self.db.create_observation( + poid=obs.poid, + source_url=obs.source.url, + source_type='institutional_website', + retrieved_at=obs.source.retrieved_at, + content_hash=obs.source.content_hash, + html_archive_path=obs.source.archive_path, + literal_name=next( + (c.claim_value for c in obs.claims if c.claim_type == 'full_name'), + None + ), + claims=[{ + 'claim_type': c.claim_type, + 'claim_value': c.claim_value, + 'xpath': c.xpath, + 'xpath_match_score': c.xpath_match_score, + 'confidence': c.confidence + } for c in obs.claims] + ) + + # Insert to RDF triple store + await self._insert_rdf(obs) + + # Publish to Kafka for downstream processing + self.kafka.send('ppid.observations.created', { + 'poid': obs.poid, + 'source_url': obs.source.url, + 'timestamp': obs.source.retrieved_at.isoformat() + }) + + async def _insert_rdf(self, obs: PersonObservationData) -> None: + """Insert observation as RDF triples.""" + from rdflib import Graph, Namespace, Literal, URIRef + from rdflib.namespace import RDF, XSD + + PPID = Namespace("https://ppid.org/") + PPIDV = Namespace("https://ppid.org/vocab#") + PROV = Namespace("http://www.w3.org/ns/prov#") + + g = Graph() + + obs_uri = PPID[obs.poid] + + g.add((obs_uri, RDF.type, PPIDV.PersonObservation)) + g.add((obs_uri, PPIDV.poid, Literal(obs.poid))) + g.add((obs_uri, PROV.wasDerivedFrom, URIRef(obs.source.url))) + g.add((obs_uri, PROV.generatedAtTime, Literal( + obs.source.retrieved_at.isoformat(), + datatype=XSD.dateTime + ))) + + # Add claims + for i, claim in enumerate(obs.claims): + claim_uri = PPID[f"{obs.poid}/claim/{i}"] + g.add((obs_uri, PPIDV.hasClaim, claim_uri)) + g.add((claim_uri, PPIDV.claimType, Literal(claim.claim_type))) + g.add((claim_uri, PPIDV.claimValue, Literal(claim.claim_value))) + g.add((claim_uri, PPIDV.xpath, Literal(claim.xpath))) + g.add((claim_uri, PPIDV.xpathMatchScore, Literal( + claim.xpath_match_score, datatype=XSD.decimal + ))) + + # Insert to triple store + await self.rdf.insert(g) +``` + +--- + +## 6. GHCID Integration + +### 6.1 Linking Persons to Institutions + +```python +from dataclasses import dataclass +from typing import Optional +from datetime import date + +@dataclass +class GhcidAffiliation: + """ + Link between a person (PRID) and a heritage institution (GHCID). + """ + ghcid: str # e.g., "NL-NH-HAA-A-NHA" + role_title: Optional[str] = None + department: Optional[str] = None + start_date: Optional[date] = None + end_date: Optional[date] = None + is_current: bool = True + source_poid: Optional[str] = None + confidence: float = 0.9 + + +async def link_person_to_institution( + db, + prid: str, + ghcid: str, + role_title: str = None, + source_poid: str = None +) -> GhcidAffiliation: + """ + Create link between person and heritage institution. + + Args: + prid: Person Reconstruction ID + ghcid: Global Heritage Custodian ID + role_title: Job title at institution + source_poid: Observation where affiliation was extracted + + Returns: + Created affiliation record + """ + # Validate PRID exists + reconstruction = await db.get_reconstruction(prid) + if not reconstruction: + raise ValueError(f"Reconstruction not found: {prid}") + + # Validate GHCID format + if not validate_ghcid_format(ghcid): + raise ValueError(f"Invalid GHCID format: {ghcid}") + + # Create affiliation + affiliation = await db.create_ghcid_affiliation( + prid_id=reconstruction.id, + ghcid=ghcid, + role_title=role_title, + source_poid_id=source_poid, + is_current=True + ) + + return affiliation + + +def validate_ghcid_format(ghcid: str) -> bool: + """Validate GHCID format.""" + import re + # Pattern: CC-RR-SSS-T-ABBREV + pattern = r'^[A-Z]{2}-[A-Z]{2}-[A-Z]{3}-[A-Z]-[A-Z0-9]+$' + return bool(re.match(pattern, ghcid)) +``` + +### 6.2 RDF Integration + +```turtle +@prefix ppid: . +@prefix ppidv: . +@prefix ghcid: . +@prefix org: . +@prefix schema: . + +# Person with GHCID affiliation +ppid:PRID-1234-5678-90ab-cde5 + a ppidv:PersonReconstruction ; + schema:name "Jan van den Berg" ; + + # Current employment + org:memberOf [ + a org:Membership ; + org:organization ghcid:NL-NH-HAA-A-NHA ; + org:role [ + a org:Role ; + schema:name "Senior Archivist" + ] ; + ppidv:isCurrent true ; + schema:startDate "2015"^^xsd:gYear + ] ; + + # Direct link for simple queries + ppidv:affiliatedWith ghcid:NL-NH-HAA-A-NHA . + +# The heritage institution (from GHCID system) +ghcid:NL-NH-HAA-A-NHA + a ppidv:HeritageCustodian ; + schema:name "Noord-Hollands Archief" ; + ppidv:ghcid "NL-NH-HAA-A-NHA" ; + schema:address [ + schema:addressLocality "Haarlem" ; + schema:addressCountry "NL" + ] . +``` + +### 6.3 SPARQL Queries + +```sparql +# Find all persons affiliated with a specific institution +PREFIX ppid: +PREFIX ppidv: +PREFIX ghcid: +PREFIX schema: +PREFIX org: + +SELECT ?prid ?name ?role ?isCurrent +WHERE { + ?prid a ppidv:PersonReconstruction ; + schema:name ?name ; + org:memberOf ?membership . + + ?membership org:organization ghcid:NL-NH-HAA-A-NHA ; + ppidv:isCurrent ?isCurrent . + + OPTIONAL { + ?membership org:role ?roleNode . + ?roleNode schema:name ?role . + } +} +ORDER BY DESC(?isCurrent) ?name + + +# Find all institutions a person has worked at +SELECT ?ghcid ?institutionName ?role ?startDate ?endDate ?isCurrent +WHERE { + ppid:PRID-1234-5678-90ab-cde5 org:memberOf ?membership . + + ?membership org:organization ?institution ; + ppidv:isCurrent ?isCurrent . + + ?institution ppidv:ghcid ?ghcid ; + schema:name ?institutionName . + + OPTIONAL { ?membership schema:startDate ?startDate } + OPTIONAL { ?membership schema:endDate ?endDate } + OPTIONAL { + ?membership org:role ?roleNode . + ?roleNode schema:name ?role . + } +} +ORDER BY DESC(?isCurrent) DESC(?startDate) + + +# Find archivists across all Dutch archives +SELECT ?prid ?name ?institution ?institutionName +WHERE { + ?prid a ppidv:PersonReconstruction ; + schema:name ?name ; + org:memberOf ?membership . + + ?membership org:organization ?institution ; + ppidv:isCurrent true ; + org:role ?roleNode . + + ?roleNode schema:name ?role . + FILTER(CONTAINS(LCASE(?role), "archivist")) + + ?institution ppidv:ghcid ?ghcid ; + schema:name ?institutionName . + FILTER(STRSTARTS(?ghcid, "NL-")) +} +ORDER BY ?institutionName ?name +``` + +--- + +## 7. Security and Access Control + +### 7.1 Authentication + +```python +from fastapi import Depends, HTTPException, status +from fastapi.security import OAuth2PasswordBearer, APIKeyHeader +from jose import JWTError, jwt +from passlib.context import CryptContext +from datetime import datetime, timedelta +from typing import Optional + +# Configuration +SECRET_KEY = "your-secret-key" # Use env variable +ALGORITHM = "HS256" +ACCESS_TOKEN_EXPIRE_MINUTES = 30 + +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token", auto_error=False) +api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) + + +class User: + def __init__(self, id: str, email: str, roles: list[str]): + self.id = id + self.email = email + self.roles = roles + + +def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): + """Create JWT access token.""" + to_encode = data.copy() + expire = datetime.utcnow() + (expires_delta or timedelta(minutes=15)) + to_encode.update({"exp": expire}) + return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM) + + +async def get_current_user( + token: Optional[str] = Depends(oauth2_scheme), + api_key: Optional[str] = Depends(api_key_header), + db = Depends(get_db) +) -> User: + """ + Authenticate user via JWT token or API key. + """ + credentials_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + + # Try JWT token first + if token: + try: + payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) + user_id: str = payload.get("sub") + if user_id is None: + raise credentials_exception + + user = await db.get_user(user_id) + if user is None: + raise credentials_exception + + return user + except JWTError: + pass + + # Try API key + if api_key: + user = await db.get_user_by_api_key(api_key) + if user: + return user + + raise credentials_exception + + +def require_role(required_roles: list[str]): + """Dependency to require specific roles.""" + async def role_checker(user: User = Depends(get_current_user)): + if not any(role in user.roles for role in required_roles): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Insufficient permissions" + ) + return user + return role_checker +``` + +### 7.2 Authorization Roles + +| Role | Permissions | +|------|-------------| +| `reader` | Read observations, reconstructions, claims | +| `contributor` | Create observations, add claims | +| `curator` | Create reconstructions, link observations, resolve conflicts | +| `admin` | Manage users, API keys, system configuration | +| `api_client` | Programmatic access via API key | + +### 7.3 Rate Limiting + +```python +from fastapi import Request +import redis +from datetime import datetime + +class RateLimiter: + """ + Token bucket rate limiter using Redis. + """ + + def __init__(self, redis_client: redis.Redis): + self.redis = redis_client + + async def is_allowed( + self, + key: str, + max_requests: int = 100, + window_seconds: int = 60 + ) -> tuple[bool, dict]: + """ + Check if request is allowed under rate limit. + + Returns: + Tuple of (is_allowed, rate_limit_info) + """ + now = datetime.utcnow().timestamp() + window_start = now - window_seconds + + pipe = self.redis.pipeline() + + # Remove old requests + pipe.zremrangebyscore(key, 0, window_start) + + # Count requests in window + pipe.zcard(key) + + # Add current request + pipe.zadd(key, {str(now): now}) + + # Set expiry + pipe.expire(key, window_seconds) + + results = pipe.execute() + request_count = results[1] + + is_allowed = request_count < max_requests + + return is_allowed, { + "limit": max_requests, + "remaining": max(0, max_requests - request_count - 1), + "reset": int(now + window_seconds) + } + + +# Rate limit tiers +RATE_LIMITS = { + "anonymous": {"requests": 60, "window": 60}, + "reader": {"requests": 100, "window": 60}, + "contributor": {"requests": 500, "window": 60}, + "curator": {"requests": 1000, "window": 60}, + "api_client": {"requests": 5000, "window": 60}, +} +``` + +--- + +## 8. Performance Requirements + +### 8.1 SLOs (Service Level Objectives) + +| Metric | Target | Measurement | +|--------|--------|-------------| +| **Availability** | 99.9% | Monthly uptime | +| **API Latency (p50)** | < 50ms | Response time | +| **API Latency (p99)** | < 500ms | Response time | +| **Search Latency** | < 200ms | Full-text search | +| **SPARQL Query** | < 1s | Simple queries | +| **Throughput** | 1000 req/s | Sustained load | + +### 8.2 Scaling Strategy + +```yaml +# Kubernetes HPA (Horizontal Pod Autoscaler) +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: ppid-api +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: ppid-api + minReplicas: 3 + maxReplicas: 20 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + - type: Pods + pods: + metric: + name: http_requests_per_second + target: + type: AverageValue + averageValue: 100 +``` + +### 8.3 Caching Strategy + +```python +import redis +from functools import wraps +import json +import hashlib + +class CacheManager: + """ + Multi-tier caching for PPID. + + Tiers: + 1. L1: In-memory (per-instance) + 2. L2: Redis (shared) + """ + + def __init__(self, redis_client: redis.Redis): + self.redis = redis_client + self.local_cache = {} + + def cache_observation(self, ttl: int = 3600): + """Cache observation lookups.""" + def decorator(func): + @wraps(func) + async def wrapper(poid: str, *args, **kwargs): + cache_key = f"observation:{poid}" + + # Check L1 cache + if cache_key in self.local_cache: + return self.local_cache[cache_key] + + # Check L2 cache + cached = self.redis.get(cache_key) + if cached: + data = json.loads(cached) + self.local_cache[cache_key] = data + return data + + # Fetch from database + result = await func(poid, *args, **kwargs) + + if result: + # Store in both caches + self.redis.setex(cache_key, ttl, json.dumps(result)) + self.local_cache[cache_key] = result + + return result + return wrapper + return decorator + + def cache_search(self, ttl: int = 300): + """Cache search results (shorter TTL).""" + def decorator(func): + @wraps(func) + async def wrapper(query: str, *args, **kwargs): + # Create deterministic cache key from query params + key_data = {"query": query, "args": args, "kwargs": kwargs} + cache_key = f"search:{hashlib.md5(json.dumps(key_data, sort_keys=True).encode()).hexdigest()}" + + cached = self.redis.get(cache_key) + if cached: + return json.loads(cached) + + result = await func(query, *args, **kwargs) + + self.redis.setex(cache_key, ttl, json.dumps(result)) + + return result + return wrapper + return decorator + + def invalidate_observation(self, poid: str): + """Invalidate cache when observation is updated.""" + cache_key = f"observation:{poid}" + self.redis.delete(cache_key) + self.local_cache.pop(cache_key, None) +``` + +--- + +## 9. Deployment Architecture + +### 9.1 Kubernetes Deployment + +```yaml +# API Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ppid-api + labels: + app: ppid + component: api +spec: + replicas: 3 + selector: + matchLabels: + app: ppid + component: api + template: + metadata: + labels: + app: ppid + component: api + spec: + containers: + - name: api + image: ppid/api:latest + ports: + - containerPort: 8000 + env: + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: ppid-secrets + key: database-url + - name: REDIS_URL + valueFrom: + secretKeyRef: + name: ppid-secrets + key: redis-url + - name: JWT_SECRET + valueFrom: + secretKeyRef: + name: ppid-secrets + key: jwt-secret + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1000m" + memory: "2Gi" + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 5 + +--- +# Service +apiVersion: v1 +kind: Service +metadata: + name: ppid-api +spec: + selector: + app: ppid + component: api + ports: + - port: 80 + targetPort: 8000 + type: ClusterIP + +--- +# Ingress +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: ppid-ingress + annotations: + kubernetes.io/ingress.class: nginx + cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + tls: + - hosts: + - api.ppid.org + secretName: ppid-tls + rules: + - host: api.ppid.org + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: ppid-api + port: + number: 80 +``` + +### 9.2 Docker Compose (Development) + +```yaml +version: '3.8' + +services: + api: + build: . + ports: + - "8000:8000" + environment: + - DATABASE_URL=postgresql://ppid:ppid@postgres:5432/ppid + - REDIS_URL=redis://redis:6379 + - FUSEKI_URL=http://fuseki:3030 + depends_on: + - postgres + - redis + - fuseki + volumes: + - ./src:/app/src + - ./archives:/app/archives + + postgres: + image: postgres:16 + environment: + POSTGRES_USER: ppid + POSTGRES_PASSWORD: ppid + POSTGRES_DB: ppid + volumes: + - postgres_data:/var/lib/postgresql/data + ports: + - "5432:5432" + + redis: + image: redis:7-alpine + ports: + - "6379:6379" + + fuseki: + image: stain/jena-fuseki + environment: + ADMIN_PASSWORD: admin + FUSEKI_DATASET_1: ppid + volumes: + - fuseki_data:/fuseki + ports: + - "3030:3030" + + elasticsearch: + image: elasticsearch:8.11.0 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + volumes: + - es_data:/usr/share/elasticsearch/data + ports: + - "9200:9200" + + kafka: + image: confluentinc/cp-kafka:7.5.0 + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + depends_on: + - zookeeper + ports: + - "9092:9092" + + zookeeper: + image: confluentinc/cp-zookeeper:7.5.0 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + +volumes: + postgres_data: + fuseki_data: + es_data: +``` + +--- + +## 10. Monitoring and Observability + +### 10.1 Prometheus Metrics + +```python +from prometheus_client import Counter, Histogram, Gauge +import time + +# Metrics +REQUEST_COUNT = Counter( + 'ppid_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status'] +) + +REQUEST_LATENCY = Histogram( + 'ppid_request_latency_seconds', + 'Request latency in seconds', + ['method', 'endpoint'], + buckets=[.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10] +) + +OBSERVATIONS_CREATED = Counter( + 'ppid_observations_created_total', + 'Total observations created' +) + +RECONSTRUCTIONS_CREATED = Counter( + 'ppid_reconstructions_created_total', + 'Total reconstructions created' +) + +ENTITY_RESOLUTION_LATENCY = Histogram( + 'ppid_entity_resolution_seconds', + 'Entity resolution latency', + buckets=[.1, .25, .5, 1, 2.5, 5, 10, 30] +) + +CACHE_HITS = Counter( + 'ppid_cache_hits_total', + 'Cache hits', + ['cache_type'] +) + +CACHE_MISSES = Counter( + 'ppid_cache_misses_total', + 'Cache misses', + ['cache_type'] +) + +DB_CONNECTIONS = Gauge( + 'ppid_db_connections', + 'Active database connections' +) + + +# Middleware for request metrics +@app.middleware("http") +async def metrics_middleware(request: Request, call_next): + start_time = time.time() + + response = await call_next(request) + + latency = time.time() - start_time + + REQUEST_COUNT.labels( + method=request.method, + endpoint=request.url.path, + status=response.status_code + ).inc() + + REQUEST_LATENCY.labels( + method=request.method, + endpoint=request.url.path + ).observe(latency) + + return response +``` + +### 10.2 Logging + +```python +import logging +import json +from datetime import datetime + +class JSONFormatter(logging.Formatter): + """Structured JSON logging for observability.""" + + def format(self, record): + log_record = { + "timestamp": datetime.utcnow().isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + + # Add extra fields + if hasattr(record, 'poid'): + log_record['poid'] = record.poid + if hasattr(record, 'prid'): + log_record['prid'] = record.prid + if hasattr(record, 'request_id'): + log_record['request_id'] = record.request_id + if hasattr(record, 'user_id'): + log_record['user_id'] = record.user_id + if hasattr(record, 'duration_ms'): + log_record['duration_ms'] = record.duration_ms + + if record.exc_info: + log_record['exception'] = self.formatException(record.exc_info) + + return json.dumps(log_record) + + +# Configure logging +def setup_logging(): + handler = logging.StreamHandler() + handler.setFormatter(JSONFormatter()) + + root_logger = logging.getLogger() + root_logger.addHandler(handler) + root_logger.setLevel(logging.INFO) + + # Reduce noise from libraries + logging.getLogger("uvicorn.access").setLevel(logging.WARNING) + logging.getLogger("httpx").setLevel(logging.WARNING) +``` + +### 10.3 Grafana Dashboard (JSON) + +```json +{ + "dashboard": { + "title": "PPID System Overview", + "panels": [ + { + "title": "Request Rate", + "type": "graph", + "targets": [ + { + "expr": "sum(rate(ppid_requests_total[5m])) by (endpoint)" + } + ] + }, + { + "title": "Request Latency (p99)", + "type": "graph", + "targets": [ + { + "expr": "histogram_quantile(0.99, rate(ppid_request_latency_seconds_bucket[5m]))" + } + ] + }, + { + "title": "Observations Created", + "type": "stat", + "targets": [ + { + "expr": "sum(ppid_observations_created_total)" + } + ] + }, + { + "title": "Cache Hit Rate", + "type": "gauge", + "targets": [ + { + "expr": "sum(rate(ppid_cache_hits_total[5m])) / (sum(rate(ppid_cache_hits_total[5m])) + sum(rate(ppid_cache_misses_total[5m])))" + } + ] + }, + { + "title": "Error Rate", + "type": "graph", + "targets": [ + { + "expr": "sum(rate(ppid_requests_total{status=~\"5..\"}[5m])) / sum(rate(ppid_requests_total[5m]))" + } + ] + } + ] + } +} +``` + +--- + +## 11. Implementation Checklist + +### 11.1 Phase 1: Core Infrastructure + +- [ ] Set up PostgreSQL database with schema +- [ ] Set up Apache Jena Fuseki for RDF +- [ ] Set up Redis for caching +- [ ] Implement POID/PRID generation +- [ ] Implement checksum validation +- [ ] Create basic REST API endpoints + +### 11.2 Phase 2: Data Ingestion + +- [ ] Build web scraping infrastructure +- [ ] Implement HTML archival +- [ ] Integrate LLM for extraction +- [ ] Implement XPath validation +- [ ] Set up Kafka for async processing +- [ ] Create ingestion pipeline + +### 11.3 Phase 3: Entity Resolution + +- [ ] Implement blocking strategies +- [ ] Implement similarity metrics +- [ ] Build clustering algorithm +- [ ] Create human-in-loop review UI +- [ ] Integrate with reconstruction creation + +### 11.4 Phase 4: GHCID Integration + +- [ ] Implement affiliation linking +- [ ] Add SPARQL queries for institution lookups +- [ ] Create bidirectional navigation +- [ ] Sync with GHCID registry updates + +### 11.5 Phase 5: Production Readiness + +- [ ] Implement authentication/authorization +- [ ] Set up rate limiting +- [ ] Configure monitoring and alerting +- [ ] Create backup and recovery procedures +- [ ] Performance testing and optimization +- [ ] Security audit + +--- + +## 12. References + +### Standards +- OAuth 2.0: https://oauth.net/2/ +- OpenAPI 3.1: https://spec.openapis.org/oas/latest.html +- GraphQL: https://graphql.org/learn/ +- SPARQL 1.1: https://www.w3.org/TR/sparql11-query/ + +### Related PPID Documents +- [Identifier Structure Design](./05_identifier_structure_design.md) +- [Entity Resolution Patterns](./06_entity_resolution_patterns.md) +- [Claims and Provenance](./07_claims_and_provenance.md) + +### Technologies +- FastAPI: https://fastapi.tiangolo.com/ +- Apache Jena: https://jena.apache.org/ +- PostgreSQL: https://www.postgresql.org/ +- Kubernetes: https://kubernetes.io/ diff --git a/docs/plan/person_pid/09_governance_and_sustainability.md b/docs/plan/person_pid/09_governance_and_sustainability.md new file mode 100644 index 0000000000..287e8cbd96 --- /dev/null +++ b/docs/plan/person_pid/09_governance_and_sustainability.md @@ -0,0 +1,1009 @@ +# Governance and Sustainability + +**Version**: 0.1.0 +**Last Updated**: 2025-01-09 +**Related**: [Executive Summary](./01_executive_summary.md) | [Implementation Guidelines](./08_implementation_guidelines.md) + +--- + +## 1. Overview + +This document defines the governance framework and sustainability model for PPID: + +- Organizational structure +- Identifier assignment policies +- Data stewardship and quality assurance +- Community governance +- Funding models +- Deprecation and tombstoning +- International coordination +- Dispute resolution + +Long-term sustainability requires clear governance, community trust, and diversified funding. + +--- + +## 2. Governance Principles + +### 2.1 Core Principles + +| Principle | Description | +|-----------|-------------| +| **Transparency** | All policies, decisions, and algorithms are public | +| **Neutrality** | PPID serves the heritage community without commercial bias | +| **Persistence** | Identifiers are permanent; "Cool URIs don't change" | +| **Interoperability** | Open standards, no vendor lock-in | +| **Privacy** | Minimal data collection, GDPR compliance | +| **Inclusivity** | Global representation in governance bodies | + +### 2.2 Governance Comparison + +| Aspect | ORCID | ISNI | VIAF | PPID (Proposed) | +|--------|-------|------|------|-----------------| +| **Legal Status** | Non-profit | Non-profit | Consortium | Non-profit foundation | +| **Membership** | Institutional | Agency-based | Library consortium | Heritage institutions | +| **Funding** | Membership fees | Registration fees | In-kind (libraries) | Hybrid (grants + membership) | +| **Decision Making** | Board + community | ISNI-IA board | OCLC-led | Steering committee + working groups | + +--- + +## 3. Organizational Structure + +### 3.1 Governance Bodies + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ PPID GOVERNANCE STRUCTURE │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ STEERING COMMITTEE │ │ +│ │ Strategic direction, policy approval, partnerships │ │ +│ │ Members: 9-15 (heritage institutions, researchers) │ │ +│ │ Meets: Quarterly │ │ +│ └───────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────────┼───────────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ +│ │ TECHNICAL │ │ POLICY │ │ COMMUNITY │ │ +│ │ COMMITTEE │ │ COMMITTEE │ │ COUNCIL │ │ +│ └───────────────┘ └───────────────┘ └───────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ +│ │ Working Groups│ │ Working Groups│ │ Regional │ │ +│ │ - API Design │ │ - Privacy │ │ Chapters │ │ +│ │ - ER Algorithms │ - Assignment │ │ - Europe │ │ +│ │ - Interop │ │ - Deprecation │ │ - Americas │ │ +│ └───────────────┘ └───────────────┘ │ - Asia-Pacific│ │ +│ └───────────────┘ │ +│ │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ OPERATIONAL TEAM │ │ +│ │ Day-to-day operations, infrastructure, support │ │ +│ │ Staff: Executive Director + 5-10 FTE │ │ +│ └───────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 3.2 Steering Committee + +**Composition** (13 seats): +- 3 seats: Archives (national, regional, specialized) +- 3 seats: Libraries (national, academic, public) +- 3 seats: Museums (art, history, science) +- 2 seats: Research/academia +- 2 seats: Technology/infrastructure providers + +**Responsibilities**: +- Approve strategic direction +- Approve annual budget +- Approve major policy changes +- Approve partnerships and MOUs +- Appoint Executive Director + +**Terms**: 3 years, staggered, maximum 2 consecutive terms + +**Voting**: Supermajority (2/3) for policy changes; simple majority for operations + +### 3.3 Technical Committee + +**Composition**: 7-9 members with technical expertise + +**Responsibilities**: +- Maintain technical specifications +- Review and approve API changes +- Oversee entity resolution algorithms +- Ensure interoperability with ORCID, ISNI, VIAF +- Conduct security reviews + +**Meetings**: Monthly + ad-hoc for urgent issues + +### 3.4 Policy Committee + +**Composition**: 7-9 members (legal, ethics, domain experts) + +**Responsibilities**: +- Develop identifier assignment policies +- Manage privacy and data protection +- Handle dispute resolution +- Define deprecation procedures +- Ensure compliance with regulations + +### 3.5 Community Council + +**Composition**: Open to all registered PPID users + +**Responsibilities**: +- Provide feedback on policies and features +- Elect community representatives +- Participate in annual summit +- Propose new features and improvements + +--- + +## 4. Identifier Assignment Policies + +### 4.1 Who Can Create Identifiers? + +| Identifier | Creator | Approval Required | Automation | +|------------|---------|-------------------|------------| +| **POID** | Any registered user | No | Fully automated | +| **PRID** | Curators only | For disputed cases | Semi-automated | + +### 4.2 POID Creation Policy + +**Eligibility**: Any user with verified account can create POIDs. + +**Requirements**: +1. Valid source URL (must be accessible or archived) +2. Minimum one claim with XPath provenance +3. HTML archive stored in PPID infrastructure +4. Content hash for verification + +**Rate Limits**: +- Free tier: 100 POIDs/day +- Institutional: 10,000 POIDs/day +- API partners: Negotiated limits + +**Prohibited Sources**: +- Sites with robots.txt exclusion (unless permission obtained) +- Paywalled content (without license) +- Social media (privacy concerns) +- Fake or fabricated pages + +### 4.3 PRID Creation Policy + +**Eligibility**: Curators with verified heritage institution affiliation. + +**Requirements**: +1. Link at least one POID +2. Provide canonical name +3. Document curation decision (manual/algorithmic/hybrid) +4. Accept responsibility for accuracy + +**Curator Certification**: +```python +CURATOR_REQUIREMENTS = { + "heritage_affiliation": True, # Must work at GHCID institution + "training_completed": True, # Online certification course + "probation_period": 30, # Days before full privileges + "initial_review": True, # First 10 PRIDs reviewed by senior curator +} +``` + +**Conflict of Interest**: +- Curators should not create PRIDs for themselves +- Curators should disclose relationships with subjects +- Institutional bias should be documented + +### 4.4 External Identifier Linking + +| External ID | Verification Required | Auto-Link Allowed | +|-------------|----------------------|-------------------| +| ORCID | Yes (via API) | Yes, if verified | +| ISNI | Yes (via lookup) | Yes, if match > 95% | +| VIAF | Yes (via API) | Yes, if verified | +| Wikidata | Manual review | No | +| LinkedIn | URL match only | No (privacy) | + +--- + +## 5. Data Stewardship + +### 5.1 Data Quality Framework + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ DATA QUALITY LIFECYCLE │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ Ingest │───▶│ Validate│───▶│ Curate │───▶│ Publish │ │ +│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ Format │ │ Syntax │ │ Entity │ │ Version │ │ +│ │ Check │ │ + Schema│ │ Resolut.│ │ Control │ │ +│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ QUALITY METRICS │ │ +│ │ Completeness | Accuracy | Consistency | Timeliness │ │ +│ └───────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 5.2 Quality Metrics + +| Metric | Definition | Target | Measurement | +|--------|------------|--------|-------------| +| **Completeness** | Required fields populated | > 95% | Automated check | +| **Accuracy** | Claims match source | > 90% | Sampling audit | +| **Consistency** | No conflicting claims | > 85% | Automated + review | +| **Timeliness** | Data freshness | < 1 year | Re-verification cycle | +| **Provenance** | XPath verifiable | 100% | Automated check | + +### 5.3 Data Retention Policy + +| Data Type | Retention Period | Rationale | +|-----------|-----------------|-----------| +| POIDs | Permanent | Persistent identifiers | +| PRIDs | Permanent | Persistent identifiers | +| Claims | Permanent (versioned) | Audit trail | +| HTML Archives | 10 years | Storage costs | +| API Logs | 2 years | Compliance | +| User Data | Account lifetime + 1 year | GDPR | + +### 5.4 Quality Assurance Process + +```python +from dataclasses import dataclass +from enum import Enum +from typing import Optional +from datetime import datetime + +class QualityLevel(Enum): + GOLD = "gold" # Verified by multiple sources + SILVER = "silver" # Single authoritative source + BRONZE = "bronze" # Automated extraction, unverified + FLAGGED = "flagged" # Quality issues detected + +@dataclass +class QualityAssessment: + observation_poid: str + quality_level: QualityLevel + completeness_score: float + accuracy_score: float + xpath_verification: bool + issues: list[str] + assessed_at: datetime + assessed_by: str # Curator ID or "automated" + +async def assess_observation_quality(poid: str, db) -> QualityAssessment: + """ + Assess quality of a Person Observation. + """ + observation = await db.get_observation(poid) + claims = await db.get_claims(poid) + + issues = [] + + # Check completeness + required_fields = ['full_name'] + optional_fields = ['job_title', 'employer', 'email'] + + found_required = sum(1 for c in claims if c.claim_type in required_fields) + found_optional = sum(1 for c in claims if c.claim_type in optional_fields) + + completeness = (found_required / len(required_fields)) * 0.6 + \ + (found_optional / len(optional_fields)) * 0.4 + + if completeness < 0.6: + issues.append("Low completeness: missing required fields") + + # Check XPath verification + xpath_verified = all(c.xpath_match_score >= 0.9 for c in claims) + if not xpath_verified: + issues.append("Some claims have low XPath match scores") + + # Check source quality + source_quality_map = { + 'official_registry': 1.0, + 'institutional_website': 0.9, + 'professional_network': 0.7, + 'social_media': 0.5, + } + source_quality = source_quality_map.get(observation.source_type, 0.5) + + # Calculate accuracy (based on source + XPath) + accuracy = source_quality * (0.7 if xpath_verified else 0.4) + + # Determine quality level + if accuracy >= 0.85 and completeness >= 0.9 and xpath_verified: + quality_level = QualityLevel.GOLD + elif accuracy >= 0.7 and completeness >= 0.7: + quality_level = QualityLevel.SILVER + elif issues: + quality_level = QualityLevel.FLAGGED + else: + quality_level = QualityLevel.BRONZE + + return QualityAssessment( + observation_poid=poid, + quality_level=quality_level, + completeness_score=completeness, + accuracy_score=accuracy, + xpath_verification=xpath_verified, + issues=issues, + assessed_at=datetime.utcnow(), + assessed_by="automated" + ) +``` + +--- + +## 6. Community Governance + +### 6.1 Membership Tiers + +| Tier | Annual Fee | Benefits | +|------|------------|----------| +| **Individual** | Free | Read access, create POIDs (limited) | +| **Contributor** | Free | Create POIDs (unlimited), suggest edits | +| **Curator** | Free | Create PRIDs, resolve conflicts | +| **Institutional Member** | EUR 500-5,000 | API access, priority support, governance vote | +| **Sustaining Member** | EUR 10,000+ | Board nomination, strategic input | + +### 6.2 Curator Certification + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CURATOR CERTIFICATION PATH │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ Step 1: Application │ +│ ├─ Submit heritage institution affiliation (GHCID) │ +│ ├─ Provide professional references │ +│ └─ Accept code of conduct │ +│ │ +│ Step 2: Training (Online, ~4 hours) │ +│ ├─ Module 1: PPID fundamentals │ +│ ├─ Module 2: Entity resolution principles │ +│ ├─ Module 3: Claims and provenance │ +│ ├─ Module 4: Cultural naming conventions │ +│ └─ Module 5: Ethics and privacy │ +│ │ +│ Step 3: Practical Assessment │ +│ ├─ Create 5 POIDs from assigned sources │ +│ ├─ Create 3 PRIDs with entity resolution │ +│ └─ Resolve 2 simulated conflicts │ +│ │ +│ Step 4: Probation (30 days) │ +│ ├─ First 10 PRIDs reviewed by mentor │ +│ └─ Feedback and correction cycle │ +│ │ +│ Step 5: Full Certification │ +│ ├─ Certificate issued │ +│ ├─ Full PRID creation privileges │ +│ └─ Annual recertification required │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 6.3 Code of Conduct + +**Core Commitments**: + +1. **Accuracy**: Only create claims that are verifiable +2. **Neutrality**: No personal bias in curation decisions +3. **Transparency**: Document all curation decisions +4. **Privacy**: Respect data subject rights +5. **Collaboration**: Work constructively with community +6. **Integrity**: No fabrication or manipulation of data + +**Violations and Consequences**: + +| Severity | Examples | Consequence | +|----------|----------|-------------| +| Minor | Incomplete documentation, slow response | Warning | +| Moderate | Pattern of low-quality contributions | Suspension (30 days) | +| Serious | Fabricated claims, privacy violations | Revocation + ban | +| Critical | Malicious data corruption, harassment | Permanent ban + legal action | + +### 6.4 Decision-Making Process + +```python +from enum import Enum +from dataclasses import dataclass +from datetime import datetime, timedelta + +class DecisionType(Enum): + OPERATIONAL = "operational" # Staff decides + TECHNICAL = "technical" # Technical Committee votes + POLICY = "policy" # Policy Committee + Steering + STRATEGIC = "strategic" # Steering Committee only + +class VotingMethod(Enum): + SIMPLE_MAJORITY = "simple" # > 50% + SUPERMAJORITY = "super" # > 66.7% + CONSENSUS = "consensus" # No objections + LAZY_CONSENSUS = "lazy" # No objections in N days + +@dataclass +class Proposal: + id: str + title: str + description: str + decision_type: DecisionType + voting_method: VotingMethod + proposed_by: str + proposed_at: datetime + discussion_period: timedelta + voting_period: timedelta + status: str # draft, discussion, voting, approved, rejected + +DECISION_MATRIX = { + DecisionType.OPERATIONAL: { + "authority": "Executive Director", + "voting": None, + "timeline": "Immediate" + }, + DecisionType.TECHNICAL: { + "authority": "Technical Committee", + "voting": VotingMethod.SIMPLE_MAJORITY, + "timeline": "2 weeks discussion + 1 week voting" + }, + DecisionType.POLICY: { + "authority": "Policy Committee + Steering", + "voting": VotingMethod.SUPERMAJORITY, + "timeline": "4 weeks discussion + 2 weeks voting" + }, + DecisionType.STRATEGIC: { + "authority": "Steering Committee", + "voting": VotingMethod.SUPERMAJORITY, + "timeline": "8 weeks discussion + 2 weeks voting" + }, +} +``` + +--- + +## 7. Funding Models + +### 7.1 Revenue Streams + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ PPID FUNDING MODEL │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ DIVERSIFIED FUNDING (Target: 5-year sustainability) │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ 40% ┌─────────────────────────────────────────────────────┐ │ +│ │ GRANTS & FOUNDATIONS │ │ +│ │ - Andrew W. Mellon Foundation │ │ +│ │ - Horizon Europe (heritage digitization) │ │ +│ │ - IMLS (US heritage institutions) │ │ +│ │ - Dutch Digital Heritage Network │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ +│ 35% ┌─────────────────────────────────────────────────────┐ │ +│ │ INSTITUTIONAL MEMBERSHIP │ │ +│ │ - Tiered fees based on size │ │ +│ │ - Consortium discounts │ │ +│ │ - In-kind contributions (staff time, hosting) │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ +│ 15% ┌─────────────────────────────────────────────────────┐ │ +│ │ API & SERVICES │ │ +│ │ - Commercial API access (high volume) │ │ +│ │ - Data enrichment services │ │ +│ │ - Custom integration support │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ +│ 10% ┌─────────────────────────────────────────────────────┐ │ +│ │ TRAINING & EVENTS │ │ +│ │ - Curator certification fees │ │ +│ │ - Annual conference │ │ +│ │ - Workshops and webinars │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 7.2 Institutional Membership Fees + +| Institution Size | Staff | Annual Fee (EUR) | API Calls/Month | +|------------------|-------|------------------|-----------------| +| Small | < 10 FTE | 500 | 50,000 | +| Medium | 10-50 FTE | 1,500 | 200,000 | +| Large | 50-200 FTE | 3,000 | 500,000 | +| Very Large | > 200 FTE | 5,000 | 1,000,000 | +| Consortium | Multiple | Negotiated | Pooled | + +**Discounts**: +- Multi-year commitment: 10% discount +- Consortium (5+ institutions): 25% discount +- Developing nations: 50-75% discount + +### 7.3 Commercial API Tiers + +| Tier | Monthly Fee (EUR) | API Calls | SLA | Support | +|------|-------------------|-----------|-----|---------| +| Starter | 0 | 1,000 | None | Community | +| Professional | 99 | 50,000 | 99% | Email | +| Business | 499 | 500,000 | 99.5% | Priority | +| Enterprise | Custom | Unlimited | 99.9% | Dedicated | + +### 7.4 Financial Sustainability Model + +```python +from dataclasses import dataclass +from decimal import Decimal + +@dataclass +class AnnualBudget: + # Revenue + grants: Decimal + membership_fees: Decimal + api_revenue: Decimal + training_events: Decimal + + # Expenses + staff_salaries: Decimal # ~60% of budget + infrastructure: Decimal # ~20% of budget + operations: Decimal # ~10% of budget + reserves: Decimal # ~10% of budget + + @property + def total_revenue(self) -> Decimal: + return (self.grants + self.membership_fees + + self.api_revenue + self.training_events) + + @property + def total_expenses(self) -> Decimal: + return (self.staff_salaries + self.infrastructure + + self.operations + self.reserves) + + @property + def is_sustainable(self) -> bool: + return self.total_revenue >= self.total_expenses + +# Year 3 target budget (EUR) +YEAR_3_BUDGET = AnnualBudget( + # Revenue + grants=Decimal("400000"), + membership_fees=Decimal("350000"), + api_revenue=Decimal("150000"), + training_events=Decimal("100000"), + + # Expenses + staff_salaries=Decimal("600000"), # 6 FTE average + infrastructure=Decimal("200000"), + operations=Decimal("100000"), + reserves=Decimal("100000"), +) + +assert YEAR_3_BUDGET.is_sustainable +``` + +--- + +## 8. Deprecation and Tombstoning + +### 8.1 Identifier Lifecycle + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ IDENTIFIER LIFECYCLE STATES │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ DRAFT │───▶│ ACTIVE │───▶│DEPRECATED│───▶│TOMBSTONE│ │ +│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ Not Fully Still Permanent │ +│ published operational resolvable redirect │ +│ (with warning) to replacement │ +│ │ +│ ── NEVER DELETED ───────────────────────────────────────────── │ +│ Identifiers are PERMANENT. Tombstones persist forever. │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 8.2 Deprecation Reasons + +| Reason | Description | Action | +|--------|-------------|--------| +| **Duplicate** | Same person has two PRIDs | Merge → tombstone loser | +| **Error** | Data fabrication detected | Tombstone with explanation | +| **GDPR Request** | Data subject requests removal | Tombstone, redact PII | +| **Merge** | PRIDs combined due to new evidence | Tombstone merged ID | +| **Split** | PRID contained multiple persons | Create new PRIDs, tombstone original | + +### 8.3 Tombstone Format + +```turtle +@prefix ppid: . +@prefix ppidv: . +@prefix schema: . + +# Tombstoned identifier +ppid:PRID-1234-5678-90ab-cde5 + a ppidv:TombstonedReconstruction ; + ppidv:tombstoneReason "duplicate" ; + ppidv:tombstoneDate "2025-06-15"^^xsd:date ; + ppidv:tombstoneBy ppid:curator-001 ; + ppidv:replacedBy ppid:PRID-9876-5432-10fe-dcb0 ; + ppidv:tombstoneNote "Merged with PRID-9876... after evidence showed same person" ; + schema:name "[TOMBSTONED]" . # Original name redacted + +# HTTP response for tombstoned URI returns 410 Gone with redirect +# Content-Type: application/json +# { +# "status": "tombstoned", +# "reason": "duplicate", +# "replaced_by": "https://ppid.org/PRID-9876-5432-10fe-dcb0", +# "tombstoned_on": "2025-06-15", +# "note": "Merged with PRID-9876... after evidence showed same person" +# } +``` + +### 8.4 GDPR Right to Erasure + +```python +from datetime import datetime +from typing import Optional + +async def handle_gdpr_erasure_request( + prid: str, + requestor_email: str, + identity_verification: dict, + db +) -> dict: + """ + Handle GDPR Article 17 (Right to Erasure) request. + + PPID balances erasure rights with archival/research exemptions. + """ + # 1. Verify identity + if not verify_identity(identity_verification): + return {"status": "rejected", "reason": "Identity verification failed"} + + # 2. Check exemptions (GDPR Art. 17(3)) + exemptions = check_exemptions(prid, db) + + if exemptions.get("archival_public_interest"): + # Heritage/archival exemption may apply + return { + "status": "partial", + "reason": "Archival exemption applies", + "action": "Data minimized but identifier retained for historical record" + } + + # 3. Tombstone the PRID (don't delete) + await db.tombstone_reconstruction( + prid=prid, + reason="gdpr_erasure", + tombstoned_by="gdpr_system", + redact_pii=True # Remove name, claims, but keep tombstone + ) + + # 4. Redact linked observations + observations = await db.get_observations_for_prid(prid) + for poid in observations: + await db.redact_observation( + poid=poid, + redact_fields=["literal_name", "claims"], + retain_provenance=True # Keep source URL for audit + ) + + # 5. Log for compliance + await db.log_gdpr_action( + action="erasure", + prid=prid, + requestor=requestor_email, + completed_at=datetime.utcnow() + ) + + return { + "status": "completed", + "prid": prid, + "action": "Tombstoned with PII redacted", + "completion_date": datetime.utcnow().isoformat() + } +``` + +--- + +## 9. International Coordination + +### 9.1 Alignment with Existing Systems + +| System | Coordination Level | Integration | +|--------|-------------------|-------------| +| **ORCID** | Strategic partnership | owl:sameAs linking, API federation | +| **ISNI** | Technical integration | Registration agent status (goal) | +| **VIAF** | Data exchange | Cluster matching, link sharing | +| **Wikidata** | Community alignment | Bidirectional linking | +| **GND** | Regional partnership | German heritage focus | +| **SNAC** | Domain partnership | Archival persons | + +### 9.2 Regional Chapters + +| Region | Focus | Lead Institution (Proposed) | +|--------|-------|----------------------------| +| **Europe** | EU heritage policy, GDPR | Europeana Foundation | +| **North America** | SNAC integration, DPLA | Library of Congress | +| **Asia-Pacific** | CJK naming conventions | National Diet Library (Japan) | +| **Latin America** | Spanish/Portuguese names | Biblioteca Nacional (Brazil) | +| **Africa** | Oral heritage, naming diversity | African Library & Info Assoc. | + +### 9.3 Standards Bodies Engagement + +| Body | Engagement | PPID Contribution | +|------|------------|-------------------| +| **ISO TC 46/SC 9** | Observer → Participant | Identifier standards input | +| **W3C** | Community Group | RDF/linked data best practices | +| **IETF** | Monitor | URI standards compliance | +| **Dublin Core** | Contributor | Metadata alignment | +| **CIDOC-CRM** | Liaison | Cultural heritage modeling | + +--- + +## 10. Dispute Resolution + +### 10.1 Types of Disputes + +| Dispute Type | Description | Resolution Path | +|--------------|-------------|-----------------| +| **Merge Conflict** | Curators disagree on whether POIDs refer to same person | Curator panel review | +| **Data Accuracy** | Claim disputed by subject or third party | Evidence review | +| **Attribution** | Who should be credited for curation | Activity log review | +| **Privacy** | Subject objects to data inclusion | GDPR process | +| **Ownership** | Multiple claims to manage a PRID | Institution hierarchy | + +### 10.2 Resolution Process + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ DISPUTE RESOLUTION PROCESS │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ Stage 1: Informal Resolution (0-7 days) │ +│ ├─ Parties communicate directly │ +│ ├─ Use discussion threads on disputed record │ +│ └─ Most disputes resolve here │ +│ │ +│ Stage 2: Mediation (7-21 days) │ +│ ├─ Neutral mediator assigned │ +│ ├─ Evidence review │ +│ └─ Mediated agreement │ +│ │ +│ Stage 3: Panel Review (21-42 days) │ +│ ├─ 3-person panel (1 Technical, 1 Policy, 1 Community) │ +│ ├─ Written submissions │ +│ └─ Binding decision │ +│ │ +│ Stage 4: Appeal (if applicable) │ +│ ├─ Steering Committee review │ +│ ├─ New evidence only │ +│ └─ Final decision │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 10.3 Dispute Resolution Implementation + +```python +from enum import Enum +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Optional + +class DisputeStatus(Enum): + OPEN = "open" + INFORMAL = "informal" + MEDIATION = "mediation" + PANEL = "panel" + APPEAL = "appeal" + RESOLVED = "resolved" + CLOSED = "closed" + +class DisputeType(Enum): + MERGE_CONFLICT = "merge_conflict" + DATA_ACCURACY = "data_accuracy" + ATTRIBUTION = "attribution" + PRIVACY = "privacy" + OWNERSHIP = "ownership" + +@dataclass +class Dispute: + id: str + dispute_type: DisputeType + status: DisputeStatus + subject_prid: str + complainant_id: str + respondent_id: Optional[str] + description: str + evidence: list[str] + created_at: datetime + updated_at: datetime + deadline: datetime + resolution: Optional[str] = None + mediator_id: Optional[str] = None + panel_members: Optional[list[str]] = None + +async def escalate_dispute(dispute_id: str, db) -> Dispute: + """ + Escalate dispute to next stage if deadline passed without resolution. + """ + dispute = await db.get_dispute(dispute_id) + + if dispute.status == DisputeStatus.INFORMAL: + # Escalate to mediation + mediator = await assign_mediator(dispute.dispute_type) + dispute.status = DisputeStatus.MEDIATION + dispute.mediator_id = mediator.id + dispute.deadline = datetime.utcnow() + timedelta(days=14) + + elif dispute.status == DisputeStatus.MEDIATION: + # Escalate to panel + panel = await assemble_panel() + dispute.status = DisputeStatus.PANEL + dispute.panel_members = [p.id for p in panel] + dispute.deadline = datetime.utcnow() + timedelta(days=21) + + elif dispute.status == DisputeStatus.PANEL: + # Force panel decision + await notify_panel_deadline(dispute) + + dispute.updated_at = datetime.utcnow() + await db.update_dispute(dispute) + + return dispute +``` + +--- + +## 11. Risk Management + +### 11.1 Risk Matrix + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| **Funding shortfall** | Medium | High | Diversified funding, reserves | +| **Key person dependency** | Medium | Medium | Documentation, succession planning | +| **Data breach** | Low | High | Security audits, encryption | +| **Scope creep** | Medium | Medium | Clear charter, governance | +| **Low adoption** | Medium | High | Partnerships, value demonstration | +| **Technical debt** | Medium | Medium | Regular refactoring, documentation | +| **Competitor emergence** | Low | Medium | Differentiation, partnerships | + +### 11.2 Contingency Plans + +**Funding Failure Contingency**: +``` +IF annual_funding < 60% of target: + 1. Activate 6-month reserve fund + 2. Reduce staff to essential operations (3 FTE) + 3. Suspend new feature development + 4. Seek emergency grants + 5. Negotiate hosting cost reduction + 6. IF 12 months: Initiate graceful shutdown procedure +``` + +**Graceful Shutdown Procedure** (last resort): +1. Announce 12-month wind-down timeline +2. Export all data to open formats (RDF dumps) +3. Transfer infrastructure to accepting institution +4. Archive code on GitHub +5. Ensure identifier resolution continues (static hosting) +6. Document lessons learned + +--- + +## 12. Implementation Roadmap + +### 12.1 Phase 1: Foundation (Year 1) + +| Quarter | Milestone | +|---------|-----------| +| Q1 | Legal entity established (non-profit foundation) | +| Q1 | Steering Committee formed (interim) | +| Q2 | Seed funding secured (EUR 500K+) | +| Q2 | Executive Director hired | +| Q3 | Technical infrastructure launched (beta) | +| Q3 | First 10 institutional members | +| Q4 | Curator certification program launched | +| Q4 | 10,000 POIDs / 1,000 PRIDs | + +### 12.2 Phase 2: Growth (Year 2) + +| Quarter | Milestone | +|---------|-----------| +| Q1 | ORCID partnership formalized | +| Q1 | 50 institutional members | +| Q2 | API v1.0 stable release | +| Q2 | First regional chapter (Europe) | +| Q3 | 100,000 POIDs / 10,000 PRIDs | +| Q3 | ISNI registration agent application | +| Q4 | Annual conference (first) | +| Q4 | Financial sustainability achieved (70%) | + +### 12.3 Phase 3: Maturity (Year 3+) + +| Quarter | Milestone | +|---------|-----------| +| Q1 | 200+ institutional members | +| Q2 | ISO standardization process initiated | +| Q3 | 1M POIDs / 100K PRIDs | +| Q4 | Full financial sustainability (100%) | +| Ongoing | International expansion | +| Ongoing | Feature enhancements based on community input | + +--- + +## 13. Success Metrics + +### 13.1 Adoption Metrics + +| Metric | Year 1 Target | Year 3 Target | +|--------|---------------|---------------| +| POIDs created | 10,000 | 1,000,000 | +| PRIDs created | 1,000 | 100,000 | +| Institutional members | 10 | 200 | +| Certified curators | 50 | 500 | +| Countries represented | 10 | 50 | +| GHCID institutions linked | 100 | 5,000 | + +### 13.2 Quality Metrics + +| Metric | Target | +|--------|--------| +| Claim accuracy (audited) | > 90% | +| XPath verification rate | 100% | +| Duplicate detection rate | > 95% | +| Dispute resolution time | < 30 days | +| API uptime | > 99.9% | + +### 13.3 Sustainability Metrics + +| Metric | Year 1 | Year 3 | +|--------|--------|--------| +| Revenue diversity (sources) | 2 | 4 | +| Membership revenue % | 20% | 35% | +| Operating reserve (months) | 3 | 12 | +| Grant dependency | 80% | 40% | + +--- + +## 14. References + +### Governance Models +- ORCID Governance: https://orcid.org/about/governance +- ISNI International Agency: https://isni.org/page/governance/ +- W3C Process Document: https://www.w3.org/Consortium/Process/ + +### Legal and Compliance +- GDPR: https://gdpr.eu/ +- Dutch WGBO (heritage law): https://wetten.overheid.nl/ +- Non-profit foundation (stichting): https://www.kvk.nl/ + +### Sustainability +- NDSA Levels of Preservation: https://ndsa.org/publications/levels-of-digital-preservation/ +- COAR Sustainability Principles: https://www.coar-repositories.org/ + +### Related PPID Documents +- [Executive Summary](./01_executive_summary.md) +- [Implementation Guidelines](./08_implementation_guidelines.md)