810 lines
31 KiB
Python
810 lines
31 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Oxigraph Person Sync Module - Sync person JSON files to Oxigraph triplestore.
|
|
|
|
This module syncs all person data (entity profiles and staff lists) to Oxigraph
|
|
as RDF triples for SPARQL querying and graph-based retrieval.
|
|
|
|
Usage:
|
|
python -m scripts.sync.oxigraph_person_sync [--dry-run] [--limit N] [--endpoint URL]
|
|
|
|
Data Sources:
|
|
- data/custodian/person/entity/*.json - Individual person profile files
|
|
- data/custodian/person/affiliated/parsed/*_staff_*.json - Staff list files
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import httpx
|
|
import yaml
|
|
from rdflib import Graph, Namespace, Literal, URIRef, BNode
|
|
from rdflib.namespace import RDF, RDFS, XSD, SKOS, DCTERMS, FOAF, OWL
|
|
|
|
# Add project root to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
from scripts.sync import BaseSyncer, SyncResult, SyncStatus, DEFAULT_CUSTODIAN_DIR
|
|
|
|
# Configuration
|
|
OXIGRAPH_URL = os.getenv("OXIGRAPH_URL", "http://localhost:7878")
|
|
BATCH_SIZE = 500
|
|
|
|
# Data directories
|
|
PERSON_ENTITY_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "entity"
|
|
PERSON_AFFILIATED_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "affiliated" / "parsed"
|
|
|
|
# Namespaces (reuse from oxigraph_sync.py + person-specific)
|
|
HC = Namespace("https://w3id.org/heritage/custodian/")
|
|
HP = Namespace("https://w3id.org/heritage/person/")
|
|
GHCID = Namespace("https://w3id.org/heritage/ghcid/")
|
|
NDE = Namespace("https://nde.nl/ontology/hc/class/")
|
|
ORG = Namespace("http://www.w3.org/ns/org#")
|
|
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
|
|
SCHEMA = Namespace("http://schema.org/")
|
|
CIDOC = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
|
|
PROV = Namespace("http://www.w3.org/ns/prov#")
|
|
WD = Namespace("http://www.wikidata.org/entity/")
|
|
WDT = Namespace("http://www.wikidata.org/prop/direct/")
|
|
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
|
|
PICO = Namespace("https://personsincontext.org/model#")
|
|
PNV = Namespace("https://w3id.org/pnv#")
|
|
|
|
|
|
def sanitize_for_uri(value: str) -> str:
|
|
"""Sanitize a string to be valid as a URI local name."""
|
|
if not value:
|
|
return "unknown"
|
|
sanitized = re.sub(r'[`\'\"<>{}|\\^~\[\]@#$%&*()+=,;!? ]', '_', value)
|
|
sanitized = re.sub(r'_+', '_', sanitized)
|
|
sanitized = sanitized.strip('_')
|
|
return sanitized.lower() if sanitized else "unknown"
|
|
|
|
|
|
def extract_country_code(location: str) -> Optional[str]:
|
|
"""Extract country code from location string like 'Amsterdam, North Holland, Netherlands (NL)'."""
|
|
if not location:
|
|
return None
|
|
|
|
# Try to find country code in parentheses
|
|
match = re.search(r'\(([A-Z]{2})\)', location)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
# Common country mappings
|
|
country_map = {
|
|
'netherlands': 'NL', 'nederland': 'NL',
|
|
'united states': 'US', 'usa': 'US',
|
|
'united kingdom': 'GB', 'uk': 'GB', 'england': 'GB',
|
|
'germany': 'DE', 'deutschland': 'DE',
|
|
'france': 'FR',
|
|
'belgium': 'BE', 'belgië': 'BE',
|
|
'spain': 'ES', 'españa': 'ES',
|
|
'italy': 'IT', 'italia': 'IT',
|
|
'australia': 'AU',
|
|
'canada': 'CA',
|
|
'japan': 'JP',
|
|
'china': 'CN',
|
|
}
|
|
|
|
lower_loc = location.lower()
|
|
for country, code in country_map.items():
|
|
if country in lower_loc:
|
|
return code
|
|
|
|
return None
|
|
|
|
|
|
class PersonRDFConverter:
|
|
"""Convert person JSON data to RDF triples."""
|
|
|
|
def __init__(self, custodian_lookup: dict[str, str] | None = None):
|
|
self.graph = Graph()
|
|
self._bind_namespaces()
|
|
self.custodian_lookup = custodian_lookup or {}
|
|
self._persons_added = 0
|
|
|
|
def _bind_namespaces(self) -> None:
|
|
"""Bind all ontology namespaces to the graph."""
|
|
self.graph.bind("hc", HC)
|
|
self.graph.bind("hp", HP)
|
|
self.graph.bind("ghcid", GHCID)
|
|
self.graph.bind("nde", NDE)
|
|
self.graph.bind("org", ORG)
|
|
self.graph.bind("rico", RICO)
|
|
self.graph.bind("schema", SCHEMA)
|
|
self.graph.bind("cidoc", CIDOC)
|
|
self.graph.bind("prov", PROV)
|
|
self.graph.bind("wd", WD)
|
|
self.graph.bind("wdt", WDT)
|
|
self.graph.bind("geo", GEO)
|
|
self.graph.bind("dcterms", DCTERMS)
|
|
self.graph.bind("foaf", FOAF)
|
|
self.graph.bind("skos", SKOS)
|
|
self.graph.bind("owl", OWL)
|
|
self.graph.bind("pico", PICO)
|
|
self.graph.bind("pnv", PNV)
|
|
|
|
def reset(self) -> None:
|
|
"""Reset the graph for new batch."""
|
|
self.graph = Graph()
|
|
self._bind_namespaces()
|
|
self._persons_added = 0
|
|
|
|
def _get_person_uri(self, data: dict, filepath: Path) -> URIRef:
|
|
"""Generate a URI for a person based on LinkedIn slug or staff_id."""
|
|
extraction_meta = data.get('extraction_metadata', {})
|
|
profile_data = data.get('profile_data', {})
|
|
|
|
# Try LinkedIn URL to get slug
|
|
linkedin_url = (
|
|
extraction_meta.get('linkedin_url') or
|
|
profile_data.get('linkedin_url') or
|
|
''
|
|
)
|
|
|
|
if linkedin_url:
|
|
# Extract slug from URL like https://www.linkedin.com/in/alex-brandsen
|
|
match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url)
|
|
if match:
|
|
slug = sanitize_for_uri(match.group(1))
|
|
return HP[slug]
|
|
|
|
# Fallback to staff_id
|
|
staff_id = extraction_meta.get('staff_id', '')
|
|
if staff_id:
|
|
return HP[sanitize_for_uri(staff_id)]
|
|
|
|
# Last resort: use filename
|
|
return HP[sanitize_for_uri(filepath.stem)]
|
|
|
|
def _resolve_custodian_uri(self, custodian_name: str) -> URIRef | None:
|
|
"""Resolve custodian name to GHCID URI."""
|
|
if not custodian_name:
|
|
return None
|
|
|
|
# Try exact match first
|
|
name_lower = custodian_name.lower()
|
|
if name_lower in self.custodian_lookup:
|
|
ghcid = self.custodian_lookup[name_lower]
|
|
return GHCID[sanitize_for_uri(ghcid)]
|
|
|
|
# Try slug form
|
|
slug = name_lower.replace(' ', '-')
|
|
if slug in self.custodian_lookup:
|
|
ghcid = self.custodian_lookup[slug]
|
|
return GHCID[sanitize_for_uri(ghcid)]
|
|
|
|
# Try partial match
|
|
for key, ghcid in self.custodian_lookup.items():
|
|
if name_lower in key or key in name_lower:
|
|
return GHCID[sanitize_for_uri(ghcid)]
|
|
|
|
return None
|
|
|
|
def add_person_from_entity(self, data: dict, filepath: Path) -> URIRef | None:
|
|
"""Convert person entity JSON to RDF triples."""
|
|
extraction_meta = data.get('extraction_metadata', {})
|
|
source_staff = data.get('source_staff_info', {})
|
|
profile_data = data.get('profile_data', {})
|
|
heritage_rel = data.get('heritage_relevance', {})
|
|
|
|
# Get name (required)
|
|
name = profile_data.get('name') or source_staff.get('name')
|
|
if not name:
|
|
return None
|
|
|
|
# Create person URI
|
|
person_uri = self._get_person_uri(data, filepath)
|
|
|
|
# Type declarations
|
|
self.graph.add((person_uri, RDF.type, SCHEMA.Person))
|
|
self.graph.add((person_uri, RDF.type, FOAF.Person))
|
|
self.graph.add((person_uri, RDF.type, PICO.PersonObservation))
|
|
self.graph.add((person_uri, RDF.type, CIDOC.E21_Person))
|
|
|
|
# Name
|
|
self.graph.add((person_uri, SCHEMA.name, Literal(name)))
|
|
self.graph.add((person_uri, FOAF.name, Literal(name)))
|
|
self.graph.add((person_uri, RDFS.label, Literal(name)))
|
|
self.graph.add((person_uri, SKOS.prefLabel, Literal(name)))
|
|
|
|
# LinkedIn URL as sameAs
|
|
linkedin_url = (
|
|
extraction_meta.get('linkedin_url') or
|
|
profile_data.get('linkedin_url')
|
|
)
|
|
if linkedin_url:
|
|
try:
|
|
self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
|
|
self.graph.add((person_uri, OWL.sameAs, URIRef(linkedin_url)))
|
|
except Exception:
|
|
pass
|
|
|
|
# Headline / job title
|
|
headline = profile_data.get('headline') or source_staff.get('headline')
|
|
if headline:
|
|
self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))
|
|
self.graph.add((person_uri, SCHEMA.description, Literal(headline)))
|
|
|
|
# Location
|
|
location = profile_data.get('location', '')
|
|
if location:
|
|
self.graph.add((person_uri, SCHEMA.workLocation, Literal(location)))
|
|
|
|
country_code = extract_country_code(location)
|
|
if country_code:
|
|
self.graph.add((person_uri, HC.countryCode, Literal(country_code)))
|
|
|
|
# About / description
|
|
about = profile_data.get('about')
|
|
if about:
|
|
self.graph.add((person_uri, SCHEMA.disambiguatingDescription, Literal(about)))
|
|
|
|
# Profile image
|
|
profile_image = profile_data.get('profile_image_url')
|
|
if profile_image:
|
|
try:
|
|
self.graph.add((person_uri, SCHEMA.image, URIRef(profile_image)))
|
|
self.graph.add((person_uri, FOAF.img, URIRef(profile_image)))
|
|
except Exception:
|
|
pass
|
|
|
|
# Heritage relevance
|
|
is_heritage_relevant = heritage_rel.get('is_heritage_relevant', True)
|
|
self.graph.add((person_uri, HC.heritageRelevant,
|
|
Literal(is_heritage_relevant, datatype=XSD.boolean)))
|
|
|
|
heritage_types = heritage_rel.get('heritage_types', [])
|
|
if not heritage_types and source_staff.get('heritage_type'):
|
|
heritage_types = [source_staff.get('heritage_type')]
|
|
|
|
for ht in heritage_types:
|
|
if ht:
|
|
self.graph.add((person_uri, HC.heritageType, Literal(ht)))
|
|
|
|
# Link to custodian (employer)
|
|
custodian_name = source_staff.get('custodian')
|
|
if custodian_name:
|
|
custodian_uri = self._resolve_custodian_uri(custodian_name)
|
|
if custodian_uri:
|
|
self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
|
|
self.graph.add((person_uri, ORG.memberOf, custodian_uri))
|
|
|
|
# Also store as literal for cases where lookup fails
|
|
self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))
|
|
|
|
# Skills
|
|
for skill in profile_data.get('skills', []):
|
|
if skill:
|
|
self.graph.add((person_uri, SCHEMA.knowsAbout, Literal(skill)))
|
|
|
|
# Languages
|
|
for lang in profile_data.get('languages', []):
|
|
if isinstance(lang, dict):
|
|
lang_name = lang.get('language', '')
|
|
else:
|
|
lang_name = str(lang)
|
|
if lang_name:
|
|
self.graph.add((person_uri, SCHEMA.knowsLanguage, Literal(lang_name)))
|
|
|
|
# Experience (as structured data)
|
|
experience = profile_data.get('experience', [])
|
|
if experience:
|
|
for i, exp in enumerate(experience):
|
|
if not isinstance(exp, dict):
|
|
continue
|
|
|
|
exp_node = BNode()
|
|
self.graph.add((exp_node, RDF.type, SCHEMA.WorkExperience))
|
|
|
|
if exp.get('title'):
|
|
self.graph.add((exp_node, SCHEMA.roleName, Literal(exp['title'])))
|
|
if exp.get('company'):
|
|
self.graph.add((exp_node, SCHEMA.name, Literal(exp['company'])))
|
|
if exp.get('date_range'):
|
|
self.graph.add((exp_node, SCHEMA.temporal, Literal(exp['date_range'])))
|
|
if exp.get('location'):
|
|
self.graph.add((exp_node, SCHEMA.workLocation, Literal(exp['location'])))
|
|
if exp.get('description'):
|
|
self.graph.add((exp_node, SCHEMA.description, Literal(exp['description'])))
|
|
|
|
self.graph.add((person_uri, SCHEMA.hasOccupation, exp_node))
|
|
|
|
# Education
|
|
education = profile_data.get('education', [])
|
|
if education:
|
|
for edu in education:
|
|
if not isinstance(edu, dict):
|
|
continue
|
|
|
|
edu_node = BNode()
|
|
self.graph.add((edu_node, RDF.type, SCHEMA.EducationalOccupationalCredential))
|
|
|
|
if edu.get('degree'):
|
|
self.graph.add((edu_node, SCHEMA.name, Literal(edu['degree'])))
|
|
if edu.get('institution'):
|
|
self.graph.add((edu_node, SCHEMA.educationalCredentialAwarded, Literal(edu['institution'])))
|
|
if edu.get('date_range'):
|
|
self.graph.add((edu_node, SCHEMA.temporal, Literal(edu['date_range'])))
|
|
|
|
self.graph.add((person_uri, SCHEMA.alumniOf, edu_node))
|
|
|
|
# Provenance
|
|
extraction_date = extraction_meta.get('extraction_date')
|
|
if extraction_date:
|
|
self.graph.add((person_uri, PROV.generatedAtTime,
|
|
Literal(extraction_date, datatype=XSD.dateTime)))
|
|
|
|
extraction_method = extraction_meta.get('extraction_method')
|
|
if extraction_method:
|
|
self.graph.add((person_uri, PROV.wasGeneratedBy, Literal(extraction_method)))
|
|
|
|
staff_id = extraction_meta.get('staff_id')
|
|
if staff_id:
|
|
self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))
|
|
self.graph.add((person_uri, HC.staffId, Literal(staff_id)))
|
|
|
|
self._persons_added += 1
|
|
return person_uri
|
|
|
|
def add_persons_from_staff_list(self, data: dict, filepath: Path) -> list[URIRef]:
|
|
"""Convert staff list JSON to RDF triples for each person."""
|
|
uris = []
|
|
|
|
custodian_meta = data.get('custodian_metadata', {})
|
|
custodian_name = custodian_meta.get('custodian_name', '')
|
|
custodian_slug = custodian_meta.get('custodian_slug', '')
|
|
|
|
# Resolve custodian URI once for all staff
|
|
custodian_uri = self._resolve_custodian_uri(custodian_name) or self._resolve_custodian_uri(custodian_slug)
|
|
|
|
staff = data.get('staff', [])
|
|
for person in staff:
|
|
if not isinstance(person, dict):
|
|
continue
|
|
|
|
name = person.get('name')
|
|
if not name:
|
|
continue
|
|
|
|
# Create person URI from linkedin_slug or staff_id
|
|
linkedin_slug = person.get('linkedin_slug')
|
|
staff_id = person.get('staff_id')
|
|
|
|
if linkedin_slug:
|
|
person_uri = HP[sanitize_for_uri(linkedin_slug)]
|
|
elif staff_id:
|
|
person_uri = HP[sanitize_for_uri(staff_id)]
|
|
else:
|
|
# Generate from name + custodian
|
|
slug = sanitize_for_uri(f"{custodian_slug}_{name}")
|
|
person_uri = HP[slug]
|
|
|
|
# Type declarations
|
|
self.graph.add((person_uri, RDF.type, SCHEMA.Person))
|
|
self.graph.add((person_uri, RDF.type, FOAF.Person))
|
|
self.graph.add((person_uri, RDF.type, PICO.PersonObservation))
|
|
|
|
# Name
|
|
self.graph.add((person_uri, SCHEMA.name, Literal(name)))
|
|
self.graph.add((person_uri, FOAF.name, Literal(name)))
|
|
self.graph.add((person_uri, RDFS.label, Literal(name)))
|
|
|
|
# LinkedIn URL
|
|
linkedin_url = person.get('linkedin_profile_url')
|
|
if linkedin_url:
|
|
try:
|
|
self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
|
|
except Exception:
|
|
pass
|
|
|
|
# Headline
|
|
headline = person.get('headline')
|
|
if headline:
|
|
self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))
|
|
|
|
# Heritage relevance
|
|
heritage_relevant = person.get('heritage_relevant', True)
|
|
self.graph.add((person_uri, HC.heritageRelevant,
|
|
Literal(heritage_relevant, datatype=XSD.boolean)))
|
|
|
|
heritage_type = person.get('heritage_type')
|
|
if heritage_type:
|
|
self.graph.add((person_uri, HC.heritageType, Literal(heritage_type)))
|
|
|
|
# Link to custodian
|
|
if custodian_uri:
|
|
self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
|
|
self.graph.add((person_uri, ORG.memberOf, custodian_uri))
|
|
|
|
if custodian_name:
|
|
self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))
|
|
|
|
# Staff ID
|
|
if staff_id:
|
|
self.graph.add((person_uri, HC.staffId, Literal(staff_id)))
|
|
self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))
|
|
|
|
uris.append(person_uri)
|
|
self._persons_added += 1
|
|
|
|
return uris
|
|
|
|
def serialize(self, format: str = "turtle") -> bytes:
|
|
"""Serialize graph to bytes."""
|
|
return self.graph.serialize(format=format).encode("utf-8")
|
|
|
|
def triple_count(self) -> int:
|
|
"""Get number of triples in graph."""
|
|
return len(self.graph)
|
|
|
|
def persons_added(self) -> int:
|
|
"""Get number of persons added."""
|
|
return self._persons_added
|
|
|
|
|
|
CACHE_FILE = PROJECT_ROOT / "data" / ".custodian_lookup_cache.json"
|
|
CACHE_MAX_AGE_HOURS = 24
|
|
|
|
|
|
def build_custodian_lookup(custodian_dir: Path = DEFAULT_CUSTODIAN_DIR, use_cache: bool = True) -> dict[str, str]:
|
|
"""
|
|
Build custodian lookup table using filename-based extraction.
|
|
|
|
This is MUCH faster than YAML parsing (27k files in <1s vs 2+ minutes).
|
|
Uses GHCID from filename directly - custodian YAMLs are named by GHCID.
|
|
"""
|
|
lookup = {}
|
|
|
|
# Try cache first
|
|
if use_cache and CACHE_FILE.exists():
|
|
try:
|
|
cache_stat = CACHE_FILE.stat()
|
|
cache_age_hours = (datetime.now().timestamp() - cache_stat.st_mtime) / 3600
|
|
if cache_age_hours < CACHE_MAX_AGE_HOURS:
|
|
with open(CACHE_FILE, 'r') as f:
|
|
cached = json.load(f)
|
|
return cached.get('lookup', {})
|
|
except Exception:
|
|
pass
|
|
|
|
if not custodian_dir.exists():
|
|
return lookup
|
|
|
|
# Fast extraction: GHCID from filename
|
|
# Files are named like: NL-NH-AMS-M-RM.yaml or NL-NH-AMS-M-RM-rijksmuseum.yaml
|
|
yaml_files = list(custodian_dir.glob("*.yaml"))
|
|
|
|
for yaml_file in yaml_files:
|
|
stem = yaml_file.stem # e.g., "NL-NH-AMS-M-RM" or "NL-NH-AMS-M-RM-rijksmuseum"
|
|
|
|
# Extract base GHCID (before any name suffix after 5th hyphen)
|
|
parts = stem.split('-')
|
|
if len(parts) >= 5:
|
|
# Standard GHCID: CC-RR-CCC-T-ABB
|
|
base_ghcid = '-'.join(parts[:5])
|
|
|
|
# Add both full stem and base GHCID to lookup
|
|
lookup[stem.lower()] = stem
|
|
lookup[base_ghcid.lower()] = stem
|
|
|
|
# If there's a name suffix, also index by that
|
|
if len(parts) > 5:
|
|
name_suffix = '-'.join(parts[5:])
|
|
lookup[name_suffix.lower()] = stem
|
|
# Also with underscores replaced by spaces
|
|
lookup[name_suffix.replace('_', ' ').lower()] = stem
|
|
|
|
# Save cache
|
|
if use_cache:
|
|
try:
|
|
CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(CACHE_FILE, 'w') as f:
|
|
json.dump({
|
|
'lookup': lookup,
|
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
|
'file_count': len(yaml_files)
|
|
}, f)
|
|
except Exception:
|
|
pass
|
|
|
|
return lookup
|
|
|
|
|
|
class OxigraphPersonSyncer(BaseSyncer):
|
|
"""Sync person JSON files to Oxigraph triplestore."""
|
|
|
|
database_name = "oxigraph_persons"
|
|
|
|
def __init__(
|
|
self,
|
|
endpoint: str = OXIGRAPH_URL,
|
|
batch_size: int = BATCH_SIZE,
|
|
entity_dir: Path = PERSON_ENTITY_DIR,
|
|
affiliated_dir: Path = PERSON_AFFILIATED_DIR,
|
|
**kwargs
|
|
):
|
|
super().__init__(**kwargs)
|
|
self.endpoint = endpoint.rstrip("/")
|
|
self.batch_size = batch_size
|
|
self.entity_dir = entity_dir
|
|
self.affiliated_dir = affiliated_dir
|
|
self.client = httpx.Client(timeout=120.0)
|
|
|
|
# Build custodian lookup
|
|
self.logger.info("Building custodian lookup table...")
|
|
self.custodian_lookup = build_custodian_lookup()
|
|
self.logger.info(f"Loaded {len(self.custodian_lookup)} custodian name mappings")
|
|
|
|
self.converter = PersonRDFConverter(custodian_lookup=self.custodian_lookup)
|
|
|
|
def check_connection(self) -> bool:
|
|
"""Check if Oxigraph is available."""
|
|
try:
|
|
resp = self.client.post(
|
|
f"{self.endpoint}/query",
|
|
headers={"Content-Type": "application/sparql-query"},
|
|
content="ASK { ?s ?p ?o }"
|
|
)
|
|
return resp.status_code in (200, 204)
|
|
except Exception as e:
|
|
self.logger.error(f"Oxigraph connection failed: {e}")
|
|
return False
|
|
|
|
def get_status(self) -> dict:
|
|
"""Get Oxigraph status for person data."""
|
|
try:
|
|
# Count person triples specifically
|
|
resp = self.client.post(
|
|
f"{self.endpoint}/query",
|
|
headers={
|
|
"Content-Type": "application/sparql-query",
|
|
"Accept": "application/sparql-results+json"
|
|
},
|
|
content="""
|
|
PREFIX schema: <http://schema.org/>
|
|
PREFIX hp: <https://w3id.org/heritage/person/>
|
|
SELECT (COUNT(DISTINCT ?person) AS ?count)
|
|
WHERE {
|
|
?person a schema:Person .
|
|
FILTER(STRSTARTS(STR(?person), STR(hp:)))
|
|
}
|
|
"""
|
|
)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
count = int(data["results"]["bindings"][0]["count"]["value"])
|
|
return {"status": "healthy", "person_count": count}
|
|
except Exception as e:
|
|
return {"status": "unavailable", "error": str(e)}
|
|
return {"status": "unknown"}
|
|
|
|
def _upload_turtle(self, ttl_data: bytes) -> bool:
|
|
"""Upload Turtle data to Oxigraph."""
|
|
try:
|
|
resp = self.client.post(
|
|
f"{self.endpoint}/store?default",
|
|
headers={"Content-Type": "text/turtle"},
|
|
content=ttl_data
|
|
)
|
|
return resp.status_code in (200, 201, 204)
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to upload data: {e}")
|
|
return False
|
|
|
|
def _list_entity_files(self) -> list[Path]:
|
|
"""List all person entity JSON files."""
|
|
if not self.entity_dir.exists():
|
|
self.logger.warning(f"Entity directory not found: {self.entity_dir}")
|
|
return []
|
|
return sorted(self.entity_dir.glob("*.json"))
|
|
|
|
def _list_staff_files(self) -> list[Path]:
|
|
"""List all staff list JSON files."""
|
|
if not self.affiliated_dir.exists():
|
|
self.logger.warning(f"Affiliated directory not found: {self.affiliated_dir}")
|
|
return []
|
|
return sorted(self.affiliated_dir.glob("*_staff_*.json"))
|
|
|
|
def sync(
|
|
self,
|
|
limit: Optional[int] = None,
|
|
dry_run: bool = False,
|
|
clear: bool = False,
|
|
entity_only: bool = False,
|
|
staff_only: bool = False,
|
|
) -> SyncResult:
|
|
"""Sync all person JSON files to Oxigraph."""
|
|
result = SyncResult(
|
|
database="oxigraph_persons",
|
|
status=SyncStatus.IN_PROGRESS,
|
|
start_time=datetime.now(timezone.utc),
|
|
)
|
|
|
|
# Check connection
|
|
if not dry_run and not self.check_connection():
|
|
result.status = SyncStatus.FAILED
|
|
result.error_message = f"Cannot connect to Oxigraph at {self.endpoint}"
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
# Gather files to process
|
|
files_to_process = []
|
|
|
|
if not staff_only:
|
|
entity_files = self._list_entity_files()
|
|
files_to_process.extend([('entity', f) for f in entity_files])
|
|
self.logger.info(f"Found {len(entity_files)} entity files")
|
|
|
|
if not entity_only:
|
|
staff_files = self._list_staff_files()
|
|
files_to_process.extend([('staff', f) for f in staff_files])
|
|
self.logger.info(f"Found {len(staff_files)} staff files")
|
|
|
|
if limit:
|
|
files_to_process = files_to_process[:limit]
|
|
|
|
self.progress.total_files = len(files_to_process)
|
|
self.progress.current_database = "oxigraph_persons"
|
|
|
|
self.logger.info(f"Processing {len(files_to_process)} files...")
|
|
|
|
# Clear existing person data (optional)
|
|
if not dry_run and clear:
|
|
self.logger.info("Note: Use SPARQL DELETE to clear person data if needed")
|
|
|
|
# Process in batches
|
|
total_triples = 0
|
|
total_persons = 0
|
|
processed = 0
|
|
failed = 0
|
|
|
|
for i in range(0, len(files_to_process), self.batch_size):
|
|
batch = files_to_process[i:i + self.batch_size]
|
|
batch_num = i // self.batch_size + 1
|
|
|
|
self.converter.reset()
|
|
|
|
for file_type, filepath in batch:
|
|
self.progress.processed_files = processed + 1
|
|
self.progress.current_file = filepath.name
|
|
self._report_progress()
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
if not data:
|
|
failed += 1
|
|
continue
|
|
|
|
if file_type == 'entity':
|
|
uri = self.converter.add_person_from_entity(data, filepath)
|
|
if uri:
|
|
processed += 1
|
|
else:
|
|
failed += 1
|
|
else: # staff
|
|
uris = self.converter.add_persons_from_staff_list(data, filepath)
|
|
if uris:
|
|
processed += 1
|
|
else:
|
|
failed += 1
|
|
|
|
except json.JSONDecodeError as e:
|
|
self.logger.warning(f"JSON decode error in {filepath.name}: {e}")
|
|
failed += 1
|
|
self.progress.errors.append(f"{filepath.name}: JSON decode error")
|
|
except Exception as e:
|
|
self.logger.warning(f"Error processing {filepath.name}: {e}")
|
|
failed += 1
|
|
self.progress.errors.append(f"{filepath.name}: {str(e)}")
|
|
|
|
batch_triples = self.converter.triple_count()
|
|
batch_persons = self.converter.persons_added()
|
|
total_triples += batch_triples
|
|
total_persons += batch_persons
|
|
|
|
if not dry_run:
|
|
try:
|
|
ttl_data = self.converter.serialize("turtle")
|
|
if self._upload_turtle(ttl_data):
|
|
self.logger.info(
|
|
f"Batch {batch_num}: loaded {len(batch)} files, "
|
|
f"{batch_persons} persons, {batch_triples:,} triples"
|
|
)
|
|
else:
|
|
self.logger.error(f"Batch {batch_num}: failed to upload")
|
|
failed += len(batch)
|
|
except Exception as e:
|
|
self.logger.error(f"Batch {batch_num}: serialization failed: {e}")
|
|
failed += len(batch)
|
|
else:
|
|
self.logger.info(
|
|
f"Batch {batch_num}: parsed {len(batch)} files, "
|
|
f"{batch_persons} persons, {batch_triples:,} triples (dry-run)"
|
|
)
|
|
|
|
result.records_processed = len(files_to_process)
|
|
result.records_succeeded = processed
|
|
result.records_failed = failed
|
|
result.details["total_triples"] = total_triples
|
|
result.details["total_persons"] = total_persons
|
|
result.details["batch_size"] = self.batch_size
|
|
result.details["dry_run"] = dry_run
|
|
result.details["custodian_mappings"] = len(self.custodian_lookup)
|
|
|
|
if failed == 0:
|
|
result.status = SyncStatus.SUCCESS
|
|
elif processed > 0:
|
|
result.status = SyncStatus.PARTIAL
|
|
else:
|
|
result.status = SyncStatus.FAILED
|
|
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Sync person JSON files to Oxigraph")
|
|
parser.add_argument("--dry-run", action="store_true", help="Parse files but don't upload")
|
|
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
parser.add_argument("--endpoint", default=OXIGRAPH_URL, help="Oxigraph SPARQL endpoint URL")
|
|
parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Files per batch")
|
|
parser.add_argument("--entity-only", action="store_true", help="Only process entity files")
|
|
parser.add_argument("--staff-only", action="store_true", help="Only process staff files")
|
|
parser.add_argument("--clear", action="store_true", help="Clear existing person data first")
|
|
args = parser.parse_args()
|
|
|
|
import logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
)
|
|
|
|
syncer = OxigraphPersonSyncer(endpoint=args.endpoint, batch_size=args.batch_size)
|
|
|
|
print("=" * 60)
|
|
print("Oxigraph Person Sync")
|
|
print("=" * 60)
|
|
|
|
if not args.dry_run:
|
|
print(f"Checking connection to {args.endpoint}...")
|
|
status = syncer.get_status()
|
|
print(f" Status: {status.get('status', 'unknown')}")
|
|
if status.get('person_count'):
|
|
print(f" Current persons: {status['person_count']:,}")
|
|
|
|
print(f" Custodian mappings: {len(syncer.custodian_lookup):,}")
|
|
|
|
result = syncer.sync(
|
|
limit=args.limit,
|
|
dry_run=args.dry_run,
|
|
clear=args.clear,
|
|
entity_only=args.entity_only,
|
|
staff_only=args.staff_only,
|
|
)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Sync Result: {result.status.value.upper()}")
|
|
print(f" Files processed: {result.records_processed}")
|
|
print(f" Files succeeded: {result.records_succeeded}")
|
|
print(f" Files failed: {result.records_failed}")
|
|
print(f" Persons synced: {result.details.get('total_persons', 0):,}")
|
|
print(f" Triples: {result.details.get('total_triples', 0):,}")
|
|
print(f" Duration: {result.duration_seconds:.2f}s")
|
|
if result.error_message:
|
|
print(f" Error: {result.error_message}")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|