glam/scripts/sync/oxigraph_person_sync.py
2025-12-23 18:08:45 +01:00

913 lines
36 KiB
Python

#!/usr/bin/env python3
"""
Oxigraph Person Sync Module - Sync person JSON files to Oxigraph triplestore.
This module syncs all person data (entity profiles and staff lists) to Oxigraph
as RDF triples for SPARQL querying and graph-based retrieval.
Usage:
python -m scripts.sync.oxigraph_person_sync [--dry-run] [--limit N] [--endpoint URL]
Data Sources:
- data/custodian/person/entity/*.json - Individual person profile files
- data/custodian/person/affiliated/parsed/*_staff_*.json - Staff list files
"""
import argparse
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import httpx
import yaml
from rdflib import Graph, Namespace, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, XSD, SKOS, DCTERMS, FOAF, OWL
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from scripts.sync import BaseSyncer, SyncResult, SyncStatus, DEFAULT_CUSTODIAN_DIR
# Configuration
OXIGRAPH_URL = os.getenv("OXIGRAPH_URL", "http://localhost:7878")
BATCH_SIZE = 500
# Data directories
PERSON_ENTITY_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "entity"
PERSON_AFFILIATED_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "affiliated" / "parsed"
# Namespaces - Using nde.nl as canonical namespace (per LinkML schema)
HC = Namespace("https://nde.nl/ontology/hc/")
HP = Namespace("https://nde.nl/ontology/hc/person/")
GHCID = Namespace("https://nde.nl/ontology/hc/")
NDE = Namespace("https://nde.nl/ontology/hc/class/")
ORG = Namespace("http://www.w3.org/ns/org#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
SCHEMA = Namespace("http://schema.org/")
CIDOC = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
PROV = Namespace("http://www.w3.org/ns/prov#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
PICO = Namespace("https://personsincontext.org/model#")
PNV = Namespace("https://w3id.org/pnv#")
def sanitize_for_uri(value: str) -> str:
"""Sanitize a string to be valid as a URI local name."""
if not value:
return "unknown"
sanitized = re.sub(r'[`\'\"<>{}|\\^~\[\]@#$%&*()+=,;!? ]', '_', value)
sanitized = re.sub(r'_+', '_', sanitized)
sanitized = sanitized.strip('_')
return sanitized.lower() if sanitized else "unknown"
def normalize_location(location) -> str:
"""Normalize location to string format.
Handles both string format: "Amsterdam, North Holland, Netherlands"
and dict format: {"city": "Amsterdam", "region": "North Holland", "country": "Netherlands"}
"""
if not location:
return ""
if isinstance(location, str):
return location
if isinstance(location, dict):
parts = []
for key in ['city', 'region', 'country']:
if location.get(key):
parts.append(str(location[key]))
return ", ".join(parts)
return str(location)
def extract_country_code(location) -> Optional[str]:
"""Extract country code from location string or dict.
Handles:
- String: 'Amsterdam, North Holland, Netherlands (NL)'
- Dict: {"city": "Amsterdam", "region": "North Holland", "country": "Netherlands"}
"""
if not location:
return None
# Handle dict format
if isinstance(location, dict):
country = location.get('country', '')
if country:
location = country
else:
location = normalize_location(location)
if not isinstance(location, str):
location = str(location)
# Try to find country code in parentheses
match = re.search(r'\(([A-Z]{2})\)', location)
if match:
return match.group(1)
# Common country mappings
country_map = {
'netherlands': 'NL', 'nederland': 'NL',
'united states': 'US', 'usa': 'US',
'united kingdom': 'GB', 'uk': 'GB', 'england': 'GB',
'germany': 'DE', 'deutschland': 'DE',
'france': 'FR',
'belgium': 'BE', 'belgië': 'BE',
'spain': 'ES', 'españa': 'ES',
'italy': 'IT', 'italia': 'IT',
'australia': 'AU',
'canada': 'CA',
'japan': 'JP',
'china': 'CN',
}
lower_loc = location.lower()
for country, code in country_map.items():
if country in lower_loc:
return code
return None
class PersonRDFConverter:
"""Convert person JSON data to RDF triples."""
def __init__(self, custodian_lookup: dict[str, str] | None = None):
self.graph = Graph()
self._bind_namespaces()
self.custodian_lookup = custodian_lookup or {}
self._persons_added = 0
def _bind_namespaces(self) -> None:
"""Bind all ontology namespaces to the graph."""
self.graph.bind("hc", HC)
self.graph.bind("hp", HP)
self.graph.bind("ghcid", GHCID)
self.graph.bind("nde", NDE)
self.graph.bind("org", ORG)
self.graph.bind("rico", RICO)
self.graph.bind("schema", SCHEMA)
self.graph.bind("cidoc", CIDOC)
self.graph.bind("prov", PROV)
self.graph.bind("wd", WD)
self.graph.bind("wdt", WDT)
self.graph.bind("geo", GEO)
self.graph.bind("dcterms", DCTERMS)
self.graph.bind("foaf", FOAF)
self.graph.bind("skos", SKOS)
self.graph.bind("owl", OWL)
self.graph.bind("pico", PICO)
self.graph.bind("pnv", PNV)
def reset(self) -> None:
"""Reset the graph for new batch."""
self.graph = Graph()
self._bind_namespaces()
self._persons_added = 0
def _get_person_uri(self, data: dict, filepath: Path) -> URIRef:
"""Generate a URI for a person based on LinkedIn slug or staff_id."""
extraction_meta = data.get('extraction_metadata', {})
profile_data = data.get('profile_data', {})
# Try LinkedIn URL to get slug
linkedin_url = (
extraction_meta.get('linkedin_url') or
profile_data.get('linkedin_url') or
''
)
if linkedin_url:
# Extract slug from URL like https://www.linkedin.com/in/alex-brandsen
match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url)
if match:
slug = sanitize_for_uri(match.group(1))
return HP[slug]
# Fallback to staff_id
staff_id = extraction_meta.get('staff_id', '')
if staff_id:
return HP[sanitize_for_uri(staff_id)]
# Last resort: use filename
return HP[sanitize_for_uri(filepath.stem)]
def _resolve_custodian_uri(self, custodian_name: str) -> URIRef | None:
"""Resolve custodian name to GHCID URI."""
if not custodian_name:
return None
# Try exact match first
name_lower = custodian_name.lower()
if name_lower in self.custodian_lookup:
ghcid = self.custodian_lookup[name_lower]
return GHCID[sanitize_for_uri(ghcid)]
# Try slug form
slug = name_lower.replace(' ', '-')
if slug in self.custodian_lookup:
ghcid = self.custodian_lookup[slug]
return GHCID[sanitize_for_uri(ghcid)]
# Try partial match
for key, ghcid in self.custodian_lookup.items():
if name_lower in key or key in name_lower:
return GHCID[sanitize_for_uri(ghcid)]
return None
def add_person_from_entity(self, data: dict, filepath: Path) -> URIRef | None:
"""Convert person entity JSON to RDF triples."""
extraction_meta = data.get('extraction_metadata', {})
source_staff = data.get('source_staff_info', {})
profile_data = data.get('profile_data', {})
heritage_rel = data.get('heritage_relevance', {})
person_data = data.get('person', {}) # Alternative schema variant
heritage_profile = data.get('heritage_profile', {}) # Alternative schema
# Get name (required) - handle multiple schema variants:
# 1. profile_data.name / profile_data.full_name (standard schema)
# 2. source_staff_info.name (from staff list extraction)
# 3. person.full_name (alternative schema - exa_crawling_linkedin_profile)
name = (
profile_data.get('name') or
profile_data.get('full_name') or
source_staff.get('name') or
person_data.get('full_name') or
person_data.get('name')
)
if not name:
return None
# Create person URI
person_uri = self._get_person_uri(data, filepath)
# Type declarations
self.graph.add((person_uri, RDF.type, SCHEMA.Person))
self.graph.add((person_uri, RDF.type, FOAF.Person))
self.graph.add((person_uri, RDF.type, PICO.PersonObservation))
self.graph.add((person_uri, RDF.type, CIDOC.E21_Person))
# Name
self.graph.add((person_uri, SCHEMA.name, Literal(name)))
self.graph.add((person_uri, FOAF.name, Literal(name)))
self.graph.add((person_uri, RDFS.label, Literal(name)))
self.graph.add((person_uri, SKOS.prefLabel, Literal(name)))
# LinkedIn URL as sameAs - check multiple schema locations
linkedin_url = (
extraction_meta.get('linkedin_url') or
profile_data.get('linkedin_url') or
person_data.get('linkedin_url')
)
if linkedin_url:
try:
self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
self.graph.add((person_uri, OWL.sameAs, URIRef(linkedin_url)))
except Exception:
pass
# Headline / job title - check multiple schema locations
headline = (
profile_data.get('headline') or
source_staff.get('headline') or
person_data.get('headline')
)
if headline:
self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))
self.graph.add((person_uri, SCHEMA.description, Literal(headline)))
# Location (handle both string and dict formats) - check multiple schema locations
location_raw = profile_data.get('location') or person_data.get('location')
location = normalize_location(location_raw)
if location:
self.graph.add((person_uri, SCHEMA.workLocation, Literal(location)))
country_code = extract_country_code(location_raw)
if country_code:
self.graph.add((person_uri, HC.countryCode, Literal(country_code)))
# About / description - check multiple schema locations
about = profile_data.get('about') or person_data.get('about')
if about:
self.graph.add((person_uri, SCHEMA.disambiguatingDescription, Literal(about)))
# Profile image - check multiple schema locations
profile_image = (
profile_data.get('profile_image_url') or
person_data.get('photo_url')
)
if profile_image:
try:
self.graph.add((person_uri, SCHEMA.image, URIRef(profile_image)))
self.graph.add((person_uri, FOAF.img, URIRef(profile_image)))
except Exception:
pass
# Heritage relevance - check multiple schema locations
is_heritage_relevant = (
heritage_rel.get('is_heritage_relevant', True) or
heritage_profile.get('is_heritage_professional', True)
)
self.graph.add((person_uri, HC.heritageRelevant,
Literal(is_heritage_relevant, datatype=XSD.boolean)))
heritage_types = (
heritage_rel.get('heritage_types', []) or
heritage_profile.get('heritage_types', [])
)
if not heritage_types and source_staff.get('heritage_type'):
heritage_types = [source_staff.get('heritage_type')]
for ht in heritage_types:
if ht:
self.graph.add((person_uri, HC.heritageType, Literal(ht)))
# Link to custodian (employer) - check multiple schema locations
custodian_name = (
source_staff.get('custodian') or
data.get('network_context', {}).get('source_custodian')
)
if custodian_name:
custodian_uri = self._resolve_custodian_uri(custodian_name)
if custodian_uri:
self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
self.graph.add((person_uri, ORG.memberOf, custodian_uri))
# Also store as literal for cases where lookup fails
self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))
# Skills (handle null values) - check multiple schema locations
skills = profile_data.get('skills') or data.get('skills') or []
for skill in skills:
if skill:
self.graph.add((person_uri, SCHEMA.knowsAbout, Literal(skill)))
# Languages (handle null values and dict format)
for lang in (profile_data.get('languages') or []):
if isinstance(lang, dict):
lang_name = lang.get('language', '')
else:
lang_name = str(lang)
if lang_name:
self.graph.add((person_uri, SCHEMA.knowsLanguage, Literal(lang_name)))
# Experience (as structured data) - handle multiple schema variants:
# 1. profile_data.experience / profile_data.career_history (standard)
# 2. professional_experience (alternative schema)
experience = (
profile_data.get('experience') or
profile_data.get('career_history') or
data.get('professional_experience') or
[]
)
if experience:
for i, exp in enumerate(experience):
if not isinstance(exp, dict):
continue
exp_node = BNode()
self.graph.add((exp_node, RDF.type, SCHEMA.WorkExperience))
if exp.get('title'):
self.graph.add((exp_node, SCHEMA.roleName, Literal(exp['title'])))
# Handle both 'company' (Schema A) and 'organization' (Schema B)
company = exp.get('company') or exp.get('organization')
if company:
self.graph.add((exp_node, SCHEMA.name, Literal(company)))
# Handle both date_range and duration fields, also construct from start_date/end_date
duration = exp.get('date_range') or exp.get('duration')
if not duration and exp.get('start_date'):
end_date = exp.get('end_date', 'Present')
duration = f"{exp['start_date']} - {end_date}"
if duration:
self.graph.add((exp_node, SCHEMA.temporal, Literal(duration)))
# Handle location (normalize if dict)
exp_location = exp.get('location')
if exp_location:
exp_location_str = normalize_location(exp_location) if isinstance(exp_location, dict) else str(exp_location)
self.graph.add((exp_node, SCHEMA.workLocation, Literal(exp_location_str)))
if exp.get('description'):
self.graph.add((exp_node, SCHEMA.description, Literal(exp['description'])))
self.graph.add((person_uri, SCHEMA.hasOccupation, exp_node))
# Education (handle null and both schema variants)
# Schema A: profile_data.education, Schema B: root education
education = profile_data.get('education') or data.get('education') or []
if education:
for edu in education:
if not isinstance(edu, dict):
continue
edu_node = BNode()
self.graph.add((edu_node, RDF.type, SCHEMA.EducationalOccupationalCredential))
if edu.get('degree'):
self.graph.add((edu_node, SCHEMA.name, Literal(edu['degree'])))
if edu.get('institution'):
self.graph.add((edu_node, SCHEMA.educationalCredentialAwarded, Literal(edu['institution'])))
# Handle field_of_study (variant B) or field (variant A)
field = edu.get('field_of_study') or edu.get('field')
if field:
self.graph.add((edu_node, SCHEMA.educationalProgramName, Literal(field)))
# Handle date_range, years, or start_year/end_year
date_range = edu.get('date_range') or edu.get('years')
if not date_range and edu.get('start_year'):
end_year = edu.get('end_year', '')
date_range = f"{edu['start_year']} - {end_year}" if end_year else str(edu['start_year'])
if date_range:
self.graph.add((edu_node, SCHEMA.temporal, Literal(date_range)))
self.graph.add((person_uri, SCHEMA.alumniOf, edu_node))
# Provenance
extraction_date = extraction_meta.get('extraction_date')
if extraction_date:
self.graph.add((person_uri, PROV.generatedAtTime,
Literal(extraction_date, datatype=XSD.dateTime)))
extraction_method = extraction_meta.get('extraction_method')
if extraction_method:
self.graph.add((person_uri, PROV.wasGeneratedBy, Literal(extraction_method)))
staff_id = extraction_meta.get('staff_id')
if staff_id:
self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))
self.graph.add((person_uri, HC.staffId, Literal(staff_id)))
self._persons_added += 1
return person_uri
def add_persons_from_staff_list(self, data: dict, filepath: Path) -> list[URIRef]:
"""Convert staff list JSON to RDF triples for each person."""
uris = []
custodian_meta = data.get('custodian_metadata', {})
custodian_name = custodian_meta.get('custodian_name', '')
custodian_slug = custodian_meta.get('custodian_slug', '')
# Resolve custodian URI once for all staff
custodian_uri = self._resolve_custodian_uri(custodian_name) or self._resolve_custodian_uri(custodian_slug)
staff = data.get('staff', [])
for person in staff:
if not isinstance(person, dict):
continue
name = person.get('name')
if not name:
continue
# Create person URI from linkedin_slug or staff_id
linkedin_slug = person.get('linkedin_slug')
staff_id = person.get('staff_id')
if linkedin_slug:
person_uri = HP[sanitize_for_uri(linkedin_slug)]
elif staff_id:
person_uri = HP[sanitize_for_uri(staff_id)]
else:
# Generate from name + custodian
slug = sanitize_for_uri(f"{custodian_slug}_{name}")
person_uri = HP[slug]
# Type declarations
self.graph.add((person_uri, RDF.type, SCHEMA.Person))
self.graph.add((person_uri, RDF.type, FOAF.Person))
self.graph.add((person_uri, RDF.type, PICO.PersonObservation))
# Name
self.graph.add((person_uri, SCHEMA.name, Literal(name)))
self.graph.add((person_uri, FOAF.name, Literal(name)))
self.graph.add((person_uri, RDFS.label, Literal(name)))
# LinkedIn URL
linkedin_url = person.get('linkedin_profile_url')
if linkedin_url:
try:
self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
except Exception:
pass
# Headline
headline = person.get('headline')
if headline:
self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))
# Heritage relevance
heritage_relevant = person.get('heritage_relevant', True)
self.graph.add((person_uri, HC.heritageRelevant,
Literal(heritage_relevant, datatype=XSD.boolean)))
heritage_type = person.get('heritage_type')
if heritage_type:
self.graph.add((person_uri, HC.heritageType, Literal(heritage_type)))
# Link to custodian
if custodian_uri:
self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
self.graph.add((person_uri, ORG.memberOf, custodian_uri))
if custodian_name:
self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))
# Staff ID
if staff_id:
self.graph.add((person_uri, HC.staffId, Literal(staff_id)))
self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))
uris.append(person_uri)
self._persons_added += 1
return uris
def serialize(self, format: str = "turtle") -> bytes:
"""Serialize graph to bytes."""
return self.graph.serialize(format=format).encode("utf-8")
def triple_count(self) -> int:
"""Get number of triples in graph."""
return len(self.graph)
def persons_added(self) -> int:
"""Get number of persons added."""
return self._persons_added
CACHE_FILE = PROJECT_ROOT / "data" / ".custodian_lookup_cache.json"
CACHE_MAX_AGE_HOURS = 24
def build_custodian_lookup(custodian_dir: Path = DEFAULT_CUSTODIAN_DIR, use_cache: bool = True) -> dict[str, str]:
"""
Build custodian lookup table using filename-based extraction.
This is MUCH faster than YAML parsing (27k files in <1s vs 2+ minutes).
Uses GHCID from filename directly - custodian YAMLs are named by GHCID.
"""
lookup = {}
# Try cache first
if use_cache and CACHE_FILE.exists():
try:
cache_stat = CACHE_FILE.stat()
cache_age_hours = (datetime.now().timestamp() - cache_stat.st_mtime) / 3600
if cache_age_hours < CACHE_MAX_AGE_HOURS:
with open(CACHE_FILE, 'r') as f:
cached = json.load(f)
return cached.get('lookup', {})
except Exception:
pass
if not custodian_dir.exists():
return lookup
# Fast extraction: GHCID from filename
# Files are named like: NL-NH-AMS-M-RM.yaml or NL-NH-AMS-M-RM-rijksmuseum.yaml
yaml_files = list(custodian_dir.glob("*.yaml"))
for yaml_file in yaml_files:
stem = yaml_file.stem # e.g., "NL-NH-AMS-M-RM" or "NL-NH-AMS-M-RM-rijksmuseum"
# Extract base GHCID (before any name suffix after 5th hyphen)
parts = stem.split('-')
if len(parts) >= 5:
# Standard GHCID: CC-RR-CCC-T-ABB
base_ghcid = '-'.join(parts[:5])
# Add both full stem and base GHCID to lookup
lookup[stem.lower()] = stem
lookup[base_ghcid.lower()] = stem
# If there's a name suffix, also index by that
if len(parts) > 5:
name_suffix = '-'.join(parts[5:])
lookup[name_suffix.lower()] = stem
# Also with underscores replaced by spaces
lookup[name_suffix.replace('_', ' ').lower()] = stem
# Save cache
if use_cache:
try:
CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(CACHE_FILE, 'w') as f:
json.dump({
'lookup': lookup,
'generated_at': datetime.now(timezone.utc).isoformat(),
'file_count': len(yaml_files)
}, f)
except Exception:
pass
return lookup
class OxigraphPersonSyncer(BaseSyncer):
"""Sync person JSON files to Oxigraph triplestore."""
database_name = "oxigraph_persons"
def __init__(
self,
endpoint: str = OXIGRAPH_URL,
batch_size: int = BATCH_SIZE,
entity_dir: Path = PERSON_ENTITY_DIR,
affiliated_dir: Path = PERSON_AFFILIATED_DIR,
**kwargs
):
super().__init__(**kwargs)
self.endpoint = endpoint.rstrip("/")
self.batch_size = batch_size
self.entity_dir = entity_dir
self.affiliated_dir = affiliated_dir
self.client = httpx.Client(timeout=120.0)
# Build custodian lookup
self.logger.info("Building custodian lookup table...")
self.custodian_lookup = build_custodian_lookup()
self.logger.info(f"Loaded {len(self.custodian_lookup)} custodian name mappings")
self.converter = PersonRDFConverter(custodian_lookup=self.custodian_lookup)
def check_connection(self) -> bool:
"""Check if Oxigraph is available."""
try:
resp = self.client.post(
f"{self.endpoint}/query",
headers={"Content-Type": "application/sparql-query"},
content="ASK { ?s ?p ?o }"
)
return resp.status_code in (200, 204)
except Exception as e:
self.logger.error(f"Oxigraph connection failed: {e}")
return False
def get_status(self) -> dict:
"""Get Oxigraph status for person data."""
try:
# Count person triples specifically
resp = self.client.post(
f"{self.endpoint}/query",
headers={
"Content-Type": "application/sparql-query",
"Accept": "application/sparql-results+json"
},
content="""
PREFIX schema: <http://schema.org/>
PREFIX hp: <https://nde.nl/ontology/hc/person/>
SELECT (COUNT(DISTINCT ?person) AS ?count)
WHERE {
?person a schema:Person .
FILTER(STRSTARTS(STR(?person), STR(hp:)))
}
"""
)
if resp.status_code == 200:
data = resp.json()
count = int(data["results"]["bindings"][0]["count"]["value"])
return {"status": "healthy", "person_count": count}
except Exception as e:
return {"status": "unavailable", "error": str(e)}
return {"status": "unknown"}
def _upload_turtle(self, ttl_data: bytes) -> bool:
"""Upload Turtle data to Oxigraph."""
try:
resp = self.client.post(
f"{self.endpoint}/store?default",
headers={"Content-Type": "text/turtle"},
content=ttl_data
)
return resp.status_code in (200, 201, 204)
except Exception as e:
self.logger.error(f"Failed to upload data: {e}")
return False
def _list_entity_files(self) -> list[Path]:
"""List all person entity JSON files."""
if not self.entity_dir.exists():
self.logger.warning(f"Entity directory not found: {self.entity_dir}")
return []
return sorted(self.entity_dir.glob("*.json"))
def _list_staff_files(self) -> list[Path]:
"""List all staff list JSON files."""
if not self.affiliated_dir.exists():
self.logger.warning(f"Affiliated directory not found: {self.affiliated_dir}")
return []
return sorted(self.affiliated_dir.glob("*_staff_*.json"))
def sync(
self,
limit: Optional[int] = None,
dry_run: bool = False,
clear: bool = False,
entity_only: bool = False,
staff_only: bool = False,
) -> SyncResult:
"""Sync all person JSON files to Oxigraph."""
result = SyncResult(
database="oxigraph_persons",
status=SyncStatus.IN_PROGRESS,
start_time=datetime.now(timezone.utc),
)
# Check connection
if not dry_run and not self.check_connection():
result.status = SyncStatus.FAILED
result.error_message = f"Cannot connect to Oxigraph at {self.endpoint}"
result.end_time = datetime.now(timezone.utc)
return result
# Gather files to process
files_to_process = []
if not staff_only:
entity_files = self._list_entity_files()
files_to_process.extend([('entity', f) for f in entity_files])
self.logger.info(f"Found {len(entity_files)} entity files")
if not entity_only:
staff_files = self._list_staff_files()
files_to_process.extend([('staff', f) for f in staff_files])
self.logger.info(f"Found {len(staff_files)} staff files")
if limit:
files_to_process = files_to_process[:limit]
self.progress.total_files = len(files_to_process)
self.progress.current_database = "oxigraph_persons"
self.logger.info(f"Processing {len(files_to_process)} files...")
# Clear existing person data (optional)
if not dry_run and clear:
self.logger.info("Note: Use SPARQL DELETE to clear person data if needed")
# Process in batches
total_triples = 0
total_persons = 0
processed = 0
failed = 0
for i in range(0, len(files_to_process), self.batch_size):
batch = files_to_process[i:i + self.batch_size]
batch_num = i // self.batch_size + 1
self.converter.reset()
for file_type, filepath in batch:
self.progress.processed_files = processed + 1
self.progress.current_file = filepath.name
self._report_progress()
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
if not data:
failed += 1
continue
if file_type == 'entity':
uri = self.converter.add_person_from_entity(data, filepath)
if uri:
processed += 1
else:
self.logger.debug(f"No person extracted from entity: {filepath.name}")
failed += 1
else: # staff
uris = self.converter.add_persons_from_staff_list(data, filepath)
# Empty staff list is not a failure - it's just a file with no persons
# (e.g., LinkedIn company page with 0 visible employees)
if uris is not None: # Explicitly check for None (error) vs [] (empty)
processed += 1
else:
self.logger.debug(f"Failed to process staff file: {filepath.name}")
failed += 1
except json.JSONDecodeError as e:
self.logger.warning(f"JSON decode error in {filepath.name}: {e}")
failed += 1
self.progress.errors.append(f"{filepath.name}: JSON decode error")
except Exception as e:
self.logger.warning(f"Error processing {filepath.name}: {e}")
failed += 1
self.progress.errors.append(f"{filepath.name}: {str(e)}")
batch_triples = self.converter.triple_count()
batch_persons = self.converter.persons_added()
total_triples += batch_triples
total_persons += batch_persons
if not dry_run:
try:
ttl_data = self.converter.serialize("turtle")
if self._upload_turtle(ttl_data):
self.logger.info(
f"Batch {batch_num}: loaded {len(batch)} files, "
f"{batch_persons} persons, {batch_triples:,} triples"
)
else:
self.logger.error(f"Batch {batch_num}: failed to upload")
failed += len(batch)
except Exception as e:
self.logger.error(f"Batch {batch_num}: serialization failed: {e}")
failed += len(batch)
else:
self.logger.info(
f"Batch {batch_num}: parsed {len(batch)} files, "
f"{batch_persons} persons, {batch_triples:,} triples (dry-run)"
)
result.records_processed = len(files_to_process)
result.records_succeeded = processed
result.records_failed = failed
result.details["total_triples"] = total_triples
result.details["total_persons"] = total_persons
result.details["batch_size"] = self.batch_size
result.details["dry_run"] = dry_run
result.details["custodian_mappings"] = len(self.custodian_lookup)
if failed == 0:
result.status = SyncStatus.SUCCESS
elif processed > 0:
result.status = SyncStatus.PARTIAL
else:
result.status = SyncStatus.FAILED
result.end_time = datetime.now(timezone.utc)
return result
def main():
parser = argparse.ArgumentParser(description="Sync person JSON files to Oxigraph")
parser.add_argument("--dry-run", action="store_true", help="Parse files but don't upload")
parser.add_argument("--limit", type=int, help="Limit number of files to process")
parser.add_argument("--endpoint", default=OXIGRAPH_URL, help="Oxigraph SPARQL endpoint URL")
parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Files per batch")
parser.add_argument("--entity-only", action="store_true", help="Only process entity files")
parser.add_argument("--staff-only", action="store_true", help="Only process staff files")
parser.add_argument("--clear", action="store_true", help="Clear existing person data first")
args = parser.parse_args()
import logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
syncer = OxigraphPersonSyncer(endpoint=args.endpoint, batch_size=args.batch_size)
print("=" * 60)
print("Oxigraph Person Sync")
print("=" * 60)
if not args.dry_run:
print(f"Checking connection to {args.endpoint}...")
status = syncer.get_status()
print(f" Status: {status.get('status', 'unknown')}")
if status.get('person_count'):
print(f" Current persons: {status['person_count']:,}")
print(f" Custodian mappings: {len(syncer.custodian_lookup):,}")
result = syncer.sync(
limit=args.limit,
dry_run=args.dry_run,
clear=args.clear,
entity_only=args.entity_only,
staff_only=args.staff_only,
)
print("\n" + "=" * 60)
print(f"Sync Result: {result.status.value.upper()}")
print(f" Files processed: {result.records_processed}")
print(f" Files succeeded: {result.records_succeeded}")
print(f" Files failed: {result.records_failed}")
print(f" Persons synced: {result.details.get('total_persons', 0):,}")
print(f" Triples: {result.details.get('total_triples', 0):,}")
print(f" Duration: {result.duration_seconds:.2f}s")
if result.error_message:
print(f" Error: {result.error_message}")
print("=" * 60)
if __name__ == "__main__":
main()