glam/scripts/sync/oxigraph_person_sync.py

#!/usr/bin/env python3
"""
Oxigraph Person Sync Module - Sync person JSON files to Oxigraph triplestore.

This module syncs all person data (entity profiles and staff lists) to Oxigraph
as RDF triples for SPARQL querying and graph-based retrieval.

Usage:
    python -m scripts.sync.oxigraph_person_sync [--dry-run] [--limit N] [--endpoint URL]

Data Sources:
    - data/custodian/person/entity/*.json - Individual person profile files
    - data/custodian/person/affiliated/parsed/*_staff_*.json - Staff list files
"""

import argparse
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

import httpx
import yaml
from rdflib import Graph, Namespace, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, XSD, SKOS, DCTERMS, FOAF, OWL

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from scripts.sync import BaseSyncer, SyncResult, SyncStatus, DEFAULT_CUSTODIAN_DIR

# Configuration
OXIGRAPH_URL = os.getenv("OXIGRAPH_URL", "http://localhost:7878")
BATCH_SIZE = 500

# Data directories
PERSON_ENTITY_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "entity"
PERSON_AFFILIATED_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "affiliated" / "parsed"

# Namespaces - Using nde.nl as canonical namespace (per LinkML schema)
HC = Namespace("https://nde.nl/ontology/hc/")
HP = Namespace("https://nde.nl/ontology/hc/person/")
GHCID = Namespace("https://nde.nl/ontology/hc/")
NDE = Namespace("https://nde.nl/ontology/hc/class/")
ORG = Namespace("http://www.w3.org/ns/org#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
SCHEMA = Namespace("http://schema.org/")
CIDOC = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
PROV = Namespace("http://www.w3.org/ns/prov#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
PICO = Namespace("https://personsincontext.org/model#")
PNV = Namespace("https://w3id.org/pnv#")


def sanitize_for_uri(value: str) -> str:
    """Sanitize a string to be valid as a URI local name."""
    if not value:
        return "unknown"
    sanitized = re.sub(r'[`\'\"<>{}|\\^~\[\]@#$%&*()+=,;!? ]', '_', value)
    sanitized = re.sub(r'_+', '_', sanitized)
    sanitized = sanitized.strip('_')
    return sanitized.lower() if sanitized else "unknown"


def normalize_location(location) -> str:
    """Normalize location to string format.

    Handles both string format: "Amsterdam, North Holland, Netherlands"
    and dict format: {"city": "Amsterdam", "region": "North Holland", "country": "Netherlands"}
    """
    if not location:
        return ""

    if isinstance(location, str):
        return location

    if isinstance(location, dict):
        parts = []
        for key in ['city', 'region', 'country']:
            if location.get(key):
                parts.append(str(location[key]))
        return ", ".join(parts)

    return str(location)


def extract_country_code(location) -> Optional[str]:
    """Extract country code from location string or dict.

    Handles:
    - String: 'Amsterdam, North Holland, Netherlands (NL)'
    - Dict: {"city": "Amsterdam", "region": "North Holland", "country": "Netherlands"}
    """
    if not location:
        return None

    # Handle dict format
    if isinstance(location, dict):
        country = location.get('country', '')
        if country:
            location = country
        else:
            location = normalize_location(location)

    if not isinstance(location, str):
        location = str(location)

    # Try to find country code in parentheses
    match = re.search(r'\(([A-Z]{2})\)', location)
    if match:
        return match.group(1)

    # Common country mappings
    country_map = {
        'netherlands': 'NL', 'nederland': 'NL',
        'united states': 'US', 'usa': 'US',
        'united kingdom': 'GB', 'uk': 'GB', 'england': 'GB',
        'germany': 'DE', 'deutschland': 'DE',
        'france': 'FR',
        'belgium': 'BE', 'belgië': 'BE',
        'spain': 'ES', 'españa': 'ES',
        'italy': 'IT', 'italia': 'IT',
        'australia': 'AU',
        'canada': 'CA',
        'japan': 'JP',
        'china': 'CN',
    }

    lower_loc = location.lower()
    for country, code in country_map.items():
        if country in lower_loc:
            return code

    return None


class PersonRDFConverter:
    """Convert person JSON data to RDF triples."""

    def __init__(self, custodian_lookup: dict[str, str] | None = None):
        self.graph = Graph()
        self._bind_namespaces()
        self.custodian_lookup = custodian_lookup or {}
        self._persons_added = 0

    def _bind_namespaces(self) -> None:
        """Bind all ontology namespaces to the graph."""
        self.graph.bind("hc", HC)
        self.graph.bind("hp", HP)
        self.graph.bind("ghcid", GHCID)
        self.graph.bind("nde", NDE)
        self.graph.bind("org", ORG)
        self.graph.bind("rico", RICO)
        self.graph.bind("schema", SCHEMA)
        self.graph.bind("cidoc", CIDOC)
        self.graph.bind("prov", PROV)
        self.graph.bind("wd", WD)
        self.graph.bind("wdt", WDT)
        self.graph.bind("geo", GEO)
        self.graph.bind("dcterms", DCTERMS)
        self.graph.bind("foaf", FOAF)
        self.graph.bind("skos", SKOS)
        self.graph.bind("owl", OWL)
        self.graph.bind("pico", PICO)
        self.graph.bind("pnv", PNV)

    def reset(self) -> None:
        """Reset the graph for new batch."""
        self.graph = Graph()
        self._bind_namespaces()
        self._persons_added = 0

    def _get_person_uri(self, data: dict, filepath: Path) -> URIRef:
        """Generate a URI for a person based on LinkedIn slug or staff_id."""
        extraction_meta = data.get('extraction_metadata', {})
        profile_data = data.get('profile_data', {})

        # Try LinkedIn URL to get slug
        linkedin_url = (
            extraction_meta.get('linkedin_url') or
            profile_data.get('linkedin_url') or
            ''
        )

        if linkedin_url:
            # Extract slug from URL like https://www.linkedin.com/in/alex-brandsen
            match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url)
            if match:
                slug = sanitize_for_uri(match.group(1))
                return HP[slug]

        # Fallback to staff_id
        staff_id = extraction_meta.get('staff_id', '')
        if staff_id:
            return HP[sanitize_for_uri(staff_id)]

        # Last resort: use filename
        return HP[sanitize_for_uri(filepath.stem)]

    def _resolve_custodian_uri(self, custodian_name: str) -> URIRef | None:
        """Resolve custodian name to GHCID URI."""
        if not custodian_name:
            return None

        # Try exact match first
        name_lower = custodian_name.lower()
        if name_lower in self.custodian_lookup:
            ghcid = self.custodian_lookup[name_lower]
            return GHCID[sanitize_for_uri(ghcid)]

        # Try slug form
        slug = name_lower.replace(' ', '-')
        if slug in self.custodian_lookup:
            ghcid = self.custodian_lookup[slug]
            return GHCID[sanitize_for_uri(ghcid)]

        # Try partial match
        for key, ghcid in self.custodian_lookup.items():
            if name_lower in key or key in name_lower:
                return GHCID[sanitize_for_uri(ghcid)]

        return None

    def add_person_from_entity(self, data: dict, filepath: Path) -> URIRef | None:
        """Convert person entity JSON to RDF triples."""
        extraction_meta = data.get('extraction_metadata', {})
        source_staff = data.get('source_staff_info', {})
        profile_data = data.get('profile_data', {})
        heritage_rel = data.get('heritage_relevance', {})
        person_data = data.get('person', {})  # Alternative schema variant
        heritage_profile = data.get('heritage_profile', {})  # Alternative schema

        # Get name (required) - handle multiple schema variants:
        # 1. profile_data.name / profile_data.full_name (standard schema)
        # 2. source_staff_info.name (from staff list extraction)
        # 3. person.full_name (alternative schema - exa_crawling_linkedin_profile)
        name = (
            profile_data.get('name') or
            profile_data.get('full_name') or
            source_staff.get('name') or
            person_data.get('full_name') or
            person_data.get('name')
        )
        if not name:
            return None

        # Create person URI
        person_uri = self._get_person_uri(data, filepath)

        # Type declarations
        self.graph.add((person_uri, RDF.type, SCHEMA.Person))
        self.graph.add((person_uri, RDF.type, FOAF.Person))
        self.graph.add((person_uri, RDF.type, PICO.PersonObservation))
        self.graph.add((person_uri, RDF.type, CIDOC.E21_Person))

        # Name
        self.graph.add((person_uri, SCHEMA.name, Literal(name)))
        self.graph.add((person_uri, FOAF.name, Literal(name)))
        self.graph.add((person_uri, RDFS.label, Literal(name)))
        self.graph.add((person_uri, SKOS.prefLabel, Literal(name)))

        # LinkedIn URL as sameAs - check multiple schema locations
        linkedin_url = (
            extraction_meta.get('linkedin_url') or
            profile_data.get('linkedin_url') or
            person_data.get('linkedin_url')
        )
        if linkedin_url:
            try:
                self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
                self.graph.add((person_uri, OWL.sameAs, URIRef(linkedin_url)))
            except Exception:
                pass

        # Headline / job title - check multiple schema locations
        headline = (
            profile_data.get('headline') or
            source_staff.get('headline') or
            person_data.get('headline')
        )
        if headline:
            self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))
            self.graph.add((person_uri, SCHEMA.description, Literal(headline)))

        # Location (handle both string and dict formats) - check multiple schema locations
        location_raw = profile_data.get('location') or person_data.get('location')
        location = normalize_location(location_raw)
        if location:
            self.graph.add((person_uri, SCHEMA.workLocation, Literal(location)))

            country_code = extract_country_code(location_raw)
            if country_code:
                self.graph.add((person_uri, HC.countryCode, Literal(country_code)))

        # About / description - check multiple schema locations
        about = profile_data.get('about') or person_data.get('about')
        if about:
            self.graph.add((person_uri, SCHEMA.disambiguatingDescription, Literal(about)))

        # Profile image - check multiple schema locations
        profile_image = (
            profile_data.get('profile_image_url') or
            person_data.get('photo_url')
        )
        if profile_image:
            try:
                self.graph.add((person_uri, SCHEMA.image, URIRef(profile_image)))
                self.graph.add((person_uri, FOAF.img, URIRef(profile_image)))
            except Exception:
                pass

        # Heritage relevance - check multiple schema locations
        is_heritage_relevant = (
            heritage_rel.get('is_heritage_relevant', True) or
            heritage_profile.get('is_heritage_professional', True)
        )
        self.graph.add((person_uri, HC.heritageRelevant,
                       Literal(is_heritage_relevant, datatype=XSD.boolean)))

        heritage_types = (
            heritage_rel.get('heritage_types', []) or
            heritage_profile.get('heritage_types', [])
        )
        if not heritage_types and source_staff.get('heritage_type'):
            heritage_types = [source_staff.get('heritage_type')]

        for ht in heritage_types:
            if ht:
                self.graph.add((person_uri, HC.heritageType, Literal(ht)))

        # Link to custodian (employer) - check multiple schema locations
        custodian_name = (
            source_staff.get('custodian') or
            data.get('network_context', {}).get('source_custodian')
        )
        if custodian_name:
            custodian_uri = self._resolve_custodian_uri(custodian_name)
            if custodian_uri:
                self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
                self.graph.add((person_uri, ORG.memberOf, custodian_uri))

            # Also store as literal for cases where lookup fails
            self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))

        # Skills (handle null values) - check multiple schema locations
        skills = profile_data.get('skills') or data.get('skills') or []
        for skill in skills:
            if skill:
                self.graph.add((person_uri, SCHEMA.knowsAbout, Literal(skill)))

        # Languages (handle null values and dict format)
        for lang in (profile_data.get('languages') or []):
            if isinstance(lang, dict):
                lang_name = lang.get('language', '')
            else:
                lang_name = str(lang)
            if lang_name:
                self.graph.add((person_uri, SCHEMA.knowsLanguage, Literal(lang_name)))

        # Experience (as structured data) - handle multiple schema variants:
        # 1. profile_data.experience / profile_data.career_history (standard)
        # 2. professional_experience (alternative schema)
        experience = (
            profile_data.get('experience') or
            profile_data.get('career_history') or
            data.get('professional_experience') or
            []
        )
        if experience:
            for i, exp in enumerate(experience):
                if not isinstance(exp, dict):
                    continue

                exp_node = BNode()
                self.graph.add((exp_node, RDF.type, SCHEMA.WorkExperience))

                if exp.get('title'):
                    self.graph.add((exp_node, SCHEMA.roleName, Literal(exp['title'])))
                # Handle both 'company' (Schema A) and 'organization' (Schema B)
                company = exp.get('company') or exp.get('organization')
                if company:
                    self.graph.add((exp_node, SCHEMA.name, Literal(company)))

                # Handle both date_range and duration fields, also construct from start_date/end_date
                duration = exp.get('date_range') or exp.get('duration')
                if not duration and exp.get('start_date'):
                    end_date = exp.get('end_date', 'Present')
                    duration = f"{exp['start_date']} - {end_date}"
                if duration:
                    self.graph.add((exp_node, SCHEMA.temporal, Literal(duration)))

                # Handle location (normalize if dict)
                exp_location = exp.get('location')
                if exp_location:
                    exp_location_str = normalize_location(exp_location) if isinstance(exp_location, dict) else str(exp_location)
                    self.graph.add((exp_node, SCHEMA.workLocation, Literal(exp_location_str)))
                if exp.get('description'):
                    self.graph.add((exp_node, SCHEMA.description, Literal(exp['description'])))

                self.graph.add((person_uri, SCHEMA.hasOccupation, exp_node))

        # Education (handle null and both schema variants)
        # Schema A: profile_data.education, Schema B: root education
        education = profile_data.get('education') or data.get('education') or []
        if education:
            for edu in education:
                if not isinstance(edu, dict):
                    continue

                edu_node = BNode()
                self.graph.add((edu_node, RDF.type, SCHEMA.EducationalOccupationalCredential))

                if edu.get('degree'):
                    self.graph.add((edu_node, SCHEMA.name, Literal(edu['degree'])))
                if edu.get('institution'):
                    self.graph.add((edu_node, SCHEMA.educationalCredentialAwarded, Literal(edu['institution'])))

                # Handle field_of_study (variant B) or field (variant A)
                field = edu.get('field_of_study') or edu.get('field')
                if field:
                    self.graph.add((edu_node, SCHEMA.educationalProgramName, Literal(field)))

                # Handle date_range, years, or start_year/end_year
                date_range = edu.get('date_range') or edu.get('years')
                if not date_range and edu.get('start_year'):
                    end_year = edu.get('end_year', '')
                    date_range = f"{edu['start_year']} - {end_year}" if end_year else str(edu['start_year'])
                if date_range:
                    self.graph.add((edu_node, SCHEMA.temporal, Literal(date_range)))

                self.graph.add((person_uri, SCHEMA.alumniOf, edu_node))

        # Provenance
        extraction_date = extraction_meta.get('extraction_date')
        if extraction_date:
            self.graph.add((person_uri, PROV.generatedAtTime,
                          Literal(extraction_date, datatype=XSD.dateTime)))

        extraction_method = extraction_meta.get('extraction_method')
        if extraction_method:
            self.graph.add((person_uri, PROV.wasGeneratedBy, Literal(extraction_method)))

        staff_id = extraction_meta.get('staff_id')
        if staff_id:
            self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))
            self.graph.add((person_uri, HC.staffId, Literal(staff_id)))

        self._persons_added += 1
        return person_uri

    def add_persons_from_staff_list(self, data: dict, filepath: Path) -> list[URIRef]:
        """Convert staff list JSON to RDF triples for each person."""
        uris = []

        custodian_meta = data.get('custodian_metadata', {})
        custodian_name = custodian_meta.get('custodian_name', '')
        custodian_slug = custodian_meta.get('custodian_slug', '')

        # Resolve custodian URI once for all staff
        custodian_uri = self._resolve_custodian_uri(custodian_name) or self._resolve_custodian_uri(custodian_slug)

        staff = data.get('staff', [])
        for person in staff:
            if not isinstance(person, dict):
                continue

            name = person.get('name')
            if not name:
                continue

            # Create person URI from linkedin_slug or staff_id
            linkedin_slug = person.get('linkedin_slug')
            staff_id = person.get('staff_id')

            if linkedin_slug:
                person_uri = HP[sanitize_for_uri(linkedin_slug)]
            elif staff_id:
                person_uri = HP[sanitize_for_uri(staff_id)]
            else:
                # Generate from name + custodian
                slug = sanitize_for_uri(f"{custodian_slug}_{name}")
                person_uri = HP[slug]

            # Type declarations
            self.graph.add((person_uri, RDF.type, SCHEMA.Person))
            self.graph.add((person_uri, RDF.type, FOAF.Person))
            self.graph.add((person_uri, RDF.type, PICO.PersonObservation))

            # Name
            self.graph.add((person_uri, SCHEMA.name, Literal(name)))
            self.graph.add((person_uri, FOAF.name, Literal(name)))
            self.graph.add((person_uri, RDFS.label, Literal(name)))

            # LinkedIn URL
            linkedin_url = person.get('linkedin_profile_url')
            if linkedin_url:
                try:
                    self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
                except Exception:
                    pass

            # Headline
            headline = person.get('headline')
            if headline:
                self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))

            # Heritage relevance
            heritage_relevant = person.get('heritage_relevant', True)
            self.graph.add((person_uri, HC.heritageRelevant,
                           Literal(heritage_relevant, datatype=XSD.boolean)))

            heritage_type = person.get('heritage_type')
            if heritage_type:
                self.graph.add((person_uri, HC.heritageType, Literal(heritage_type)))

            # Link to custodian
            if custodian_uri:
                self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
                self.graph.add((person_uri, ORG.memberOf, custodian_uri))

            if custodian_name:
                self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))

            # Staff ID
            if staff_id:
                self.graph.add((person_uri, HC.staffId, Literal(staff_id)))
                self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))

            uris.append(person_uri)
            self._persons_added += 1

        return uris

    def serialize(self, format: str = "turtle") -> bytes:
        """Serialize graph to bytes."""
        return self.graph.serialize(format=format).encode("utf-8")

    def triple_count(self) -> int:
        """Get number of triples in graph."""
        return len(self.graph)

    def persons_added(self) -> int:
        """Get number of persons added."""
        return self._persons_added


CACHE_FILE = PROJECT_ROOT / "data" / ".custodian_lookup_cache.json"
CACHE_MAX_AGE_HOURS = 24


def build_custodian_lookup(custodian_dir: Path = DEFAULT_CUSTODIAN_DIR, use_cache: bool = True) -> dict[str, str]:
    """
    Build custodian lookup table using filename-based extraction.

    This is MUCH faster than YAML parsing (27k files in <1s vs 2+ minutes).
    Uses GHCID from filename directly - custodian YAMLs are named by GHCID.
    """
    lookup = {}

    # Try cache first
    if use_cache and CACHE_FILE.exists():
        try:
            cache_stat = CACHE_FILE.stat()
            cache_age_hours = (datetime.now().timestamp() - cache_stat.st_mtime) / 3600
            if cache_age_hours < CACHE_MAX_AGE_HOURS:
                with open(CACHE_FILE, 'r') as f:
                    cached = json.load(f)
                    return cached.get('lookup', {})
        except Exception:
            pass

    if not custodian_dir.exists():
        return lookup

    # Fast extraction: GHCID from filename
    # Files are named like: NL-NH-AMS-M-RM.yaml or NL-NH-AMS-M-RM-rijksmuseum.yaml
    yaml_files = list(custodian_dir.glob("*.yaml"))

    for yaml_file in yaml_files:
        stem = yaml_file.stem  # e.g., "NL-NH-AMS-M-RM" or "NL-NH-AMS-M-RM-rijksmuseum"

        # Extract base GHCID (before any name suffix after 5th hyphen)
        parts = stem.split('-')
        if len(parts) >= 5:
            # Standard GHCID: CC-RR-CCC-T-ABB
            base_ghcid = '-'.join(parts[:5])

            # Add both full stem and base GHCID to lookup
            lookup[stem.lower()] = stem
            lookup[base_ghcid.lower()] = stem

            # If there's a name suffix, also index by that
            if len(parts) > 5:
                name_suffix = '-'.join(parts[5:])
                lookup[name_suffix.lower()] = stem
                # Also with underscores replaced by spaces
                lookup[name_suffix.replace('_', ' ').lower()] = stem

    # Save cache
    if use_cache:
        try:
            CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
            with open(CACHE_FILE, 'w') as f:
                json.dump({
                    'lookup': lookup,
                    'generated_at': datetime.now(timezone.utc).isoformat(),
                    'file_count': len(yaml_files)
                }, f)
        except Exception:
            pass

    return lookup


class OxigraphPersonSyncer(BaseSyncer):
    """Sync person JSON files to Oxigraph triplestore."""

    database_name = "oxigraph_persons"

    def __init__(
        self,
        endpoint: str = OXIGRAPH_URL,
        batch_size: int = BATCH_SIZE,
        entity_dir: Path = PERSON_ENTITY_DIR,
        affiliated_dir: Path = PERSON_AFFILIATED_DIR,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.endpoint = endpoint.rstrip("/")
        self.batch_size = batch_size
        self.entity_dir = entity_dir
        self.affiliated_dir = affiliated_dir
        self.client = httpx.Client(timeout=120.0)

        # Build custodian lookup
        self.logger.info("Building custodian lookup table...")
        self.custodian_lookup = build_custodian_lookup()
        self.logger.info(f"Loaded {len(self.custodian_lookup)} custodian name mappings")

        self.converter = PersonRDFConverter(custodian_lookup=self.custodian_lookup)

    def check_connection(self) -> bool:
        """Check if Oxigraph is available."""
        try:
            resp = self.client.post(
                f"{self.endpoint}/query",
                headers={"Content-Type": "application/sparql-query"},
                content="ASK { ?s ?p ?o }"
            )
            return resp.status_code in (200, 204)
        except Exception as e:
            self.logger.error(f"Oxigraph connection failed: {e}")
            return False

    def get_status(self) -> dict:
        """Get Oxigraph status for person data."""
        try:
            # Count person triples specifically
            resp = self.client.post(
                f"{self.endpoint}/query",
                headers={
                    "Content-Type": "application/sparql-query",
                    "Accept": "application/sparql-results+json"
                },
                content="""
                PREFIX schema: <http://schema.org/>
                PREFIX hp: <https://nde.nl/ontology/hc/person/>
                SELECT (COUNT(DISTINCT ?person) AS ?count)
                WHERE {
                    ?person a schema:Person .
                    FILTER(STRSTARTS(STR(?person), STR(hp:)))
                }
                """
            )
            if resp.status_code == 200:
                data = resp.json()
                count = int(data["results"]["bindings"][0]["count"]["value"])
                return {"status": "healthy", "person_count": count}
        except Exception as e:
            return {"status": "unavailable", "error": str(e)}
        return {"status": "unknown"}

    def _upload_turtle(self, ttl_data: bytes) -> bool:
        """Upload Turtle data to Oxigraph."""
        try:
            resp = self.client.post(
                f"{self.endpoint}/store?default",
                headers={"Content-Type": "text/turtle"},
                content=ttl_data
            )
            return resp.status_code in (200, 201, 204)
        except Exception as e:
            self.logger.error(f"Failed to upload data: {e}")
            return False

    def _list_entity_files(self) -> list[Path]:
        """List all person entity JSON files."""
        if not self.entity_dir.exists():
            self.logger.warning(f"Entity directory not found: {self.entity_dir}")
            return []
        return sorted(self.entity_dir.glob("*.json"))

    def _list_staff_files(self) -> list[Path]:
        """List all staff list JSON files."""
        if not self.affiliated_dir.exists():
            self.logger.warning(f"Affiliated directory not found: {self.affiliated_dir}")
            return []
        return sorted(self.affiliated_dir.glob("*_staff_*.json"))

    def sync(
        self,
        limit: Optional[int] = None,
        dry_run: bool = False,
        clear: bool = False,
        entity_only: bool = False,
        staff_only: bool = False,
    ) -> SyncResult:
        """Sync all person JSON files to Oxigraph."""
        result = SyncResult(
            database="oxigraph_persons",
            status=SyncStatus.IN_PROGRESS,
            start_time=datetime.now(timezone.utc),
        )

        # Check connection
        if not dry_run and not self.check_connection():
            result.status = SyncStatus.FAILED
            result.error_message = f"Cannot connect to Oxigraph at {self.endpoint}"
            result.end_time = datetime.now(timezone.utc)
            return result

        # Gather files to process
        files_to_process = []

        if not staff_only:
            entity_files = self._list_entity_files()
            files_to_process.extend([('entity', f) for f in entity_files])
            self.logger.info(f"Found {len(entity_files)} entity files")

        if not entity_only:
            staff_files = self._list_staff_files()
            files_to_process.extend([('staff', f) for f in staff_files])
            self.logger.info(f"Found {len(staff_files)} staff files")

        if limit:
            files_to_process = files_to_process[:limit]

        self.progress.total_files = len(files_to_process)
        self.progress.current_database = "oxigraph_persons"

        self.logger.info(f"Processing {len(files_to_process)} files...")

        # Clear existing person data (optional)
        if not dry_run and clear:
            self.logger.info("Note: Use SPARQL DELETE to clear person data if needed")

        # Process in batches
        total_triples = 0
        total_persons = 0
        processed = 0
        failed = 0

        for i in range(0, len(files_to_process), self.batch_size):
            batch = files_to_process[i:i + self.batch_size]
            batch_num = i // self.batch_size + 1

            self.converter.reset()

            for file_type, filepath in batch:
                self.progress.processed_files = processed + 1
                self.progress.current_file = filepath.name
                self._report_progress()

                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    if not data:
                        failed += 1
                        continue

                    if file_type == 'entity':
                        uri = self.converter.add_person_from_entity(data, filepath)
                        if uri:
                            processed += 1
                        else:
                            self.logger.debug(f"No person extracted from entity: {filepath.name}")
                            failed += 1
                    else:  # staff
                        uris = self.converter.add_persons_from_staff_list(data, filepath)
                        # Empty staff list is not a failure - it's just a file with no persons
                        # (e.g., LinkedIn company page with 0 visible employees)
                        if uris is not None:  # Explicitly check for None (error) vs [] (empty)
                            processed += 1
                        else:
                            self.logger.debug(f"Failed to process staff file: {filepath.name}")
                            failed += 1

                except json.JSONDecodeError as e:
                    self.logger.warning(f"JSON decode error in {filepath.name}: {e}")
                    failed += 1
                    self.progress.errors.append(f"{filepath.name}: JSON decode error")
                except Exception as e:
                    self.logger.warning(f"Error processing {filepath.name}: {e}")
                    failed += 1
                    self.progress.errors.append(f"{filepath.name}: {str(e)}")

            batch_triples = self.converter.triple_count()
            batch_persons = self.converter.persons_added()
            total_triples += batch_triples
            total_persons += batch_persons

            if not dry_run:
                try:
                    ttl_data = self.converter.serialize("turtle")
                    if self._upload_turtle(ttl_data):
                        self.logger.info(
                            f"Batch {batch_num}: loaded {len(batch)} files, "
                            f"{batch_persons} persons, {batch_triples:,} triples"
                        )
                    else:
                        self.logger.error(f"Batch {batch_num}: failed to upload")
                        failed += len(batch)
                except Exception as e:
                    self.logger.error(f"Batch {batch_num}: serialization failed: {e}")
                    failed += len(batch)
            else:
                self.logger.info(
                    f"Batch {batch_num}: parsed {len(batch)} files, "
                    f"{batch_persons} persons, {batch_triples:,} triples (dry-run)"
                )

        result.records_processed = len(files_to_process)
        result.records_succeeded = processed
        result.records_failed = failed
        result.details["total_triples"] = total_triples
        result.details["total_persons"] = total_persons
        result.details["batch_size"] = self.batch_size
        result.details["dry_run"] = dry_run
        result.details["custodian_mappings"] = len(self.custodian_lookup)

        if failed == 0:
            result.status = SyncStatus.SUCCESS
        elif processed > 0:
            result.status = SyncStatus.PARTIAL
        else:
            result.status = SyncStatus.FAILED

        result.end_time = datetime.now(timezone.utc)
        return result


def main():
    parser = argparse.ArgumentParser(description="Sync person JSON files to Oxigraph")
    parser.add_argument("--dry-run", action="store_true", help="Parse files but don't upload")
    parser.add_argument("--limit", type=int, help="Limit number of files to process")
    parser.add_argument("--endpoint", default=OXIGRAPH_URL, help="Oxigraph SPARQL endpoint URL")
    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Files per batch")
    parser.add_argument("--entity-only", action="store_true", help="Only process entity files")
    parser.add_argument("--staff-only", action="store_true", help="Only process staff files")
    parser.add_argument("--clear", action="store_true", help="Clear existing person data first")
    args = parser.parse_args()

    import logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
    )

    syncer = OxigraphPersonSyncer(endpoint=args.endpoint, batch_size=args.batch_size)

    print("=" * 60)
    print("Oxigraph Person Sync")
    print("=" * 60)

    if not args.dry_run:
        print(f"Checking connection to {args.endpoint}...")
        status = syncer.get_status()
        print(f"  Status: {status.get('status', 'unknown')}")
        if status.get('person_count'):
            print(f"  Current persons: {status['person_count']:,}")

    print(f"  Custodian mappings: {len(syncer.custodian_lookup):,}")

    result = syncer.sync(
        limit=args.limit,
        dry_run=args.dry_run,
        clear=args.clear,
        entity_only=args.entity_only,
        staff_only=args.staff_only,
    )

    print("\n" + "=" * 60)
    print(f"Sync Result: {result.status.value.upper()}")
    print(f"  Files processed: {result.records_processed}")
    print(f"  Files succeeded: {result.records_succeeded}")
    print(f"  Files failed: {result.records_failed}")
    print(f"  Persons synced: {result.details.get('total_persons', 0):,}")
    print(f"  Triples: {result.details.get('total_triples', 0):,}")
    print(f"  Duration: {result.duration_seconds:.2f}s")
    if result.error_message:
        print(f"  Error: {result.error_message}")
    print("=" * 60)


if __name__ == "__main__":
    main()