glam/scripts/sync/oxigraph_person_sync.py

#!/usr/bin/env python3
"""
Oxigraph Person Sync Module - Sync person JSON files to Oxigraph triplestore.

This module syncs all person data (entity profiles and staff lists) to Oxigraph
as RDF triples for SPARQL querying and graph-based retrieval.

Usage:
    python -m scripts.sync.oxigraph_person_sync [--dry-run] [--limit N] [--endpoint URL]

Data Sources:
    - data/custodian/person/entity/*.json - Individual person profile files
    - data/custodian/person/affiliated/parsed/*_staff_*.json - Staff list files
"""

import argparse
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

import httpx
import yaml
from rdflib import Graph, Namespace, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, XSD, SKOS, DCTERMS, FOAF, OWL

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from scripts.sync import BaseSyncer, SyncResult, SyncStatus, DEFAULT_CUSTODIAN_DIR

# Configuration
OXIGRAPH_URL = os.getenv("OXIGRAPH_URL", "http://localhost:7878")
BATCH_SIZE = 500

# Data directories
PERSON_ENTITY_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "entity"
PERSON_AFFILIATED_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "affiliated" / "parsed"

# Namespaces (reuse from oxigraph_sync.py + person-specific)
HC = Namespace("https://w3id.org/heritage/custodian/")
HP = Namespace("https://w3id.org/heritage/person/")
GHCID = Namespace("https://w3id.org/heritage/ghcid/")
NDE = Namespace("https://nde.nl/ontology/hc/class/")
ORG = Namespace("http://www.w3.org/ns/org#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
SCHEMA = Namespace("http://schema.org/")
CIDOC = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
PROV = Namespace("http://www.w3.org/ns/prov#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
PICO = Namespace("https://personsincontext.org/model#")
PNV = Namespace("https://w3id.org/pnv#")


def sanitize_for_uri(value: str) -> str:
    """Sanitize a string to be valid as a URI local name."""
    if not value:
        return "unknown"
    sanitized = re.sub(r'[`\'\"<>{}|\\^~\[\]@#$%&*()+=,;!? ]', '_', value)
    sanitized = re.sub(r'_+', '_', sanitized)
    sanitized = sanitized.strip('_')
    return sanitized.lower() if sanitized else "unknown"


def extract_country_code(location: str) -> Optional[str]:
    """Extract country code from location string like 'Amsterdam, North Holland, Netherlands (NL)'."""
    if not location:
        return None

    # Try to find country code in parentheses
    match = re.search(r'\(([A-Z]{2})\)', location)
    if match:
        return match.group(1)

    # Common country mappings
    country_map = {
        'netherlands': 'NL', 'nederland': 'NL',
        'united states': 'US', 'usa': 'US',
        'united kingdom': 'GB', 'uk': 'GB', 'england': 'GB',
        'germany': 'DE', 'deutschland': 'DE',
        'france': 'FR',
        'belgium': 'BE', 'belgië': 'BE',
        'spain': 'ES', 'españa': 'ES',
        'italy': 'IT', 'italia': 'IT',
        'australia': 'AU',
        'canada': 'CA',
        'japan': 'JP',
        'china': 'CN',
    }

    lower_loc = location.lower()
    for country, code in country_map.items():
        if country in lower_loc:
            return code

    return None


class PersonRDFConverter:
    """Convert person JSON data to RDF triples."""

    def __init__(self, custodian_lookup: dict[str, str] | None = None):
        self.graph = Graph()
        self._bind_namespaces()
        self.custodian_lookup = custodian_lookup or {}
        self._persons_added = 0

    def _bind_namespaces(self) -> None:
        """Bind all ontology namespaces to the graph."""
        self.graph.bind("hc", HC)
        self.graph.bind("hp", HP)
        self.graph.bind("ghcid", GHCID)
        self.graph.bind("nde", NDE)
        self.graph.bind("org", ORG)
        self.graph.bind("rico", RICO)
        self.graph.bind("schema", SCHEMA)
        self.graph.bind("cidoc", CIDOC)
        self.graph.bind("prov", PROV)
        self.graph.bind("wd", WD)
        self.graph.bind("wdt", WDT)
        self.graph.bind("geo", GEO)
        self.graph.bind("dcterms", DCTERMS)
        self.graph.bind("foaf", FOAF)
        self.graph.bind("skos", SKOS)
        self.graph.bind("owl", OWL)
        self.graph.bind("pico", PICO)
        self.graph.bind("pnv", PNV)

    def reset(self) -> None:
        """Reset the graph for new batch."""
        self.graph = Graph()
        self._bind_namespaces()
        self._persons_added = 0

    def _get_person_uri(self, data: dict, filepath: Path) -> URIRef:
        """Generate a URI for a person based on LinkedIn slug or staff_id."""
        extraction_meta = data.get('extraction_metadata', {})
        profile_data = data.get('profile_data', {})

        # Try LinkedIn URL to get slug
        linkedin_url = (
            extraction_meta.get('linkedin_url') or
            profile_data.get('linkedin_url') or
            ''
        )

        if linkedin_url:
            # Extract slug from URL like https://www.linkedin.com/in/alex-brandsen
            match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url)
            if match:
                slug = sanitize_for_uri(match.group(1))
                return HP[slug]

        # Fallback to staff_id
        staff_id = extraction_meta.get('staff_id', '')
        if staff_id:
            return HP[sanitize_for_uri(staff_id)]

        # Last resort: use filename
        return HP[sanitize_for_uri(filepath.stem)]

    def _resolve_custodian_uri(self, custodian_name: str) -> URIRef | None:
        """Resolve custodian name to GHCID URI."""
        if not custodian_name:
            return None

        # Try exact match first
        name_lower = custodian_name.lower()
        if name_lower in self.custodian_lookup:
            ghcid = self.custodian_lookup[name_lower]
            return GHCID[sanitize_for_uri(ghcid)]

        # Try slug form
        slug = name_lower.replace(' ', '-')
        if slug in self.custodian_lookup:
            ghcid = self.custodian_lookup[slug]
            return GHCID[sanitize_for_uri(ghcid)]

        # Try partial match
        for key, ghcid in self.custodian_lookup.items():
            if name_lower in key or key in name_lower:
                return GHCID[sanitize_for_uri(ghcid)]

        return None

    def add_person_from_entity(self, data: dict, filepath: Path) -> URIRef | None:
        """Convert person entity JSON to RDF triples."""
        extraction_meta = data.get('extraction_metadata', {})
        source_staff = data.get('source_staff_info', {})
        profile_data = data.get('profile_data', {})
        heritage_rel = data.get('heritage_relevance', {})

        # Get name (required)
        name = profile_data.get('name') or source_staff.get('name')
        if not name:
            return None

        # Create person URI
        person_uri = self._get_person_uri(data, filepath)

        # Type declarations
        self.graph.add((person_uri, RDF.type, SCHEMA.Person))
        self.graph.add((person_uri, RDF.type, FOAF.Person))
        self.graph.add((person_uri, RDF.type, PICO.PersonObservation))
        self.graph.add((person_uri, RDF.type, CIDOC.E21_Person))

        # Name
        self.graph.add((person_uri, SCHEMA.name, Literal(name)))
        self.graph.add((person_uri, FOAF.name, Literal(name)))
        self.graph.add((person_uri, RDFS.label, Literal(name)))
        self.graph.add((person_uri, SKOS.prefLabel, Literal(name)))

        # LinkedIn URL as sameAs
        linkedin_url = (
            extraction_meta.get('linkedin_url') or
            profile_data.get('linkedin_url')
        )
        if linkedin_url:
            try:
                self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
                self.graph.add((person_uri, OWL.sameAs, URIRef(linkedin_url)))
            except Exception:
                pass

        # Headline / job title
        headline = profile_data.get('headline') or source_staff.get('headline')
        if headline:
            self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))
            self.graph.add((person_uri, SCHEMA.description, Literal(headline)))

        # Location
        location = profile_data.get('location', '')
        if location:
            self.graph.add((person_uri, SCHEMA.workLocation, Literal(location)))

            country_code = extract_country_code(location)
            if country_code:
                self.graph.add((person_uri, HC.countryCode, Literal(country_code)))

        # About / description
        about = profile_data.get('about')
        if about:
            self.graph.add((person_uri, SCHEMA.disambiguatingDescription, Literal(about)))

        # Profile image
        profile_image = profile_data.get('profile_image_url')
        if profile_image:
            try:
                self.graph.add((person_uri, SCHEMA.image, URIRef(profile_image)))
                self.graph.add((person_uri, FOAF.img, URIRef(profile_image)))
            except Exception:
                pass

        # Heritage relevance
        is_heritage_relevant = heritage_rel.get('is_heritage_relevant', True)
        self.graph.add((person_uri, HC.heritageRelevant,
                       Literal(is_heritage_relevant, datatype=XSD.boolean)))

        heritage_types = heritage_rel.get('heritage_types', [])
        if not heritage_types and source_staff.get('heritage_type'):
            heritage_types = [source_staff.get('heritage_type')]

        for ht in heritage_types:
            if ht:
                self.graph.add((person_uri, HC.heritageType, Literal(ht)))

        # Link to custodian (employer)
        custodian_name = source_staff.get('custodian')
        if custodian_name:
            custodian_uri = self._resolve_custodian_uri(custodian_name)
            if custodian_uri:
                self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
                self.graph.add((person_uri, ORG.memberOf, custodian_uri))

            # Also store as literal for cases where lookup fails
            self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))

        # Skills
        for skill in profile_data.get('skills', []):
            if skill:
                self.graph.add((person_uri, SCHEMA.knowsAbout, Literal(skill)))

        # Languages
        for lang in profile_data.get('languages', []):
            if isinstance(lang, dict):
                lang_name = lang.get('language', '')
            else:
                lang_name = str(lang)
            if lang_name:
                self.graph.add((person_uri, SCHEMA.knowsLanguage, Literal(lang_name)))

        # Experience (as structured data)
        experience = profile_data.get('experience', [])
        if experience:
            for i, exp in enumerate(experience):
                if not isinstance(exp, dict):
                    continue

                exp_node = BNode()
                self.graph.add((exp_node, RDF.type, SCHEMA.WorkExperience))

                if exp.get('title'):
                    self.graph.add((exp_node, SCHEMA.roleName, Literal(exp['title'])))
                if exp.get('company'):
                    self.graph.add((exp_node, SCHEMA.name, Literal(exp['company'])))
                if exp.get('date_range'):
                    self.graph.add((exp_node, SCHEMA.temporal, Literal(exp['date_range'])))
                if exp.get('location'):
                    self.graph.add((exp_node, SCHEMA.workLocation, Literal(exp['location'])))
                if exp.get('description'):
                    self.graph.add((exp_node, SCHEMA.description, Literal(exp['description'])))

                self.graph.add((person_uri, SCHEMA.hasOccupation, exp_node))

        # Education
        education = profile_data.get('education', [])
        if education:
            for edu in education:
                if not isinstance(edu, dict):
                    continue

                edu_node = BNode()
                self.graph.add((edu_node, RDF.type, SCHEMA.EducationalOccupationalCredential))

                if edu.get('degree'):
                    self.graph.add((edu_node, SCHEMA.name, Literal(edu['degree'])))
                if edu.get('institution'):
                    self.graph.add((edu_node, SCHEMA.educationalCredentialAwarded, Literal(edu['institution'])))
                if edu.get('date_range'):
                    self.graph.add((edu_node, SCHEMA.temporal, Literal(edu['date_range'])))

                self.graph.add((person_uri, SCHEMA.alumniOf, edu_node))

        # Provenance
        extraction_date = extraction_meta.get('extraction_date')
        if extraction_date:
            self.graph.add((person_uri, PROV.generatedAtTime,
                          Literal(extraction_date, datatype=XSD.dateTime)))

        extraction_method = extraction_meta.get('extraction_method')
        if extraction_method:
            self.graph.add((person_uri, PROV.wasGeneratedBy, Literal(extraction_method)))

        staff_id = extraction_meta.get('staff_id')
        if staff_id:
            self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))
            self.graph.add((person_uri, HC.staffId, Literal(staff_id)))

        self._persons_added += 1
        return person_uri

    def add_persons_from_staff_list(self, data: dict, filepath: Path) -> list[URIRef]:
        """Convert staff list JSON to RDF triples for each person."""
        uris = []

        custodian_meta = data.get('custodian_metadata', {})
        custodian_name = custodian_meta.get('custodian_name', '')
        custodian_slug = custodian_meta.get('custodian_slug', '')

        # Resolve custodian URI once for all staff
        custodian_uri = self._resolve_custodian_uri(custodian_name) or self._resolve_custodian_uri(custodian_slug)

        staff = data.get('staff', [])
        for person in staff:
            if not isinstance(person, dict):
                continue

            name = person.get('name')
            if not name:
                continue

            # Create person URI from linkedin_slug or staff_id
            linkedin_slug = person.get('linkedin_slug')
            staff_id = person.get('staff_id')

            if linkedin_slug:
                person_uri = HP[sanitize_for_uri(linkedin_slug)]
            elif staff_id:
                person_uri = HP[sanitize_for_uri(staff_id)]
            else:
                # Generate from name + custodian
                slug = sanitize_for_uri(f"{custodian_slug}_{name}")
                person_uri = HP[slug]

            # Type declarations
            self.graph.add((person_uri, RDF.type, SCHEMA.Person))
            self.graph.add((person_uri, RDF.type, FOAF.Person))
            self.graph.add((person_uri, RDF.type, PICO.PersonObservation))

            # Name
            self.graph.add((person_uri, SCHEMA.name, Literal(name)))
            self.graph.add((person_uri, FOAF.name, Literal(name)))
            self.graph.add((person_uri, RDFS.label, Literal(name)))

            # LinkedIn URL
            linkedin_url = person.get('linkedin_profile_url')
            if linkedin_url:
                try:
                    self.graph.add((person_uri, SCHEMA.sameAs, URIRef(linkedin_url)))
                except Exception:
                    pass

            # Headline
            headline = person.get('headline')
            if headline:
                self.graph.add((person_uri, SCHEMA.jobTitle, Literal(headline)))

            # Heritage relevance
            heritage_relevant = person.get('heritage_relevant', True)
            self.graph.add((person_uri, HC.heritageRelevant,
                           Literal(heritage_relevant, datatype=XSD.boolean)))

            heritage_type = person.get('heritage_type')
            if heritage_type:
                self.graph.add((person_uri, HC.heritageType, Literal(heritage_type)))

            # Link to custodian
            if custodian_uri:
                self.graph.add((person_uri, SCHEMA.worksFor, custodian_uri))
                self.graph.add((person_uri, ORG.memberOf, custodian_uri))

            if custodian_name:
                self.graph.add((person_uri, HC.custodianName, Literal(custodian_name)))

            # Staff ID
            if staff_id:
                self.graph.add((person_uri, HC.staffId, Literal(staff_id)))
                self.graph.add((person_uri, DCTERMS.identifier, Literal(staff_id)))

            uris.append(person_uri)
            self._persons_added += 1

        return uris

    def serialize(self, format: str = "turtle") -> bytes:
        """Serialize graph to bytes."""
        return self.graph.serialize(format=format).encode("utf-8")

    def triple_count(self) -> int:
        """Get number of triples in graph."""
        return len(self.graph)

    def persons_added(self) -> int:
        """Get number of persons added."""
        return self._persons_added


CACHE_FILE = PROJECT_ROOT / "data" / ".custodian_lookup_cache.json"
CACHE_MAX_AGE_HOURS = 24


def build_custodian_lookup(custodian_dir: Path = DEFAULT_CUSTODIAN_DIR, use_cache: bool = True) -> dict[str, str]:
    """
    Build custodian lookup table using filename-based extraction.

    This is MUCH faster than YAML parsing (27k files in <1s vs 2+ minutes).
    Uses GHCID from filename directly - custodian YAMLs are named by GHCID.
    """
    lookup = {}

    # Try cache first
    if use_cache and CACHE_FILE.exists():
        try:
            cache_stat = CACHE_FILE.stat()
            cache_age_hours = (datetime.now().timestamp() - cache_stat.st_mtime) / 3600
            if cache_age_hours < CACHE_MAX_AGE_HOURS:
                with open(CACHE_FILE, 'r') as f:
                    cached = json.load(f)
                    return cached.get('lookup', {})
        except Exception:
            pass

    if not custodian_dir.exists():
        return lookup

    # Fast extraction: GHCID from filename
    # Files are named like: NL-NH-AMS-M-RM.yaml or NL-NH-AMS-M-RM-rijksmuseum.yaml
    yaml_files = list(custodian_dir.glob("*.yaml"))

    for yaml_file in yaml_files:
        stem = yaml_file.stem  # e.g., "NL-NH-AMS-M-RM" or "NL-NH-AMS-M-RM-rijksmuseum"

        # Extract base GHCID (before any name suffix after 5th hyphen)
        parts = stem.split('-')
        if len(parts) >= 5:
            # Standard GHCID: CC-RR-CCC-T-ABB
            base_ghcid = '-'.join(parts[:5])

            # Add both full stem and base GHCID to lookup
            lookup[stem.lower()] = stem
            lookup[base_ghcid.lower()] = stem

            # If there's a name suffix, also index by that
            if len(parts) > 5:
                name_suffix = '-'.join(parts[5:])
                lookup[name_suffix.lower()] = stem
                # Also with underscores replaced by spaces
                lookup[name_suffix.replace('_', ' ').lower()] = stem

    # Save cache
    if use_cache:
        try:
            CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
            with open(CACHE_FILE, 'w') as f:
                json.dump({
                    'lookup': lookup,
                    'generated_at': datetime.now(timezone.utc).isoformat(),
                    'file_count': len(yaml_files)
                }, f)
        except Exception:
            pass

    return lookup


class OxigraphPersonSyncer(BaseSyncer):
    """Sync person JSON files to Oxigraph triplestore."""

    database_name = "oxigraph_persons"

    def __init__(
        self,
        endpoint: str = OXIGRAPH_URL,
        batch_size: int = BATCH_SIZE,
        entity_dir: Path = PERSON_ENTITY_DIR,
        affiliated_dir: Path = PERSON_AFFILIATED_DIR,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.endpoint = endpoint.rstrip("/")
        self.batch_size = batch_size
        self.entity_dir = entity_dir
        self.affiliated_dir = affiliated_dir
        self.client = httpx.Client(timeout=120.0)

        # Build custodian lookup
        self.logger.info("Building custodian lookup table...")
        self.custodian_lookup = build_custodian_lookup()
        self.logger.info(f"Loaded {len(self.custodian_lookup)} custodian name mappings")

        self.converter = PersonRDFConverter(custodian_lookup=self.custodian_lookup)

    def check_connection(self) -> bool:
        """Check if Oxigraph is available."""
        try:
            resp = self.client.post(
                f"{self.endpoint}/query",
                headers={"Content-Type": "application/sparql-query"},
                content="ASK { ?s ?p ?o }"
            )
            return resp.status_code in (200, 204)
        except Exception as e:
            self.logger.error(f"Oxigraph connection failed: {e}")
            return False

    def get_status(self) -> dict:
        """Get Oxigraph status for person data."""
        try:
            # Count person triples specifically
            resp = self.client.post(
                f"{self.endpoint}/query",
                headers={
                    "Content-Type": "application/sparql-query",
                    "Accept": "application/sparql-results+json"
                },
                content="""
                PREFIX schema: <http://schema.org/>
                PREFIX hp: <https://w3id.org/heritage/person/>
                SELECT (COUNT(DISTINCT ?person) AS ?count)
                WHERE {
                    ?person a schema:Person .
                    FILTER(STRSTARTS(STR(?person), STR(hp:)))
                }
                """
            )
            if resp.status_code == 200:
                data = resp.json()
                count = int(data["results"]["bindings"][0]["count"]["value"])
                return {"status": "healthy", "person_count": count}
        except Exception as e:
            return {"status": "unavailable", "error": str(e)}
        return {"status": "unknown"}

    def _upload_turtle(self, ttl_data: bytes) -> bool:
        """Upload Turtle data to Oxigraph."""
        try:
            resp = self.client.post(
                f"{self.endpoint}/store?default",
                headers={"Content-Type": "text/turtle"},
                content=ttl_data
            )
            return resp.status_code in (200, 201, 204)
        except Exception as e:
            self.logger.error(f"Failed to upload data: {e}")
            return False

    def _list_entity_files(self) -> list[Path]:
        """List all person entity JSON files."""
        if not self.entity_dir.exists():
            self.logger.warning(f"Entity directory not found: {self.entity_dir}")
            return []
        return sorted(self.entity_dir.glob("*.json"))

    def _list_staff_files(self) -> list[Path]:
        """List all staff list JSON files."""
        if not self.affiliated_dir.exists():
            self.logger.warning(f"Affiliated directory not found: {self.affiliated_dir}")
            return []
        return sorted(self.affiliated_dir.glob("*_staff_*.json"))

    def sync(
        self,
        limit: Optional[int] = None,
        dry_run: bool = False,
        clear: bool = False,
        entity_only: bool = False,
        staff_only: bool = False,
    ) -> SyncResult:
        """Sync all person JSON files to Oxigraph."""
        result = SyncResult(
            database="oxigraph_persons",
            status=SyncStatus.IN_PROGRESS,
            start_time=datetime.now(timezone.utc),
        )

        # Check connection
        if not dry_run and not self.check_connection():
            result.status = SyncStatus.FAILED
            result.error_message = f"Cannot connect to Oxigraph at {self.endpoint}"
            result.end_time = datetime.now(timezone.utc)
            return result

        # Gather files to process
        files_to_process = []

        if not staff_only:
            entity_files = self._list_entity_files()
            files_to_process.extend([('entity', f) for f in entity_files])
            self.logger.info(f"Found {len(entity_files)} entity files")

        if not entity_only:
            staff_files = self._list_staff_files()
            files_to_process.extend([('staff', f) for f in staff_files])
            self.logger.info(f"Found {len(staff_files)} staff files")

        if limit:
            files_to_process = files_to_process[:limit]

        self.progress.total_files = len(files_to_process)
        self.progress.current_database = "oxigraph_persons"

        self.logger.info(f"Processing {len(files_to_process)} files...")

        # Clear existing person data (optional)
        if not dry_run and clear:
            self.logger.info("Note: Use SPARQL DELETE to clear person data if needed")

        # Process in batches
        total_triples = 0
        total_persons = 0
        processed = 0
        failed = 0

        for i in range(0, len(files_to_process), self.batch_size):
            batch = files_to_process[i:i + self.batch_size]
            batch_num = i // self.batch_size + 1

            self.converter.reset()

            for file_type, filepath in batch:
                self.progress.processed_files = processed + 1
                self.progress.current_file = filepath.name
                self._report_progress()

                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    if not data:
                        failed += 1
                        continue

                    if file_type == 'entity':
                        uri = self.converter.add_person_from_entity(data, filepath)
                        if uri:
                            processed += 1
                        else:
                            failed += 1
                    else:  # staff
                        uris = self.converter.add_persons_from_staff_list(data, filepath)
                        if uris:
                            processed += 1
                        else:
                            failed += 1

                except json.JSONDecodeError as e:
                    self.logger.warning(f"JSON decode error in {filepath.name}: {e}")
                    failed += 1
                    self.progress.errors.append(f"{filepath.name}: JSON decode error")
                except Exception as e:
                    self.logger.warning(f"Error processing {filepath.name}: {e}")
                    failed += 1
                    self.progress.errors.append(f"{filepath.name}: {str(e)}")

            batch_triples = self.converter.triple_count()
            batch_persons = self.converter.persons_added()
            total_triples += batch_triples
            total_persons += batch_persons

            if not dry_run:
                try:
                    ttl_data = self.converter.serialize("turtle")
                    if self._upload_turtle(ttl_data):
                        self.logger.info(
                            f"Batch {batch_num}: loaded {len(batch)} files, "
                            f"{batch_persons} persons, {batch_triples:,} triples"
                        )
                    else:
                        self.logger.error(f"Batch {batch_num}: failed to upload")
                        failed += len(batch)
                except Exception as e:
                    self.logger.error(f"Batch {batch_num}: serialization failed: {e}")
                    failed += len(batch)
            else:
                self.logger.info(
                    f"Batch {batch_num}: parsed {len(batch)} files, "
                    f"{batch_persons} persons, {batch_triples:,} triples (dry-run)"
                )

        result.records_processed = len(files_to_process)
        result.records_succeeded = processed
        result.records_failed = failed
        result.details["total_triples"] = total_triples
        result.details["total_persons"] = total_persons
        result.details["batch_size"] = self.batch_size
        result.details["dry_run"] = dry_run
        result.details["custodian_mappings"] = len(self.custodian_lookup)

        if failed == 0:
            result.status = SyncStatus.SUCCESS
        elif processed > 0:
            result.status = SyncStatus.PARTIAL
        else:
            result.status = SyncStatus.FAILED

        result.end_time = datetime.now(timezone.utc)
        return result


def main():
    parser = argparse.ArgumentParser(description="Sync person JSON files to Oxigraph")
    parser.add_argument("--dry-run", action="store_true", help="Parse files but don't upload")
    parser.add_argument("--limit", type=int, help="Limit number of files to process")
    parser.add_argument("--endpoint", default=OXIGRAPH_URL, help="Oxigraph SPARQL endpoint URL")
    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Files per batch")
    parser.add_argument("--entity-only", action="store_true", help="Only process entity files")
    parser.add_argument("--staff-only", action="store_true", help="Only process staff files")
    parser.add_argument("--clear", action="store_true", help="Clear existing person data first")
    args = parser.parse_args()

    import logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
    )

    syncer = OxigraphPersonSyncer(endpoint=args.endpoint, batch_size=args.batch_size)

    print("=" * 60)
    print("Oxigraph Person Sync")
    print("=" * 60)

    if not args.dry_run:
        print(f"Checking connection to {args.endpoint}...")
        status = syncer.get_status()
        print(f"  Status: {status.get('status', 'unknown')}")
        if status.get('person_count'):
            print(f"  Current persons: {status['person_count']:,}")

    print(f"  Custodian mappings: {len(syncer.custodian_lookup):,}")

    result = syncer.sync(
        limit=args.limit,
        dry_run=args.dry_run,
        clear=args.clear,
        entity_only=args.entity_only,
        staff_only=args.staff_only,
    )

    print("\n" + "=" * 60)
    print(f"Sync Result: {result.status.value.upper()}")
    print(f"  Files processed: {result.records_processed}")
    print(f"  Files succeeded: {result.records_succeeded}")
    print(f"  Files failed: {result.records_failed}")
    print(f"  Persons synced: {result.details.get('total_persons', 0):,}")
    print(f"  Triples: {result.details.get('total_triples', 0):,}")
    print(f"  Duration: {result.duration_seconds:.2f}s")
    if result.error_message:
        print(f"  Error: {result.error_message}")
    print("=" * 60)


if __name__ == "__main__":
    main()