#!/usr/bin/env python3 """ Sync Heritage Custodian YAML files to Oxigraph triplestore. This script reads custodian YAML files from data/custodian/ and loads them as RDF triples into Oxigraph for SPARQL querying and graph-based retrieval. Usage: python scripts/sync_custodians_to_oxigraph.py [--endpoint ENDPOINT] [--batch-size N] Examples: # Sync to local Oxigraph python scripts/sync_custodians_to_oxigraph.py # Sync to production via SSH tunnel ssh -L 7878:127.0.0.1:7878 root@91.98.224.44 & python scripts/sync_custodians_to_oxigraph.py --endpoint http://localhost:7878 # Sync directly to production (if allowed) python scripts/sync_custodians_to_oxigraph.py --endpoint https://bronhouder.nl """ import argparse import hashlib import logging import os import re import sys import urllib.parse from datetime import datetime, timezone from io import BytesIO from pathlib import Path from typing import Any, Generator import httpx import yaml from rdflib import Graph, Namespace, Literal, URIRef, BNode from rdflib.namespace import RDF, RDFS, XSD, SKOS, DCTERMS, FOAF, OWL # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) logger = logging.getLogger(__name__) # Namespaces # IMPORTANT: hc: namespace must match sparql_templates.yaml for queries to work HC = Namespace("https://nde.nl/ontology/hc/") # Properties like institutionType, subregionCode HCC = Namespace("https://nde.nl/ontology/hc/class/") # Classes like Custodian GHCID = Namespace("https://w3id.org/heritage/ghcid/") # GHCID URIs for instances ORG = Namespace("http://www.w3.org/ns/org#") RICO = Namespace("https://www.ica.org/standards/RiC/ontology#") SCHEMA = Namespace("http://schema.org/") CIDOC = Namespace("http://www.cidoc-crm.org/cidoc-crm/") PROV = Namespace("http://www.w3.org/ns/prov#") WD = Namespace("http://www.wikidata.org/entity/") WDT = Namespace("http://www.wikidata.org/prop/direct/") GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#") def sanitize_ghcid_for_uri(ghcid: str) -> str: """Sanitize GHCID to be valid as a URI local name. Removes or encodes invalid URI characters like backticks, quotes, etc. """ # Replace backticks, quotes, and other invalid chars with underscores sanitized = re.sub(r'[`\'\"<>{}|\\^~\[\]@#$%&*()+=,;!? ]', '_', ghcid) # Collapse multiple underscores sanitized = re.sub(r'_+', '_', sanitized) # Remove leading/trailing underscores sanitized = sanitized.strip('_') return sanitized class CustodianRDFConverter: """Convert custodian YAML data to RDF triples.""" def __init__(self): self.graph = Graph() self._bind_namespaces() def _bind_namespaces(self) -> None: """Bind all ontology namespaces to the graph.""" self.graph.bind("hc", HC) self.graph.bind("hcc", HCC) self.graph.bind("ghcid", GHCID) self.graph.bind("org", ORG) self.graph.bind("rico", RICO) self.graph.bind("schema", SCHEMA) self.graph.bind("cidoc", CIDOC) self.graph.bind("prov", PROV) self.graph.bind("wd", WD) self.graph.bind("wdt", WDT) self.graph.bind("geo", GEO) self.graph.bind("dcterms", DCTERMS) self.graph.bind("foaf", FOAF) self.graph.bind("skos", SKOS) self.graph.bind("owl", OWL) def reset(self) -> None: """Reset the graph for new batch.""" self.graph = Graph() self._bind_namespaces() def add_custodian(self, data: dict[str, Any], filepath: Path) -> URIRef | None: """Convert custodian YAML data to RDF triples.""" # Extract GHCID for URI ghcid_data = data.get("ghcid", {}) ghcid = ghcid_data.get("ghcid_current", "") if not ghcid: # Fallback to filename ghcid = filepath.stem # Sanitize GHCID for use in URI (remove invalid characters like backticks) ghcid_uri_safe = sanitize_ghcid_for_uri(ghcid) # Create custodian URI custodian_uri = GHCID[ghcid_uri_safe] # Get original entry and enrichment data original = data.get("original_entry", {}) wikidata = data.get("wikidata_enrichment", {}) custodian_name = data.get("custodian_name", {}) # Type declarations self.graph.add((custodian_uri, RDF.type, HCC.Custodian)) self.graph.add((custodian_uri, RDF.type, CIDOC.E39_Actor)) self.graph.add((custodian_uri, RDF.type, ORG.Organization)) self.graph.add((custodian_uri, RDF.type, SCHEMA.Organization)) self.graph.add((custodian_uri, RDF.type, PROV.Entity)) # Add institution-type-specific classes # Extract institution type from various possible locations in the YAML # Priority: 1) top-level institution_type, 2) original_entry.institution_type, # 3) original_entry.type, 4) extract from GHCID inst_type_raw = ( data.get("institution_type") or # Top-level (most common) original.get("institution_type") or # Inside original_entry original.get("type") or # Legacy format None ) # Map full type names to single-letter codes (GLAMORCUBESFIXPHDNT) TYPE_NAME_TO_CODE = { "GALLERY": "G", "LIBRARY": "L", "ARCHIVE": "A", "MUSEUM": "M", "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C", "UNKNOWN": "U", "BOTANICAL_ZOO": "B", "EDUCATION_PROVIDER": "E", "COLLECTING_SOCIETY": "S", "FEATURES": "F", "INTANGIBLE_HERITAGE_GROUP": "I", "MIXED": "X", "PERSONAL_COLLECTION": "P", "HOLY_SITES": "H", "DIGITAL_PLATFORM": "D", "NGO": "N", "TASTE_SMELL": "T", } # If no explicit type found, extract from GHCID (4th component) # Format: COUNTRY-REGION-CITY-TYPE-ABBREV (e.g., NL-NH-AMS-M-RM) if not inst_type_raw and ghcid: ghcid_parts = ghcid.split("-") if len(ghcid_parts) >= 4: type_from_ghcid = ghcid_parts[3] # Only use single-letter codes from GHCID if len(type_from_ghcid) == 1 and type_from_ghcid.upper() in "GLAMORCUBESFIXPHDNT": inst_type_raw = type_from_ghcid.upper() if isinstance(inst_type_raw, str): inst_type_raw = [inst_type_raw] elif not inst_type_raw: inst_type_raw = [] for inst_type in inst_type_raw: # Convert full name to code if needed type_code = TYPE_NAME_TO_CODE.get(inst_type.upper(), inst_type) if inst_type else None if type_code: self._add_type_class(custodian_uri, type_code) # Name - prefer emic name from custodian_name name = ( custodian_name.get("emic_name") or custodian_name.get("claim_value") or wikidata.get("wikidata_label_nl") or wikidata.get("wikidata_label_en") or original.get("name") or original.get("organisatie", "") ) if name: self.graph.add((custodian_uri, SKOS.prefLabel, Literal(name, lang="nl"))) self.graph.add((custodian_uri, SCHEMA.name, Literal(name))) self.graph.add((custodian_uri, FOAF.name, Literal(name))) self.graph.add((custodian_uri, RDFS.label, Literal(name))) # Alternative names from Wikidata labels labels = wikidata.get("wikidata_labels") or {} for lang, label in labels.items(): if label and label != name: # Normalize language tags lang_tag = lang.replace("-", "_").split("_")[0] # Get base language self.graph.add((custodian_uri, SKOS.altLabel, Literal(label, lang=lang_tag))) # Description description = ( wikidata.get("wikidata_description_en") or wikidata.get("wikidata_description_nl", "") ) if description: self.graph.add((custodian_uri, DCTERMS.description, Literal(description, lang="en"))) self.graph.add((custodian_uri, SCHEMA.description, Literal(description))) # GHCID identifier if ghcid: self.graph.add((custodian_uri, HC.ghcid, Literal(ghcid))) self.graph.add((custodian_uri, DCTERMS.identifier, Literal(ghcid))) # Extract subregionCode from GHCID (format: COUNTRY-REGION-CITY-TYPE-ABBREV) # e.g., NL-NH-AMS-M-RM → subregionCode = NL-NH ghcid_parts = ghcid.split("-") if len(ghcid_parts) >= 2: country = ghcid_parts[0] region = ghcid_parts[1] subregion_code = f"{country}-{region}" self.graph.add((custodian_uri, HC.subregionCode, Literal(subregion_code))) self.graph.add((custodian_uri, HC.countryCode, Literal(country))) # Settlement/city name from location data # Used by SPARQL templates for city-based queries location = data.get("location", {}) city_name = location.get("city") or location.get("geonames_name") if city_name: self.graph.add((custodian_uri, HC.settlementName, Literal(city_name))) # GHCID UUID ghcid_uuid = ghcid_data.get("ghcid_uuid") if ghcid_uuid: self.graph.add((custodian_uri, HC.ghcidUUID, Literal(ghcid_uuid))) # Wikidata sameAs link wikidata_id = ( wikidata.get("wikidata_entity_id") or original.get("wikidata_id", "") ) if wikidata_id: wd_uri = WD[wikidata_id] self.graph.add((custodian_uri, OWL.sameAs, wd_uri)) self.graph.add((custodian_uri, SCHEMA.sameAs, wd_uri)) self.graph.add((custodian_uri, HC.wikidataId, Literal(wikidata_id))) # ISIL code wd_identifiers = wikidata.get("wikidata_identifiers") or {} isil = ( wd_identifiers.get("isil") or original.get("isil-code_na", "") ) if isil: self._add_identifier(custodian_uri, "ISIL", isil) # Other identifiers from Wikidata for id_type, id_value in wd_identifiers.items(): if id_type != "isil" and id_value: self._add_identifier(custodian_uri, id_type.upper(), str(id_value)) # Location from coordinates - check multiple sources in priority order # Priority: 1) location.latitude/longitude (canonical), 2) location.coordinates, # 3) wikidata_coordinates, 4) google_maps_enrichment.coordinates coords = None # 1. Check canonical location (top-level latitude/longitude) location = data.get("location", {}) if location.get("latitude") and location.get("longitude"): coords = { "latitude": location["latitude"], "longitude": location["longitude"] } # 2. Check location.coordinates (GeoNames format) if not coords: loc_coords = location.get("coordinates", {}) if loc_coords.get("latitude") and loc_coords.get("longitude"): coords = loc_coords # 3. Fallback to Wikidata coordinates if not coords: wd_coords = wikidata.get("wikidata_coordinates") or {} if wd_coords.get("latitude") and wd_coords.get("longitude"): coords = wd_coords # 4. Fallback to Google Maps coordinates if not coords: gmaps = data.get("google_maps_enrichment") or {} gmaps_coords = gmaps.get("coordinates") or {} if gmaps_coords.get("latitude") and gmaps_coords.get("longitude"): coords = gmaps_coords if coords: self._add_location(custodian_uri, coords, wikidata) # Official website - handle both single string and list of URLs website = wikidata.get("wikidata_official_website") if website: websites = website if isinstance(website, list) else [website] for url in websites: if url and isinstance(url, str): # Sanitize URL: replace backslashes with forward slashes, encode special chars sanitized_url = url.replace("\\", "/") # Skip URLs that still have invalid characters if any(c in sanitized_url for c in ['<', '>', '{', '}', '|', '^', '`', ' ']): continue try: self.graph.add((custodian_uri, SCHEMA.url, URIRef(sanitized_url))) self.graph.add((custodian_uri, FOAF.homepage, URIRef(sanitized_url))) except Exception: # Skip invalid URLs pass # Located in (administrative) located_in = wikidata.get("wikidata_located_in") or {} if located_in.get("id"): loc_uri = WD[located_in["id"]] self.graph.add((custodian_uri, SCHEMA.containedInPlace, loc_uri)) self.graph.add((custodian_uri, WDT.P131, loc_uri)) # P131 = located in admin entity # Country - handle both dict format {"id": "Q16"} and string format "Q16" country = wikidata.get("wikidata_country") country_id = None if isinstance(country, dict) and country.get("id"): country_id = country["id"] elif isinstance(country, str) and country.startswith("Q"): country_id = country if country_id: country_uri = WD[country_id] self.graph.add((custodian_uri, SCHEMA.addressCountry, country_uri)) self.graph.add((custodian_uri, WDT.P17, country_uri)) # P17 = country # Instance of (from Wikidata) - handle both list of dicts and list of strings for instance in wikidata.get("wikidata_instance_of") or []: inst_id = None if isinstance(instance, dict) and instance.get("id"): inst_id = instance["id"] elif isinstance(instance, str) and instance.startswith("Q"): inst_id = instance if inst_id: inst_uri = WD[inst_id] self.graph.add((custodian_uri, WDT.P31, inst_uri)) # P31 = instance of # Partnerships claims = wikidata.get("wikidata_claims") or {} partners = claims.get("P2652_partnership_with") or {} if isinstance(partners, dict) and partners.get("value"): partner_values = partners["value"] if isinstance(partner_values, list): for partner in partner_values: if partner.get("id"): partner_uri = WD[partner["id"]] self.graph.add((custodian_uri, ORG.linkedTo, partner_uri)) self.graph.add((custodian_uri, HC.partnerOf, partner_uri)) # Member of member_of = claims.get("member_of") or {} if member_of.get("id"): member_uri = WD[member_of["id"]] self.graph.add((custodian_uri, ORG.memberOf, member_uri)) # Replaces (predecessor organization) replaces = claims.get("P1365_replaces") or {} if isinstance(replaces, dict) and (replaces.get("value") or {}).get("id"): predecessor_uri = WD[replaces["value"]["id"]] self.graph.add((custodian_uri, DCTERMS.replaces, predecessor_uri)) self.graph.add((custodian_uri, PROV.wasDerivedFrom, predecessor_uri)) # Has parts (collections, sub-organizations) has_parts = claims.get("P527_has_part_s_") or {} if isinstance(has_parts, dict) and (has_parts.get("value") or {}).get("id"): part_uri = WD[has_parts["value"]["id"]] self.graph.add((custodian_uri, DCTERMS.hasPart, part_uri)) # Processing provenance timestamp = data.get("processing_timestamp") if timestamp: self.graph.add((custodian_uri, PROV.generatedAtTime, Literal(timestamp, datatype=XSD.dateTime))) return custodian_uri def _add_type_class(self, uri: URIRef, inst_type: str) -> None: """Add institution-type-specific RDF classes.""" type_mappings = { "M": [(SCHEMA.Museum,), (CIDOC["E74_Group"],)], "A": [(SCHEMA.ArchiveOrganization,), (RICO.CorporateBody,)], "L": [(SCHEMA.Library,)], "G": [(SCHEMA.Museum,)], # Gallery "R": [(SCHEMA.ResearchOrganization,)], "H": [(SCHEMA.PlaceOfWorship,)], # Holy sites "E": [(SCHEMA.EducationalOrganization,)], "S": [(ORG.Organization,)], # Societies "O": [(ORG.Organization,)], # Official } for classes in type_mappings.get(inst_type, []): for cls in classes: self.graph.add((uri, RDF.type, cls)) # Also add the type code as literal if inst_type: self.graph.add((uri, HC.institutionType, Literal(inst_type))) def _add_identifier(self, uri: URIRef, scheme: str, value: str) -> None: """Add an identifier as a blank node.""" id_node = BNode() self.graph.add((id_node, RDF.type, CIDOC.E42_Identifier)) self.graph.add((id_node, RICO.identifierType, Literal(scheme))) self.graph.add((id_node, RICO.textualValue, Literal(value))) self.graph.add((uri, CIDOC.P1_is_identified_by, id_node)) # Direct property for common identifiers if scheme == "ISIL": self.graph.add((uri, HC.isil, Literal(value))) elif scheme == "VIAF": self.graph.add((uri, HC.viaf, Literal(value))) elif scheme == "GND": self.graph.add((uri, HC.gnd, Literal(value))) def _add_location(self, uri: URIRef, coords: dict, wikidata: dict) -> None: """Add location with coordinates.""" loc_node = BNode() self.graph.add((loc_node, RDF.type, SCHEMA.Place)) self.graph.add((loc_node, RDF.type, GEO.SpatialThing)) lat = coords.get("latitude") lon = coords.get("longitude") if lat and lon: self.graph.add((loc_node, GEO.lat, Literal(lat, datatype=XSD.decimal))) self.graph.add((loc_node, GEO.long, Literal(lon, datatype=XSD.decimal))) self.graph.add((loc_node, SCHEMA.latitude, Literal(lat, datatype=XSD.decimal))) self.graph.add((loc_node, SCHEMA.longitude, Literal(lon, datatype=XSD.decimal))) # Add city from located_in located_in = wikidata.get("wikidata_located_in") or {} city_label = located_in.get("label_en") or located_in.get("label_nl") if city_label: self.graph.add((loc_node, SCHEMA.addressLocality, Literal(city_label))) # Link to custodian self.graph.add((uri, SCHEMA.location, loc_node)) def serialize(self, format: str = "turtle") -> bytes: """Serialize graph to bytes.""" return self.graph.serialize(format=format).encode("utf-8") def triple_count(self) -> int: """Get number of triples in graph.""" return len(self.graph) class OxigraphLoader: """Load RDF data into Oxigraph.""" def __init__(self, endpoint: str = "http://localhost:7878"): self.endpoint = endpoint.rstrip("/") self.client = httpx.Client(timeout=120.0) def check_health(self) -> bool: """Check if Oxigraph is available.""" try: # Try query endpoint resp = self.client.post( f"{self.endpoint}/query", headers={"Content-Type": "application/sparql-query"}, content="ASK { ?s ?p ?o }" ) return resp.status_code in (200, 204) except Exception as e: logger.error(f"Oxigraph health check failed: {e}") return False def get_triple_count(self) -> int: """Get current triple count.""" try: resp = self.client.post( f"{self.endpoint}/query", headers={ "Content-Type": "application/sparql-query", "Accept": "application/sparql-results+json" }, content="SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" ) if resp.status_code == 200: data = resp.json() return int(data["results"]["bindings"][0]["count"]["value"]) except Exception as e: logger.error(f"Failed to get triple count: {e}") return 0 def load_turtle(self, ttl_data: bytes) -> bool: """Load Turtle data into Oxigraph.""" try: resp = self.client.post( f"{self.endpoint}/store?default", headers={"Content-Type": "text/turtle"}, content=ttl_data ) return resp.status_code in (200, 201, 204) except Exception as e: logger.error(f"Failed to load data: {e}") return False def clear_graph(self, graph_uri: str | None = None) -> bool: """Clear a named graph or default graph.""" try: if graph_uri: query = f"CLEAR GRAPH <{graph_uri}>" else: query = "CLEAR DEFAULT" resp = self.client.post( f"{self.endpoint}/update", headers={"Content-Type": "application/sparql-update"}, content=query ) return resp.status_code in (200, 204) except Exception as e: logger.error(f"Failed to clear graph: {e}") return False def iter_custodian_files(data_dir: Path) -> Generator[Path, None, None]: """Iterate over custodian YAML files.""" for filepath in sorted(data_dir.glob("*.yaml")): yield filepath def load_yaml(filepath: Path) -> dict[str, Any] | None: """Load YAML file. Note: Large files (>1MB) are processed normally - they contain valuable YouTube transcripts and other enrichment data that should not be stripped. """ try: with open(filepath, "r", encoding="utf-8") as f: return yaml.safe_load(f) except Exception as e: logger.warning(f"Failed to load {filepath.name}: {e}") return None def main(): parser = argparse.ArgumentParser(description="Sync custodian YAML files to Oxigraph") parser.add_argument("--data-dir", type=Path, default=PROJECT_ROOT / "data" / "custodian", help="Directory containing custodian YAML files") parser.add_argument("--endpoint", type=str, default="http://localhost:7878", help="Oxigraph SPARQL endpoint URL") parser.add_argument("--batch-size", type=int, default=500, help="Number of files to process per batch") parser.add_argument("--clear", action="store_true", help="Clear existing data before loading") parser.add_argument("--limit", type=int, default=None, help="Limit number of files to process (for testing)") parser.add_argument("--dry-run", action="store_true", help="Parse files but don't upload to Oxigraph") args = parser.parse_args() # Initialize converter = CustodianRDFConverter() loader = OxigraphLoader(args.endpoint) initial_count = 0 # Track initial triple count for statistics # Check Oxigraph availability if not args.dry_run: logger.info(f"Connecting to Oxigraph at {args.endpoint}...") if not loader.check_health(): logger.error("Cannot connect to Oxigraph. Ensure it's running and accessible.") sys.exit(1) initial_count = loader.get_triple_count() logger.info(f"Initial triple count: {initial_count:,}") if args.clear: logger.info("Clearing existing custodian data...") # Clear our namespace specifically loader.clear_graph() logger.info("Store cleared") # Count files files = list(iter_custodian_files(args.data_dir)) total_files = len(files) if args.limit: files = files[:args.limit] logger.info(f"Found {total_files:,} custodian files, processing {len(files):,}") # Process in batches processed = 0 failed = 0 total_triples = 0 for i in range(0, len(files), args.batch_size): batch = files[i:i + args.batch_size] batch_num = i // args.batch_size + 1 converter.reset() for filepath in batch: data = load_yaml(filepath) if data: try: converter.add_custodian(data, filepath) processed += 1 except Exception as e: logger.warning(f"Error processing {filepath.name}: {e}") failed += 1 else: failed += 1 batch_triples = converter.triple_count() total_triples += batch_triples if not args.dry_run: # Upload batch try: ttl_data = converter.serialize("turtle") if loader.load_turtle(ttl_data): logger.info(f"Batch {batch_num}: loaded {len(batch)} files, {batch_triples:,} triples") else: logger.error(f"Batch {batch_num}: failed to upload") except Exception as e: logger.error(f"Batch {batch_num}: serialization failed: {e}") # Continue with next batch else: logger.info(f"Batch {batch_num}: parsed {len(batch)} files, {batch_triples:,} triples (dry-run)") # Final statistics logger.info("=" * 60) logger.info(f"Processing complete:") logger.info(f" Files processed: {processed:,}") logger.info(f" Files failed: {failed:,}") logger.info(f" Total triples generated: {total_triples:,}") if not args.dry_run: final_count = loader.get_triple_count() added = final_count - initial_count logger.info(f" Triples added: {added:,}") logger.info(f" Final triple count: {final_count:,}") if __name__ == "__main__": main()