#!/usr/bin/env python3 """ NDE Enriched YAML to Heritage Custodian RDF Transformer Converts enriched NDE heritage custodian data from YAML format to RDF (Turtle), aligned with the Heritage Custodian ontology. Output: data/nde/rdf/{ghcid_numeric}.ttl Usage: python scripts/nde_to_hc_rdf.py # Transform all entries python scripts/nde_to_hc_rdf.py --entry 0946 # Transform single entry python scripts/nde_to_hc_rdf.py --dry-run # Preview without writing Author: GLAM Data Extraction Project Date: 2025-12-02 """ import argparse import logging import sys from datetime import datetime from pathlib import Path from typing import Any, Optional from urllib.parse import quote import yaml from rdflib import Graph, Literal, Namespace, URIRef from rdflib.namespace import DCTERMS, FOAF, RDF, RDFS, SKOS, XSD # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Project paths PROJECT_ROOT = Path(__file__).parent.parent NDE_ENRICHED_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries" RDF_OUTPUT_DIR = PROJECT_ROOT / "data" / "nde" / "rdf" # Namespaces HC = Namespace("https://nde.nl/ontology/hc/") HCC = Namespace("https://nde.nl/ontology/hc/class/") SCHEMA = Namespace("http://schema.org/") CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/") PROV = Namespace("http://www.w3.org/ns/prov#") WD = Namespace("http://www.wikidata.org/entity/") ORG = Namespace("http://www.w3.org/ns/org#") # NDE type code to CustodianPrimaryTypeEnum mapping TYPE_CODE_MAP = { 'G': 'GALLERY', 'L': 'LIBRARY', 'A': 'ARCHIVE', 'M': 'MUSEUM', 'O': 'OFFICIAL_INSTITUTION', 'R': 'RESEARCH_CENTER', 'C': 'COMMERCIAL', 'U': 'UNSPECIFIED', 'B': 'BIO_CUSTODIAN', 'E': 'EDUCATION_PROVIDER', 'S': 'HERITAGE_SOCIETY', 'F': 'FEATURE_CUSTODIAN', 'I': 'INTANGIBLE_HERITAGE_GROUP', 'X': 'MIXED', 'P': 'PERSONAL_COLLECTION', 'H': 'HOLY_SACRED_SITE', 'D': 'DIGITAL_PLATFORM', 'N': 'NON_PROFIT', 'T': 'TASTE_SCENT_HERITAGE', } # Social media claim type to SocialMediaPlatformTypeEnum mapping SOCIAL_CLAIM_TYPE_MAP = { 'social_facebook': 'FACEBOOK', 'social_instagram': 'INSTAGRAM', 'social_linkedin': 'LINKEDIN', 'social_youtube': 'YOUTUBE', 'social_twitter': 'X_TWITTER', 'social_x': 'X_TWITTER', 'social_tiktok': 'TIKTOK', 'social_pinterest': 'PINTEREST', 'social_flickr': 'FLICKR', 'social_vimeo': 'VIMEO', 'social_threads': 'THREADS', 'social_bluesky': 'BLUESKY', 'social_mastodon': 'MASTODON', } # Identifier scheme to URIs IDENTIFIER_SCHEME_MAP = { 'ISIL': 'https://www.iso.org/standard/77849.html', 'Wikidata': 'https://www.wikidata.org/wiki/', 'VIAF': 'https://viaf.org/viaf/', 'GND': 'https://d-nb.info/gnd/', 'ISNI': 'https://isni.org/isni/', 'LCNAF': 'https://id.loc.gov/authorities/names/', 'ROR': 'https://ror.org/', 'GHCID': 'https://nde.nl/ontology/hc/', 'GHCID_UUID': 'urn:uuid:', 'GHCID_UUID_SHA256': 'urn:uuid:', 'GHCID_NUMERIC': 'https://nde.nl/ontology/hc/', 'RECORD_ID': 'urn:uuid:', 'Ringgold': 'https://www.ringgold.com/identify/', } class NDEToHCTransformer: """Transform NDE enriched YAML to Heritage Custodian RDF.""" def __init__(self, dry_run: bool = False): self.dry_run = dry_run self.stats = { 'processed': 0, 'success': 0, 'errors': 0, 'skipped': 0, } def transform_entry(self, entry_path: Path) -> Optional[Graph]: """Transform a single NDE entry to RDF Graph.""" try: with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: logger.warning(f"Empty entry: {entry_path}") return None # Get GHCID numeric for URI ghcid_data = entry.get('ghcid', {}) ghcid_numeric = ghcid_data.get('ghcid_numeric') if not ghcid_numeric: logger.warning(f"No GHCID numeric in {entry_path}, skipping") self.stats['skipped'] += 1 return None # Create RDF graph g = Graph() g.bind('hc', HC) g.bind('hcc', HCC) g.bind('schema', SCHEMA) g.bind('crm', CRM) g.bind('prov', PROV) g.bind('foaf', FOAF) g.bind('skos', SKOS) g.bind('dcterms', DCTERMS) g.bind('wd', WD) g.bind('org', ORG) # Custodian hub URI custodian_uri = URIRef(f"{HC}{ghcid_numeric}") # Add type assertion g.add((custodian_uri, RDF.type, CRM.E39_Actor)) g.add((custodian_uri, RDF.type, HCC.Custodian)) # hc_id (dcterms:identifier) g.add((custodian_uri, DCTERMS.identifier, Literal(str(ghcid_numeric)))) # preferred_label (skos:prefLabel) self._add_preferred_label(g, custodian_uri, entry) # custodian_type self._add_custodian_type(g, custodian_uri, entry) # identifiers (CustodianIdentifier instances) self._add_identifiers(g, custodian_uri, entry, ghcid_numeric) # digital_platform (DigitalPlatform) self._add_digital_platform(g, custodian_uri, entry, ghcid_numeric) # social_media_profiles (SocialMediaProfile) self._add_social_media_profiles(g, custodian_uri, entry, ghcid_numeric) # Location/place data (coordinates, address) self._add_place_data(g, custodian_uri, entry, ghcid_numeric) # created/modified timestamps self._add_timestamps(g, custodian_uri, entry) return g except Exception as e: logger.error(f"Error transforming {entry_path}: {e}") self.stats['errors'] += 1 return None def _add_preferred_label(self, g: Graph, custodian_uri: URIRef, entry: dict): """Add preferred label from custodian_name or fallbacks.""" label = None # Priority: custodian_name > wikidata label > original entry custodian_name = entry.get('custodian_name', {}) if custodian_name: label = custodian_name.get('claim_value') if not label: wikidata = entry.get('wikidata_enrichment', {}) label = wikidata.get('wikidata_label_nl') or wikidata.get('wikidata_label_en') if not label: original = entry.get('original_entry', {}) label = original.get('organisatie') if label: g.add((custodian_uri, SKOS.prefLabel, Literal(label, lang='nl'))) # Also add English label if available wikidata = entry.get('wikidata_enrichment', {}) en_label = wikidata.get('wikidata_label_en') if en_label and en_label != label: g.add((custodian_uri, SKOS.altLabel, Literal(en_label, lang='en'))) def _add_custodian_type(self, g: Graph, custodian_uri: URIRef, entry: dict): """Add custodian type from original entry type code.""" original = entry.get('original_entry', {}) type_codes = original.get('type', []) if not type_codes: return for code in type_codes: type_enum = TYPE_CODE_MAP.get(code, 'UNSPECIFIED') g.add((custodian_uri, HC.custodian_type, Literal(type_enum))) def _add_identifiers(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str): """Add CustodianIdentifier instances.""" # From entry.identifiers list identifiers = entry.get('identifiers', []) for idx, ident in enumerate(identifiers): scheme = ident.get('identifier_scheme') value = ident.get('identifier_value') if not scheme or not value: continue # Create identifier URI ident_uri = URIRef(f"{HC}identifier/{ghcid_numeric}/{scheme.lower()}") g.add((ident_uri, RDF.type, CRM.E42_Identifier)) g.add((ident_uri, SKOS.inScheme, Literal(scheme))) g.add((ident_uri, SKOS.notation, Literal(str(value)))) # Link to custodian g.add((custodian_uri, CRM.P48_has_preferred_identifier, ident_uri)) g.add((ident_uri, CRM.P48i_is_preferred_identifier_of, custodian_uri)) # Add identifier URL if available url = ident.get('identifier_url') if url: g.add((ident_uri, SCHEMA.url, URIRef(url))) # Wikidata identifiers from enrichment wikidata = entry.get('wikidata_enrichment', {}) wikidata_id = wikidata.get('wikidata_entity_id') if wikidata_id: wd_ident_uri = URIRef(f"{HC}identifier/{ghcid_numeric}/wikidata") g.add((wd_ident_uri, RDF.type, CRM.E42_Identifier)) g.add((wd_ident_uri, SKOS.inScheme, Literal('Wikidata'))) g.add((wd_ident_uri, SKOS.notation, Literal(wikidata_id))) g.add((wd_ident_uri, SCHEMA.url, URIRef(f"https://www.wikidata.org/wiki/{wikidata_id}"))) g.add((custodian_uri, CRM.P48_has_preferred_identifier, wd_ident_uri)) # Also add owl:sameAs to Wikidata entity g.add((custodian_uri, URIRef("http://www.w3.org/2002/07/owl#sameAs"), WD[wikidata_id])) # Additional Wikidata identifiers (VIAF, GND, ISNI, etc.) wd_identifiers = wikidata.get('wikidata_identifiers', {}) for scheme, value in wd_identifiers.items(): scheme_upper = scheme.upper() if scheme_upper in IDENTIFIER_SCHEME_MAP: ext_ident_uri = URIRef(f"{HC}identifier/{ghcid_numeric}/{scheme.lower()}") g.add((ext_ident_uri, RDF.type, CRM.E42_Identifier)) g.add((ext_ident_uri, SKOS.inScheme, Literal(scheme_upper))) g.add((ext_ident_uri, SKOS.notation, Literal(str(value)))) g.add((custodian_uri, CRM.P48_has_preferred_identifier, ext_ident_uri)) def _add_digital_platform(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str): """Add DigitalPlatform for website.""" # Get website from various sources website = None # Priority: wikidata official website > google maps website > web enrichment wikidata = entry.get('wikidata_enrichment', {}) website = wikidata.get('wikidata_official_website') if not website: google = entry.get('google_maps_enrichment', {}) website = google.get('website') if not website: return # Create DigitalPlatform instance platform_uri = URIRef(f"{HC}platform/{ghcid_numeric}/website") g.add((platform_uri, RDF.type, HCC.DigitalPlatform)) g.add((platform_uri, FOAF.homepage, URIRef(website))) g.add((platform_uri, SCHEMA.url, URIRef(website))) # Link to custodian g.add((custodian_uri, FOAF.homepage, platform_uri)) # Add online catalog URL if available claims = wikidata.get('wikidata_claims', {}) catalog_claim = claims.get('P8768_online_catalog_url', {}) catalog_values = catalog_claim.get('value') if isinstance(catalog_claim, dict) else None if catalog_values: # Handle both single value and list of values if isinstance(catalog_values, list): for catalog_url in catalog_values: if catalog_url: g.add((platform_uri, HC.collection_url, URIRef(catalog_url))) else: g.add((platform_uri, HC.collection_url, URIRef(catalog_values))) def _add_social_media_profiles(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str): """Add SocialMediaProfile instances from web_claims.""" web_claims = entry.get('web_claims', {}) claims = web_claims.get('claims', []) social_count = 0 for claim in claims: claim_type = claim.get('claim_type', '') if not claim_type.startswith('social_'): continue platform_type = SOCIAL_CLAIM_TYPE_MAP.get(claim_type) if not platform_type: continue profile_url = claim.get('claim_value') if not profile_url: continue # Validate URL - skip share links, intent URLs, or URLs with query params if '/share?' in profile_url or '/intent/' in profile_url or '&' in profile_url: logger.debug(f"Skipping non-profile URL: {profile_url}") continue # Validate URL doesn't contain spaces or invalid characters if ' ' in profile_url: logger.debug(f"Skipping URL with spaces: {profile_url}") continue social_count += 1 # Create SocialMediaProfile instance profile_uri = URIRef(f"{HC}social/{ghcid_numeric}/{platform_type.lower()}") g.add((profile_uri, RDF.type, FOAF.OnlineAccount)) g.add((profile_uri, RDF.type, HCC.SocialMediaProfile)) g.add((profile_uri, HC.platform_type, Literal(platform_type))) g.add((profile_uri, FOAF.accountServiceHomepage, URIRef(profile_url))) # Extract account name from URL account_name = self._extract_account_name(profile_url, platform_type) if account_name: g.add((profile_uri, FOAF.accountName, Literal(account_name))) # Add provenance from web claim source_url = claim.get('source_url') if source_url: g.add((profile_uri, PROV.wasDerivedFrom, URIRef(source_url))) retrieved_on = claim.get('retrieved_on') if retrieved_on: g.add((profile_uri, PROV.generatedAtTime, Literal(retrieved_on, datatype=XSD.dateTime))) # Link to custodian g.add((custodian_uri, FOAF.account, profile_uri)) # Also check Wikidata for Twitter username wikidata = entry.get('wikidata_enrichment', {}) claims_wd = wikidata.get('wikidata_claims', {}) twitter_claim = claims_wd.get('P2002_x__twitter__username', {}) twitter_value = twitter_claim.get('value') if isinstance(twitter_claim, dict) else None if twitter_value: # Handle both single value and list of values (take first) if isinstance(twitter_value, list): twitter_username = twitter_value[0] if twitter_value else None else: twitter_username = twitter_value if twitter_username: # Check if we already have Twitter from web_claims existing_twitter_uri = URIRef(f"{HC}social/{ghcid_numeric}/x_twitter") if (existing_twitter_uri, RDF.type, FOAF.OnlineAccount) not in g: g.add((existing_twitter_uri, RDF.type, FOAF.OnlineAccount)) g.add((existing_twitter_uri, RDF.type, HCC.SocialMediaProfile)) g.add((existing_twitter_uri, HC.platform_type, Literal('X_TWITTER'))) g.add((existing_twitter_uri, FOAF.accountName, Literal(twitter_username))) g.add((existing_twitter_uri, FOAF.accountServiceHomepage, URIRef(f"https://x.com/{twitter_username}"))) g.add((custodian_uri, FOAF.account, existing_twitter_uri)) def _extract_account_name(self, url: str, platform_type: str) -> Optional[str]: """Extract account name from social media URL.""" try: from urllib.parse import urlparse parsed = urlparse(url) path = parsed.path.strip('/') if platform_type in ('FACEBOOK', 'INSTAGRAM', 'LINKEDIN', 'YOUTUBE'): # Usually the last path component parts = path.split('/') if parts: # Handle linkedin.com/company/name format if platform_type == 'LINKEDIN' and len(parts) >= 2: return parts[-1] # Handle youtube.com/channel/ID format if platform_type == 'YOUTUBE' and 'channel' in parts: idx = parts.index('channel') if idx + 1 < len(parts): return parts[idx + 1] return parts[-1] if parts[-1] else (parts[-2] if len(parts) > 1 else None) return path.split('/')[-1] if path else None except Exception: return None def _add_place_data(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str): """Add geographic/place data.""" # Get coordinates from Google Maps or Wikidata google = entry.get('google_maps_enrichment', {}) coords = google.get('coordinates', {}) if not coords: wikidata = entry.get('wikidata_enrichment', {}) coords = wikidata.get('wikidata_coordinates', {}) if coords: lat = coords.get('latitude') lon = coords.get('longitude') if lat and lon: # Create Place instance place_uri = URIRef(f"{HC}place/{ghcid_numeric}") g.add((place_uri, RDF.type, SCHEMA.Place)) g.add((place_uri, SCHEMA.latitude, Literal(lat, datatype=XSD.decimal))) g.add((place_uri, SCHEMA.longitude, Literal(lon, datatype=XSD.decimal))) # Add address if available address = google.get('formatted_address') if address: g.add((place_uri, SCHEMA.address, Literal(address))) # Link to custodian g.add((custodian_uri, CRM.P53_has_former_or_current_location, place_uri)) # Add GeoNames ID from location resolution ghcid_data = entry.get('ghcid', {}) loc_resolution = ghcid_data.get('location_resolution', {}) geonames_id = loc_resolution.get('geonames_id') if geonames_id: geonames_uri = URIRef(f"https://sws.geonames.org/{geonames_id}/") g.add((custodian_uri, SCHEMA.containedInPlace, geonames_uri)) def _add_timestamps(self, g: Graph, custodian_uri: URIRef, entry: dict): """Add created/modified timestamps.""" processing_ts = entry.get('processing_timestamp') if processing_ts: g.add((custodian_uri, SCHEMA.dateCreated, Literal(processing_ts, datatype=XSD.dateTime))) # Use provenance timestamp as modified provenance = entry.get('provenance', {}) generated_at = provenance.get('generated_at') if generated_at: g.add((custodian_uri, SCHEMA.dateModified, Literal(generated_at, datatype=XSD.dateTime))) def transform_all(self): """Transform all NDE enriched entries.""" # Ensure output directory exists RDF_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Get all entry files entry_files = sorted(NDE_ENRICHED_DIR.glob("*.yaml")) total = len(entry_files) logger.info(f"Found {total} NDE enriched entries to transform") for idx, entry_path in enumerate(entry_files, 1): self.stats['processed'] += 1 logger.info(f"[{idx}/{total}] Transforming {entry_path.name}") graph = self.transform_entry(entry_path) if graph: # Get GHCID numeric for filename with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) ghcid_numeric = entry.get('ghcid', {}).get('ghcid_numeric') if ghcid_numeric: output_path = RDF_OUTPUT_DIR / f"{ghcid_numeric}.ttl" if not self.dry_run: graph.serialize(destination=str(output_path), format='turtle') logger.info(f" -> Wrote {output_path.name} ({len(graph)} triples)") else: logger.info(f" -> [DRY-RUN] Would write {output_path.name} ({len(graph)} triples)") self.stats['success'] += 1 # Summary logger.info("=" * 60) logger.info("Transformation complete!") logger.info(f" Processed: {self.stats['processed']}") logger.info(f" Success: {self.stats['success']}") logger.info(f" Skipped: {self.stats['skipped']}") logger.info(f" Errors: {self.stats['errors']}") def transform_single(self, entry_index: str): """Transform a single entry by index (e.g., '0946').""" # Find the entry file pattern = f"{entry_index}_*.yaml" matches = list(NDE_ENRICHED_DIR.glob(pattern)) if not matches: logger.error(f"No entry found matching {pattern}") return entry_path = matches[0] logger.info(f"Transforming single entry: {entry_path.name}") graph = self.transform_entry(entry_path) if graph: # Get GHCID numeric for filename with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) ghcid_numeric = entry.get('ghcid', {}).get('ghcid_numeric') if ghcid_numeric: RDF_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) output_path = RDF_OUTPUT_DIR / f"{ghcid_numeric}.ttl" if not self.dry_run: graph.serialize(destination=str(output_path), format='turtle') logger.info(f"Wrote {output_path.name} ({len(graph)} triples)") # Also print the RDF print("\n" + "=" * 60) print("Generated RDF (Turtle):") print("=" * 60) print(graph.serialize(format='turtle')) else: logger.info(f"[DRY-RUN] Would write {output_path.name}") print("\n" + "=" * 60) print("Generated RDF (Turtle) [DRY-RUN]:") print("=" * 60) print(graph.serialize(format='turtle')) def main(): parser = argparse.ArgumentParser( description="Transform NDE enriched YAML to Heritage Custodian RDF" ) parser.add_argument( '--entry', '-e', help="Transform single entry by index (e.g., '0946')" ) parser.add_argument( '--dry-run', '-n', action='store_true', help="Preview without writing files" ) args = parser.parse_args() transformer = NDEToHCTransformer(dry_run=args.dry_run) if args.entry: transformer.transform_single(args.entry) else: transformer.transform_all() if __name__ == '__main__': main()