glam/scripts/nde_to_hc_rdf.py

#!/usr/bin/env python3
"""
NDE Enriched YAML to Heritage Custodian RDF Transformer

Converts enriched NDE heritage custodian data from YAML format to RDF (Turtle),
aligned with the Heritage Custodian ontology.

Output: data/nde/rdf/{ghcid_numeric}.ttl

Usage:
    python scripts/nde_to_hc_rdf.py                    # Transform all entries
    python scripts/nde_to_hc_rdf.py --entry 0946      # Transform single entry
    python scripts/nde_to_hc_rdf.py --dry-run         # Preview without writing

Author: GLAM Data Extraction Project
Date: 2025-12-02
"""

import argparse
import logging
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
from urllib.parse import quote

import yaml
from rdflib import Graph, Literal, Namespace, URIRef
from rdflib.namespace import DCTERMS, FOAF, RDF, RDFS, SKOS, XSD

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
NDE_ENRICHED_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
RDF_OUTPUT_DIR = PROJECT_ROOT / "data" / "nde" / "rdf"

# Namespaces
HC = Namespace("https://nde.nl/ontology/hc/")
HCC = Namespace("https://nde.nl/ontology/hc/class/")
SCHEMA = Namespace("http://schema.org/")
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
PROV = Namespace("http://www.w3.org/ns/prov#")
WD = Namespace("http://www.wikidata.org/entity/")
ORG = Namespace("http://www.w3.org/ns/org#")

# NDE type code to CustodianPrimaryTypeEnum mapping
TYPE_CODE_MAP = {
    'G': 'GALLERY',
    'L': 'LIBRARY',
    'A': 'ARCHIVE',
    'M': 'MUSEUM',
    'O': 'OFFICIAL_INSTITUTION',
    'R': 'RESEARCH_CENTER',
    'C': 'COMMERCIAL',
    'U': 'UNSPECIFIED',
    'B': 'BIO_CUSTODIAN',
    'E': 'EDUCATION_PROVIDER',
    'S': 'HERITAGE_SOCIETY',
    'F': 'FEATURE_CUSTODIAN',
    'I': 'INTANGIBLE_HERITAGE_GROUP',
    'X': 'MIXED',
    'P': 'PERSONAL_COLLECTION',
    'H': 'HOLY_SACRED_SITE',
    'D': 'DIGITAL_PLATFORM',
    'N': 'NON_PROFIT',
    'T': 'TASTE_SCENT_HERITAGE',
}

# Social media claim type to SocialMediaPlatformTypeEnum mapping
SOCIAL_CLAIM_TYPE_MAP = {
    'social_facebook': 'FACEBOOK',
    'social_instagram': 'INSTAGRAM',
    'social_linkedin': 'LINKEDIN',
    'social_youtube': 'YOUTUBE',
    'social_twitter': 'X_TWITTER',
    'social_x': 'X_TWITTER',
    'social_tiktok': 'TIKTOK',
    'social_pinterest': 'PINTEREST',
    'social_flickr': 'FLICKR',
    'social_vimeo': 'VIMEO',
    'social_threads': 'THREADS',
    'social_bluesky': 'BLUESKY',
    'social_mastodon': 'MASTODON',
}

# Identifier scheme to URIs
IDENTIFIER_SCHEME_MAP = {
    'ISIL': 'https://www.iso.org/standard/77849.html',
    'Wikidata': 'https://www.wikidata.org/wiki/',
    'VIAF': 'https://viaf.org/viaf/',
    'GND': 'https://d-nb.info/gnd/',
    'ISNI': 'https://isni.org/isni/',
    'LCNAF': 'https://id.loc.gov/authorities/names/',
    'ROR': 'https://ror.org/',
    'GHCID': 'https://nde.nl/ontology/hc/',
    'GHCID_UUID': 'urn:uuid:',
    'GHCID_UUID_SHA256': 'urn:uuid:',
    'GHCID_NUMERIC': 'https://nde.nl/ontology/hc/',
    'RECORD_ID': 'urn:uuid:',
    'Ringgold': 'https://www.ringgold.com/identify/',
}


class NDEToHCTransformer:
    """Transform NDE enriched YAML to Heritage Custodian RDF."""

    def __init__(self, dry_run: bool = False):
        self.dry_run = dry_run
        self.stats = {
            'processed': 0,
            'success': 0,
            'errors': 0,
            'skipped': 0,
        }

    def transform_entry(self, entry_path: Path) -> Optional[Graph]:
        """Transform a single NDE entry to RDF Graph."""
        try:
            with open(entry_path, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if not entry:
                logger.warning(f"Empty entry: {entry_path}")
                return None

            # Get GHCID numeric for URI
            ghcid_data = entry.get('ghcid', {})
            ghcid_numeric = ghcid_data.get('ghcid_numeric')

            if not ghcid_numeric:
                logger.warning(f"No GHCID numeric in {entry_path}, skipping")
                self.stats['skipped'] += 1
                return None

            # Create RDF graph
            g = Graph()
            g.bind('hc', HC)
            g.bind('hcc', HCC)
            g.bind('schema', SCHEMA)
            g.bind('crm', CRM)
            g.bind('prov', PROV)
            g.bind('foaf', FOAF)
            g.bind('skos', SKOS)
            g.bind('dcterms', DCTERMS)
            g.bind('wd', WD)
            g.bind('org', ORG)

            # Custodian hub URI
            custodian_uri = URIRef(f"{HC}{ghcid_numeric}")

            # Add type assertion
            g.add((custodian_uri, RDF.type, CRM.E39_Actor))
            g.add((custodian_uri, RDF.type, HCC.Custodian))

            # hc_id (dcterms:identifier)
            g.add((custodian_uri, DCTERMS.identifier, Literal(str(ghcid_numeric))))

            # preferred_label (skos:prefLabel)
            self._add_preferred_label(g, custodian_uri, entry)

            # custodian_type
            self._add_custodian_type(g, custodian_uri, entry)

            # identifiers (CustodianIdentifier instances)
            self._add_identifiers(g, custodian_uri, entry, ghcid_numeric)

            # digital_platform (DigitalPlatform)
            self._add_digital_platform(g, custodian_uri, entry, ghcid_numeric)

            # social_media_profiles (SocialMediaProfile)
            self._add_social_media_profiles(g, custodian_uri, entry, ghcid_numeric)

            # Location/place data (coordinates, address)
            self._add_place_data(g, custodian_uri, entry, ghcid_numeric)

            # created/modified timestamps
            self._add_timestamps(g, custodian_uri, entry)

            return g

        except Exception as e:
            logger.error(f"Error transforming {entry_path}: {e}")
            self.stats['errors'] += 1
            return None

    def _add_preferred_label(self, g: Graph, custodian_uri: URIRef, entry: dict):
        """Add preferred label from custodian_name or fallbacks."""
        label = None

        # Priority: custodian_name > wikidata label > original entry
        custodian_name = entry.get('custodian_name', {})
        if custodian_name:
            label = custodian_name.get('claim_value')

        if not label:
            wikidata = entry.get('wikidata_enrichment', {})
            label = wikidata.get('wikidata_label_nl') or wikidata.get('wikidata_label_en')

        if not label:
            original = entry.get('original_entry', {})
            label = original.get('organisatie')

        if label:
            g.add((custodian_uri, SKOS.prefLabel, Literal(label, lang='nl')))
            # Also add English label if available
            wikidata = entry.get('wikidata_enrichment', {})
            en_label = wikidata.get('wikidata_label_en')
            if en_label and en_label != label:
                g.add((custodian_uri, SKOS.altLabel, Literal(en_label, lang='en')))

    def _add_custodian_type(self, g: Graph, custodian_uri: URIRef, entry: dict):
        """Add custodian type from original entry type code."""
        original = entry.get('original_entry', {})
        type_codes = original.get('type', [])

        if not type_codes:
            return

        for code in type_codes:
            type_enum = TYPE_CODE_MAP.get(code, 'UNSPECIFIED')
            g.add((custodian_uri, HC.custodian_type, Literal(type_enum)))

    def _add_identifiers(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str):
        """Add CustodianIdentifier instances."""
        # From entry.identifiers list
        identifiers = entry.get('identifiers', [])
        for idx, ident in enumerate(identifiers):
            scheme = ident.get('identifier_scheme')
            value = ident.get('identifier_value')

            if not scheme or not value:
                continue

            # Create identifier URI
            ident_uri = URIRef(f"{HC}identifier/{ghcid_numeric}/{scheme.lower()}")

            g.add((ident_uri, RDF.type, CRM.E42_Identifier))
            g.add((ident_uri, SKOS.inScheme, Literal(scheme)))
            g.add((ident_uri, SKOS.notation, Literal(str(value))))

            # Link to custodian
            g.add((custodian_uri, CRM.P48_has_preferred_identifier, ident_uri))
            g.add((ident_uri, CRM.P48i_is_preferred_identifier_of, custodian_uri))

            # Add identifier URL if available
            url = ident.get('identifier_url')
            if url:
                g.add((ident_uri, SCHEMA.url, URIRef(url)))

        # Wikidata identifiers from enrichment
        wikidata = entry.get('wikidata_enrichment', {})
        wikidata_id = wikidata.get('wikidata_entity_id')
        if wikidata_id:
            wd_ident_uri = URIRef(f"{HC}identifier/{ghcid_numeric}/wikidata")
            g.add((wd_ident_uri, RDF.type, CRM.E42_Identifier))
            g.add((wd_ident_uri, SKOS.inScheme, Literal('Wikidata')))
            g.add((wd_ident_uri, SKOS.notation, Literal(wikidata_id)))
            g.add((wd_ident_uri, SCHEMA.url, URIRef(f"https://www.wikidata.org/wiki/{wikidata_id}")))
            g.add((custodian_uri, CRM.P48_has_preferred_identifier, wd_ident_uri))

            # Also add owl:sameAs to Wikidata entity
            g.add((custodian_uri, URIRef("http://www.w3.org/2002/07/owl#sameAs"), WD[wikidata_id]))

        # Additional Wikidata identifiers (VIAF, GND, ISNI, etc.)
        wd_identifiers = wikidata.get('wikidata_identifiers', {})
        for scheme, value in wd_identifiers.items():
            scheme_upper = scheme.upper()
            if scheme_upper in IDENTIFIER_SCHEME_MAP:
                ext_ident_uri = URIRef(f"{HC}identifier/{ghcid_numeric}/{scheme.lower()}")
                g.add((ext_ident_uri, RDF.type, CRM.E42_Identifier))
                g.add((ext_ident_uri, SKOS.inScheme, Literal(scheme_upper)))
                g.add((ext_ident_uri, SKOS.notation, Literal(str(value))))
                g.add((custodian_uri, CRM.P48_has_preferred_identifier, ext_ident_uri))

    def _add_digital_platform(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str):
        """Add DigitalPlatform for website."""
        # Get website from various sources
        website = None

        # Priority: wikidata official website > google maps website > web enrichment
        wikidata = entry.get('wikidata_enrichment', {})
        website = wikidata.get('wikidata_official_website')

        if not website:
            google = entry.get('google_maps_enrichment', {})
            website = google.get('website')

        if not website:
            return

        # Create DigitalPlatform instance
        platform_uri = URIRef(f"{HC}platform/{ghcid_numeric}/website")

        g.add((platform_uri, RDF.type, HCC.DigitalPlatform))
        g.add((platform_uri, FOAF.homepage, URIRef(website)))
        g.add((platform_uri, SCHEMA.url, URIRef(website)))

        # Link to custodian
        g.add((custodian_uri, FOAF.homepage, platform_uri))

        # Add online catalog URL if available
        claims = wikidata.get('wikidata_claims', {})
        catalog_claim = claims.get('P8768_online_catalog_url', {})
        catalog_values = catalog_claim.get('value') if isinstance(catalog_claim, dict) else None
        if catalog_values:
            # Handle both single value and list of values
            if isinstance(catalog_values, list):
                for catalog_url in catalog_values:
                    if catalog_url:
                        g.add((platform_uri, HC.collection_url, URIRef(catalog_url)))
            else:
                g.add((platform_uri, HC.collection_url, URIRef(catalog_values)))

    def _add_social_media_profiles(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str):
        """Add SocialMediaProfile instances from web_claims."""
        web_claims = entry.get('web_claims', {})
        claims = web_claims.get('claims', [])

        social_count = 0
        for claim in claims:
            claim_type = claim.get('claim_type', '')

            if not claim_type.startswith('social_'):
                continue

            platform_type = SOCIAL_CLAIM_TYPE_MAP.get(claim_type)
            if not platform_type:
                continue

            profile_url = claim.get('claim_value')
            if not profile_url:
                continue

            # Validate URL - skip share links, intent URLs, or URLs with query params
            if '/share?' in profile_url or '/intent/' in profile_url or '&' in profile_url:
                logger.debug(f"Skipping non-profile URL: {profile_url}")
                continue

            # Validate URL doesn't contain spaces or invalid characters
            if ' ' in profile_url:
                logger.debug(f"Skipping URL with spaces: {profile_url}")
                continue

            social_count += 1

            # Create SocialMediaProfile instance
            profile_uri = URIRef(f"{HC}social/{ghcid_numeric}/{platform_type.lower()}")

            g.add((profile_uri, RDF.type, FOAF.OnlineAccount))
            g.add((profile_uri, RDF.type, HCC.SocialMediaProfile))
            g.add((profile_uri, HC.platform_type, Literal(platform_type)))
            g.add((profile_uri, FOAF.accountServiceHomepage, URIRef(profile_url)))

            # Extract account name from URL
            account_name = self._extract_account_name(profile_url, platform_type)
            if account_name:
                g.add((profile_uri, FOAF.accountName, Literal(account_name)))

            # Add provenance from web claim
            source_url = claim.get('source_url')
            if source_url:
                g.add((profile_uri, PROV.wasDerivedFrom, URIRef(source_url)))

            retrieved_on = claim.get('retrieved_on')
            if retrieved_on:
                g.add((profile_uri, PROV.generatedAtTime, Literal(retrieved_on, datatype=XSD.dateTime)))

            # Link to custodian
            g.add((custodian_uri, FOAF.account, profile_uri))

        # Also check Wikidata for Twitter username
        wikidata = entry.get('wikidata_enrichment', {})
        claims_wd = wikidata.get('wikidata_claims', {})
        twitter_claim = claims_wd.get('P2002_x__twitter__username', {})
        twitter_value = twitter_claim.get('value') if isinstance(twitter_claim, dict) else None

        if twitter_value:
            # Handle both single value and list of values (take first)
            if isinstance(twitter_value, list):
                twitter_username = twitter_value[0] if twitter_value else None
            else:
                twitter_username = twitter_value

            if twitter_username:
                # Check if we already have Twitter from web_claims
                existing_twitter_uri = URIRef(f"{HC}social/{ghcid_numeric}/x_twitter")
                if (existing_twitter_uri, RDF.type, FOAF.OnlineAccount) not in g:
                    g.add((existing_twitter_uri, RDF.type, FOAF.OnlineAccount))
                    g.add((existing_twitter_uri, RDF.type, HCC.SocialMediaProfile))
                    g.add((existing_twitter_uri, HC.platform_type, Literal('X_TWITTER')))
                    g.add((existing_twitter_uri, FOAF.accountName, Literal(twitter_username)))
                    g.add((existing_twitter_uri, FOAF.accountServiceHomepage, URIRef(f"https://x.com/{twitter_username}")))
                    g.add((custodian_uri, FOAF.account, existing_twitter_uri))

    def _extract_account_name(self, url: str, platform_type: str) -> Optional[str]:
        """Extract account name from social media URL."""
        try:
            from urllib.parse import urlparse
            parsed = urlparse(url)
            path = parsed.path.strip('/')

            if platform_type in ('FACEBOOK', 'INSTAGRAM', 'LINKEDIN', 'YOUTUBE'):
                # Usually the last path component
                parts = path.split('/')
                if parts:
                    # Handle linkedin.com/company/name format
                    if platform_type == 'LINKEDIN' and len(parts) >= 2:
                        return parts[-1]
                    # Handle youtube.com/channel/ID format
                    if platform_type == 'YOUTUBE' and 'channel' in parts:
                        idx = parts.index('channel')
                        if idx + 1 < len(parts):
                            return parts[idx + 1]
                    return parts[-1] if parts[-1] else (parts[-2] if len(parts) > 1 else None)

            return path.split('/')[-1] if path else None

        except Exception:
            return None

    def _add_place_data(self, g: Graph, custodian_uri: URIRef, entry: dict, ghcid_numeric: str):
        """Add geographic/place data."""
        # Get coordinates from Google Maps or Wikidata
        google = entry.get('google_maps_enrichment', {})
        coords = google.get('coordinates', {})

        if not coords:
            wikidata = entry.get('wikidata_enrichment', {})
            coords = wikidata.get('wikidata_coordinates', {})

        if coords:
            lat = coords.get('latitude')
            lon = coords.get('longitude')

            if lat and lon:
                # Create Place instance
                place_uri = URIRef(f"{HC}place/{ghcid_numeric}")

                g.add((place_uri, RDF.type, SCHEMA.Place))
                g.add((place_uri, SCHEMA.latitude, Literal(lat, datatype=XSD.decimal)))
                g.add((place_uri, SCHEMA.longitude, Literal(lon, datatype=XSD.decimal)))

                # Add address if available
                address = google.get('formatted_address')
                if address:
                    g.add((place_uri, SCHEMA.address, Literal(address)))

                # Link to custodian
                g.add((custodian_uri, CRM.P53_has_former_or_current_location, place_uri))

        # Add GeoNames ID from location resolution
        ghcid_data = entry.get('ghcid', {})
        loc_resolution = ghcid_data.get('location_resolution', {})
        geonames_id = loc_resolution.get('geonames_id')

        if geonames_id:
            geonames_uri = URIRef(f"https://sws.geonames.org/{geonames_id}/")
            g.add((custodian_uri, SCHEMA.containedInPlace, geonames_uri))

    def _add_timestamps(self, g: Graph, custodian_uri: URIRef, entry: dict):
        """Add created/modified timestamps."""
        processing_ts = entry.get('processing_timestamp')
        if processing_ts:
            g.add((custodian_uri, SCHEMA.dateCreated, Literal(processing_ts, datatype=XSD.dateTime)))

        # Use provenance timestamp as modified
        provenance = entry.get('provenance', {})
        generated_at = provenance.get('generated_at')
        if generated_at:
            g.add((custodian_uri, SCHEMA.dateModified, Literal(generated_at, datatype=XSD.dateTime)))

    def transform_all(self):
        """Transform all NDE enriched entries."""
        # Ensure output directory exists
        RDF_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

        # Get all entry files
        entry_files = sorted(NDE_ENRICHED_DIR.glob("*.yaml"))
        total = len(entry_files)

        logger.info(f"Found {total} NDE enriched entries to transform")

        for idx, entry_path in enumerate(entry_files, 1):
            self.stats['processed'] += 1

            logger.info(f"[{idx}/{total}] Transforming {entry_path.name}")

            graph = self.transform_entry(entry_path)

            if graph:
                # Get GHCID numeric for filename
                with open(entry_path, 'r', encoding='utf-8') as f:
                    entry = yaml.safe_load(f)
                ghcid_numeric = entry.get('ghcid', {}).get('ghcid_numeric')

                if ghcid_numeric:
                    output_path = RDF_OUTPUT_DIR / f"{ghcid_numeric}.ttl"

                    if not self.dry_run:
                        graph.serialize(destination=str(output_path), format='turtle')
                        logger.info(f"  -> Wrote {output_path.name} ({len(graph)} triples)")
                    else:
                        logger.info(f"  -> [DRY-RUN] Would write {output_path.name} ({len(graph)} triples)")

                    self.stats['success'] += 1

        # Summary
        logger.info("=" * 60)
        logger.info("Transformation complete!")
        logger.info(f"  Processed: {self.stats['processed']}")
        logger.info(f"  Success:   {self.stats['success']}")
        logger.info(f"  Skipped:   {self.stats['skipped']}")
        logger.info(f"  Errors:    {self.stats['errors']}")

    def transform_single(self, entry_index: str):
        """Transform a single entry by index (e.g., '0946')."""
        # Find the entry file
        pattern = f"{entry_index}_*.yaml"
        matches = list(NDE_ENRICHED_DIR.glob(pattern))

        if not matches:
            logger.error(f"No entry found matching {pattern}")
            return

        entry_path = matches[0]
        logger.info(f"Transforming single entry: {entry_path.name}")

        graph = self.transform_entry(entry_path)

        if graph:
            # Get GHCID numeric for filename
            with open(entry_path, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)
            ghcid_numeric = entry.get('ghcid', {}).get('ghcid_numeric')

            if ghcid_numeric:
                RDF_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
                output_path = RDF_OUTPUT_DIR / f"{ghcid_numeric}.ttl"

                if not self.dry_run:
                    graph.serialize(destination=str(output_path), format='turtle')
                    logger.info(f"Wrote {output_path.name} ({len(graph)} triples)")

                    # Also print the RDF
                    print("\n" + "=" * 60)
                    print("Generated RDF (Turtle):")
                    print("=" * 60)
                    print(graph.serialize(format='turtle'))
                else:
                    logger.info(f"[DRY-RUN] Would write {output_path.name}")
                    print("\n" + "=" * 60)
                    print("Generated RDF (Turtle) [DRY-RUN]:")
                    print("=" * 60)
                    print(graph.serialize(format='turtle'))


def main():
    parser = argparse.ArgumentParser(
        description="Transform NDE enriched YAML to Heritage Custodian RDF"
    )
    parser.add_argument(
        '--entry', '-e',
        help="Transform single entry by index (e.g., '0946')"
    )
    parser.add_argument(
        '--dry-run', '-n',
        action='store_true',
        help="Preview without writing files"
    )

    args = parser.parse_args()

    transformer = NDEToHCTransformer(dry_run=args.dry_run)

    if args.entry:
        transformer.transform_single(args.entry)
    else:
        transformer.transform_all()


if __name__ == '__main__':
    main()