glam/scripts/export_denmark_rdf_enriched.py

"""
Export Danish GLAM dataset to RDF (Turtle, RDF/XML, JSON-LD)

This script converts the denmark_complete.json dataset into Linked Open Data formats
aligned with CPOV (Core Public Organisation Vocabulary), Schema.org, and heritage ontologies.
"""

import json
import re
from pathlib import Path
from typing import Dict, List, Optional
from rdflib import Graph, Namespace, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, FOAF, XSD, OWL

# Define namespaces
HERITAGE = Namespace("https://w3id.org/heritage/custodian/")
SCHEMA = Namespace("http://schema.org/")
CPOV = Namespace("http://data.europa.eu/m8g/")
TOOI = Namespace("https://identifier.overheid.nl/tooi/def/ont/")
ORG = Namespace("http://www.w3.org/ns/org#")
PROV = Namespace("http://www.w3.org/ns/prov#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
GEONAMES = Namespace("http://www.geonames.org/ontology#")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")
VIAF = Namespace("http://viaf.org/viaf/")

def parse_identifier_string(identifier_str: str) -> Optional[Dict[str, str]]:
    """Parse identifier from string representation."""
    if not identifier_str or not isinstance(identifier_str, str):
        return None

    # Extract identifier_scheme and identifier_value using regex
    scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
    value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
    url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)

    if scheme_match and value_match:
        return {
            'scheme': scheme_match.group(1),
            'value': value_match.group(1),
            'url': url_match.group(1) if url_match else None
        }
    return None

def parse_location_string(location_str: str) -> Optional[Dict]:
    """Parse location from string representation."""
    if not location_str or not isinstance(location_str, str):
        return None

    result = {}

    # Extract fields using regex
    city_match = re.search(r"'city':\s*'([^']*)'", location_str)
    if city_match:
        result['city'] = city_match.group(1)

    address_match = re.search(r"'street_address':\s*'([^']*)'", location_str)
    if address_match:
        result['street_address'] = address_match.group(1)

    postal_match = re.search(r"'postal_code':\s*'([^']*)'", location_str)
    if postal_match:
        result['postal_code'] = postal_match.group(1)

    country_match = re.search(r"'country':\s*'([^']*)'", location_str)
    if country_match:
        result['country'] = country_match.group(1)

    return result if result else None

def export_institution_to_rdf(institution: Dict, graph: Graph) -> None:
    """Export a single institution to RDF graph."""

    # Institution URI
    inst_id = institution.get('id')
    if not inst_id:
        return

    inst_uri = URIRef(inst_id)

    # Determine RDF type based on institution_type
    inst_type = institution.get('institution_type', 'UNKNOWN')

    # Add multiple rdf:type declarations
    if inst_type == 'LIBRARY':
        graph.add((inst_uri, RDF.type, SCHEMA.Library))
        graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
    elif inst_type == 'ARCHIVE':
        graph.add((inst_uri, RDF.type, SCHEMA.ArchiveOrganization))
        graph.add((inst_uri, RDF.type, RICO.CorporateBody))
        graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
    elif inst_type == 'MUSEUM':
        graph.add((inst_uri, RDF.type, SCHEMA.Museum))
        graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
    else:
        graph.add((inst_uri, RDF.type, SCHEMA.Organization))

    # Basic properties
    name = institution.get('name')
    if name:
        graph.add((inst_uri, SKOS.prefLabel, Literal(name, lang='da')))
        graph.add((inst_uri, SCHEMA.name, Literal(name, lang='da')))
        graph.add((inst_uri, RDFS.label, Literal(name, lang='da')))

    # Alternative names
    alt_names = institution.get('alternative_names', [])
    if alt_names:
        for alt_name in alt_names:
            if alt_name:
                graph.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang='da')))
                graph.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang='da')))

    # Description
    description = institution.get('description')
    if description:
        graph.add((inst_uri, DCTERMS.description, Literal(description, lang='da')))
        graph.add((inst_uri, SCHEMA.description, Literal(description, lang='da')))

    # Identifiers
    identifiers = institution.get('identifiers', [])
    for identifier_data in identifiers:
        identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data

        if identifier and isinstance(identifier, dict):
            scheme = identifier.get('scheme')
            value = identifier.get('value')
            url = identifier.get('url')

            if scheme == 'ISIL':
                graph.add((inst_uri, DCTERMS.identifier, Literal(value)))
                graph.add((inst_uri, SCHEMA.identifier, Literal(value)))
                if url:
                    graph.add((inst_uri, SCHEMA.sameAs, URIRef(url)))

            elif scheme == 'Wikidata':
                wikidata_uri = URIRef(f"http://www.wikidata.org/entity/{value}")
                graph.add((inst_uri, SCHEMA.sameAs, wikidata_uri))
                graph.add((inst_uri, OWL.sameAs, wikidata_uri))

            elif scheme == 'VIAF':
                viaf_uri = URIRef(f"http://viaf.org/viaf/{value}")
                graph.add((inst_uri, SCHEMA.sameAs, viaf_uri))
                graph.add((inst_uri, OWL.sameAs, viaf_uri))

            elif scheme == 'Website':
                if value:
                    graph.add((inst_uri, SCHEMA.url, URIRef(value)))
                    graph.add((inst_uri, FOAF.homepage, URIRef(value)))

            elif scheme == 'VIP-basen Library Number':
                graph.add((inst_uri, DCTERMS.identifier, Literal(value)))

    # GHCID identifiers
    ghcid = institution.get('ghcid_current')
    if ghcid:
        graph.add((inst_uri, HERITAGE.ghcid, Literal(ghcid)))
        graph.add((inst_uri, DCTERMS.identifier, Literal(ghcid)))

    ghcid_uuid = institution.get('ghcid_uuid')
    if ghcid_uuid:
        graph.add((inst_uri, HERITAGE.ghcidUUID, Literal(ghcid_uuid)))

    # Locations
    locations = institution.get('locations', [])
    for location_data in locations:
        location = parse_location_string(location_data) if isinstance(location_data, str) else location_data

        if location and isinstance(location, dict):
            # Create address node
            address_node = BNode()
            graph.add((inst_uri, SCHEMA.address, address_node))
            graph.add((address_node, RDF.type, SCHEMA.PostalAddress))

            city = location.get('city')
            if city:
                graph.add((address_node, SCHEMA.addressLocality, Literal(city)))

            street = location.get('street_address')
            if street:
                graph.add((address_node, SCHEMA.streetAddress, Literal(street)))

            postal = location.get('postal_code')
            if postal:
                graph.add((address_node, SCHEMA.postalCode, Literal(postal)))

            country = location.get('country')
            if country:
                graph.add((address_node, SCHEMA.addressCountry, Literal(country)))

    # Parent organization (hierarchical linking)
    parent_org = institution.get('parent_organization')
    if parent_org:
        parent_uri = URIRef(parent_org)
        graph.add((inst_uri, ORG.subOrganizationOf, parent_uri))
        graph.add((inst_uri, SCHEMA.parentOrganization, parent_uri))
        graph.add((parent_uri, ORG.hasSubOrganization, inst_uri))

    # Provenance metadata
    provenance = institution.get('provenance')
    if provenance:
        prov_node = BNode()
        graph.add((inst_uri, PROV.wasGeneratedBy, prov_node))
        graph.add((prov_node, RDF.type, PROV.Activity))

        # Parse provenance if it's a string
        if isinstance(provenance, str):
            source_match = re.search(r"'data_source':\s*<DataSourceEnum\.([^:]+)", provenance)
            if source_match:
                data_source = source_match.group(1)
                graph.add((prov_node, DCTERMS.source, Literal(data_source)))

            date_match = re.search(r"'extraction_date':\s*datetime\.datetime\(([^)]+)\)", provenance)
            if date_match:
                # Simplify - just add a note that it was extracted
                graph.add((prov_node, DCTERMS.created, Literal("2025-11-19", datatype=XSD.date)))

def export_denmark_to_rdf(input_json: Path, output_dir: Path, format_type: str = 'ttl') -> None:
    """Export Danish GLAM dataset to RDF."""

    print(f"Loading dataset from {input_json}...")
    with open(input_json, 'r') as f:
        institutions = json.load(f)

    print(f"Creating RDF graph for {len(institutions)} institutions...")

    # Create graph
    g = Graph()

    # Bind namespaces
    g.bind('heritage', HERITAGE)
    g.bind('schema', SCHEMA)
    g.bind('cpov', CPOV)
    g.bind('tooi', TOOI)
    g.bind('org', ORG)
    g.bind('prov', PROV)
    g.bind('rico', RICO)
    g.bind('skos', SKOS)
    g.bind('dcterms', DCTERMS)
    g.bind('foaf', FOAF)
    g.bind('geonames', GEONAMES)
    g.bind('wikidata', WIKIDATA)
    g.bind('viaf', VIAF)
    g.bind('owl', OWL)

    # Add ontology metadata
    ontology_uri = URIRef("https://w3id.org/heritage/custodian/ontology")
    g.add((ontology_uri, RDF.type, OWL.Ontology))
    g.add((ontology_uri, RDFS.label, Literal("Heritage Custodian Ontology")))
    g.add((ontology_uri, DCTERMS.title, Literal("Heritage Custodian Ontology")))
    g.add((ontology_uri, DCTERMS.description, Literal(
        "Ontology for describing heritage institutions (GLAM - Galleries, Libraries, Archives, Museums) worldwide"
    )))

    # Export each institution
    for i, institution in enumerate(institutions, 1):
        if i % 100 == 0:
            print(f"  Processed {i}/{len(institutions)} institutions...")
        export_institution_to_rdf(institution, g)

    print(f"\n✅ Graph contains {len(g)} triples")

    # Serialize to different formats
    formats = {
        'ttl': ('turtle', '.ttl'),
        'rdf': ('xml', '.rdf'),
        'jsonld': ('json-ld', '.jsonld'),
        'nt': ('nt', '.nt')
    }

    if format_type == 'all':
        for fmt, (rdf_format, ext) in formats.items():
            output_file = output_dir / f"denmark_complete{ext}"
            print(f"\nSerializing to {fmt.upper()}...")
            g.serialize(destination=output_file, format=rdf_format)
            size_mb = output_file.stat().st_size / (1024 * 1024)
            print(f"  ✅ Saved to {output_file} ({size_mb:.2f} MB)")
    else:
        rdf_format, ext = formats.get(format_type, ('turtle', '.ttl'))
        output_file = output_dir / f"denmark_complete{ext}"
        print(f"\nSerializing to {format_type.upper()}...")
        g.serialize(destination=output_file, format=rdf_format)
        size_mb = output_file.stat().st_size / (1024 * 1024)
        print(f"  ✅ Saved to {output_file} ({size_mb:.2f} MB)")

if __name__ == '__main__':
    input_json = Path('data/instances/denmark_complete_enriched.json')
    output_dir = Path('data/rdf')
    output_dir.mkdir(parents=True, exist_ok=True)

    print("=" * 60)
    print("Danish GLAM Dataset → RDF Export")
    print("=" * 60)

    # Export to all formats
    export_denmark_to_rdf(input_json, output_dir, format_type='all')

    print("\n" + "=" * 60)
    print("✅ RDF Export Complete")
    print("=" * 60)