glam/scripts/generate_static_rdf_for_frontend.py

#!/usr/bin/env python3
"""
Generate a combined static Turtle file for frontend visualization fallback.

This creates a smaller subset of the full RDF data, optimized for Force-directed
graph visualization when the SPARQL endpoint is not available.

Output: frontend/public/data/nde_heritage_custodians.ttl
"""

import os
import sys
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, SKOS, XSD, OWL, FOAF, DCTERMS

# Namespaces (matching the actual data)
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
HC = Namespace("https://nde.nl/ontology/hc/")
HCC = Namespace("https://nde.nl/ontology/hc/class/")
SCHEMA = Namespace("http://schema.org/")  # Data uses schema1: but it's the same URI

def main():
    """Combine essential RDF data from all TTL files into a single visualization file."""

    rdf_dir = Path(__file__).parent.parent / "data" / "nde" / "rdf"
    output_path = Path(__file__).parent.parent / "frontend" / "public" / "data" / "nde_heritage_custodians.ttl"

    if not rdf_dir.exists():
        print(f"Error: RDF directory not found: {rdf_dir}")
        sys.exit(1)

    # Create combined graph with visualization-essential triples
    combined = Graph()
    combined.bind("crm", CRM)
    combined.bind("hc", HC)
    combined.bind("hcc", HCC)
    combined.bind("skos", SKOS)
    combined.bind("schema1", SCHEMA)  # Match the source data prefix
    combined.bind("owl", OWL)
    combined.bind("foaf", FOAF)
    combined.bind("dcterms", DCTERMS)
    combined.bind("wd", Namespace("http://www.wikidata.org/entity/"))

    ttl_files = list(rdf_dir.glob("*.ttl"))
    print(f"Found {len(ttl_files)} TTL files to process...")

    processed = 0
    wikidata_count = 0
    website_count = 0
    geonames_count = 0

    for ttl_file in ttl_files:
        try:
            g = Graph()
            g.parse(ttl_file, format="turtle")

            # Extract visualization-essential triples for each custodian
            for custodian in g.subjects(RDF.type, CRM.E39_Actor):
                # Type triple
                combined.add((custodian, RDF.type, CRM.E39_Actor))

                # Label (prefLabel)
                for label in g.objects(custodian, SKOS.prefLabel):
                    combined.add((custodian, SKOS.prefLabel, label))

                # Custodian type
                for ctype in g.objects(custodian, HC.custodian_type):
                    combined.add((custodian, HC.custodian_type, ctype))

                # GHCID numeric identifier (dcterms:identifier)
                for identifier in g.objects(custodian, DCTERMS.identifier):
                    combined.add((custodian, DCTERMS.identifier, identifier))

                # Wikidata link (owl:sameAs)
                for same_as in g.objects(custodian, OWL.sameAs):
                    same_as_str = str(same_as)
                    if "wikidata.org" in same_as_str:
                        combined.add((custodian, OWL.sameAs, same_as))
                        wikidata_count += 1

                # GeoNames containedInPlace (schema1:containedInPlace)
                for geonames in g.objects(custodian, SCHEMA.containedInPlace):
                    geonames_str = str(geonames)
                    if "geonames.org" in geonames_str:
                        combined.add((custodian, SCHEMA.containedInPlace, geonames))
                        geonames_count += 1

                # Website (foaf:homepage)
                for homepage in g.objects(custodian, FOAF.homepage):
                    combined.add((custodian, FOAF.homepage, homepage))
                    # Also get the URL from the homepage resource
                    for url in g.objects(homepage, SCHEMA.url):
                        combined.add((homepage, SCHEMA.url, url))
                    website_count += 1

                # Social media accounts (foaf:account)
                for account in g.objects(custodian, FOAF.account):
                    combined.add((custodian, FOAF.account, account))
                    # Get account details
                    for account_url in g.objects(account, FOAF.accountServiceHomepage):
                        combined.add((account, FOAF.accountServiceHomepage, account_url))
                    for platform_type in g.objects(account, HC.platform_type):
                        combined.add((account, HC.platform_type, platform_type))

                # Location and coordinates
                for place in g.objects(custodian, CRM.P53_has_former_or_current_location):
                    combined.add((custodian, CRM.P53_has_former_or_current_location, place))

                    for lat in g.objects(place, SCHEMA.latitude):
                        combined.add((place, SCHEMA.latitude, lat))
                    for lon in g.objects(place, SCHEMA.longitude):
                        combined.add((place, SCHEMA.longitude, lon))
                    for city in g.objects(place, SCHEMA.addressLocality):
                        combined.add((place, SCHEMA.addressLocality, city))
                    for address in g.objects(place, SCHEMA.address):
                        combined.add((place, SCHEMA.address, address))

            processed += 1
            if processed % 100 == 0:
                print(f"  Processed {processed}/{len(ttl_files)} files...")

        except Exception as e:
            print(f"  Warning: Could not parse {ttl_file.name}: {e}")

    # Add header comment
    header = f"""# NDE Heritage Custodians - Static RDF for Frontend Visualization
# Generated from {len(ttl_files)} individual TTL files
# Contains: labels, types, locations, Wikidata links, websites, social media, GeoNames
#
# Full data available via SPARQL endpoint at http://91.98.224.44/query
#

"""

    # Serialize
    output_path.parent.mkdir(parents=True, exist_ok=True)
    turtle_content = combined.serialize(format="turtle")

    with open(output_path, "w") as f:
        f.write(header)
        f.write(turtle_content)

    # Stats
    custodian_count = len(list(combined.subjects(RDF.type, CRM.E39_Actor)))
    triple_count = len(combined)
    file_size = output_path.stat().st_size / 1024  # KB

    print(f"\n✅ Generated static RDF file:")
    print(f"   Path: {output_path}")
    print(f"   Custodians: {custodian_count}")
    print(f"   Triples: {triple_count}")
    print(f"   Size: {file_size:.1f} KB")
    print(f"\n   Data enrichment:")
    print(f"   - Wikidata links: {wikidata_count}")
    print(f"   - Websites: {website_count}")
    print(f"   - GeoNames links: {geonames_count}")

if __name__ == "__main__":
    main()