glam/scripts/generate_static_rdf_for_frontend.py
2025-12-23 13:27:35 +01:00

160 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
Generate a combined static Turtle file for frontend visualization fallback.
This creates a smaller subset of the full RDF data, optimized for Force-directed
graph visualization when the SPARQL endpoint is not available.
Output: frontend/public/data/nde_heritage_custodians.ttl
"""
import os
import sys
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, SKOS, XSD, OWL, FOAF, DCTERMS
# Namespaces (matching the actual data)
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
HC = Namespace("https://nde.nl/ontology/hc/")
HCC = Namespace("https://nde.nl/ontology/hc/class/")
SCHEMA = Namespace("http://schema.org/") # Data uses schema1: but it's the same URI
def main():
"""Combine essential RDF data from all TTL files into a single visualization file."""
rdf_dir = Path(__file__).parent.parent / "data" / "nde" / "rdf"
output_path = Path(__file__).parent.parent / "frontend" / "public" / "data" / "nde_heritage_custodians.ttl"
if not rdf_dir.exists():
print(f"Error: RDF directory not found: {rdf_dir}")
sys.exit(1)
# Create combined graph with visualization-essential triples
combined = Graph()
combined.bind("crm", CRM)
combined.bind("hc", HC)
combined.bind("hcc", HCC)
combined.bind("skos", SKOS)
combined.bind("schema1", SCHEMA) # Match the source data prefix
combined.bind("owl", OWL)
combined.bind("foaf", FOAF)
combined.bind("dcterms", DCTERMS)
combined.bind("wd", Namespace("http://www.wikidata.org/entity/"))
ttl_files = list(rdf_dir.glob("*.ttl"))
print(f"Found {len(ttl_files)} TTL files to process...")
processed = 0
wikidata_count = 0
website_count = 0
geonames_count = 0
for ttl_file in ttl_files:
try:
g = Graph()
g.parse(ttl_file, format="turtle")
# Extract visualization-essential triples for each custodian
for custodian in g.subjects(RDF.type, CRM.E39_Actor):
# Type triple
combined.add((custodian, RDF.type, CRM.E39_Actor))
# Label (prefLabel)
for label in g.objects(custodian, SKOS.prefLabel):
combined.add((custodian, SKOS.prefLabel, label))
# Custodian type
for ctype in g.objects(custodian, HC.custodian_type):
combined.add((custodian, HC.custodian_type, ctype))
# GHCID numeric identifier (dcterms:identifier)
for identifier in g.objects(custodian, DCTERMS.identifier):
combined.add((custodian, DCTERMS.identifier, identifier))
# Wikidata link (owl:sameAs)
for same_as in g.objects(custodian, OWL.sameAs):
same_as_str = str(same_as)
if "wikidata.org" in same_as_str:
combined.add((custodian, OWL.sameAs, same_as))
wikidata_count += 1
# GeoNames containedInPlace (schema1:containedInPlace)
for geonames in g.objects(custodian, SCHEMA.containedInPlace):
geonames_str = str(geonames)
if "geonames.org" in geonames_str:
combined.add((custodian, SCHEMA.containedInPlace, geonames))
geonames_count += 1
# Website (foaf:homepage)
for homepage in g.objects(custodian, FOAF.homepage):
combined.add((custodian, FOAF.homepage, homepage))
# Also get the URL from the homepage resource
for url in g.objects(homepage, SCHEMA.url):
combined.add((homepage, SCHEMA.url, url))
website_count += 1
# Social media accounts (foaf:account)
for account in g.objects(custodian, FOAF.account):
combined.add((custodian, FOAF.account, account))
# Get account details
for account_url in g.objects(account, FOAF.accountServiceHomepage):
combined.add((account, FOAF.accountServiceHomepage, account_url))
for platform_type in g.objects(account, HC.platform_type):
combined.add((account, HC.platform_type, platform_type))
# Location and coordinates
for place in g.objects(custodian, CRM.P53_has_former_or_current_location):
combined.add((custodian, CRM.P53_has_former_or_current_location, place))
for lat in g.objects(place, SCHEMA.latitude):
combined.add((place, SCHEMA.latitude, lat))
for lon in g.objects(place, SCHEMA.longitude):
combined.add((place, SCHEMA.longitude, lon))
for city in g.objects(place, SCHEMA.addressLocality):
combined.add((place, SCHEMA.addressLocality, city))
for address in g.objects(place, SCHEMA.address):
combined.add((place, SCHEMA.address, address))
processed += 1
if processed % 100 == 0:
print(f" Processed {processed}/{len(ttl_files)} files...")
except Exception as e:
print(f" Warning: Could not parse {ttl_file.name}: {e}")
# Add header comment
header = f"""# NDE Heritage Custodians - Static RDF for Frontend Visualization
# Generated from {len(ttl_files)} individual TTL files
# Contains: labels, types, locations, Wikidata links, websites, social media, GeoNames
#
# Full data available via SPARQL endpoint at http://91.98.224.44/query
#
"""
# Serialize
output_path.parent.mkdir(parents=True, exist_ok=True)
turtle_content = combined.serialize(format="turtle")
with open(output_path, "w") as f:
f.write(header)
f.write(turtle_content)
# Stats
custodian_count = len(list(combined.subjects(RDF.type, CRM.E39_Actor)))
triple_count = len(combined)
file_size = output_path.stat().st_size / 1024 # KB
print(f"\n✅ Generated static RDF file:")
print(f" Path: {output_path}")
print(f" Custodians: {custodian_count}")
print(f" Triples: {triple_count}")
print(f" Size: {file_size:.1f} KB")
print(f"\n Data enrichment:")
print(f" - Wikidata links: {wikidata_count}")
print(f" - Websites: {website_count}")
print(f" - GeoNames links: {geonames_count}")
if __name__ == "__main__":
main()