#!/usr/bin/env python3 """ Generate a combined static Turtle file for frontend visualization fallback. This creates a smaller subset of the full RDF data, optimized for Force-directed graph visualization when the SPARQL endpoint is not available. Output: frontend/public/data/nde_heritage_custodians.ttl """ import os import sys from pathlib import Path from rdflib import Graph, Namespace, URIRef, Literal from rdflib.namespace import RDF, SKOS, XSD, OWL, FOAF, DCTERMS # Namespaces (matching the actual data) CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/") HC = Namespace("https://nde.nl/ontology/hc/") HCC = Namespace("https://nde.nl/ontology/hc/class/") SCHEMA = Namespace("http://schema.org/") # Data uses schema1: but it's the same URI def main(): """Combine essential RDF data from all TTL files into a single visualization file.""" rdf_dir = Path(__file__).parent.parent / "data" / "nde" / "rdf" output_path = Path(__file__).parent.parent / "frontend" / "public" / "data" / "nde_heritage_custodians.ttl" if not rdf_dir.exists(): print(f"Error: RDF directory not found: {rdf_dir}") sys.exit(1) # Create combined graph with visualization-essential triples combined = Graph() combined.bind("crm", CRM) combined.bind("hc", HC) combined.bind("hcc", HCC) combined.bind("skos", SKOS) combined.bind("schema1", SCHEMA) # Match the source data prefix combined.bind("owl", OWL) combined.bind("foaf", FOAF) combined.bind("dcterms", DCTERMS) combined.bind("wd", Namespace("http://www.wikidata.org/entity/")) ttl_files = list(rdf_dir.glob("*.ttl")) print(f"Found {len(ttl_files)} TTL files to process...") processed = 0 wikidata_count = 0 website_count = 0 geonames_count = 0 for ttl_file in ttl_files: try: g = Graph() g.parse(ttl_file, format="turtle") # Extract visualization-essential triples for each custodian for custodian in g.subjects(RDF.type, CRM.E39_Actor): # Type triple combined.add((custodian, RDF.type, CRM.E39_Actor)) # Label (prefLabel) for label in g.objects(custodian, SKOS.prefLabel): combined.add((custodian, SKOS.prefLabel, label)) # Custodian type for ctype in g.objects(custodian, HC.custodian_type): combined.add((custodian, HC.custodian_type, ctype)) # GHCID numeric identifier (dcterms:identifier) for identifier in g.objects(custodian, DCTERMS.identifier): combined.add((custodian, DCTERMS.identifier, identifier)) # Wikidata link (owl:sameAs) for same_as in g.objects(custodian, OWL.sameAs): same_as_str = str(same_as) if "wikidata.org" in same_as_str: combined.add((custodian, OWL.sameAs, same_as)) wikidata_count += 1 # GeoNames containedInPlace (schema1:containedInPlace) for geonames in g.objects(custodian, SCHEMA.containedInPlace): geonames_str = str(geonames) if "geonames.org" in geonames_str: combined.add((custodian, SCHEMA.containedInPlace, geonames)) geonames_count += 1 # Website (foaf:homepage) for homepage in g.objects(custodian, FOAF.homepage): combined.add((custodian, FOAF.homepage, homepage)) # Also get the URL from the homepage resource for url in g.objects(homepage, SCHEMA.url): combined.add((homepage, SCHEMA.url, url)) website_count += 1 # Social media accounts (foaf:account) for account in g.objects(custodian, FOAF.account): combined.add((custodian, FOAF.account, account)) # Get account details for account_url in g.objects(account, FOAF.accountServiceHomepage): combined.add((account, FOAF.accountServiceHomepage, account_url)) for platform_type in g.objects(account, HC.platform_type): combined.add((account, HC.platform_type, platform_type)) # Location and coordinates for place in g.objects(custodian, CRM.P53_has_former_or_current_location): combined.add((custodian, CRM.P53_has_former_or_current_location, place)) for lat in g.objects(place, SCHEMA.latitude): combined.add((place, SCHEMA.latitude, lat)) for lon in g.objects(place, SCHEMA.longitude): combined.add((place, SCHEMA.longitude, lon)) for city in g.objects(place, SCHEMA.addressLocality): combined.add((place, SCHEMA.addressLocality, city)) for address in g.objects(place, SCHEMA.address): combined.add((place, SCHEMA.address, address)) processed += 1 if processed % 100 == 0: print(f" Processed {processed}/{len(ttl_files)} files...") except Exception as e: print(f" Warning: Could not parse {ttl_file.name}: {e}") # Add header comment header = f"""# NDE Heritage Custodians - Static RDF for Frontend Visualization # Generated from {len(ttl_files)} individual TTL files # Contains: labels, types, locations, Wikidata links, websites, social media, GeoNames # # Full data available via SPARQL endpoint at http://91.98.224.44/query # """ # Serialize output_path.parent.mkdir(parents=True, exist_ok=True) turtle_content = combined.serialize(format="turtle") with open(output_path, "w") as f: f.write(header) f.write(turtle_content) # Stats custodian_count = len(list(combined.subjects(RDF.type, CRM.E39_Actor))) triple_count = len(combined) file_size = output_path.stat().st_size / 1024 # KB print(f"\n✅ Generated static RDF file:") print(f" Path: {output_path}") print(f" Custodians: {custodian_count}") print(f" Triples: {triple_count}") print(f" Size: {file_size:.1f} KB") print(f"\n Data enrichment:") print(f" - Wikidata links: {wikidata_count}") print(f" - Websites: {website_count}") print(f" - GeoNames links: {geonames_count}") if __name__ == "__main__": main()