160 lines
6.7 KiB
Python
160 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate a combined static Turtle file for frontend visualization fallback.
|
|
|
|
This creates a smaller subset of the full RDF data, optimized for Force-directed
|
|
graph visualization when the SPARQL endpoint is not available.
|
|
|
|
Output: frontend/public/data/nde_heritage_custodians.ttl
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from rdflib import Graph, Namespace, URIRef, Literal
|
|
from rdflib.namespace import RDF, SKOS, XSD, OWL, FOAF, DCTERMS
|
|
|
|
# Namespaces (matching the actual data)
|
|
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
|
|
HC = Namespace("https://nde.nl/ontology/hc/")
|
|
HCC = Namespace("https://nde.nl/ontology/hc/class/")
|
|
SCHEMA = Namespace("http://schema.org/") # Data uses schema1: but it's the same URI
|
|
|
|
def main():
|
|
"""Combine essential RDF data from all TTL files into a single visualization file."""
|
|
|
|
rdf_dir = Path(__file__).parent.parent / "data" / "nde" / "rdf"
|
|
output_path = Path(__file__).parent.parent / "frontend" / "public" / "data" / "nde_heritage_custodians.ttl"
|
|
|
|
if not rdf_dir.exists():
|
|
print(f"Error: RDF directory not found: {rdf_dir}")
|
|
sys.exit(1)
|
|
|
|
# Create combined graph with visualization-essential triples
|
|
combined = Graph()
|
|
combined.bind("crm", CRM)
|
|
combined.bind("hc", HC)
|
|
combined.bind("hcc", HCC)
|
|
combined.bind("skos", SKOS)
|
|
combined.bind("schema1", SCHEMA) # Match the source data prefix
|
|
combined.bind("owl", OWL)
|
|
combined.bind("foaf", FOAF)
|
|
combined.bind("dcterms", DCTERMS)
|
|
combined.bind("wd", Namespace("http://www.wikidata.org/entity/"))
|
|
|
|
ttl_files = list(rdf_dir.glob("*.ttl"))
|
|
print(f"Found {len(ttl_files)} TTL files to process...")
|
|
|
|
processed = 0
|
|
wikidata_count = 0
|
|
website_count = 0
|
|
geonames_count = 0
|
|
|
|
for ttl_file in ttl_files:
|
|
try:
|
|
g = Graph()
|
|
g.parse(ttl_file, format="turtle")
|
|
|
|
# Extract visualization-essential triples for each custodian
|
|
for custodian in g.subjects(RDF.type, CRM.E39_Actor):
|
|
# Type triple
|
|
combined.add((custodian, RDF.type, CRM.E39_Actor))
|
|
|
|
# Label (prefLabel)
|
|
for label in g.objects(custodian, SKOS.prefLabel):
|
|
combined.add((custodian, SKOS.prefLabel, label))
|
|
|
|
# Custodian type
|
|
for ctype in g.objects(custodian, HC.custodian_type):
|
|
combined.add((custodian, HC.custodian_type, ctype))
|
|
|
|
# GHCID numeric identifier (dcterms:identifier)
|
|
for identifier in g.objects(custodian, DCTERMS.identifier):
|
|
combined.add((custodian, DCTERMS.identifier, identifier))
|
|
|
|
# Wikidata link (owl:sameAs)
|
|
for same_as in g.objects(custodian, OWL.sameAs):
|
|
same_as_str = str(same_as)
|
|
if "wikidata.org" in same_as_str:
|
|
combined.add((custodian, OWL.sameAs, same_as))
|
|
wikidata_count += 1
|
|
|
|
# GeoNames containedInPlace (schema1:containedInPlace)
|
|
for geonames in g.objects(custodian, SCHEMA.containedInPlace):
|
|
geonames_str = str(geonames)
|
|
if "geonames.org" in geonames_str:
|
|
combined.add((custodian, SCHEMA.containedInPlace, geonames))
|
|
geonames_count += 1
|
|
|
|
# Website (foaf:homepage)
|
|
for homepage in g.objects(custodian, FOAF.homepage):
|
|
combined.add((custodian, FOAF.homepage, homepage))
|
|
# Also get the URL from the homepage resource
|
|
for url in g.objects(homepage, SCHEMA.url):
|
|
combined.add((homepage, SCHEMA.url, url))
|
|
website_count += 1
|
|
|
|
# Social media accounts (foaf:account)
|
|
for account in g.objects(custodian, FOAF.account):
|
|
combined.add((custodian, FOAF.account, account))
|
|
# Get account details
|
|
for account_url in g.objects(account, FOAF.accountServiceHomepage):
|
|
combined.add((account, FOAF.accountServiceHomepage, account_url))
|
|
for platform_type in g.objects(account, HC.platform_type):
|
|
combined.add((account, HC.platform_type, platform_type))
|
|
|
|
# Location and coordinates
|
|
for place in g.objects(custodian, CRM.P53_has_former_or_current_location):
|
|
combined.add((custodian, CRM.P53_has_former_or_current_location, place))
|
|
|
|
for lat in g.objects(place, SCHEMA.latitude):
|
|
combined.add((place, SCHEMA.latitude, lat))
|
|
for lon in g.objects(place, SCHEMA.longitude):
|
|
combined.add((place, SCHEMA.longitude, lon))
|
|
for city in g.objects(place, SCHEMA.addressLocality):
|
|
combined.add((place, SCHEMA.addressLocality, city))
|
|
for address in g.objects(place, SCHEMA.address):
|
|
combined.add((place, SCHEMA.address, address))
|
|
|
|
processed += 1
|
|
if processed % 100 == 0:
|
|
print(f" Processed {processed}/{len(ttl_files)} files...")
|
|
|
|
except Exception as e:
|
|
print(f" Warning: Could not parse {ttl_file.name}: {e}")
|
|
|
|
# Add header comment
|
|
header = f"""# NDE Heritage Custodians - Static RDF for Frontend Visualization
|
|
# Generated from {len(ttl_files)} individual TTL files
|
|
# Contains: labels, types, locations, Wikidata links, websites, social media, GeoNames
|
|
#
|
|
# Full data available via SPARQL endpoint at http://91.98.224.44/query
|
|
#
|
|
|
|
"""
|
|
|
|
# Serialize
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
turtle_content = combined.serialize(format="turtle")
|
|
|
|
with open(output_path, "w") as f:
|
|
f.write(header)
|
|
f.write(turtle_content)
|
|
|
|
# Stats
|
|
custodian_count = len(list(combined.subjects(RDF.type, CRM.E39_Actor)))
|
|
triple_count = len(combined)
|
|
file_size = output_path.stat().st_size / 1024 # KB
|
|
|
|
print(f"\n✅ Generated static RDF file:")
|
|
print(f" Path: {output_path}")
|
|
print(f" Custodians: {custodian_count}")
|
|
print(f" Triples: {triple_count}")
|
|
print(f" Size: {file_size:.1f} KB")
|
|
print(f"\n Data enrichment:")
|
|
print(f" - Wikidata links: {wikidata_count}")
|
|
print(f" - Websites: {website_count}")
|
|
print(f" - GeoNames links: {geonames_count}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|