glam/scripts/export_denmark_rdf_enriched.py
2025-11-19 23:25:22 +01:00

299 lines
12 KiB
Python

"""
Export Danish GLAM dataset to RDF (Turtle, RDF/XML, JSON-LD)
This script converts the denmark_complete.json dataset into Linked Open Data formats
aligned with CPOV (Core Public Organisation Vocabulary), Schema.org, and heritage ontologies.
"""
import json
import re
from pathlib import Path
from typing import Dict, List, Optional
from rdflib import Graph, Namespace, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, FOAF, XSD, OWL
# Define namespaces
HERITAGE = Namespace("https://w3id.org/heritage/custodian/")
SCHEMA = Namespace("http://schema.org/")
CPOV = Namespace("http://data.europa.eu/m8g/")
TOOI = Namespace("https://identifier.overheid.nl/tooi/def/ont/")
ORG = Namespace("http://www.w3.org/ns/org#")
PROV = Namespace("http://www.w3.org/ns/prov#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
GEONAMES = Namespace("http://www.geonames.org/ontology#")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")
VIAF = Namespace("http://viaf.org/viaf/")
def parse_identifier_string(identifier_str: str) -> Optional[Dict[str, str]]:
"""Parse identifier from string representation."""
if not identifier_str or not isinstance(identifier_str, str):
return None
# Extract identifier_scheme and identifier_value using regex
scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)
if scheme_match and value_match:
return {
'scheme': scheme_match.group(1),
'value': value_match.group(1),
'url': url_match.group(1) if url_match else None
}
return None
def parse_location_string(location_str: str) -> Optional[Dict]:
"""Parse location from string representation."""
if not location_str or not isinstance(location_str, str):
return None
result = {}
# Extract fields using regex
city_match = re.search(r"'city':\s*'([^']*)'", location_str)
if city_match:
result['city'] = city_match.group(1)
address_match = re.search(r"'street_address':\s*'([^']*)'", location_str)
if address_match:
result['street_address'] = address_match.group(1)
postal_match = re.search(r"'postal_code':\s*'([^']*)'", location_str)
if postal_match:
result['postal_code'] = postal_match.group(1)
country_match = re.search(r"'country':\s*'([^']*)'", location_str)
if country_match:
result['country'] = country_match.group(1)
return result if result else None
def export_institution_to_rdf(institution: Dict, graph: Graph) -> None:
"""Export a single institution to RDF graph."""
# Institution URI
inst_id = institution.get('id')
if not inst_id:
return
inst_uri = URIRef(inst_id)
# Determine RDF type based on institution_type
inst_type = institution.get('institution_type', 'UNKNOWN')
# Add multiple rdf:type declarations
if inst_type == 'LIBRARY':
graph.add((inst_uri, RDF.type, SCHEMA.Library))
graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
elif inst_type == 'ARCHIVE':
graph.add((inst_uri, RDF.type, SCHEMA.ArchiveOrganization))
graph.add((inst_uri, RDF.type, RICO.CorporateBody))
graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
elif inst_type == 'MUSEUM':
graph.add((inst_uri, RDF.type, SCHEMA.Museum))
graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
else:
graph.add((inst_uri, RDF.type, SCHEMA.Organization))
# Basic properties
name = institution.get('name')
if name:
graph.add((inst_uri, SKOS.prefLabel, Literal(name, lang='da')))
graph.add((inst_uri, SCHEMA.name, Literal(name, lang='da')))
graph.add((inst_uri, RDFS.label, Literal(name, lang='da')))
# Alternative names
alt_names = institution.get('alternative_names', [])
if alt_names:
for alt_name in alt_names:
if alt_name:
graph.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang='da')))
graph.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang='da')))
# Description
description = institution.get('description')
if description:
graph.add((inst_uri, DCTERMS.description, Literal(description, lang='da')))
graph.add((inst_uri, SCHEMA.description, Literal(description, lang='da')))
# Identifiers
identifiers = institution.get('identifiers', [])
for identifier_data in identifiers:
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
if identifier and isinstance(identifier, dict):
scheme = identifier.get('scheme')
value = identifier.get('value')
url = identifier.get('url')
if scheme == 'ISIL':
graph.add((inst_uri, DCTERMS.identifier, Literal(value)))
graph.add((inst_uri, SCHEMA.identifier, Literal(value)))
if url:
graph.add((inst_uri, SCHEMA.sameAs, URIRef(url)))
elif scheme == 'Wikidata':
wikidata_uri = URIRef(f"http://www.wikidata.org/entity/{value}")
graph.add((inst_uri, SCHEMA.sameAs, wikidata_uri))
graph.add((inst_uri, OWL.sameAs, wikidata_uri))
elif scheme == 'VIAF':
viaf_uri = URIRef(f"http://viaf.org/viaf/{value}")
graph.add((inst_uri, SCHEMA.sameAs, viaf_uri))
graph.add((inst_uri, OWL.sameAs, viaf_uri))
elif scheme == 'Website':
if value:
graph.add((inst_uri, SCHEMA.url, URIRef(value)))
graph.add((inst_uri, FOAF.homepage, URIRef(value)))
elif scheme == 'VIP-basen Library Number':
graph.add((inst_uri, DCTERMS.identifier, Literal(value)))
# GHCID identifiers
ghcid = institution.get('ghcid_current')
if ghcid:
graph.add((inst_uri, HERITAGE.ghcid, Literal(ghcid)))
graph.add((inst_uri, DCTERMS.identifier, Literal(ghcid)))
ghcid_uuid = institution.get('ghcid_uuid')
if ghcid_uuid:
graph.add((inst_uri, HERITAGE.ghcidUUID, Literal(ghcid_uuid)))
# Locations
locations = institution.get('locations', [])
for location_data in locations:
location = parse_location_string(location_data) if isinstance(location_data, str) else location_data
if location and isinstance(location, dict):
# Create address node
address_node = BNode()
graph.add((inst_uri, SCHEMA.address, address_node))
graph.add((address_node, RDF.type, SCHEMA.PostalAddress))
city = location.get('city')
if city:
graph.add((address_node, SCHEMA.addressLocality, Literal(city)))
street = location.get('street_address')
if street:
graph.add((address_node, SCHEMA.streetAddress, Literal(street)))
postal = location.get('postal_code')
if postal:
graph.add((address_node, SCHEMA.postalCode, Literal(postal)))
country = location.get('country')
if country:
graph.add((address_node, SCHEMA.addressCountry, Literal(country)))
# Parent organization (hierarchical linking)
parent_org = institution.get('parent_organization')
if parent_org:
parent_uri = URIRef(parent_org)
graph.add((inst_uri, ORG.subOrganizationOf, parent_uri))
graph.add((inst_uri, SCHEMA.parentOrganization, parent_uri))
graph.add((parent_uri, ORG.hasSubOrganization, inst_uri))
# Provenance metadata
provenance = institution.get('provenance')
if provenance:
prov_node = BNode()
graph.add((inst_uri, PROV.wasGeneratedBy, prov_node))
graph.add((prov_node, RDF.type, PROV.Activity))
# Parse provenance if it's a string
if isinstance(provenance, str):
source_match = re.search(r"'data_source':\s*<DataSourceEnum\.([^:]+)", provenance)
if source_match:
data_source = source_match.group(1)
graph.add((prov_node, DCTERMS.source, Literal(data_source)))
date_match = re.search(r"'extraction_date':\s*datetime\.datetime\(([^)]+)\)", provenance)
if date_match:
# Simplify - just add a note that it was extracted
graph.add((prov_node, DCTERMS.created, Literal("2025-11-19", datatype=XSD.date)))
def export_denmark_to_rdf(input_json: Path, output_dir: Path, format_type: str = 'ttl') -> None:
"""Export Danish GLAM dataset to RDF."""
print(f"Loading dataset from {input_json}...")
with open(input_json, 'r') as f:
institutions = json.load(f)
print(f"Creating RDF graph for {len(institutions)} institutions...")
# Create graph
g = Graph()
# Bind namespaces
g.bind('heritage', HERITAGE)
g.bind('schema', SCHEMA)
g.bind('cpov', CPOV)
g.bind('tooi', TOOI)
g.bind('org', ORG)
g.bind('prov', PROV)
g.bind('rico', RICO)
g.bind('skos', SKOS)
g.bind('dcterms', DCTERMS)
g.bind('foaf', FOAF)
g.bind('geonames', GEONAMES)
g.bind('wikidata', WIKIDATA)
g.bind('viaf', VIAF)
g.bind('owl', OWL)
# Add ontology metadata
ontology_uri = URIRef("https://w3id.org/heritage/custodian/ontology")
g.add((ontology_uri, RDF.type, OWL.Ontology))
g.add((ontology_uri, RDFS.label, Literal("Heritage Custodian Ontology")))
g.add((ontology_uri, DCTERMS.title, Literal("Heritage Custodian Ontology")))
g.add((ontology_uri, DCTERMS.description, Literal(
"Ontology for describing heritage institutions (GLAM - Galleries, Libraries, Archives, Museums) worldwide"
)))
# Export each institution
for i, institution in enumerate(institutions, 1):
if i % 100 == 0:
print(f" Processed {i}/{len(institutions)} institutions...")
export_institution_to_rdf(institution, g)
print(f"\n✅ Graph contains {len(g)} triples")
# Serialize to different formats
formats = {
'ttl': ('turtle', '.ttl'),
'rdf': ('xml', '.rdf'),
'jsonld': ('json-ld', '.jsonld'),
'nt': ('nt', '.nt')
}
if format_type == 'all':
for fmt, (rdf_format, ext) in formats.items():
output_file = output_dir / f"denmark_complete{ext}"
print(f"\nSerializing to {fmt.upper()}...")
g.serialize(destination=output_file, format=rdf_format)
size_mb = output_file.stat().st_size / (1024 * 1024)
print(f" ✅ Saved to {output_file} ({size_mb:.2f} MB)")
else:
rdf_format, ext = formats.get(format_type, ('turtle', '.ttl'))
output_file = output_dir / f"denmark_complete{ext}"
print(f"\nSerializing to {format_type.upper()}...")
g.serialize(destination=output_file, format=rdf_format)
size_mb = output_file.stat().st_size / (1024 * 1024)
print(f" ✅ Saved to {output_file} ({size_mb:.2f} MB)")
if __name__ == '__main__':
input_json = Path('data/instances/denmark_complete_enriched.json')
output_dir = Path('data/rdf')
output_dir.mkdir(parents=True, exist_ok=True)
print("=" * 60)
print("Danish GLAM Dataset → RDF Export")
print("=" * 60)
# Export to all formats
export_denmark_to_rdf(input_json, output_dir, format_type='all')
print("\n" + "=" * 60)
print("✅ RDF Export Complete")
print("=" * 60)