299 lines
12 KiB
Python
Executable file
299 lines
12 KiB
Python
Executable file
"""
|
|
Export Danish GLAM dataset to RDF (Turtle, RDF/XML, JSON-LD)
|
|
|
|
This script converts the denmark_complete.json dataset into Linked Open Data formats
|
|
aligned with CPOV (Core Public Organisation Vocabulary), Schema.org, and heritage ontologies.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from rdflib import Graph, Namespace, Literal, URIRef, BNode
|
|
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, FOAF, XSD, OWL
|
|
|
|
# Define namespaces
|
|
HERITAGE = Namespace("https://w3id.org/heritage/custodian/")
|
|
SCHEMA = Namespace("http://schema.org/")
|
|
CPOV = Namespace("http://data.europa.eu/m8g/")
|
|
TOOI = Namespace("https://identifier.overheid.nl/tooi/def/ont/")
|
|
ORG = Namespace("http://www.w3.org/ns/org#")
|
|
PROV = Namespace("http://www.w3.org/ns/prov#")
|
|
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
|
|
GEONAMES = Namespace("http://www.geonames.org/ontology#")
|
|
WIKIDATA = Namespace("http://www.wikidata.org/entity/")
|
|
VIAF = Namespace("http://viaf.org/viaf/")
|
|
|
|
def parse_identifier_string(identifier_str: str) -> Optional[Dict[str, str]]:
|
|
"""Parse identifier from string representation."""
|
|
if not identifier_str or not isinstance(identifier_str, str):
|
|
return None
|
|
|
|
# Extract identifier_scheme and identifier_value using regex
|
|
scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
|
|
value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
|
|
url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)
|
|
|
|
if scheme_match and value_match:
|
|
return {
|
|
'scheme': scheme_match.group(1),
|
|
'value': value_match.group(1),
|
|
'url': url_match.group(1) if url_match else None
|
|
}
|
|
return None
|
|
|
|
def parse_location_string(location_str: str) -> Optional[Dict]:
|
|
"""Parse location from string representation."""
|
|
if not location_str or not isinstance(location_str, str):
|
|
return None
|
|
|
|
result = {}
|
|
|
|
# Extract fields using regex
|
|
city_match = re.search(r"'city':\s*'([^']*)'", location_str)
|
|
if city_match:
|
|
result['city'] = city_match.group(1)
|
|
|
|
address_match = re.search(r"'street_address':\s*'([^']*)'", location_str)
|
|
if address_match:
|
|
result['street_address'] = address_match.group(1)
|
|
|
|
postal_match = re.search(r"'postal_code':\s*'([^']*)'", location_str)
|
|
if postal_match:
|
|
result['postal_code'] = postal_match.group(1)
|
|
|
|
country_match = re.search(r"'country':\s*'([^']*)'", location_str)
|
|
if country_match:
|
|
result['country'] = country_match.group(1)
|
|
|
|
return result if result else None
|
|
|
|
def export_institution_to_rdf(institution: Dict, graph: Graph) -> None:
|
|
"""Export a single institution to RDF graph."""
|
|
|
|
# Institution URI
|
|
inst_id = institution.get('id')
|
|
if not inst_id:
|
|
return
|
|
|
|
inst_uri = URIRef(inst_id)
|
|
|
|
# Determine RDF type based on institution_type
|
|
inst_type = institution.get('institution_type', 'UNKNOWN')
|
|
|
|
# Add multiple rdf:type declarations
|
|
if inst_type == 'LIBRARY':
|
|
graph.add((inst_uri, RDF.type, SCHEMA.Library))
|
|
graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
|
|
elif inst_type == 'ARCHIVE':
|
|
graph.add((inst_uri, RDF.type, SCHEMA.ArchiveOrganization))
|
|
graph.add((inst_uri, RDF.type, RICO.CorporateBody))
|
|
graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
|
|
elif inst_type == 'MUSEUM':
|
|
graph.add((inst_uri, RDF.type, SCHEMA.Museum))
|
|
graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation))
|
|
else:
|
|
graph.add((inst_uri, RDF.type, SCHEMA.Organization))
|
|
|
|
# Basic properties
|
|
name = institution.get('name')
|
|
if name:
|
|
graph.add((inst_uri, SKOS.prefLabel, Literal(name, lang='da')))
|
|
graph.add((inst_uri, SCHEMA.name, Literal(name, lang='da')))
|
|
graph.add((inst_uri, RDFS.label, Literal(name, lang='da')))
|
|
|
|
# Alternative names
|
|
alt_names = institution.get('alternative_names', [])
|
|
if alt_names:
|
|
for alt_name in alt_names:
|
|
if alt_name:
|
|
graph.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang='da')))
|
|
graph.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang='da')))
|
|
|
|
# Description
|
|
description = institution.get('description')
|
|
if description:
|
|
graph.add((inst_uri, DCTERMS.description, Literal(description, lang='da')))
|
|
graph.add((inst_uri, SCHEMA.description, Literal(description, lang='da')))
|
|
|
|
# Identifiers
|
|
identifiers = institution.get('identifiers', [])
|
|
for identifier_data in identifiers:
|
|
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
|
|
|
|
if identifier and isinstance(identifier, dict):
|
|
scheme = identifier.get('scheme')
|
|
value = identifier.get('value')
|
|
url = identifier.get('url')
|
|
|
|
if scheme == 'ISIL':
|
|
graph.add((inst_uri, DCTERMS.identifier, Literal(value)))
|
|
graph.add((inst_uri, SCHEMA.identifier, Literal(value)))
|
|
if url:
|
|
graph.add((inst_uri, SCHEMA.sameAs, URIRef(url)))
|
|
|
|
elif scheme == 'Wikidata':
|
|
wikidata_uri = URIRef(f"http://www.wikidata.org/entity/{value}")
|
|
graph.add((inst_uri, SCHEMA.sameAs, wikidata_uri))
|
|
graph.add((inst_uri, OWL.sameAs, wikidata_uri))
|
|
|
|
elif scheme == 'VIAF':
|
|
viaf_uri = URIRef(f"http://viaf.org/viaf/{value}")
|
|
graph.add((inst_uri, SCHEMA.sameAs, viaf_uri))
|
|
graph.add((inst_uri, OWL.sameAs, viaf_uri))
|
|
|
|
elif scheme == 'Website':
|
|
if value:
|
|
graph.add((inst_uri, SCHEMA.url, URIRef(value)))
|
|
graph.add((inst_uri, FOAF.homepage, URIRef(value)))
|
|
|
|
elif scheme == 'VIP-basen Library Number':
|
|
graph.add((inst_uri, DCTERMS.identifier, Literal(value)))
|
|
|
|
# GHCID identifiers
|
|
ghcid = institution.get('ghcid_current')
|
|
if ghcid:
|
|
graph.add((inst_uri, HERITAGE.ghcid, Literal(ghcid)))
|
|
graph.add((inst_uri, DCTERMS.identifier, Literal(ghcid)))
|
|
|
|
ghcid_uuid = institution.get('ghcid_uuid')
|
|
if ghcid_uuid:
|
|
graph.add((inst_uri, HERITAGE.ghcidUUID, Literal(ghcid_uuid)))
|
|
|
|
# Locations
|
|
locations = institution.get('locations', [])
|
|
for location_data in locations:
|
|
location = parse_location_string(location_data) if isinstance(location_data, str) else location_data
|
|
|
|
if location and isinstance(location, dict):
|
|
# Create address node
|
|
address_node = BNode()
|
|
graph.add((inst_uri, SCHEMA.address, address_node))
|
|
graph.add((address_node, RDF.type, SCHEMA.PostalAddress))
|
|
|
|
city = location.get('city')
|
|
if city:
|
|
graph.add((address_node, SCHEMA.addressLocality, Literal(city)))
|
|
|
|
street = location.get('street_address')
|
|
if street:
|
|
graph.add((address_node, SCHEMA.streetAddress, Literal(street)))
|
|
|
|
postal = location.get('postal_code')
|
|
if postal:
|
|
graph.add((address_node, SCHEMA.postalCode, Literal(postal)))
|
|
|
|
country = location.get('country')
|
|
if country:
|
|
graph.add((address_node, SCHEMA.addressCountry, Literal(country)))
|
|
|
|
# Parent organization (hierarchical linking)
|
|
parent_org = institution.get('parent_organization')
|
|
if parent_org:
|
|
parent_uri = URIRef(parent_org)
|
|
graph.add((inst_uri, ORG.subOrganizationOf, parent_uri))
|
|
graph.add((inst_uri, SCHEMA.parentOrganization, parent_uri))
|
|
graph.add((parent_uri, ORG.hasSubOrganization, inst_uri))
|
|
|
|
# Provenance metadata
|
|
provenance = institution.get('provenance')
|
|
if provenance:
|
|
prov_node = BNode()
|
|
graph.add((inst_uri, PROV.wasGeneratedBy, prov_node))
|
|
graph.add((prov_node, RDF.type, PROV.Activity))
|
|
|
|
# Parse provenance if it's a string
|
|
if isinstance(provenance, str):
|
|
source_match = re.search(r"'data_source':\s*<DataSourceEnum\.([^:]+)", provenance)
|
|
if source_match:
|
|
data_source = source_match.group(1)
|
|
graph.add((prov_node, DCTERMS.source, Literal(data_source)))
|
|
|
|
date_match = re.search(r"'extraction_date':\s*datetime\.datetime\(([^)]+)\)", provenance)
|
|
if date_match:
|
|
# Simplify - just add a note that it was extracted
|
|
graph.add((prov_node, DCTERMS.created, Literal("2025-11-19", datatype=XSD.date)))
|
|
|
|
def export_denmark_to_rdf(input_json: Path, output_dir: Path, format_type: str = 'ttl') -> None:
|
|
"""Export Danish GLAM dataset to RDF."""
|
|
|
|
print(f"Loading dataset from {input_json}...")
|
|
with open(input_json, 'r') as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f"Creating RDF graph for {len(institutions)} institutions...")
|
|
|
|
# Create graph
|
|
g = Graph()
|
|
|
|
# Bind namespaces
|
|
g.bind('heritage', HERITAGE)
|
|
g.bind('schema', SCHEMA)
|
|
g.bind('cpov', CPOV)
|
|
g.bind('tooi', TOOI)
|
|
g.bind('org', ORG)
|
|
g.bind('prov', PROV)
|
|
g.bind('rico', RICO)
|
|
g.bind('skos', SKOS)
|
|
g.bind('dcterms', DCTERMS)
|
|
g.bind('foaf', FOAF)
|
|
g.bind('geonames', GEONAMES)
|
|
g.bind('wikidata', WIKIDATA)
|
|
g.bind('viaf', VIAF)
|
|
g.bind('owl', OWL)
|
|
|
|
# Add ontology metadata
|
|
ontology_uri = URIRef("https://w3id.org/heritage/custodian/ontology")
|
|
g.add((ontology_uri, RDF.type, OWL.Ontology))
|
|
g.add((ontology_uri, RDFS.label, Literal("Heritage Custodian Ontology")))
|
|
g.add((ontology_uri, DCTERMS.title, Literal("Heritage Custodian Ontology")))
|
|
g.add((ontology_uri, DCTERMS.description, Literal(
|
|
"Ontology for describing heritage institutions (GLAM - Galleries, Libraries, Archives, Museums) worldwide"
|
|
)))
|
|
|
|
# Export each institution
|
|
for i, institution in enumerate(institutions, 1):
|
|
if i % 100 == 0:
|
|
print(f" Processed {i}/{len(institutions)} institutions...")
|
|
export_institution_to_rdf(institution, g)
|
|
|
|
print(f"\n✅ Graph contains {len(g)} triples")
|
|
|
|
# Serialize to different formats
|
|
formats = {
|
|
'ttl': ('turtle', '.ttl'),
|
|
'rdf': ('xml', '.rdf'),
|
|
'jsonld': ('json-ld', '.jsonld'),
|
|
'nt': ('nt', '.nt')
|
|
}
|
|
|
|
if format_type == 'all':
|
|
for fmt, (rdf_format, ext) in formats.items():
|
|
output_file = output_dir / f"denmark_complete{ext}"
|
|
print(f"\nSerializing to {fmt.upper()}...")
|
|
g.serialize(destination=output_file, format=rdf_format)
|
|
size_mb = output_file.stat().st_size / (1024 * 1024)
|
|
print(f" ✅ Saved to {output_file} ({size_mb:.2f} MB)")
|
|
else:
|
|
rdf_format, ext = formats.get(format_type, ('turtle', '.ttl'))
|
|
output_file = output_dir / f"denmark_complete{ext}"
|
|
print(f"\nSerializing to {format_type.upper()}...")
|
|
g.serialize(destination=output_file, format=rdf_format)
|
|
size_mb = output_file.stat().st_size / (1024 * 1024)
|
|
print(f" ✅ Saved to {output_file} ({size_mb:.2f} MB)")
|
|
|
|
if __name__ == '__main__':
|
|
input_json = Path('data/instances/denmark_complete.json')
|
|
output_dir = Path('data/rdf')
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print("=" * 60)
|
|
print("Danish GLAM Dataset → RDF Export")
|
|
print("=" * 60)
|
|
|
|
# Export to all formats
|
|
export_denmark_to_rdf(input_json, output_dir, format_type='all')
|
|
|
|
print("\n" + "=" * 60)
|
|
print("✅ RDF Export Complete")
|
|
print("=" * 60)
|