""" Export Danish GLAM dataset to RDF (Turtle, RDF/XML, JSON-LD) This script converts the denmark_complete.json dataset into Linked Open Data formats aligned with CPOV (Core Public Organisation Vocabulary), Schema.org, and heritage ontologies. """ import json import re from pathlib import Path from typing import Dict, List, Optional from rdflib import Graph, Namespace, Literal, URIRef, BNode from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, FOAF, XSD, OWL # Define namespaces HERITAGE = Namespace("https://w3id.org/heritage/custodian/") SCHEMA = Namespace("http://schema.org/") CPOV = Namespace("http://data.europa.eu/m8g/") TOOI = Namespace("https://identifier.overheid.nl/tooi/def/ont/") ORG = Namespace("http://www.w3.org/ns/org#") PROV = Namespace("http://www.w3.org/ns/prov#") RICO = Namespace("https://www.ica.org/standards/RiC/ontology#") GEONAMES = Namespace("http://www.geonames.org/ontology#") WIKIDATA = Namespace("http://www.wikidata.org/entity/") VIAF = Namespace("http://viaf.org/viaf/") def parse_identifier_string(identifier_str: str) -> Optional[Dict[str, str]]: """Parse identifier from string representation.""" if not identifier_str or not isinstance(identifier_str, str): return None # Extract identifier_scheme and identifier_value using regex scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str) value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str) url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str) if scheme_match and value_match: return { 'scheme': scheme_match.group(1), 'value': value_match.group(1), 'url': url_match.group(1) if url_match else None } return None def parse_location_string(location_str: str) -> Optional[Dict]: """Parse location from string representation.""" if not location_str or not isinstance(location_str, str): return None result = {} # Extract fields using regex city_match = re.search(r"'city':\s*'([^']*)'", location_str) if city_match: result['city'] = city_match.group(1) address_match = re.search(r"'street_address':\s*'([^']*)'", location_str) if address_match: result['street_address'] = address_match.group(1) postal_match = re.search(r"'postal_code':\s*'([^']*)'", location_str) if postal_match: result['postal_code'] = postal_match.group(1) country_match = re.search(r"'country':\s*'([^']*)'", location_str) if country_match: result['country'] = country_match.group(1) return result if result else None def export_institution_to_rdf(institution: Dict, graph: Graph) -> None: """Export a single institution to RDF graph.""" # Institution URI inst_id = institution.get('id') if not inst_id: return inst_uri = URIRef(inst_id) # Determine RDF type based on institution_type inst_type = institution.get('institution_type', 'UNKNOWN') # Add multiple rdf:type declarations if inst_type == 'LIBRARY': graph.add((inst_uri, RDF.type, SCHEMA.Library)) graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation)) elif inst_type == 'ARCHIVE': graph.add((inst_uri, RDF.type, SCHEMA.ArchiveOrganization)) graph.add((inst_uri, RDF.type, RICO.CorporateBody)) graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation)) elif inst_type == 'MUSEUM': graph.add((inst_uri, RDF.type, SCHEMA.Museum)) graph.add((inst_uri, RDF.type, CPOV.PublicOrganisation)) else: graph.add((inst_uri, RDF.type, SCHEMA.Organization)) # Basic properties name = institution.get('name') if name: graph.add((inst_uri, SKOS.prefLabel, Literal(name, lang='da'))) graph.add((inst_uri, SCHEMA.name, Literal(name, lang='da'))) graph.add((inst_uri, RDFS.label, Literal(name, lang='da'))) # Alternative names alt_names = institution.get('alternative_names', []) if alt_names: for alt_name in alt_names: if alt_name: graph.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang='da'))) graph.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang='da'))) # Description description = institution.get('description') if description: graph.add((inst_uri, DCTERMS.description, Literal(description, lang='da'))) graph.add((inst_uri, SCHEMA.description, Literal(description, lang='da'))) # Identifiers identifiers = institution.get('identifiers', []) for identifier_data in identifiers: identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data if identifier and isinstance(identifier, dict): scheme = identifier.get('scheme') value = identifier.get('value') url = identifier.get('url') if scheme == 'ISIL': graph.add((inst_uri, DCTERMS.identifier, Literal(value))) graph.add((inst_uri, SCHEMA.identifier, Literal(value))) if url: graph.add((inst_uri, SCHEMA.sameAs, URIRef(url))) elif scheme == 'Wikidata': wikidata_uri = URIRef(f"http://www.wikidata.org/entity/{value}") graph.add((inst_uri, SCHEMA.sameAs, wikidata_uri)) graph.add((inst_uri, OWL.sameAs, wikidata_uri)) elif scheme == 'VIAF': viaf_uri = URIRef(f"http://viaf.org/viaf/{value}") graph.add((inst_uri, SCHEMA.sameAs, viaf_uri)) graph.add((inst_uri, OWL.sameAs, viaf_uri)) elif scheme == 'Website': if value: graph.add((inst_uri, SCHEMA.url, URIRef(value))) graph.add((inst_uri, FOAF.homepage, URIRef(value))) elif scheme == 'VIP-basen Library Number': graph.add((inst_uri, DCTERMS.identifier, Literal(value))) # GHCID identifiers ghcid = institution.get('ghcid_current') if ghcid: graph.add((inst_uri, HERITAGE.ghcid, Literal(ghcid))) graph.add((inst_uri, DCTERMS.identifier, Literal(ghcid))) ghcid_uuid = institution.get('ghcid_uuid') if ghcid_uuid: graph.add((inst_uri, HERITAGE.ghcidUUID, Literal(ghcid_uuid))) # Locations locations = institution.get('locations', []) for location_data in locations: location = parse_location_string(location_data) if isinstance(location_data, str) else location_data if location and isinstance(location, dict): # Create address node address_node = BNode() graph.add((inst_uri, SCHEMA.address, address_node)) graph.add((address_node, RDF.type, SCHEMA.PostalAddress)) city = location.get('city') if city: graph.add((address_node, SCHEMA.addressLocality, Literal(city))) street = location.get('street_address') if street: graph.add((address_node, SCHEMA.streetAddress, Literal(street))) postal = location.get('postal_code') if postal: graph.add((address_node, SCHEMA.postalCode, Literal(postal))) country = location.get('country') if country: graph.add((address_node, SCHEMA.addressCountry, Literal(country))) # Parent organization (hierarchical linking) parent_org = institution.get('parent_organization') if parent_org: parent_uri = URIRef(parent_org) graph.add((inst_uri, ORG.subOrganizationOf, parent_uri)) graph.add((inst_uri, SCHEMA.parentOrganization, parent_uri)) graph.add((parent_uri, ORG.hasSubOrganization, inst_uri)) # Provenance metadata provenance = institution.get('provenance') if provenance: prov_node = BNode() graph.add((inst_uri, PROV.wasGeneratedBy, prov_node)) graph.add((prov_node, RDF.type, PROV.Activity)) # Parse provenance if it's a string if isinstance(provenance, str): source_match = re.search(r"'data_source':\s* None: """Export Danish GLAM dataset to RDF.""" print(f"Loading dataset from {input_json}...") with open(input_json, 'r') as f: institutions = json.load(f) print(f"Creating RDF graph for {len(institutions)} institutions...") # Create graph g = Graph() # Bind namespaces g.bind('heritage', HERITAGE) g.bind('schema', SCHEMA) g.bind('cpov', CPOV) g.bind('tooi', TOOI) g.bind('org', ORG) g.bind('prov', PROV) g.bind('rico', RICO) g.bind('skos', SKOS) g.bind('dcterms', DCTERMS) g.bind('foaf', FOAF) g.bind('geonames', GEONAMES) g.bind('wikidata', WIKIDATA) g.bind('viaf', VIAF) g.bind('owl', OWL) # Add ontology metadata ontology_uri = URIRef("https://w3id.org/heritage/custodian/ontology") g.add((ontology_uri, RDF.type, OWL.Ontology)) g.add((ontology_uri, RDFS.label, Literal("Heritage Custodian Ontology"))) g.add((ontology_uri, DCTERMS.title, Literal("Heritage Custodian Ontology"))) g.add((ontology_uri, DCTERMS.description, Literal( "Ontology for describing heritage institutions (GLAM - Galleries, Libraries, Archives, Museums) worldwide" ))) # Export each institution for i, institution in enumerate(institutions, 1): if i % 100 == 0: print(f" Processed {i}/{len(institutions)} institutions...") export_institution_to_rdf(institution, g) print(f"\n✅ Graph contains {len(g)} triples") # Serialize to different formats formats = { 'ttl': ('turtle', '.ttl'), 'rdf': ('xml', '.rdf'), 'jsonld': ('json-ld', '.jsonld'), 'nt': ('nt', '.nt') } if format_type == 'all': for fmt, (rdf_format, ext) in formats.items(): output_file = output_dir / f"denmark_complete{ext}" print(f"\nSerializing to {fmt.upper()}...") g.serialize(destination=output_file, format=rdf_format) size_mb = output_file.stat().st_size / (1024 * 1024) print(f" ✅ Saved to {output_file} ({size_mb:.2f} MB)") else: rdf_format, ext = formats.get(format_type, ('turtle', '.ttl')) output_file = output_dir / f"denmark_complete{ext}" print(f"\nSerializing to {format_type.upper()}...") g.serialize(destination=output_file, format=rdf_format) size_mb = output_file.stat().st_size / (1024 * 1024) print(f" ✅ Saved to {output_file} ({size_mb:.2f} MB)") if __name__ == '__main__': input_json = Path('data/instances/denmark_complete_enriched.json') output_dir = Path('data/rdf') output_dir.mkdir(parents=True, exist_ok=True) print("=" * 60) print("Danish GLAM Dataset → RDF Export") print("=" * 60) # Export to all formats export_denmark_to_rdf(input_json, output_dir, format_type='all') print("\n" + "=" * 60) print("✅ RDF Export Complete") print("=" * 60)