#!/usr/bin/env python3
"""
Export Palestinian and Lebanese heritage institutions to JSON-LD and RDF formats.
This script reads the consolidated Palestinian heritage JSON and exports to:
- JSON-LD (with @context for linked data)
- Turtle (RDF)
- N-Triples (for bulk loading)
Usage:
python scripts/export_palestinian_rdf.py [--format jsonld|turtle|ntriples|all]
Output:
data/extracted/palestinian_heritage.jsonld
data/extracted/palestinian_heritage.ttl
data/extracted/palestinian_heritage.nt
"""
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional
# JSON-LD Context for heritage custodians
JSONLD_CONTEXT = {
"@context": {
# Prefixes
"schema": "https://schema.org/",
"wd": "http://www.wikidata.org/entity/",
"wdt": "http://www.wikidata.org/prop/direct/",
"viaf": "http://viaf.org/viaf/",
"gnd": "https://d-nb.info/gnd/",
"lcnaf": "http://id.loc.gov/authorities/names/",
"geonames": "https://sws.geonames.org/",
"ghcid": "https://w3id.org/heritage/custodian/",
"geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
"dct": "http://purl.org/dc/terms/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"cpov": "http://data.europa.eu/m8g/",
"prov": "http://www.w3.org/ns/prov#",
# Properties
"@id": "@id",
"@type": "@type",
"name": "schema:name",
"nameArabic": {
"@id": "schema:name",
"@language": "ar"
},
"description": "schema:description",
"url": "schema:url",
"identifier": "schema:identifier",
"latitude": "geo:lat",
"longitude": "geo:long",
"address": "schema:address",
"city": "schema:addressLocality",
"country": "schema:addressCountry",
"region": "schema:addressRegion",
"wikidataId": "wdt:P31",
"viafId": "wdt:P214",
"gndId": "wdt:P227",
"isPartOf": "schema:isPartOf",
"foundingDate": "schema:foundingDate",
"dissolutionDate": "schema:dissolutionDate",
"alternateName": "schema:alternateName",
# Classes
"HeritageInstitution": "cpov:PublicOrganisation",
"Museum": "schema:Museum",
"Library": "schema:Library",
"Archive": "schema:ArchiveOrganization",
"Gallery": "schema:CivicStructure",
"ResearchCenter": "schema:ResearchOrganization",
}
}
# Type mappings for JSON-LD
TYPE_MAPPINGS = {
"museum": ["HeritageInstitution", "Museum"],
"art_museum": ["HeritageInstitution", "Museum"],
"archaeology_museum": ["HeritageInstitution", "Museum"],
"library": ["HeritageInstitution", "Library"],
"national_library": ["HeritageInstitution", "Library"],
"public_library": ["HeritageInstitution", "Library"],
"academic_library": ["HeritageInstitution", "Library"],
"archive": ["HeritageInstitution", "Archive"],
"oral_history_archive": ["HeritageInstitution", "Archive"],
"photographic_archive": ["HeritageInstitution", "Archive"],
"research_archive": ["HeritageInstitution", "Archive"],
"municipal_archive": ["HeritageInstitution", "Archive"],
"cultural_center": ["HeritageInstitution", "Gallery"],
"theater": ["HeritageInstitution", "Gallery"],
"research_institute": ["HeritageInstitution", "ResearchCenter"],
"heritage_center": ["HeritageInstitution", "ResearchCenter"],
}
def convert_institution_to_jsonld(inst: Dict, base_uri: str = "https://w3id.org/heritage/custodian/") -> Dict:
"""Convert a single institution to JSON-LD format."""
# Generate @id from GHCID or fall back to local ID
ghcid = inst.get("ghcid", "")
local_id = inst.get("id", "unknown")
if ghcid:
entity_id = f"{base_uri}{ghcid.lower().replace('-', '/')}"
else:
entity_id = f"{base_uri}{local_id.lower()}"
# Determine @type
subtype = inst.get("subtype", "archive")
types = TYPE_MAPPINGS.get(subtype, ["HeritageInstitution"])
# Build JSON-LD entity
entity = {
"@id": entity_id,
"@type": types,
"name": inst.get("name"),
}
# Add Arabic name if available
if inst.get("name_arabic"):
entity["nameArabic"] = inst["name_arabic"]
# Add identifiers
identifiers = []
# GHCID
if ghcid:
identifiers.append({
"@type": "PropertyValue",
"propertyID": "GHCID",
"value": ghcid
})
# GHCID UUID
if inst.get("ghcid_uuid"):
identifiers.append({
"@type": "PropertyValue",
"propertyID": "GHCID-UUID",
"value": inst["ghcid_uuid"]
})
# Wikidata
wikidata = inst.get("wikidata", {})
if isinstance(wikidata, dict) and wikidata.get("id"):
entity["wikidataId"] = f"wd:{wikidata['id']}"
identifiers.append({
"@type": "PropertyValue",
"propertyID": "Wikidata",
"value": wikidata["id"],
"url": wikidata.get("url")
})
# VIAF, GND, LCNAF
inst_identifiers = inst.get("identifiers", {})
if isinstance(inst_identifiers, dict):
if inst_identifiers.get("viaf"):
entity["viafId"] = f"viaf:{inst_identifiers['viaf']}"
if inst_identifiers.get("gnd"):
entity["gndId"] = f"gnd:{inst_identifiers['gnd']}"
if identifiers:
entity["identifier"] = identifiers
# Location
if inst.get("city") or inst.get("country"):
address = {
"@type": "PostalAddress"
}
if inst.get("city"):
address["addressLocality"] = inst["city"]
if inst.get("country"):
address["addressCountry"] = inst["country"]
if inst.get("location"):
address["addressRegion"] = inst["location"]
entity["address"] = address
# Coordinates
coords = inst.get("coordinates", {})
if isinstance(coords, dict) and coords.get("lat") and coords.get("lon"):
entity["latitude"] = coords["lat"]
entity["longitude"] = coords["lon"]
# Website
if inst.get("website"):
entity["url"] = inst["website"]
# Description / notes
if inst.get("notes"):
entity["description"] = inst["notes"]
# Founding date
if inst.get("founded"):
entity["foundingDate"] = inst["founded"]
# Parent institution
parent_wikidata = inst.get("parent_wikidata", {})
if isinstance(parent_wikidata, dict) and parent_wikidata.get("id"):
entity["isPartOf"] = {
"@id": f"wd:{parent_wikidata['id']}",
"name": parent_wikidata.get("name")
}
elif inst.get("parent_institution"):
entity["isPartOf"] = {
"name": inst["parent_institution"]
}
return entity
def export_to_jsonld(data: Dict, output_path: Path) -> None:
"""Export to JSON-LD format."""
institutions = data.get("institutions", [])
# Build JSON-LD document
jsonld_doc = {
**JSONLD_CONTEXT,
"@graph": []
}
# Add metadata
jsonld_doc["@graph"].append({
"@id": "https://w3id.org/heritage/custodian/dataset/palestinian",
"@type": "schema:Dataset",
"name": data["metadata"]["title"],
"description": data["metadata"]["description"],
"version": data["metadata"]["version"],
"dateModified": data["metadata"]["updated"],
"license": "https://creativecommons.org/licenses/by-sa/4.0/"
})
# Convert each institution
for inst in institutions:
if inst.get("ghcid_status") == "skipped":
continue # Skip online-only platforms
entity = convert_institution_to_jsonld(inst)
jsonld_doc["@graph"].append(entity)
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(jsonld_doc, f, ensure_ascii=False, indent=2)
print(f"Exported JSON-LD: {output_path} ({len(jsonld_doc['@graph']) - 1} institutions)")
def export_to_turtle(data: Dict, output_path: Path) -> None:
"""Export to Turtle RDF format."""
institutions = data.get("institutions", [])
# Build Turtle output
lines = [
"# Palestinian and Lebanese Heritage Institutions",
f"# Version: {data['metadata']['version']}",
f"# Generated: {datetime.now(timezone.utc).isoformat()}",
"",
"@prefix schema: .",
"@prefix wd: .",
"@prefix viaf: .",
"@prefix gnd: .",
"@prefix ghcid: .",
"@prefix geo: .",
"@prefix cpov: .",
"@prefix xsd: .",
"",
]
for inst in institutions:
if inst.get("ghcid_status") == "skipped":
continue
ghcid = inst.get("ghcid", "")
if not ghcid:
continue
# Generate URI
entity_uri = f"ghcid:{ghcid.lower().replace('-', '/')}"
subtype = inst.get("subtype", "archive")
lines.append(f"# {inst.get('name')}")
lines.append(f"{entity_uri}")
lines.append(f' a cpov:PublicOrganisation, schema:{"Museum" if "museum" in subtype else "Library" if "library" in subtype else "ArchiveOrganization"} ;')
lines.append(f' schema:name "{inst.get("name")}"@en ;')
if inst.get("name_arabic"):
lines.append(f' schema:name "{inst["name_arabic"]}"@ar ;')
if inst.get("website"):
lines.append(f' schema:url <{inst["website"]}> ;')
coords = inst.get("coordinates", {})
if isinstance(coords, dict) and coords.get("lat"):
lines.append(f' geo:lat "{coords["lat"]}"^^xsd:decimal ;')
lines.append(f' geo:long "{coords["lon"]}"^^xsd:decimal ;')
wikidata = inst.get("wikidata", {})
if isinstance(wikidata, dict) and wikidata.get("id"):
lines.append(f' schema:sameAs wd:{wikidata["id"]} ;')
# End with period
if lines[-1].endswith(' ;'):
lines[-1] = lines[-1][:-2] + ' .'
else:
lines.append(' .')
lines.append("")
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
print(f"Exported Turtle: {output_path}")
def main():
import argparse
parser = argparse.ArgumentParser(description="Export Palestinian heritage data to RDF formats")
parser.add_argument("--format", choices=["jsonld", "turtle", "all"], default="all",
help="Output format (default: all)")
args = parser.parse_args()
# Load data
data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"
if not data_file.exists():
print(f"Error: Data file not found: {data_file}")
return 1
print(f"Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
data = json.load(f)
output_dir = data_file.parent
# Export to requested formats
if args.format in ["jsonld", "all"]:
export_to_jsonld(data, output_dir / "palestinian_heritage.jsonld")
if args.format in ["turtle", "all"]:
export_to_turtle(data, output_dir / "palestinian_heritage.ttl")
print("\nExport complete!")
return 0
if __name__ == "__main__":
sys.exit(main())