350 lines
12 KiB
Python
350 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Palestinian and Lebanese heritage institutions to JSON-LD and RDF formats.
|
|
|
|
This script reads the consolidated Palestinian heritage JSON and exports to:
|
|
- JSON-LD (with @context for linked data)
|
|
- Turtle (RDF)
|
|
- N-Triples (for bulk loading)
|
|
|
|
Usage:
|
|
python scripts/export_palestinian_rdf.py [--format jsonld|turtle|ntriples|all]
|
|
|
|
Output:
|
|
data/extracted/palestinian_heritage.jsonld
|
|
data/extracted/palestinian_heritage.ttl
|
|
data/extracted/palestinian_heritage.nt
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
# JSON-LD Context for heritage custodians
|
|
JSONLD_CONTEXT = {
|
|
"@context": {
|
|
# Prefixes
|
|
"schema": "https://schema.org/",
|
|
"wd": "http://www.wikidata.org/entity/",
|
|
"wdt": "http://www.wikidata.org/prop/direct/",
|
|
"viaf": "http://viaf.org/viaf/",
|
|
"gnd": "https://d-nb.info/gnd/",
|
|
"lcnaf": "http://id.loc.gov/authorities/names/",
|
|
"geonames": "https://sws.geonames.org/",
|
|
"ghcid": "https://w3id.org/heritage/custodian/",
|
|
"geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
|
|
"dct": "http://purl.org/dc/terms/",
|
|
"skos": "http://www.w3.org/2004/02/skos/core#",
|
|
"cpov": "http://data.europa.eu/m8g/",
|
|
"prov": "http://www.w3.org/ns/prov#",
|
|
|
|
# Properties
|
|
"@id": "@id",
|
|
"@type": "@type",
|
|
"name": "schema:name",
|
|
"nameArabic": {
|
|
"@id": "schema:name",
|
|
"@language": "ar"
|
|
},
|
|
"description": "schema:description",
|
|
"url": "schema:url",
|
|
"identifier": "schema:identifier",
|
|
"latitude": "geo:lat",
|
|
"longitude": "geo:long",
|
|
"address": "schema:address",
|
|
"city": "schema:addressLocality",
|
|
"country": "schema:addressCountry",
|
|
"region": "schema:addressRegion",
|
|
"wikidataId": "wdt:P31",
|
|
"viafId": "wdt:P214",
|
|
"gndId": "wdt:P227",
|
|
"isPartOf": "schema:isPartOf",
|
|
"foundingDate": "schema:foundingDate",
|
|
"dissolutionDate": "schema:dissolutionDate",
|
|
"alternateName": "schema:alternateName",
|
|
|
|
# Classes
|
|
"HeritageInstitution": "cpov:PublicOrganisation",
|
|
"Museum": "schema:Museum",
|
|
"Library": "schema:Library",
|
|
"Archive": "schema:ArchiveOrganization",
|
|
"Gallery": "schema:CivicStructure",
|
|
"ResearchCenter": "schema:ResearchOrganization",
|
|
}
|
|
}
|
|
|
|
# Type mappings for JSON-LD
|
|
TYPE_MAPPINGS = {
|
|
"museum": ["HeritageInstitution", "Museum"],
|
|
"art_museum": ["HeritageInstitution", "Museum"],
|
|
"archaeology_museum": ["HeritageInstitution", "Museum"],
|
|
"library": ["HeritageInstitution", "Library"],
|
|
"national_library": ["HeritageInstitution", "Library"],
|
|
"public_library": ["HeritageInstitution", "Library"],
|
|
"academic_library": ["HeritageInstitution", "Library"],
|
|
"archive": ["HeritageInstitution", "Archive"],
|
|
"oral_history_archive": ["HeritageInstitution", "Archive"],
|
|
"photographic_archive": ["HeritageInstitution", "Archive"],
|
|
"research_archive": ["HeritageInstitution", "Archive"],
|
|
"municipal_archive": ["HeritageInstitution", "Archive"],
|
|
"cultural_center": ["HeritageInstitution", "Gallery"],
|
|
"theater": ["HeritageInstitution", "Gallery"],
|
|
"research_institute": ["HeritageInstitution", "ResearchCenter"],
|
|
"heritage_center": ["HeritageInstitution", "ResearchCenter"],
|
|
}
|
|
|
|
|
|
def convert_institution_to_jsonld(inst: Dict, base_uri: str = "https://w3id.org/heritage/custodian/") -> Dict:
|
|
"""Convert a single institution to JSON-LD format."""
|
|
|
|
# Generate @id from GHCID or fall back to local ID
|
|
ghcid = inst.get("ghcid", "")
|
|
local_id = inst.get("id", "unknown")
|
|
|
|
if ghcid:
|
|
entity_id = f"{base_uri}{ghcid.lower().replace('-', '/')}"
|
|
else:
|
|
entity_id = f"{base_uri}{local_id.lower()}"
|
|
|
|
# Determine @type
|
|
subtype = inst.get("subtype", "archive")
|
|
types = TYPE_MAPPINGS.get(subtype, ["HeritageInstitution"])
|
|
|
|
# Build JSON-LD entity
|
|
entity = {
|
|
"@id": entity_id,
|
|
"@type": types,
|
|
"name": inst.get("name"),
|
|
}
|
|
|
|
# Add Arabic name if available
|
|
if inst.get("name_arabic"):
|
|
entity["nameArabic"] = inst["name_arabic"]
|
|
|
|
# Add identifiers
|
|
identifiers = []
|
|
|
|
# GHCID
|
|
if ghcid:
|
|
identifiers.append({
|
|
"@type": "PropertyValue",
|
|
"propertyID": "GHCID",
|
|
"value": ghcid
|
|
})
|
|
|
|
# GHCID UUID
|
|
if inst.get("ghcid_uuid"):
|
|
identifiers.append({
|
|
"@type": "PropertyValue",
|
|
"propertyID": "GHCID-UUID",
|
|
"value": inst["ghcid_uuid"]
|
|
})
|
|
|
|
# Wikidata
|
|
wikidata = inst.get("wikidata", {})
|
|
if isinstance(wikidata, dict) and wikidata.get("id"):
|
|
entity["wikidataId"] = f"wd:{wikidata['id']}"
|
|
identifiers.append({
|
|
"@type": "PropertyValue",
|
|
"propertyID": "Wikidata",
|
|
"value": wikidata["id"],
|
|
"url": wikidata.get("url")
|
|
})
|
|
|
|
# VIAF, GND, LCNAF
|
|
inst_identifiers = inst.get("identifiers", {})
|
|
if isinstance(inst_identifiers, dict):
|
|
if inst_identifiers.get("viaf"):
|
|
entity["viafId"] = f"viaf:{inst_identifiers['viaf']}"
|
|
if inst_identifiers.get("gnd"):
|
|
entity["gndId"] = f"gnd:{inst_identifiers['gnd']}"
|
|
|
|
if identifiers:
|
|
entity["identifier"] = identifiers
|
|
|
|
# Location
|
|
if inst.get("city") or inst.get("country"):
|
|
address = {
|
|
"@type": "PostalAddress"
|
|
}
|
|
if inst.get("city"):
|
|
address["addressLocality"] = inst["city"]
|
|
if inst.get("country"):
|
|
address["addressCountry"] = inst["country"]
|
|
if inst.get("location"):
|
|
address["addressRegion"] = inst["location"]
|
|
entity["address"] = address
|
|
|
|
# Coordinates
|
|
coords = inst.get("coordinates", {})
|
|
if isinstance(coords, dict) and coords.get("lat") and coords.get("lon"):
|
|
entity["latitude"] = coords["lat"]
|
|
entity["longitude"] = coords["lon"]
|
|
|
|
# Website
|
|
if inst.get("website"):
|
|
entity["url"] = inst["website"]
|
|
|
|
# Description / notes
|
|
if inst.get("notes"):
|
|
entity["description"] = inst["notes"]
|
|
|
|
# Founding date
|
|
if inst.get("founded"):
|
|
entity["foundingDate"] = inst["founded"]
|
|
|
|
# Parent institution
|
|
parent_wikidata = inst.get("parent_wikidata", {})
|
|
if isinstance(parent_wikidata, dict) and parent_wikidata.get("id"):
|
|
entity["isPartOf"] = {
|
|
"@id": f"wd:{parent_wikidata['id']}",
|
|
"name": parent_wikidata.get("name")
|
|
}
|
|
elif inst.get("parent_institution"):
|
|
entity["isPartOf"] = {
|
|
"name": inst["parent_institution"]
|
|
}
|
|
|
|
return entity
|
|
|
|
|
|
def export_to_jsonld(data: Dict, output_path: Path) -> None:
|
|
"""Export to JSON-LD format."""
|
|
institutions = data.get("institutions", [])
|
|
|
|
# Build JSON-LD document
|
|
jsonld_doc = {
|
|
**JSONLD_CONTEXT,
|
|
"@graph": []
|
|
}
|
|
|
|
# Add metadata
|
|
jsonld_doc["@graph"].append({
|
|
"@id": "https://w3id.org/heritage/custodian/dataset/palestinian",
|
|
"@type": "schema:Dataset",
|
|
"name": data["metadata"]["title"],
|
|
"description": data["metadata"]["description"],
|
|
"version": data["metadata"]["version"],
|
|
"dateModified": data["metadata"]["updated"],
|
|
"license": "https://creativecommons.org/licenses/by-sa/4.0/"
|
|
})
|
|
|
|
# Convert each institution
|
|
for inst in institutions:
|
|
if inst.get("ghcid_status") == "skipped":
|
|
continue # Skip online-only platforms
|
|
|
|
entity = convert_institution_to_jsonld(inst)
|
|
jsonld_doc["@graph"].append(entity)
|
|
|
|
# Write output
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(jsonld_doc, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"Exported JSON-LD: {output_path} ({len(jsonld_doc['@graph']) - 1} institutions)")
|
|
|
|
|
|
def export_to_turtle(data: Dict, output_path: Path) -> None:
|
|
"""Export to Turtle RDF format."""
|
|
institutions = data.get("institutions", [])
|
|
|
|
# Build Turtle output
|
|
lines = [
|
|
"# Palestinian and Lebanese Heritage Institutions",
|
|
f"# Version: {data['metadata']['version']}",
|
|
f"# Generated: {datetime.now(timezone.utc).isoformat()}",
|
|
"",
|
|
"@prefix schema: <https://schema.org/> .",
|
|
"@prefix wd: <http://www.wikidata.org/entity/> .",
|
|
"@prefix viaf: <http://viaf.org/viaf/> .",
|
|
"@prefix gnd: <https://d-nb.info/gnd/> .",
|
|
"@prefix ghcid: <https://w3id.org/heritage/custodian/> .",
|
|
"@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .",
|
|
"@prefix cpov: <http://data.europa.eu/m8g/> .",
|
|
"@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .",
|
|
"",
|
|
]
|
|
|
|
for inst in institutions:
|
|
if inst.get("ghcid_status") == "skipped":
|
|
continue
|
|
|
|
ghcid = inst.get("ghcid", "")
|
|
if not ghcid:
|
|
continue
|
|
|
|
# Generate URI
|
|
entity_uri = f"ghcid:{ghcid.lower().replace('-', '/')}"
|
|
subtype = inst.get("subtype", "archive")
|
|
|
|
lines.append(f"# {inst.get('name')}")
|
|
lines.append(f"{entity_uri}")
|
|
lines.append(f' a cpov:PublicOrganisation, schema:{"Museum" if "museum" in subtype else "Library" if "library" in subtype else "ArchiveOrganization"} ;')
|
|
lines.append(f' schema:name "{inst.get("name")}"@en ;')
|
|
|
|
if inst.get("name_arabic"):
|
|
lines.append(f' schema:name "{inst["name_arabic"]}"@ar ;')
|
|
|
|
if inst.get("website"):
|
|
lines.append(f' schema:url <{inst["website"]}> ;')
|
|
|
|
coords = inst.get("coordinates", {})
|
|
if isinstance(coords, dict) and coords.get("lat"):
|
|
lines.append(f' geo:lat "{coords["lat"]}"^^xsd:decimal ;')
|
|
lines.append(f' geo:long "{coords["lon"]}"^^xsd:decimal ;')
|
|
|
|
wikidata = inst.get("wikidata", {})
|
|
if isinstance(wikidata, dict) and wikidata.get("id"):
|
|
lines.append(f' schema:sameAs wd:{wikidata["id"]} ;')
|
|
|
|
# End with period
|
|
if lines[-1].endswith(' ;'):
|
|
lines[-1] = lines[-1][:-2] + ' .'
|
|
else:
|
|
lines.append(' .')
|
|
|
|
lines.append("")
|
|
|
|
# Write output
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write('\n'.join(lines))
|
|
|
|
print(f"Exported Turtle: {output_path}")
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Export Palestinian heritage data to RDF formats")
|
|
parser.add_argument("--format", choices=["jsonld", "turtle", "all"], default="all",
|
|
help="Output format (default: all)")
|
|
args = parser.parse_args()
|
|
|
|
# Load data
|
|
data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"
|
|
|
|
if not data_file.exists():
|
|
print(f"Error: Data file not found: {data_file}")
|
|
return 1
|
|
|
|
print(f"Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
output_dir = data_file.parent
|
|
|
|
# Export to requested formats
|
|
if args.format in ["jsonld", "all"]:
|
|
export_to_jsonld(data, output_dir / "palestinian_heritage.jsonld")
|
|
|
|
if args.format in ["turtle", "all"]:
|
|
export_to_turtle(data, output_dir / "palestinian_heritage.ttl")
|
|
|
|
print("\nExport complete!")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|