#!/usr/bin/env python3 """ Export Palestinian and Lebanese heritage institutions to JSON-LD and RDF formats. This script reads the consolidated Palestinian heritage JSON and exports to: - JSON-LD (with @context for linked data) - Turtle (RDF) - N-Triples (for bulk loading) Usage: python scripts/export_palestinian_rdf.py [--format jsonld|turtle|ntriples|all] Output: data/extracted/palestinian_heritage.jsonld data/extracted/palestinian_heritage.ttl data/extracted/palestinian_heritage.nt """ import json import sys from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Optional # JSON-LD Context for heritage custodians JSONLD_CONTEXT = { "@context": { # Prefixes "schema": "https://schema.org/", "wd": "http://www.wikidata.org/entity/", "wdt": "http://www.wikidata.org/prop/direct/", "viaf": "http://viaf.org/viaf/", "gnd": "https://d-nb.info/gnd/", "lcnaf": "http://id.loc.gov/authorities/names/", "geonames": "https://sws.geonames.org/", "ghcid": "https://w3id.org/heritage/custodian/", "geo": "http://www.w3.org/2003/01/geo/wgs84_pos#", "dct": "http://purl.org/dc/terms/", "skos": "http://www.w3.org/2004/02/skos/core#", "cpov": "http://data.europa.eu/m8g/", "prov": "http://www.w3.org/ns/prov#", # Properties "@id": "@id", "@type": "@type", "name": "schema:name", "nameArabic": { "@id": "schema:name", "@language": "ar" }, "description": "schema:description", "url": "schema:url", "identifier": "schema:identifier", "latitude": "geo:lat", "longitude": "geo:long", "address": "schema:address", "city": "schema:addressLocality", "country": "schema:addressCountry", "region": "schema:addressRegion", "wikidataId": "wdt:P31", "viafId": "wdt:P214", "gndId": "wdt:P227", "isPartOf": "schema:isPartOf", "foundingDate": "schema:foundingDate", "dissolutionDate": "schema:dissolutionDate", "alternateName": "schema:alternateName", # Classes "HeritageInstitution": "cpov:PublicOrganisation", "Museum": "schema:Museum", "Library": "schema:Library", "Archive": "schema:ArchiveOrganization", "Gallery": "schema:CivicStructure", "ResearchCenter": "schema:ResearchOrganization", } } # Type mappings for JSON-LD TYPE_MAPPINGS = { "museum": ["HeritageInstitution", "Museum"], "art_museum": ["HeritageInstitution", "Museum"], "archaeology_museum": ["HeritageInstitution", "Museum"], "library": ["HeritageInstitution", "Library"], "national_library": ["HeritageInstitution", "Library"], "public_library": ["HeritageInstitution", "Library"], "academic_library": ["HeritageInstitution", "Library"], "archive": ["HeritageInstitution", "Archive"], "oral_history_archive": ["HeritageInstitution", "Archive"], "photographic_archive": ["HeritageInstitution", "Archive"], "research_archive": ["HeritageInstitution", "Archive"], "municipal_archive": ["HeritageInstitution", "Archive"], "cultural_center": ["HeritageInstitution", "Gallery"], "theater": ["HeritageInstitution", "Gallery"], "research_institute": ["HeritageInstitution", "ResearchCenter"], "heritage_center": ["HeritageInstitution", "ResearchCenter"], } def convert_institution_to_jsonld(inst: Dict, base_uri: str = "https://w3id.org/heritage/custodian/") -> Dict: """Convert a single institution to JSON-LD format.""" # Generate @id from GHCID or fall back to local ID ghcid = inst.get("ghcid", "") local_id = inst.get("id", "unknown") if ghcid: entity_id = f"{base_uri}{ghcid.lower().replace('-', '/')}" else: entity_id = f"{base_uri}{local_id.lower()}" # Determine @type subtype = inst.get("subtype", "archive") types = TYPE_MAPPINGS.get(subtype, ["HeritageInstitution"]) # Build JSON-LD entity entity = { "@id": entity_id, "@type": types, "name": inst.get("name"), } # Add Arabic name if available if inst.get("name_arabic"): entity["nameArabic"] = inst["name_arabic"] # Add identifiers identifiers = [] # GHCID if ghcid: identifiers.append({ "@type": "PropertyValue", "propertyID": "GHCID", "value": ghcid }) # GHCID UUID if inst.get("ghcid_uuid"): identifiers.append({ "@type": "PropertyValue", "propertyID": "GHCID-UUID", "value": inst["ghcid_uuid"] }) # Wikidata wikidata = inst.get("wikidata", {}) if isinstance(wikidata, dict) and wikidata.get("id"): entity["wikidataId"] = f"wd:{wikidata['id']}" identifiers.append({ "@type": "PropertyValue", "propertyID": "Wikidata", "value": wikidata["id"], "url": wikidata.get("url") }) # VIAF, GND, LCNAF inst_identifiers = inst.get("identifiers", {}) if isinstance(inst_identifiers, dict): if inst_identifiers.get("viaf"): entity["viafId"] = f"viaf:{inst_identifiers['viaf']}" if inst_identifiers.get("gnd"): entity["gndId"] = f"gnd:{inst_identifiers['gnd']}" if identifiers: entity["identifier"] = identifiers # Location if inst.get("city") or inst.get("country"): address = { "@type": "PostalAddress" } if inst.get("city"): address["addressLocality"] = inst["city"] if inst.get("country"): address["addressCountry"] = inst["country"] if inst.get("location"): address["addressRegion"] = inst["location"] entity["address"] = address # Coordinates coords = inst.get("coordinates", {}) if isinstance(coords, dict) and coords.get("lat") and coords.get("lon"): entity["latitude"] = coords["lat"] entity["longitude"] = coords["lon"] # Website if inst.get("website"): entity["url"] = inst["website"] # Description / notes if inst.get("notes"): entity["description"] = inst["notes"] # Founding date if inst.get("founded"): entity["foundingDate"] = inst["founded"] # Parent institution parent_wikidata = inst.get("parent_wikidata", {}) if isinstance(parent_wikidata, dict) and parent_wikidata.get("id"): entity["isPartOf"] = { "@id": f"wd:{parent_wikidata['id']}", "name": parent_wikidata.get("name") } elif inst.get("parent_institution"): entity["isPartOf"] = { "name": inst["parent_institution"] } return entity def export_to_jsonld(data: Dict, output_path: Path) -> None: """Export to JSON-LD format.""" institutions = data.get("institutions", []) # Build JSON-LD document jsonld_doc = { **JSONLD_CONTEXT, "@graph": [] } # Add metadata jsonld_doc["@graph"].append({ "@id": "https://w3id.org/heritage/custodian/dataset/palestinian", "@type": "schema:Dataset", "name": data["metadata"]["title"], "description": data["metadata"]["description"], "version": data["metadata"]["version"], "dateModified": data["metadata"]["updated"], "license": "https://creativecommons.org/licenses/by-sa/4.0/" }) # Convert each institution for inst in institutions: if inst.get("ghcid_status") == "skipped": continue # Skip online-only platforms entity = convert_institution_to_jsonld(inst) jsonld_doc["@graph"].append(entity) # Write output with open(output_path, 'w', encoding='utf-8') as f: json.dump(jsonld_doc, f, ensure_ascii=False, indent=2) print(f"Exported JSON-LD: {output_path} ({len(jsonld_doc['@graph']) - 1} institutions)") def export_to_turtle(data: Dict, output_path: Path) -> None: """Export to Turtle RDF format.""" institutions = data.get("institutions", []) # Build Turtle output lines = [ "# Palestinian and Lebanese Heritage Institutions", f"# Version: {data['metadata']['version']}", f"# Generated: {datetime.now(timezone.utc).isoformat()}", "", "@prefix schema: .", "@prefix wd: .", "@prefix viaf: .", "@prefix gnd: .", "@prefix ghcid: .", "@prefix geo: .", "@prefix cpov: .", "@prefix xsd: .", "", ] for inst in institutions: if inst.get("ghcid_status") == "skipped": continue ghcid = inst.get("ghcid", "") if not ghcid: continue # Generate URI entity_uri = f"ghcid:{ghcid.lower().replace('-', '/')}" subtype = inst.get("subtype", "archive") lines.append(f"# {inst.get('name')}") lines.append(f"{entity_uri}") lines.append(f' a cpov:PublicOrganisation, schema:{"Museum" if "museum" in subtype else "Library" if "library" in subtype else "ArchiveOrganization"} ;') lines.append(f' schema:name "{inst.get("name")}"@en ;') if inst.get("name_arabic"): lines.append(f' schema:name "{inst["name_arabic"]}"@ar ;') if inst.get("website"): lines.append(f' schema:url <{inst["website"]}> ;') coords = inst.get("coordinates", {}) if isinstance(coords, dict) and coords.get("lat"): lines.append(f' geo:lat "{coords["lat"]}"^^xsd:decimal ;') lines.append(f' geo:long "{coords["lon"]}"^^xsd:decimal ;') wikidata = inst.get("wikidata", {}) if isinstance(wikidata, dict) and wikidata.get("id"): lines.append(f' schema:sameAs wd:{wikidata["id"]} ;') # End with period if lines[-1].endswith(' ;'): lines[-1] = lines[-1][:-2] + ' .' else: lines.append(' .') lines.append("") # Write output with open(output_path, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) print(f"Exported Turtle: {output_path}") def main(): import argparse parser = argparse.ArgumentParser(description="Export Palestinian heritage data to RDF formats") parser.add_argument("--format", choices=["jsonld", "turtle", "all"], default="all", help="Output format (default: all)") args = parser.parse_args() # Load data data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json" if not data_file.exists(): print(f"Error: Data file not found: {data_file}") return 1 print(f"Loading: {data_file}") with open(data_file, 'r', encoding='utf-8') as f: data = json.load(f) output_dir = data_file.parent # Export to requested formats if args.format in ["jsonld", "all"]: export_to_jsonld(data, output_dir / "palestinian_heritage.jsonld") if args.format in ["turtle", "all"]: export_to_turtle(data, output_dir / "palestinian_heritage.ttl") print("\nExport complete!") return 0 if __name__ == "__main__": sys.exit(main())