#!/usr/bin/env python3 """ Create LinkML-compliant YAML instance files from Palestinian heritage data. This script transforms the enriched Palestinian heritage consolidated JSON into proper LinkML YAML instances following the Heritage Custodian Ontology schema. Output: - data/instances/palestinian_heritage_custodians.yaml (CustodianObservation instances) - data/instances/palestinian_heritage_identifiers.yaml (CustodianIdentifier instances) """ import json import yaml from datetime import datetime, timezone from pathlib import Path from typing import Any # Institution type mapping from GLAM-NER to LinkML CustodianType TYPE_MAPPING = { "GRP.HER.MUS": "MUSEUM", "GRP.HER.LIB": "LIBRARY", "GRP.HER": "ARCHIVE", # Default heritage group to archive "GRP.HER.ARC": "ARCHIVE", "GRP.EDU": "EDUCATION_PROVIDER", "GRP.EDU.UNI": "EDUCATION_PROVIDER", "GRP.RES": "RESEARCH_CENTER", "GRP.NPO": "NGO", "LOC.HOLY": "HOLY_SITES", "LOC.FEAT": "FEATURES", } # Subtype refinements SUBTYPE_MAPPING = { "oral_history_archive": "ARCHIVE", "photographic_archive": "ARCHIVE", "research_archive": "ARCHIVE", "research_institute": "RESEARCH_CENTER", "archival_network": "COLLECTING_SOCIETY", "family_archives": "ARCHIVE", "heritage_center": "RESEARCH_CENTER", "library": "LIBRARY", "museum": "MUSEUM", "media_center": "ARCHIVE", "dance_archive": "ARCHIVE", "film_archive": "ARCHIVE", "university_library": "LIBRARY", "church_archive": "HOLY_SITES", "national_archive": "ARCHIVE", "online_archive": "DIGITAL_PLATFORM", "documentation_center": "ARCHIVE", "memorial_site": "FEATURES", "memory_project": "ARCHIVE", } def map_institution_type(inst_type: str, subtype: str | None) -> str: """Map institution type to LinkML CustodianType enum value.""" # Check subtype first for more specific mapping if subtype and subtype in SUBTYPE_MAPPING: return SUBTYPE_MAPPING[subtype] # Then check main type if inst_type in TYPE_MAPPING: return TYPE_MAPPING[inst_type] # Default to UNKNOWN return "UNKNOWN" def create_hc_id(institution: dict) -> str: """Create the NDE Heritage Custodian ID from GHCID.""" ghcid = institution.get("ghcid", "") if ghcid: # Convert to lowercase for URL ghcid_lower = ghcid.lower() return f"https://nde.nl/ontology/hc/{ghcid_lower}" # Fallback to original ID orig_id = institution.get("id", "unknown") return f"https://nde.nl/ontology/hc/{orig_id.lower()}" def create_custodian_observation(institution: dict) -> dict: """Create a CustodianObservation instance from institution data. Uses valid AppellationTypeEnum values: - OFFICIAL: Official/legal name - VERNACULAR: Commonly used informal name - HISTORICAL: Historical name no longer in use - TRANSLATION: Translated name in another language - ABBREVIATION: Abbreviated form or acronym - ALTERNATIVE: Alternative name or variant spelling """ observation = { "observed_name": { "appellation_value": institution["name"], "appellation_type": "OFFICIAL" # Changed from PREFERRED_NAME }, "observation_date": datetime.now(timezone.utc).strftime("%Y-%m-%d"), "observation_source": "Palestinian/Lebanese Heritage Institutions Dataset v2.2.0", "source": { "source_uri": "https://github.com/glam-datasets/palestinian-heritage", "source_date": "2025-12-05", "source_creator": "Archives Lab LLM Extraction + Palestinian GLAM Claims" }, "confidence_score": { "confidence_value": institution.get("confidence", 0.9), "confidence_method": "LLM extraction with Wikidata cross-validation" } } # Add Arabic name as alternative (use TRANSLATION type) if institution.get("name_arabic"): observation["alternative_observed_names"] = [{ "appellation_value": institution["name_arabic"], "appellation_language": "ar", "appellation_type": "TRANSLATION" # Changed from TRANSLATED_NAME }] # Add language if known if institution.get("name_arabic"): observation["language"] = { "language_code": "ar" } else: observation["language"] = { "language_code": "en" } # Add context notes_parts = [] if institution.get("notes"): notes_parts.append(institution["notes"]) if institution.get("location"): notes_parts.append(f"Location: {institution['location']}") if institution.get("founded_by"): notes_parts.append(f"Founded by: {institution['founded_by']}") if notes_parts: observation["observation_context"] = " | ".join(notes_parts) return observation def create_custodian_name(institution: dict) -> dict: """Create a CustodianName instance from institution data.""" name_entry = { "emic_name": institution["name"], "standardized_name": institution["name"], "name_language": "en", "endorsement_source": "https://github.com/glam-datasets/palestinian-heritage", } # Add Arabic name as alternative if institution.get("name_arabic"): name_entry["alternative_names"] = [{ "appellation_value": institution["name_arabic"], "appellation_language": "ar", "appellation_type": "TRANSLATED_NAME" }] return name_entry def get_country_alpha3(alpha2: str) -> str: """Get ISO 3166-1 alpha-3 code from alpha-2 code.""" mapping = { "PS": "PSE", # Palestine "LB": "LBN", # Lebanon "JO": "JOR", # Jordan "US": "USA", # United States "XX": "XXX", # Unknown } return mapping.get(alpha2, f"{alpha2}X") # Fallback: add X def create_custodian_place(institution: dict, hc_id: str) -> dict | None: """Create a CustodianPlace instance from institution data. Args: institution: Institution data dict hc_id: The custodian hub ID this place refers to """ if not institution.get("city") and not institution.get("coordinates"): return None place = { # Required: nominal place name "place_name": institution.get("city", institution.get("location", "Unknown location")), # Required: link to custodian hub "refers_to_custodian": hc_id, # Required: must have at least one observation source "was_derived_from": [f"obs-{institution.get('id', 'unknown')}"], } # Add country with proper structure (both alpha_2 and alpha_3 required) country_code = institution.get("country", "XX") place["country"] = { "alpha_2": country_code, "alpha_3": get_country_alpha3(country_code) } # Add settlement with proper structure (settlement_id and settlement_name required) if institution.get("city"): city_slug = institution["city"].lower().replace(" ", "-").replace("'", "") place["settlement"] = { "settlement_id": f"https://nde.nl/ontology/hc/settlement/{country_code.lower()}-{city_slug}", "settlement_name": institution["city"], "country": { "alpha_2": country_code, "alpha_3": get_country_alpha3(country_code) } } # Add geospatial location if coordinates available if institution.get("coordinates"): coords = institution["coordinates"] place["has_geospatial_location"] = [{ "geospatial_id": f"https://nde.nl/ontology/hc/geo/{institution.get('id', 'unknown')}", "latitude": coords.get("lat"), "longitude": coords.get("lon") }] # Add place note/description if institution.get("location"): place["place_note"] = institution["location"] return place def create_identifiers(institution: dict) -> list[dict]: """Create CustodianIdentifier instances from institution data.""" identifiers = [] # GHCID as primary identifier if institution.get("ghcid"): identifiers.append({ "identifier_scheme": "GHCID", "identifier_value": institution["ghcid"], "canonical_value": institution["ghcid"].lower() }) # GHCID UUID if institution.get("ghcid_uuid"): identifiers.append({ "identifier_scheme": "GHCID_UUID", "identifier_value": institution["ghcid_uuid"] }) # Wikidata if institution.get("wikidata", {}).get("id"): identifiers.append({ "identifier_scheme": "Wikidata", "identifier_value": institution["wikidata"]["id"], "canonical_value": institution["wikidata"]["id"] }) # VIAF if institution.get("identifiers", {}).get("viaf"): identifiers.append({ "identifier_scheme": "VIAF", "identifier_value": institution["identifiers"]["viaf"] }) # GND if institution.get("identifiers", {}).get("gnd"): identifiers.append({ "identifier_scheme": "GND", "identifier_value": institution["identifiers"]["gnd"] }) # LCNAF if institution.get("identifiers", {}).get("lcnaf"): identifiers.append({ "identifier_scheme": "LCNAF", "identifier_value": institution["identifiers"]["lcnaf"] }) return identifiers def create_digital_platform(institution: dict, hc_id: str) -> dict | None: """Create a DigitalPlatform instance from institution data. Args: institution: Institution data dict hc_id: The custodian hub ID this platform refers to """ if not institution.get("website"): return None # Generate platform ID from institution ID inst_id = institution.get("id", "unknown") platform_id = f"https://nde.nl/ontology/hc/platform/{inst_id}-website" return { # Required: unique platform identifier "platform_id": platform_id, # Required: platform name "platform_name": f"{institution['name']} Website", # Required: homepage URL "homepage_web_address": institution["website"], # Required: platform type (must be a list per schema) "platform_type": ["INSTITUTIONAL_WEBSITE"], # Required: link to custodian hub "refers_to_custodian": hc_id, } def create_full_custodian_record(institution: dict) -> dict: """Create a complete Custodian hub record with all aspects.""" hc_id = create_hc_id(institution) record = { "hc_id": hc_id, "preferred_label": institution["name"], "custodian_type": map_institution_type( institution.get("type", "GRP.HER"), institution.get("subtype") ), "identifiers": create_identifiers(institution), } # Add place if available (re-enabled after schema change to allow uriorcurie) place = create_custodian_place(institution, hc_id) if place: record["place_designation"] = place # Add digital platform if available platform = create_digital_platform(institution, hc_id) if platform: record["digital_platform"] = [platform] # Add metadata record["created"] = datetime.now(timezone.utc).isoformat() record["modified"] = datetime.now(timezone.utc).isoformat() return record def create_observation_record(institution: dict) -> dict: """Create a CustodianObservation record for provenance tracking. NOTE: CustodianObservation does NOT have observation_id or derived_custodian slots. These are NOT part of the schema - observations link to custodians via ReconstructionActivity, not directly. """ # Just return the observation dict without extra fields return create_custodian_observation(institution) def main(): """Main function to create LinkML instances.""" # Load consolidated data input_path = Path("data/extracted/palestinian_heritage_consolidated.json") with open(input_path) as f: data = json.load(f) institutions = data.get("institutions", []) print(f"Processing {len(institutions)} institutions...") # Create output directory output_dir = Path("data/instances") output_dir.mkdir(parents=True, exist_ok=True) # Create Custodian hub records custodian_records = [] observation_records = [] for inst in institutions: # Skip online-only without GHCID if not inst.get("ghcid"): print(f" Skipping (no GHCID): {inst['name']}") continue # Create hub record custodian_records.append(create_full_custodian_record(inst)) # Create observation record for provenance observation_records.append(create_observation_record(inst)) print(f"Created {len(custodian_records)} custodian records") print(f"Created {len(observation_records)} observation records") # Container for validation (matches Container class in schema) container = { "custodians": custodian_records, "custodian_observations": observation_records } # Write YAML output output_path = output_dir / "palestinian_heritage_custodians.yaml" with open(output_path, "w") as f: yaml.dump(container, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"\nWritten to: {output_path}") # Also write individual files for easier inspection # Custodians only custodians_path = output_dir / "palestinian_custodians_only.yaml" with open(custodians_path, "w") as f: yaml.dump({"custodians": custodian_records}, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"Written to: {custodians_path}") # Observations only observations_path = output_dir / "palestinian_observations_only.yaml" with open(observations_path, "w") as f: yaml.dump({"custodian_observations": observation_records}, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"Written to: {observations_path}") # Print summary statistics print("\n=== Summary Statistics ===") type_counts = {} for record in custodian_records: ctype = record.get("custodian_type", "UNKNOWN") type_counts[ctype] = type_counts.get(ctype, 0) + 1 print("\nBy Custodian Type:") for ctype, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {ctype}: {count}") with_wikidata = sum(1 for r in custodian_records if any(i.get("identifier_scheme") == "Wikidata" for i in r.get("identifiers", []))) with_coordinates = sum(1 for r in custodian_records if r.get("place_designation", {}).get("geospatial_place")) with_website = sum(1 for r in custodian_records if r.get("digital_platform")) print(f"\nEnrichment Coverage:") print(f" With Wikidata ID: {with_wikidata}/{len(custodian_records)}") print(f" With coordinates: {with_coordinates}/{len(custodian_records)}") print(f" With website: {with_website}/{len(custodian_records)}") return 0 if __name__ == "__main__": exit(main())