glam/scripts/create_palestinian_linkml_instances.py
2025-12-06 19:50:04 +01:00

444 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Create LinkML-compliant YAML instance files from Palestinian heritage data.
This script transforms the enriched Palestinian heritage consolidated JSON
into proper LinkML YAML instances following the Heritage Custodian Ontology schema.
Output:
- data/instances/palestinian_heritage_custodians.yaml (CustodianObservation instances)
- data/instances/palestinian_heritage_identifiers.yaml (CustodianIdentifier instances)
"""
import json
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# Institution type mapping from GLAM-NER to LinkML CustodianType
TYPE_MAPPING = {
"GRP.HER.MUS": "MUSEUM",
"GRP.HER.LIB": "LIBRARY",
"GRP.HER": "ARCHIVE", # Default heritage group to archive
"GRP.HER.ARC": "ARCHIVE",
"GRP.EDU": "EDUCATION_PROVIDER",
"GRP.EDU.UNI": "EDUCATION_PROVIDER",
"GRP.RES": "RESEARCH_CENTER",
"GRP.NPO": "NGO",
"LOC.HOLY": "HOLY_SITES",
"LOC.FEAT": "FEATURES",
}
# Subtype refinements
SUBTYPE_MAPPING = {
"oral_history_archive": "ARCHIVE",
"photographic_archive": "ARCHIVE",
"research_archive": "ARCHIVE",
"research_institute": "RESEARCH_CENTER",
"archival_network": "COLLECTING_SOCIETY",
"family_archives": "ARCHIVE",
"heritage_center": "RESEARCH_CENTER",
"library": "LIBRARY",
"museum": "MUSEUM",
"media_center": "ARCHIVE",
"dance_archive": "ARCHIVE",
"film_archive": "ARCHIVE",
"university_library": "LIBRARY",
"church_archive": "HOLY_SITES",
"national_archive": "ARCHIVE",
"online_archive": "DIGITAL_PLATFORM",
"documentation_center": "ARCHIVE",
"memorial_site": "FEATURES",
"memory_project": "ARCHIVE",
}
def map_institution_type(inst_type: str, subtype: str | None) -> str:
"""Map institution type to LinkML CustodianType enum value."""
# Check subtype first for more specific mapping
if subtype and subtype in SUBTYPE_MAPPING:
return SUBTYPE_MAPPING[subtype]
# Then check main type
if inst_type in TYPE_MAPPING:
return TYPE_MAPPING[inst_type]
# Default to UNKNOWN
return "UNKNOWN"
def create_hc_id(institution: dict) -> str:
"""Create the NDE Heritage Custodian ID from GHCID."""
ghcid = institution.get("ghcid", "")
if ghcid:
# Convert to lowercase for URL
ghcid_lower = ghcid.lower()
return f"https://nde.nl/ontology/hc/{ghcid_lower}"
# Fallback to original ID
orig_id = institution.get("id", "unknown")
return f"https://nde.nl/ontology/hc/{orig_id.lower()}"
def create_custodian_observation(institution: dict) -> dict:
"""Create a CustodianObservation instance from institution data.
Uses valid AppellationTypeEnum values:
- OFFICIAL: Official/legal name
- VERNACULAR: Commonly used informal name
- HISTORICAL: Historical name no longer in use
- TRANSLATION: Translated name in another language
- ABBREVIATION: Abbreviated form or acronym
- ALTERNATIVE: Alternative name or variant spelling
"""
observation = {
"observed_name": {
"appellation_value": institution["name"],
"appellation_type": "OFFICIAL" # Changed from PREFERRED_NAME
},
"observation_date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
"observation_source": "Palestinian/Lebanese Heritage Institutions Dataset v2.2.0",
"source": {
"source_uri": "https://github.com/glam-datasets/palestinian-heritage",
"source_date": "2025-12-05",
"source_creator": "Archives Lab LLM Extraction + Palestinian GLAM Claims"
},
"confidence_score": {
"confidence_value": institution.get("confidence", 0.9),
"confidence_method": "LLM extraction with Wikidata cross-validation"
}
}
# Add Arabic name as alternative (use TRANSLATION type)
if institution.get("name_arabic"):
observation["alternative_observed_names"] = [{
"appellation_value": institution["name_arabic"],
"appellation_language": "ar",
"appellation_type": "TRANSLATION" # Changed from TRANSLATED_NAME
}]
# Add language if known
if institution.get("name_arabic"):
observation["language"] = {
"language_code": "ar"
}
else:
observation["language"] = {
"language_code": "en"
}
# Add context
notes_parts = []
if institution.get("notes"):
notes_parts.append(institution["notes"])
if institution.get("location"):
notes_parts.append(f"Location: {institution['location']}")
if institution.get("founded_by"):
notes_parts.append(f"Founded by: {institution['founded_by']}")
if notes_parts:
observation["observation_context"] = " | ".join(notes_parts)
return observation
def create_custodian_name(institution: dict) -> dict:
"""Create a CustodianName instance from institution data."""
name_entry = {
"emic_name": institution["name"],
"standardized_name": institution["name"],
"name_language": "en",
"endorsement_source": "https://github.com/glam-datasets/palestinian-heritage",
}
# Add Arabic name as alternative
if institution.get("name_arabic"):
name_entry["alternative_names"] = [{
"appellation_value": institution["name_arabic"],
"appellation_language": "ar",
"appellation_type": "TRANSLATED_NAME"
}]
return name_entry
def get_country_alpha3(alpha2: str) -> str:
"""Get ISO 3166-1 alpha-3 code from alpha-2 code."""
mapping = {
"PS": "PSE", # Palestine
"LB": "LBN", # Lebanon
"JO": "JOR", # Jordan
"US": "USA", # United States
"XX": "XXX", # Unknown
}
return mapping.get(alpha2, f"{alpha2}X") # Fallback: add X
def create_custodian_place(institution: dict, hc_id: str) -> dict | None:
"""Create a CustodianPlace instance from institution data.
Args:
institution: Institution data dict
hc_id: The custodian hub ID this place refers to
"""
if not institution.get("city") and not institution.get("coordinates"):
return None
place = {
# Required: nominal place name
"place_name": institution.get("city", institution.get("location", "Unknown location")),
# Required: link to custodian hub
"refers_to_custodian": hc_id,
# Required: must have at least one observation source
"was_derived_from": [f"obs-{institution.get('id', 'unknown')}"],
}
# Add country with proper structure (both alpha_2 and alpha_3 required)
country_code = institution.get("country", "XX")
place["country"] = {
"alpha_2": country_code,
"alpha_3": get_country_alpha3(country_code)
}
# Add settlement with proper structure (settlement_id and settlement_name required)
if institution.get("city"):
city_slug = institution["city"].lower().replace(" ", "-").replace("'", "")
place["settlement"] = {
"settlement_id": f"https://nde.nl/ontology/hc/settlement/{country_code.lower()}-{city_slug}",
"settlement_name": institution["city"],
"country": {
"alpha_2": country_code,
"alpha_3": get_country_alpha3(country_code)
}
}
# Add geospatial location if coordinates available
if institution.get("coordinates"):
coords = institution["coordinates"]
place["has_geospatial_location"] = [{
"geospatial_id": f"https://nde.nl/ontology/hc/geo/{institution.get('id', 'unknown')}",
"latitude": coords.get("lat"),
"longitude": coords.get("lon")
}]
# Add place note/description
if institution.get("location"):
place["place_note"] = institution["location"]
return place
def create_identifiers(institution: dict) -> list[dict]:
"""Create CustodianIdentifier instances from institution data."""
identifiers = []
# GHCID as primary identifier
if institution.get("ghcid"):
identifiers.append({
"identifier_scheme": "GHCID",
"identifier_value": institution["ghcid"],
"canonical_value": institution["ghcid"].lower()
})
# GHCID UUID
if institution.get("ghcid_uuid"):
identifiers.append({
"identifier_scheme": "GHCID_UUID",
"identifier_value": institution["ghcid_uuid"]
})
# Wikidata
if institution.get("wikidata", {}).get("id"):
identifiers.append({
"identifier_scheme": "Wikidata",
"identifier_value": institution["wikidata"]["id"],
"canonical_value": institution["wikidata"]["id"]
})
# VIAF
if institution.get("identifiers", {}).get("viaf"):
identifiers.append({
"identifier_scheme": "VIAF",
"identifier_value": institution["identifiers"]["viaf"]
})
# GND
if institution.get("identifiers", {}).get("gnd"):
identifiers.append({
"identifier_scheme": "GND",
"identifier_value": institution["identifiers"]["gnd"]
})
# LCNAF
if institution.get("identifiers", {}).get("lcnaf"):
identifiers.append({
"identifier_scheme": "LCNAF",
"identifier_value": institution["identifiers"]["lcnaf"]
})
return identifiers
def create_digital_platform(institution: dict, hc_id: str) -> dict | None:
"""Create a DigitalPlatform instance from institution data.
Args:
institution: Institution data dict
hc_id: The custodian hub ID this platform refers to
"""
if not institution.get("website"):
return None
# Generate platform ID from institution ID
inst_id = institution.get("id", "unknown")
platform_id = f"https://nde.nl/ontology/hc/platform/{inst_id}-website"
return {
# Required: unique platform identifier
"platform_id": platform_id,
# Required: platform name
"platform_name": f"{institution['name']} Website",
# Required: homepage URL
"homepage_web_address": institution["website"],
# Required: platform type (must be a list per schema)
"platform_type": ["INSTITUTIONAL_WEBSITE"],
# Required: link to custodian hub
"refers_to_custodian": hc_id,
}
def create_full_custodian_record(institution: dict) -> dict:
"""Create a complete Custodian hub record with all aspects."""
hc_id = create_hc_id(institution)
record = {
"hc_id": hc_id,
"preferred_label": institution["name"],
"custodian_type": map_institution_type(
institution.get("type", "GRP.HER"),
institution.get("subtype")
),
"identifiers": create_identifiers(institution),
}
# Add place if available (re-enabled after schema change to allow uriorcurie)
place = create_custodian_place(institution, hc_id)
if place:
record["place_designation"] = place
# Add digital platform if available
platform = create_digital_platform(institution, hc_id)
if platform:
record["digital_platform"] = [platform]
# Add metadata
record["created"] = datetime.now(timezone.utc).isoformat()
record["modified"] = datetime.now(timezone.utc).isoformat()
return record
def create_observation_record(institution: dict) -> dict:
"""Create a CustodianObservation record for provenance tracking.
NOTE: CustodianObservation does NOT have observation_id or derived_custodian slots.
These are NOT part of the schema - observations link to custodians via
ReconstructionActivity, not directly.
"""
# Just return the observation dict without extra fields
return create_custodian_observation(institution)
def main():
"""Main function to create LinkML instances."""
# Load consolidated data
input_path = Path("data/extracted/palestinian_heritage_consolidated.json")
with open(input_path) as f:
data = json.load(f)
institutions = data.get("institutions", [])
print(f"Processing {len(institutions)} institutions...")
# Create output directory
output_dir = Path("data/instances")
output_dir.mkdir(parents=True, exist_ok=True)
# Create Custodian hub records
custodian_records = []
observation_records = []
for inst in institutions:
# Skip online-only without GHCID
if not inst.get("ghcid"):
print(f" Skipping (no GHCID): {inst['name']}")
continue
# Create hub record
custodian_records.append(create_full_custodian_record(inst))
# Create observation record for provenance
observation_records.append(create_observation_record(inst))
print(f"Created {len(custodian_records)} custodian records")
print(f"Created {len(observation_records)} observation records")
# Container for validation (matches Container class in schema)
container = {
"custodians": custodian_records,
"custodian_observations": observation_records
}
# Write YAML output
output_path = output_dir / "palestinian_heritage_custodians.yaml"
with open(output_path, "w") as f:
yaml.dump(container, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"\nWritten to: {output_path}")
# Also write individual files for easier inspection
# Custodians only
custodians_path = output_dir / "palestinian_custodians_only.yaml"
with open(custodians_path, "w") as f:
yaml.dump({"custodians": custodian_records}, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"Written to: {custodians_path}")
# Observations only
observations_path = output_dir / "palestinian_observations_only.yaml"
with open(observations_path, "w") as f:
yaml.dump({"custodian_observations": observation_records}, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"Written to: {observations_path}")
# Print summary statistics
print("\n=== Summary Statistics ===")
type_counts = {}
for record in custodian_records:
ctype = record.get("custodian_type", "UNKNOWN")
type_counts[ctype] = type_counts.get(ctype, 0) + 1
print("\nBy Custodian Type:")
for ctype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {ctype}: {count}")
with_wikidata = sum(1 for r in custodian_records
if any(i.get("identifier_scheme") == "Wikidata"
for i in r.get("identifiers", [])))
with_coordinates = sum(1 for r in custodian_records
if r.get("place_designation", {}).get("geospatial_place"))
with_website = sum(1 for r in custodian_records
if r.get("digital_platform"))
print(f"\nEnrichment Coverage:")
print(f" With Wikidata ID: {with_wikidata}/{len(custodian_records)}")
print(f" With coordinates: {with_coordinates}/{len(custodian_records)}")
print(f" With website: {with_website}/{len(custodian_records)}")
return 0
if __name__ == "__main__":
exit(main())