444 lines
15 KiB
Python
444 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create LinkML-compliant YAML instance files from Palestinian heritage data.
|
|
|
|
This script transforms the enriched Palestinian heritage consolidated JSON
|
|
into proper LinkML YAML instances following the Heritage Custodian Ontology schema.
|
|
|
|
Output:
|
|
- data/instances/palestinian_heritage_custodians.yaml (CustodianObservation instances)
|
|
- data/instances/palestinian_heritage_identifiers.yaml (CustodianIdentifier instances)
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Institution type mapping from GLAM-NER to LinkML CustodianType
|
|
TYPE_MAPPING = {
|
|
"GRP.HER.MUS": "MUSEUM",
|
|
"GRP.HER.LIB": "LIBRARY",
|
|
"GRP.HER": "ARCHIVE", # Default heritage group to archive
|
|
"GRP.HER.ARC": "ARCHIVE",
|
|
"GRP.EDU": "EDUCATION_PROVIDER",
|
|
"GRP.EDU.UNI": "EDUCATION_PROVIDER",
|
|
"GRP.RES": "RESEARCH_CENTER",
|
|
"GRP.NPO": "NGO",
|
|
"LOC.HOLY": "HOLY_SITES",
|
|
"LOC.FEAT": "FEATURES",
|
|
}
|
|
|
|
# Subtype refinements
|
|
SUBTYPE_MAPPING = {
|
|
"oral_history_archive": "ARCHIVE",
|
|
"photographic_archive": "ARCHIVE",
|
|
"research_archive": "ARCHIVE",
|
|
"research_institute": "RESEARCH_CENTER",
|
|
"archival_network": "COLLECTING_SOCIETY",
|
|
"family_archives": "ARCHIVE",
|
|
"heritage_center": "RESEARCH_CENTER",
|
|
"library": "LIBRARY",
|
|
"museum": "MUSEUM",
|
|
"media_center": "ARCHIVE",
|
|
"dance_archive": "ARCHIVE",
|
|
"film_archive": "ARCHIVE",
|
|
"university_library": "LIBRARY",
|
|
"church_archive": "HOLY_SITES",
|
|
"national_archive": "ARCHIVE",
|
|
"online_archive": "DIGITAL_PLATFORM",
|
|
"documentation_center": "ARCHIVE",
|
|
"memorial_site": "FEATURES",
|
|
"memory_project": "ARCHIVE",
|
|
}
|
|
|
|
|
|
def map_institution_type(inst_type: str, subtype: str | None) -> str:
|
|
"""Map institution type to LinkML CustodianType enum value."""
|
|
# Check subtype first for more specific mapping
|
|
if subtype and subtype in SUBTYPE_MAPPING:
|
|
return SUBTYPE_MAPPING[subtype]
|
|
|
|
# Then check main type
|
|
if inst_type in TYPE_MAPPING:
|
|
return TYPE_MAPPING[inst_type]
|
|
|
|
# Default to UNKNOWN
|
|
return "UNKNOWN"
|
|
|
|
|
|
def create_hc_id(institution: dict) -> str:
|
|
"""Create the NDE Heritage Custodian ID from GHCID."""
|
|
ghcid = institution.get("ghcid", "")
|
|
if ghcid:
|
|
# Convert to lowercase for URL
|
|
ghcid_lower = ghcid.lower()
|
|
return f"https://nde.nl/ontology/hc/{ghcid_lower}"
|
|
|
|
# Fallback to original ID
|
|
orig_id = institution.get("id", "unknown")
|
|
return f"https://nde.nl/ontology/hc/{orig_id.lower()}"
|
|
|
|
|
|
def create_custodian_observation(institution: dict) -> dict:
|
|
"""Create a CustodianObservation instance from institution data.
|
|
|
|
Uses valid AppellationTypeEnum values:
|
|
- OFFICIAL: Official/legal name
|
|
- VERNACULAR: Commonly used informal name
|
|
- HISTORICAL: Historical name no longer in use
|
|
- TRANSLATION: Translated name in another language
|
|
- ABBREVIATION: Abbreviated form or acronym
|
|
- ALTERNATIVE: Alternative name or variant spelling
|
|
"""
|
|
observation = {
|
|
"observed_name": {
|
|
"appellation_value": institution["name"],
|
|
"appellation_type": "OFFICIAL" # Changed from PREFERRED_NAME
|
|
},
|
|
"observation_date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
|
|
"observation_source": "Palestinian/Lebanese Heritage Institutions Dataset v2.2.0",
|
|
"source": {
|
|
"source_uri": "https://github.com/glam-datasets/palestinian-heritage",
|
|
"source_date": "2025-12-05",
|
|
"source_creator": "Archives Lab LLM Extraction + Palestinian GLAM Claims"
|
|
},
|
|
"confidence_score": {
|
|
"confidence_value": institution.get("confidence", 0.9),
|
|
"confidence_method": "LLM extraction with Wikidata cross-validation"
|
|
}
|
|
}
|
|
|
|
# Add Arabic name as alternative (use TRANSLATION type)
|
|
if institution.get("name_arabic"):
|
|
observation["alternative_observed_names"] = [{
|
|
"appellation_value": institution["name_arabic"],
|
|
"appellation_language": "ar",
|
|
"appellation_type": "TRANSLATION" # Changed from TRANSLATED_NAME
|
|
}]
|
|
|
|
# Add language if known
|
|
if institution.get("name_arabic"):
|
|
observation["language"] = {
|
|
"language_code": "ar"
|
|
}
|
|
else:
|
|
observation["language"] = {
|
|
"language_code": "en"
|
|
}
|
|
|
|
# Add context
|
|
notes_parts = []
|
|
if institution.get("notes"):
|
|
notes_parts.append(institution["notes"])
|
|
if institution.get("location"):
|
|
notes_parts.append(f"Location: {institution['location']}")
|
|
if institution.get("founded_by"):
|
|
notes_parts.append(f"Founded by: {institution['founded_by']}")
|
|
|
|
if notes_parts:
|
|
observation["observation_context"] = " | ".join(notes_parts)
|
|
|
|
return observation
|
|
|
|
|
|
def create_custodian_name(institution: dict) -> dict:
|
|
"""Create a CustodianName instance from institution data."""
|
|
name_entry = {
|
|
"emic_name": institution["name"],
|
|
"standardized_name": institution["name"],
|
|
"name_language": "en",
|
|
"endorsement_source": "https://github.com/glam-datasets/palestinian-heritage",
|
|
}
|
|
|
|
# Add Arabic name as alternative
|
|
if institution.get("name_arabic"):
|
|
name_entry["alternative_names"] = [{
|
|
"appellation_value": institution["name_arabic"],
|
|
"appellation_language": "ar",
|
|
"appellation_type": "TRANSLATED_NAME"
|
|
}]
|
|
|
|
return name_entry
|
|
|
|
|
|
def get_country_alpha3(alpha2: str) -> str:
|
|
"""Get ISO 3166-1 alpha-3 code from alpha-2 code."""
|
|
mapping = {
|
|
"PS": "PSE", # Palestine
|
|
"LB": "LBN", # Lebanon
|
|
"JO": "JOR", # Jordan
|
|
"US": "USA", # United States
|
|
"XX": "XXX", # Unknown
|
|
}
|
|
return mapping.get(alpha2, f"{alpha2}X") # Fallback: add X
|
|
|
|
|
|
def create_custodian_place(institution: dict, hc_id: str) -> dict | None:
|
|
"""Create a CustodianPlace instance from institution data.
|
|
|
|
Args:
|
|
institution: Institution data dict
|
|
hc_id: The custodian hub ID this place refers to
|
|
"""
|
|
if not institution.get("city") and not institution.get("coordinates"):
|
|
return None
|
|
|
|
place = {
|
|
# Required: nominal place name
|
|
"place_name": institution.get("city", institution.get("location", "Unknown location")),
|
|
# Required: link to custodian hub
|
|
"refers_to_custodian": hc_id,
|
|
# Required: must have at least one observation source
|
|
"was_derived_from": [f"obs-{institution.get('id', 'unknown')}"],
|
|
}
|
|
|
|
# Add country with proper structure (both alpha_2 and alpha_3 required)
|
|
country_code = institution.get("country", "XX")
|
|
place["country"] = {
|
|
"alpha_2": country_code,
|
|
"alpha_3": get_country_alpha3(country_code)
|
|
}
|
|
|
|
# Add settlement with proper structure (settlement_id and settlement_name required)
|
|
if institution.get("city"):
|
|
city_slug = institution["city"].lower().replace(" ", "-").replace("'", "")
|
|
place["settlement"] = {
|
|
"settlement_id": f"https://nde.nl/ontology/hc/settlement/{country_code.lower()}-{city_slug}",
|
|
"settlement_name": institution["city"],
|
|
"country": {
|
|
"alpha_2": country_code,
|
|
"alpha_3": get_country_alpha3(country_code)
|
|
}
|
|
}
|
|
|
|
# Add geospatial location if coordinates available
|
|
if institution.get("coordinates"):
|
|
coords = institution["coordinates"]
|
|
place["has_geospatial_location"] = [{
|
|
"geospatial_id": f"https://nde.nl/ontology/hc/geo/{institution.get('id', 'unknown')}",
|
|
"latitude": coords.get("lat"),
|
|
"longitude": coords.get("lon")
|
|
}]
|
|
|
|
# Add place note/description
|
|
if institution.get("location"):
|
|
place["place_note"] = institution["location"]
|
|
|
|
return place
|
|
|
|
|
|
def create_identifiers(institution: dict) -> list[dict]:
|
|
"""Create CustodianIdentifier instances from institution data."""
|
|
identifiers = []
|
|
|
|
# GHCID as primary identifier
|
|
if institution.get("ghcid"):
|
|
identifiers.append({
|
|
"identifier_scheme": "GHCID",
|
|
"identifier_value": institution["ghcid"],
|
|
"canonical_value": institution["ghcid"].lower()
|
|
})
|
|
|
|
# GHCID UUID
|
|
if institution.get("ghcid_uuid"):
|
|
identifiers.append({
|
|
"identifier_scheme": "GHCID_UUID",
|
|
"identifier_value": institution["ghcid_uuid"]
|
|
})
|
|
|
|
# Wikidata
|
|
if institution.get("wikidata", {}).get("id"):
|
|
identifiers.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": institution["wikidata"]["id"],
|
|
"canonical_value": institution["wikidata"]["id"]
|
|
})
|
|
|
|
# VIAF
|
|
if institution.get("identifiers", {}).get("viaf"):
|
|
identifiers.append({
|
|
"identifier_scheme": "VIAF",
|
|
"identifier_value": institution["identifiers"]["viaf"]
|
|
})
|
|
|
|
# GND
|
|
if institution.get("identifiers", {}).get("gnd"):
|
|
identifiers.append({
|
|
"identifier_scheme": "GND",
|
|
"identifier_value": institution["identifiers"]["gnd"]
|
|
})
|
|
|
|
# LCNAF
|
|
if institution.get("identifiers", {}).get("lcnaf"):
|
|
identifiers.append({
|
|
"identifier_scheme": "LCNAF",
|
|
"identifier_value": institution["identifiers"]["lcnaf"]
|
|
})
|
|
|
|
return identifiers
|
|
|
|
|
|
def create_digital_platform(institution: dict, hc_id: str) -> dict | None:
|
|
"""Create a DigitalPlatform instance from institution data.
|
|
|
|
Args:
|
|
institution: Institution data dict
|
|
hc_id: The custodian hub ID this platform refers to
|
|
"""
|
|
if not institution.get("website"):
|
|
return None
|
|
|
|
# Generate platform ID from institution ID
|
|
inst_id = institution.get("id", "unknown")
|
|
platform_id = f"https://nde.nl/ontology/hc/platform/{inst_id}-website"
|
|
|
|
return {
|
|
# Required: unique platform identifier
|
|
"platform_id": platform_id,
|
|
# Required: platform name
|
|
"platform_name": f"{institution['name']} Website",
|
|
# Required: homepage URL
|
|
"homepage_web_address": institution["website"],
|
|
# Required: platform type (must be a list per schema)
|
|
"platform_type": ["INSTITUTIONAL_WEBSITE"],
|
|
# Required: link to custodian hub
|
|
"refers_to_custodian": hc_id,
|
|
}
|
|
|
|
|
|
def create_full_custodian_record(institution: dict) -> dict:
|
|
"""Create a complete Custodian hub record with all aspects."""
|
|
hc_id = create_hc_id(institution)
|
|
|
|
record = {
|
|
"hc_id": hc_id,
|
|
"preferred_label": institution["name"],
|
|
"custodian_type": map_institution_type(
|
|
institution.get("type", "GRP.HER"),
|
|
institution.get("subtype")
|
|
),
|
|
"identifiers": create_identifiers(institution),
|
|
}
|
|
|
|
# Add place if available (re-enabled after schema change to allow uriorcurie)
|
|
place = create_custodian_place(institution, hc_id)
|
|
if place:
|
|
record["place_designation"] = place
|
|
|
|
# Add digital platform if available
|
|
platform = create_digital_platform(institution, hc_id)
|
|
if platform:
|
|
record["digital_platform"] = [platform]
|
|
|
|
# Add metadata
|
|
record["created"] = datetime.now(timezone.utc).isoformat()
|
|
record["modified"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
return record
|
|
|
|
|
|
def create_observation_record(institution: dict) -> dict:
|
|
"""Create a CustodianObservation record for provenance tracking.
|
|
|
|
NOTE: CustodianObservation does NOT have observation_id or derived_custodian slots.
|
|
These are NOT part of the schema - observations link to custodians via
|
|
ReconstructionActivity, not directly.
|
|
"""
|
|
# Just return the observation dict without extra fields
|
|
return create_custodian_observation(institution)
|
|
|
|
|
|
def main():
|
|
"""Main function to create LinkML instances."""
|
|
# Load consolidated data
|
|
input_path = Path("data/extracted/palestinian_heritage_consolidated.json")
|
|
|
|
with open(input_path) as f:
|
|
data = json.load(f)
|
|
|
|
institutions = data.get("institutions", [])
|
|
|
|
print(f"Processing {len(institutions)} institutions...")
|
|
|
|
# Create output directory
|
|
output_dir = Path("data/instances")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create Custodian hub records
|
|
custodian_records = []
|
|
observation_records = []
|
|
|
|
for inst in institutions:
|
|
# Skip online-only without GHCID
|
|
if not inst.get("ghcid"):
|
|
print(f" Skipping (no GHCID): {inst['name']}")
|
|
continue
|
|
|
|
# Create hub record
|
|
custodian_records.append(create_full_custodian_record(inst))
|
|
|
|
# Create observation record for provenance
|
|
observation_records.append(create_observation_record(inst))
|
|
|
|
print(f"Created {len(custodian_records)} custodian records")
|
|
print(f"Created {len(observation_records)} observation records")
|
|
|
|
# Container for validation (matches Container class in schema)
|
|
container = {
|
|
"custodians": custodian_records,
|
|
"custodian_observations": observation_records
|
|
}
|
|
|
|
# Write YAML output
|
|
output_path = output_dir / "palestinian_heritage_custodians.yaml"
|
|
with open(output_path, "w") as f:
|
|
yaml.dump(container, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\nWritten to: {output_path}")
|
|
|
|
# Also write individual files for easier inspection
|
|
|
|
# Custodians only
|
|
custodians_path = output_dir / "palestinian_custodians_only.yaml"
|
|
with open(custodians_path, "w") as f:
|
|
yaml.dump({"custodians": custodian_records}, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
print(f"Written to: {custodians_path}")
|
|
|
|
# Observations only
|
|
observations_path = output_dir / "palestinian_observations_only.yaml"
|
|
with open(observations_path, "w") as f:
|
|
yaml.dump({"custodian_observations": observation_records}, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
print(f"Written to: {observations_path}")
|
|
|
|
# Print summary statistics
|
|
print("\n=== Summary Statistics ===")
|
|
|
|
type_counts = {}
|
|
for record in custodian_records:
|
|
ctype = record.get("custodian_type", "UNKNOWN")
|
|
type_counts[ctype] = type_counts.get(ctype, 0) + 1
|
|
|
|
print("\nBy Custodian Type:")
|
|
for ctype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {ctype}: {count}")
|
|
|
|
with_wikidata = sum(1 for r in custodian_records
|
|
if any(i.get("identifier_scheme") == "Wikidata"
|
|
for i in r.get("identifiers", [])))
|
|
with_coordinates = sum(1 for r in custodian_records
|
|
if r.get("place_designation", {}).get("geospatial_place"))
|
|
with_website = sum(1 for r in custodian_records
|
|
if r.get("digital_platform"))
|
|
|
|
print(f"\nEnrichment Coverage:")
|
|
print(f" With Wikidata ID: {with_wikidata}/{len(custodian_records)}")
|
|
print(f" With coordinates: {with_coordinates}/{len(custodian_records)}")
|
|
print(f" With website: {with_website}/{len(custodian_records)}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|