From 18874e6070a714d18c48616fb1e5484833697d44 Mon Sep 17 00:00:00 2001
From: kempersc <sckemper@mailfence.com>
Date: Sun, 7 Dec 2025 19:21:14 +0100
Subject: [PATCH] fix(scripts): normalize org_type codes in DuckLake loader

- Handle single-letter GLAM type codes (G, L, A, M, O, R, C, etc.)
- Handle legacy GRP.HER.* format
- Support compound types like 'M,F' -> 'MUSEUM,FEATURES'
- Fix type hint syntax for Python 3.10+
---
 scripts/load_custodians_to_ducklake.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/scripts/load_custodians_to_ducklake.py b/scripts/load_custodians_to_ducklake.py
index bde5663e25..8a6bf48c0d 100644
--- a/scripts/load_custodians_to_ducklake.py
+++ b/scripts/load_custodians_to_ducklake.py
@@ -117,14 +117,35 @@ def extract_top_level_fields(data: dict) -> dict:
         
         # Normalize institution type names to standard codes
         type_normalize = {
+            # Full names to shorter standard codes
             "OFFICIAL_INSTITUTION": "OFFICIAL",
             "RESEARCH_CENTER": "RESEARCH",
             "BOTANICAL_ZOO": "BOTANICAL",
             "EDUCATION_PROVIDER": "EDUCATION",
             "COLLECTING_SOCIETY": "SOCIETY",
             "INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE",
+            # Single-letter codes to full names
+            "G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM",
+            "O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN",
+            "B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES",
+            "I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES",
+            "D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL",
+            # Legacy GRP.HER format
+            "GRP.HER.GAL": "GALLERY", "GRP.HER.LIB": "LIBRARY", "GRP.HER.ARC": "ARCHIVE",
+            "GRP.HER.MUS": "MUSEUM", "GRP.HER.DIG": "DIGITAL", "GRP.HER.MIX": "MIXED",
+            "GRP.HER": "UNKNOWN",
         }
-        record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else ""
+        
+        # Handle compound types (e.g., "M,F" -> "MUSEUM,FEATURES")
+        def normalize_type(t: str) -> str:
+            if not t:
+                return ""
+            # Split compound types and normalize each
+            parts = [p.strip() for p in t.upper().split(",")]
+            normalized = [type_normalize.get(p, p) for p in parts]
+            return ",".join(normalized)
+        
+        record["org_type"] = normalize_type(inst_type)
         
         record["wikidata_id"] = original.get("wikidata_id", "")
         record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
@@ -240,7 +261,7 @@ def extract_top_level_fields(data: dict) -> dict:
     return record
 
 
-def load_yaml_files(directory: Path, limit: int = None) -> list[dict]:
+def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]:
     """Load all YAML files from directory and convert to records."""
     records = []
     yaml_files = sorted(directory.glob("*.yaml"))