From 18874e6070a714d18c48616fb1e5484833697d44 Mon Sep 17 00:00:00 2001 From: kempersc Date: Sun, 7 Dec 2025 19:21:14 +0100 Subject: [PATCH] fix(scripts): normalize org_type codes in DuckLake loader - Handle single-letter GLAM type codes (G, L, A, M, O, R, C, etc.) - Handle legacy GRP.HER.* format - Support compound types like 'M,F' -> 'MUSEUM,FEATURES' - Fix type hint syntax for Python 3.10+ --- scripts/load_custodians_to_ducklake.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/scripts/load_custodians_to_ducklake.py b/scripts/load_custodians_to_ducklake.py index bde5663e25..8a6bf48c0d 100644 --- a/scripts/load_custodians_to_ducklake.py +++ b/scripts/load_custodians_to_ducklake.py @@ -117,14 +117,35 @@ def extract_top_level_fields(data: dict) -> dict: # Normalize institution type names to standard codes type_normalize = { + # Full names to shorter standard codes "OFFICIAL_INSTITUTION": "OFFICIAL", "RESEARCH_CENTER": "RESEARCH", "BOTANICAL_ZOO": "BOTANICAL", "EDUCATION_PROVIDER": "EDUCATION", "COLLECTING_SOCIETY": "SOCIETY", "INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE", + # Single-letter codes to full names + "G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM", + "O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN", + "B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES", + "I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES", + "D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL", + # Legacy GRP.HER format + "GRP.HER.GAL": "GALLERY", "GRP.HER.LIB": "LIBRARY", "GRP.HER.ARC": "ARCHIVE", + "GRP.HER.MUS": "MUSEUM", "GRP.HER.DIG": "DIGITAL", "GRP.HER.MIX": "MIXED", + "GRP.HER": "UNKNOWN", } - record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else "" + + # Handle compound types (e.g., "M,F" -> "MUSEUM,FEATURES") + def normalize_type(t: str) -> str: + if not t: + return "" + # Split compound types and normalize each + parts = [p.strip() for p in t.upper().split(",")] + normalized = [type_normalize.get(p, p) for p in parts] + return ",".join(normalized) + + record["org_type"] = normalize_type(inst_type) record["wikidata_id"] = original.get("wikidata_id", "") record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str) @@ -240,7 +261,7 @@ def extract_top_level_fields(data: dict) -> dict: return record -def load_yaml_files(directory: Path, limit: int = None) -> list[dict]: +def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]: """Load all YAML files from directory and convert to records.""" records = [] yaml_files = sorted(directory.glob("*.yaml"))