fix(scripts): normalize org_type codes in DuckLake loader

- Handle single-letter GLAM type codes (G, L, A, M, O, R, C, etc.)
- Handle legacy GRP.HER.* format
- Support compound types like 'M,F' -> 'MUSEUM,FEATURES'
- Fix type hint syntax for Python 3.10+
This commit is contained in:
kempersc 2025-12-07 19:21:14 +01:00
parent 6d66e67bf4
commit 18874e6070

View file

@ -117,14 +117,35 @@ def extract_top_level_fields(data: dict) -> dict:
# Normalize institution type names to standard codes
type_normalize = {
# Full names to shorter standard codes
"OFFICIAL_INSTITUTION": "OFFICIAL",
"RESEARCH_CENTER": "RESEARCH",
"BOTANICAL_ZOO": "BOTANICAL",
"EDUCATION_PROVIDER": "EDUCATION",
"COLLECTING_SOCIETY": "SOCIETY",
"INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE",
# Single-letter codes to full names
"G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM",
"O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN",
"B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES",
"I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES",
"D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL",
# Legacy GRP.HER format
"GRP.HER.GAL": "GALLERY", "GRP.HER.LIB": "LIBRARY", "GRP.HER.ARC": "ARCHIVE",
"GRP.HER.MUS": "MUSEUM", "GRP.HER.DIG": "DIGITAL", "GRP.HER.MIX": "MIXED",
"GRP.HER": "UNKNOWN",
}
record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else ""
# Handle compound types (e.g., "M,F" -> "MUSEUM,FEATURES")
def normalize_type(t: str) -> str:
if not t:
return ""
# Split compound types and normalize each
parts = [p.strip() for p in t.upper().split(",")]
normalized = [type_normalize.get(p, p) for p in parts]
return ",".join(normalized)
record["org_type"] = normalize_type(inst_type)
record["wikidata_id"] = original.get("wikidata_id", "")
record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
@ -240,7 +261,7 @@ def extract_top_level_fields(data: dict) -> dict:
return record
def load_yaml_files(directory: Path, limit: int = None) -> list[dict]:
def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]:
"""Load all YAML files from directory and convert to records."""
records = []
yaml_files = sorted(directory.glob("*.yaml"))