fix(scripts): normalize org_type codes in DuckLake loader
- Handle single-letter GLAM type codes (G, L, A, M, O, R, C, etc.) - Handle legacy GRP.HER.* format - Support compound types like 'M,F' -> 'MUSEUM,FEATURES' - Fix type hint syntax for Python 3.10+
This commit is contained in:
parent
6d66e67bf4
commit
18874e6070
1 changed files with 23 additions and 2 deletions
|
|
@ -117,14 +117,35 @@ def extract_top_level_fields(data: dict) -> dict:
|
|||
|
||||
# Normalize institution type names to standard codes
|
||||
type_normalize = {
|
||||
# Full names to shorter standard codes
|
||||
"OFFICIAL_INSTITUTION": "OFFICIAL",
|
||||
"RESEARCH_CENTER": "RESEARCH",
|
||||
"BOTANICAL_ZOO": "BOTANICAL",
|
||||
"EDUCATION_PROVIDER": "EDUCATION",
|
||||
"COLLECTING_SOCIETY": "SOCIETY",
|
||||
"INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE",
|
||||
# Single-letter codes to full names
|
||||
"G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM",
|
||||
"O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN",
|
||||
"B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES",
|
||||
"I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES",
|
||||
"D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL",
|
||||
# Legacy GRP.HER format
|
||||
"GRP.HER.GAL": "GALLERY", "GRP.HER.LIB": "LIBRARY", "GRP.HER.ARC": "ARCHIVE",
|
||||
"GRP.HER.MUS": "MUSEUM", "GRP.HER.DIG": "DIGITAL", "GRP.HER.MIX": "MIXED",
|
||||
"GRP.HER": "UNKNOWN",
|
||||
}
|
||||
record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else ""
|
||||
|
||||
# Handle compound types (e.g., "M,F" -> "MUSEUM,FEATURES")
|
||||
def normalize_type(t: str) -> str:
|
||||
if not t:
|
||||
return ""
|
||||
# Split compound types and normalize each
|
||||
parts = [p.strip() for p in t.upper().split(",")]
|
||||
normalized = [type_normalize.get(p, p) for p in parts]
|
||||
return ",".join(normalized)
|
||||
|
||||
record["org_type"] = normalize_type(inst_type)
|
||||
|
||||
record["wikidata_id"] = original.get("wikidata_id", "")
|
||||
record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
|
||||
|
|
@ -240,7 +261,7 @@ def extract_top_level_fields(data: dict) -> dict:
|
|||
return record
|
||||
|
||||
|
||||
def load_yaml_files(directory: Path, limit: int = None) -> list[dict]:
|
||||
def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]:
|
||||
"""Load all YAML files from directory and convert to records."""
|
||||
records = []
|
||||
yaml_files = sorted(directory.glob("*.yaml"))
|
||||
|
|
|
|||
Loading…
Reference in a new issue