""" Schema-Driven Label Loading for RAG Templates Per Rule 41: Labels MUST be resolved at runtime from schema/reference files, NOT hardcoded in templates or code. This module loads multilingual labels for: - Institution types (from CustodianType schema + InstitutionTypeCodeEnum) - Subregions (from ISO 3166-2 JSON reference files) - Countries (from ISO 3166-1 reference data) Usage: from schema_labels import get_label_resolver resolver = get_label_resolver() label_nl = resolver.get_institution_type_label("M", "nl") # "musea" label_en = resolver.get_subregion_label("NL-NH", "en") # "North Holland" """ import json import logging import os from functools import lru_cache from pathlib import Path from typing import Dict, Optional import yaml logger = logging.getLogger(__name__) # Base paths - detect deployment location automatically def _detect_paths(): """ Detect correct paths for schema and reference files. Supports two deployment scenarios: 1. Development: /Users/.../glam/backend/rag/schema_labels.py - Schema at: /Users/.../glam/schemas/20251121/linkml/ - Reference at: /Users/.../glam/data/reference/ 2. Production: /opt/glam-backend/rag/schema_labels.py - Schema at: /opt/glam-backend/rag/schemas/20251121/linkml/ (if deployed) - Reference at: /opt/glam-backend/rag/data/reference/ """ script_dir = Path(__file__).parent # .../rag/ # Try different possible locations for reference data possible_reference_paths = [ script_dir / "data" / "reference", # Production: /opt/glam-backend/rag/data/reference script_dir.parent.parent / "data" / "reference", # Dev: /Users/.../glam/data/reference Path("/opt/glam-backend/rag/data/reference"), # Explicit production path ] reference_path = None for path in possible_reference_paths: if path.exists() and list(path.glob("iso_3166_2_*.json")): reference_path = path break if reference_path is None: # Fallback to first option even if doesn't exist (will use fallback labels) reference_path = possible_reference_paths[0] logger.warning(f"No reference path found with ISO 3166-2 files, using {reference_path}") # Try different possible locations for schema possible_schema_paths = [ script_dir / "schemas" / "20251121" / "linkml", # Production (if schemas deployed) script_dir.parent.parent / "schemas" / "20251121" / "linkml", # Dev monorepo Path("/opt/glam-backend/rag/schemas/20251121/linkml"), # Explicit production ] schema_path = None for path in possible_schema_paths: if path.exists(): schema_path = path break if schema_path is None: # Fallback - will use fallback labels for institution types schema_path = possible_schema_paths[0] logger.warning(f"No schema path found, using {schema_path}") return schema_path, reference_path SCHEMA_PATH, REFERENCE_PATH = _detect_paths() class SchemaLabelResolver: """ Loads and resolves labels from schema and reference files. Labels are loaded once at initialization and cached for performance. """ def __init__( self, schema_path: Optional[Path] = None, reference_path: Optional[Path] = None ): self.schema_path = schema_path or SCHEMA_PATH self.reference_path = reference_path or REFERENCE_PATH # Lazy-loaded label dictionaries self._institution_type_labels: Optional[Dict[str, Dict[str, str]]] = None self._subregion_labels: Optional[Dict[str, Dict[str, str]]] = None self._country_labels: Optional[Dict[str, Dict[str, str]]] = None @property def institution_type_labels(self) -> Dict[str, Dict[str, str]]: """Load institution type labels from schema on first access.""" if self._institution_type_labels is None: self._institution_type_labels = self._load_institution_type_labels() return self._institution_type_labels @property def subregion_labels(self) -> Dict[str, Dict[str, str]]: """Load subregion labels from reference files on first access.""" if self._subregion_labels is None: self._subregion_labels = self._load_subregion_labels() return self._subregion_labels def _load_institution_type_labels(self) -> Dict[str, Dict[str, str]]: """ Load institution type labels from InstitutionTypeCodeEnum. Returns dict like: {"M": {"nl": "musea", "en": "museums", "de": "Museen"}, ...} """ labels: Dict[str, Dict[str, str]] = {} # Primary source: InstitutionTypeCodeEnum with descriptions enum_path = self.schema_path / "modules" / "enums" / "InstitutionTypeCodeEnum.yaml" # Fallback labels derived from CustodianType subclass naming # These match the schema's glamorcubesfixphdnt_code slot fallback_labels = { "G": {"en": "galleries", "nl": "galerijen", "de": "Galerien"}, "L": {"en": "libraries", "nl": "bibliotheken", "de": "Bibliotheken"}, "A": {"en": "archives", "nl": "archieven", "de": "Archive"}, "M": {"en": "museums", "nl": "musea", "de": "Museen"}, "O": {"en": "official institutions", "nl": "overheidsinstellingen", "de": "Behörden"}, "R": {"en": "research centers", "nl": "onderzoekscentra", "de": "Forschungszentren"}, "C": {"en": "corporate archives", "nl": "bedrijfsarchieven", "de": "Unternehmensarchive"}, "U": {"en": "institutions", "nl": "instellingen", "de": "Einrichtungen"}, "B": {"en": "botanical gardens and zoos", "nl": "botanische tuinen en dierentuinen", "de": "botanische Gärten und Zoos"}, "E": {"en": "education providers", "nl": "onderwijsinstellingen", "de": "Bildungseinrichtungen"}, "S": {"en": "heritage societies", "nl": "heemkundige kringen", "de": "Heimatvereine"}, "F": {"en": "features", "nl": "monumenten", "de": "Denkmäler"}, "I": {"en": "intangible heritage groups", "nl": "immaterieel erfgoedgroepen", "de": "immaterielles Kulturerbe"}, "X": {"en": "mixed institutions", "nl": "gecombineerde instellingen", "de": "gemischte Einrichtungen"}, "P": {"en": "personal collections", "nl": "privéverzamelingen", "de": "Privatsammlungen"}, "H": {"en": "holy sites", "nl": "religieuze erfgoedsites", "de": "religiöse Stätten"}, "D": {"en": "digital platforms", "nl": "digitale platforms", "de": "digitale Plattformen"}, "N": {"en": "heritage NGOs", "nl": "erfgoedorganisaties", "de": "Kulturerbe-NGOs"}, "T": {"en": "taste/smell heritage", "nl": "culinair erfgoed", "de": "kulinarisches Erbe"}, } try: if enum_path.exists(): with open(enum_path, 'r', encoding='utf-8') as f: enum_data = yaml.safe_load(f) # Extract labels from enum descriptions permissible_values = enum_data.get('enums', {}).get('InstitutionTypeCodeEnum', {}).get('permissible_values', {}) for code, value_info in permissible_values.items(): description = value_info.get('description', '') # Use description as English label, fallback for other languages labels[code] = { "en": description.lower() + "s" if description else fallback_labels.get(code, {}).get("en", code), "nl": fallback_labels.get(code, {}).get("nl", code), "de": fallback_labels.get(code, {}).get("de", code), } logger.info(f"Loaded {len(labels)} institution type labels from schema") else: logger.warning(f"Schema file not found: {enum_path}, using fallback labels") labels = fallback_labels except Exception as e: logger.error(f"Error loading institution type labels: {e}, using fallback") labels = fallback_labels # Ensure all codes have labels for code in "GLAMORCUBESFIXPHDNT": if code not in labels: labels[code] = fallback_labels.get(code, {"en": code, "nl": code, "de": code}) return labels def _load_subregion_labels(self) -> Dict[str, Dict[str, str]]: """ Load subregion labels from ISO 3166-2 JSON files. Returns dict like: {"NL-NH": {"nl": "Noord-Holland", "en": "North Holland"}, ...} """ labels: Dict[str, Dict[str, str]] = {} # Load all iso_3166_2_*.json files try: for json_file in self.reference_path.glob("iso_3166_2_*.json"): country_code = json_file.stem.replace("iso_3166_2_", "").upper() with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) provinces = data.get('provinces', {}) # Build reverse lookup: code -> names code_to_names: Dict[str, Dict[str, str]] = {} for name, subdivision_code in provinces.items(): full_code = f"{country_code}-{subdivision_code}" if full_code not in code_to_names: code_to_names[full_code] = {} # Detect language from name characteristics # Dutch names often have hyphenated prefixes like "Noord-" or "Zuid-" if any(name.startswith(prefix) for prefix in ["Noord", "Zuid", "Oost", "West"]): code_to_names[full_code]["nl"] = name elif name.startswith("North") or name.startswith("South"): code_to_names[full_code]["en"] = name else: # Default: use for both if no language-specific version exists if "nl" not in code_to_names[full_code]: code_to_names[full_code]["nl"] = name if "en" not in code_to_names[full_code]: code_to_names[full_code]["en"] = name labels.update(code_to_names) logger.debug(f"Loaded {len(provinces)} subregion labels from {json_file.name}") logger.info(f"Loaded {len(labels)} total subregion labels from reference files") except Exception as e: logger.error(f"Error loading subregion labels: {e}") return labels def get_institution_type_label(self, code: str, language: str = "en") -> str: """Get human-readable label for institution type code.""" labels = self.institution_type_labels.get(code, {}) return labels.get(language, labels.get("en", code)) def get_subregion_label(self, code: str, language: str = "en") -> str: """Get human-readable label for subregion code.""" labels = self.subregion_labels.get(code, {}) return labels.get(language, labels.get("en", code)) def get_all_institution_type_labels(self, language: str = "en") -> Dict[str, str]: """Get all institution type labels for a language (for template interpolation).""" return { code: self.get_institution_type_label(code, language) for code in self.institution_type_labels } def get_all_subregion_labels(self, language: str = "en") -> Dict[str, str]: """Get all subregion labels for a language (for template interpolation).""" return { code: self.get_subregion_label(code, language) for code in self.subregion_labels } # Singleton instance for efficient reuse _label_resolver: Optional[SchemaLabelResolver] = None def get_label_resolver( schema_path: Optional[Path] = None, reference_path: Optional[Path] = None ) -> SchemaLabelResolver: """ Get the singleton label resolver instance. Creates a new instance if paths are specified, otherwise returns cached instance. """ global _label_resolver if schema_path or reference_path or _label_resolver is None: _label_resolver = SchemaLabelResolver(schema_path, reference_path) return _label_resolver # Convenience functions for common use cases def get_institution_type_labels_nl() -> Dict[str, str]: """Get Dutch labels for all institution types.""" return get_label_resolver().get_all_institution_type_labels("nl") def get_institution_type_labels_en() -> Dict[str, str]: """Get English labels for all institution types.""" return get_label_resolver().get_all_institution_type_labels("en") def get_subregion_labels() -> Dict[str, str]: """Get default (Dutch) labels for all subregions.""" return get_label_resolver().get_all_subregion_labels("nl")