300 lines
13 KiB
Python
300 lines
13 KiB
Python
"""
|
|
Schema-Driven Label Loading for RAG Templates
|
|
|
|
Per Rule 41: Labels MUST be resolved at runtime from schema/reference files,
|
|
NOT hardcoded in templates or code.
|
|
|
|
This module loads multilingual labels for:
|
|
- Institution types (from CustodianType schema + InstitutionTypeCodeEnum)
|
|
- Subregions (from ISO 3166-2 JSON reference files)
|
|
- Countries (from ISO 3166-1 reference data)
|
|
|
|
Usage:
|
|
from schema_labels import get_label_resolver
|
|
|
|
resolver = get_label_resolver()
|
|
label_nl = resolver.get_institution_type_label("M", "nl") # "musea"
|
|
label_en = resolver.get_subregion_label("NL-NH", "en") # "North Holland"
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
from typing import Dict, Optional
|
|
|
|
import yaml
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Base paths - detect deployment location automatically
|
|
def _detect_paths():
|
|
"""
|
|
Detect correct paths for schema and reference files.
|
|
|
|
Supports two deployment scenarios:
|
|
1. Development: /Users/.../glam/backend/rag/schema_labels.py
|
|
- Schema at: /Users/.../glam/schemas/20251121/linkml/
|
|
- Reference at: /Users/.../glam/data/reference/
|
|
2. Production: /opt/glam-backend/rag/schema_labels.py
|
|
- Schema at: /opt/glam-backend/rag/schemas/20251121/linkml/ (if deployed)
|
|
- Reference at: /opt/glam-backend/rag/data/reference/
|
|
"""
|
|
script_dir = Path(__file__).parent # .../rag/
|
|
|
|
# Try different possible locations for reference data
|
|
possible_reference_paths = [
|
|
script_dir / "data" / "reference", # Production: /opt/glam-backend/rag/data/reference
|
|
script_dir.parent.parent / "data" / "reference", # Dev: /Users/.../glam/data/reference
|
|
Path("/opt/glam-backend/rag/data/reference"), # Explicit production path
|
|
]
|
|
|
|
reference_path = None
|
|
for path in possible_reference_paths:
|
|
if path.exists() and list(path.glob("iso_3166_2_*.json")):
|
|
reference_path = path
|
|
break
|
|
|
|
if reference_path is None:
|
|
# Fallback to first option even if doesn't exist (will use fallback labels)
|
|
reference_path = possible_reference_paths[0]
|
|
logger.warning(f"No reference path found with ISO 3166-2 files, using {reference_path}")
|
|
|
|
# Try different possible locations for schema
|
|
possible_schema_paths = [
|
|
script_dir / "schemas" / "20251121" / "linkml", # Production (if schemas deployed)
|
|
script_dir.parent.parent / "schemas" / "20251121" / "linkml", # Dev monorepo
|
|
Path("/opt/glam-backend/rag/schemas/20251121/linkml"), # Explicit production
|
|
]
|
|
|
|
schema_path = None
|
|
for path in possible_schema_paths:
|
|
if path.exists():
|
|
schema_path = path
|
|
break
|
|
|
|
if schema_path is None:
|
|
# Fallback - will use fallback labels for institution types
|
|
schema_path = possible_schema_paths[0]
|
|
logger.warning(f"No schema path found, using {schema_path}")
|
|
|
|
return schema_path, reference_path
|
|
|
|
|
|
SCHEMA_PATH, REFERENCE_PATH = _detect_paths()
|
|
|
|
|
|
class SchemaLabelResolver:
|
|
"""
|
|
Loads and resolves labels from schema and reference files.
|
|
|
|
Labels are loaded once at initialization and cached for performance.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
schema_path: Optional[Path] = None,
|
|
reference_path: Optional[Path] = None
|
|
):
|
|
self.schema_path = schema_path or SCHEMA_PATH
|
|
self.reference_path = reference_path or REFERENCE_PATH
|
|
|
|
# Lazy-loaded label dictionaries
|
|
self._institution_type_labels: Optional[Dict[str, Dict[str, str]]] = None
|
|
self._subregion_labels: Optional[Dict[str, Dict[str, str]]] = None
|
|
self._country_labels: Optional[Dict[str, Dict[str, str]]] = None
|
|
|
|
@property
|
|
def institution_type_labels(self) -> Dict[str, Dict[str, str]]:
|
|
"""Load institution type labels from schema on first access."""
|
|
if self._institution_type_labels is None:
|
|
self._institution_type_labels = self._load_institution_type_labels()
|
|
return self._institution_type_labels
|
|
|
|
@property
|
|
def subregion_labels(self) -> Dict[str, Dict[str, str]]:
|
|
"""Load subregion labels from reference files on first access."""
|
|
if self._subregion_labels is None:
|
|
self._subregion_labels = self._load_subregion_labels()
|
|
return self._subregion_labels
|
|
|
|
def _load_institution_type_labels(self) -> Dict[str, Dict[str, str]]:
|
|
"""
|
|
Load institution type labels from InstitutionTypeCodeEnum.
|
|
|
|
Returns dict like:
|
|
{"M": {"nl": "musea", "en": "museums", "de": "Museen"}, ...}
|
|
"""
|
|
labels: Dict[str, Dict[str, str]] = {}
|
|
|
|
# Primary source: InstitutionTypeCodeEnum with descriptions
|
|
enum_path = self.schema_path / "modules" / "enums" / "InstitutionTypeCodeEnum.yaml"
|
|
|
|
# Fallback labels derived from CustodianType subclass naming
|
|
# These match the schema's glamorcubesfixphdnt_code slot
|
|
fallback_labels = {
|
|
"G": {"en": "galleries", "nl": "galerijen", "de": "Galerien"},
|
|
"L": {"en": "libraries", "nl": "bibliotheken", "de": "Bibliotheken"},
|
|
"A": {"en": "archives", "nl": "archieven", "de": "Archive"},
|
|
"M": {"en": "museums", "nl": "musea", "de": "Museen"},
|
|
"O": {"en": "official institutions", "nl": "overheidsinstellingen", "de": "Behörden"},
|
|
"R": {"en": "research centers", "nl": "onderzoekscentra", "de": "Forschungszentren"},
|
|
"C": {"en": "corporate archives", "nl": "bedrijfsarchieven", "de": "Unternehmensarchive"},
|
|
"U": {"en": "institutions", "nl": "instellingen", "de": "Einrichtungen"},
|
|
"B": {"en": "botanical gardens and zoos", "nl": "botanische tuinen en dierentuinen", "de": "botanische Gärten und Zoos"},
|
|
"E": {"en": "education providers", "nl": "onderwijsinstellingen", "de": "Bildungseinrichtungen"},
|
|
"S": {"en": "heritage societies", "nl": "heemkundige kringen", "de": "Heimatvereine"},
|
|
"F": {"en": "features", "nl": "monumenten", "de": "Denkmäler"},
|
|
"I": {"en": "intangible heritage groups", "nl": "immaterieel erfgoedgroepen", "de": "immaterielles Kulturerbe"},
|
|
"X": {"en": "mixed institutions", "nl": "gecombineerde instellingen", "de": "gemischte Einrichtungen"},
|
|
"P": {"en": "personal collections", "nl": "privéverzamelingen", "de": "Privatsammlungen"},
|
|
"H": {"en": "holy sites", "nl": "religieuze erfgoedsites", "de": "religiöse Stätten"},
|
|
"D": {"en": "digital platforms", "nl": "digitale platforms", "de": "digitale Plattformen"},
|
|
"N": {"en": "heritage NGOs", "nl": "erfgoedorganisaties", "de": "Kulturerbe-NGOs"},
|
|
"T": {"en": "taste/smell heritage", "nl": "culinair erfgoed", "de": "kulinarisches Erbe"},
|
|
}
|
|
|
|
try:
|
|
if enum_path.exists():
|
|
with open(enum_path, 'r', encoding='utf-8') as f:
|
|
enum_data = yaml.safe_load(f)
|
|
|
|
# Extract labels from enum descriptions
|
|
permissible_values = enum_data.get('enums', {}).get('InstitutionTypeCodeEnum', {}).get('permissible_values', {})
|
|
|
|
for code, value_info in permissible_values.items():
|
|
description = value_info.get('description', '')
|
|
# Use description as English label, fallback for other languages
|
|
labels[code] = {
|
|
"en": description.lower() + "s" if description else fallback_labels.get(code, {}).get("en", code),
|
|
"nl": fallback_labels.get(code, {}).get("nl", code),
|
|
"de": fallback_labels.get(code, {}).get("de", code),
|
|
}
|
|
|
|
logger.info(f"Loaded {len(labels)} institution type labels from schema")
|
|
else:
|
|
logger.warning(f"Schema file not found: {enum_path}, using fallback labels")
|
|
labels = fallback_labels
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading institution type labels: {e}, using fallback")
|
|
labels = fallback_labels
|
|
|
|
# Ensure all codes have labels
|
|
for code in "GLAMORCUBESFIXPHDNT":
|
|
if code not in labels:
|
|
labels[code] = fallback_labels.get(code, {"en": code, "nl": code, "de": code})
|
|
|
|
return labels
|
|
|
|
def _load_subregion_labels(self) -> Dict[str, Dict[str, str]]:
|
|
"""
|
|
Load subregion labels from ISO 3166-2 JSON files.
|
|
|
|
Returns dict like:
|
|
{"NL-NH": {"nl": "Noord-Holland", "en": "North Holland"}, ...}
|
|
"""
|
|
labels: Dict[str, Dict[str, str]] = {}
|
|
|
|
# Load all iso_3166_2_*.json files
|
|
try:
|
|
for json_file in self.reference_path.glob("iso_3166_2_*.json"):
|
|
country_code = json_file.stem.replace("iso_3166_2_", "").upper()
|
|
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
provinces = data.get('provinces', {})
|
|
|
|
# Build reverse lookup: code -> names
|
|
code_to_names: Dict[str, Dict[str, str]] = {}
|
|
for name, subdivision_code in provinces.items():
|
|
full_code = f"{country_code}-{subdivision_code}"
|
|
|
|
if full_code not in code_to_names:
|
|
code_to_names[full_code] = {}
|
|
|
|
# Detect language from name characteristics
|
|
# Dutch names often have hyphenated prefixes like "Noord-" or "Zuid-"
|
|
if any(name.startswith(prefix) for prefix in ["Noord", "Zuid", "Oost", "West"]):
|
|
code_to_names[full_code]["nl"] = name
|
|
elif name.startswith("North") or name.startswith("South"):
|
|
code_to_names[full_code]["en"] = name
|
|
else:
|
|
# Default: use for both if no language-specific version exists
|
|
if "nl" not in code_to_names[full_code]:
|
|
code_to_names[full_code]["nl"] = name
|
|
if "en" not in code_to_names[full_code]:
|
|
code_to_names[full_code]["en"] = name
|
|
|
|
labels.update(code_to_names)
|
|
logger.debug(f"Loaded {len(provinces)} subregion labels from {json_file.name}")
|
|
|
|
logger.info(f"Loaded {len(labels)} total subregion labels from reference files")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading subregion labels: {e}")
|
|
|
|
return labels
|
|
|
|
def get_institution_type_label(self, code: str, language: str = "en") -> str:
|
|
"""Get human-readable label for institution type code."""
|
|
labels = self.institution_type_labels.get(code, {})
|
|
return labels.get(language, labels.get("en", code))
|
|
|
|
def get_subregion_label(self, code: str, language: str = "en") -> str:
|
|
"""Get human-readable label for subregion code."""
|
|
labels = self.subregion_labels.get(code, {})
|
|
return labels.get(language, labels.get("en", code))
|
|
|
|
def get_all_institution_type_labels(self, language: str = "en") -> Dict[str, str]:
|
|
"""Get all institution type labels for a language (for template interpolation)."""
|
|
return {
|
|
code: self.get_institution_type_label(code, language)
|
|
for code in self.institution_type_labels
|
|
}
|
|
|
|
def get_all_subregion_labels(self, language: str = "en") -> Dict[str, str]:
|
|
"""Get all subregion labels for a language (for template interpolation)."""
|
|
return {
|
|
code: self.get_subregion_label(code, language)
|
|
for code in self.subregion_labels
|
|
}
|
|
|
|
|
|
# Singleton instance for efficient reuse
|
|
_label_resolver: Optional[SchemaLabelResolver] = None
|
|
|
|
|
|
def get_label_resolver(
|
|
schema_path: Optional[Path] = None,
|
|
reference_path: Optional[Path] = None
|
|
) -> SchemaLabelResolver:
|
|
"""
|
|
Get the singleton label resolver instance.
|
|
|
|
Creates a new instance if paths are specified, otherwise returns cached instance.
|
|
"""
|
|
global _label_resolver
|
|
|
|
if schema_path or reference_path or _label_resolver is None:
|
|
_label_resolver = SchemaLabelResolver(schema_path, reference_path)
|
|
|
|
return _label_resolver
|
|
|
|
|
|
# Convenience functions for common use cases
|
|
def get_institution_type_labels_nl() -> Dict[str, str]:
|
|
"""Get Dutch labels for all institution types."""
|
|
return get_label_resolver().get_all_institution_type_labels("nl")
|
|
|
|
|
|
def get_institution_type_labels_en() -> Dict[str, str]:
|
|
"""Get English labels for all institution types."""
|
|
return get_label_resolver().get_all_institution_type_labels("en")
|
|
|
|
|
|
def get_subregion_labels() -> Dict[str, str]:
|
|
"""Get default (Dutch) labels for all subregions."""
|
|
return get_label_resolver().get_all_subregion_labels("nl")
|