glam/backend/rag/schema_labels.py

300 lines
13 KiB
Python

"""
Schema-Driven Label Loading for RAG Templates
Per Rule 41: Labels MUST be resolved at runtime from schema/reference files,
NOT hardcoded in templates or code.
This module loads multilingual labels for:
- Institution types (from CustodianType schema + InstitutionTypeCodeEnum)
- Subregions (from ISO 3166-2 JSON reference files)
- Countries (from ISO 3166-1 reference data)
Usage:
from schema_labels import get_label_resolver
resolver = get_label_resolver()
label_nl = resolver.get_institution_type_label("M", "nl") # "musea"
label_en = resolver.get_subregion_label("NL-NH", "en") # "North Holland"
"""
import json
import logging
import os
from functools import lru_cache
from pathlib import Path
from typing import Dict, Optional
import yaml
logger = logging.getLogger(__name__)
# Base paths - detect deployment location automatically
def _detect_paths():
"""
Detect correct paths for schema and reference files.
Supports two deployment scenarios:
1. Development: /Users/.../glam/backend/rag/schema_labels.py
- Schema at: /Users/.../glam/schemas/20251121/linkml/
- Reference at: /Users/.../glam/data/reference/
2. Production: /opt/glam-backend/rag/schema_labels.py
- Schema at: /opt/glam-backend/rag/schemas/20251121/linkml/ (if deployed)
- Reference at: /opt/glam-backend/rag/data/reference/
"""
script_dir = Path(__file__).parent # .../rag/
# Try different possible locations for reference data
possible_reference_paths = [
script_dir / "data" / "reference", # Production: /opt/glam-backend/rag/data/reference
script_dir.parent.parent / "data" / "reference", # Dev: /Users/.../glam/data/reference
Path("/opt/glam-backend/rag/data/reference"), # Explicit production path
]
reference_path = None
for path in possible_reference_paths:
if path.exists() and list(path.glob("iso_3166_2_*.json")):
reference_path = path
break
if reference_path is None:
# Fallback to first option even if doesn't exist (will use fallback labels)
reference_path = possible_reference_paths[0]
logger.warning(f"No reference path found with ISO 3166-2 files, using {reference_path}")
# Try different possible locations for schema
possible_schema_paths = [
script_dir / "schemas" / "20251121" / "linkml", # Production (if schemas deployed)
script_dir.parent.parent / "schemas" / "20251121" / "linkml", # Dev monorepo
Path("/opt/glam-backend/rag/schemas/20251121/linkml"), # Explicit production
]
schema_path = None
for path in possible_schema_paths:
if path.exists():
schema_path = path
break
if schema_path is None:
# Fallback - will use fallback labels for institution types
schema_path = possible_schema_paths[0]
logger.warning(f"No schema path found, using {schema_path}")
return schema_path, reference_path
SCHEMA_PATH, REFERENCE_PATH = _detect_paths()
class SchemaLabelResolver:
"""
Loads and resolves labels from schema and reference files.
Labels are loaded once at initialization and cached for performance.
"""
def __init__(
self,
schema_path: Optional[Path] = None,
reference_path: Optional[Path] = None
):
self.schema_path = schema_path or SCHEMA_PATH
self.reference_path = reference_path or REFERENCE_PATH
# Lazy-loaded label dictionaries
self._institution_type_labels: Optional[Dict[str, Dict[str, str]]] = None
self._subregion_labels: Optional[Dict[str, Dict[str, str]]] = None
self._country_labels: Optional[Dict[str, Dict[str, str]]] = None
@property
def institution_type_labels(self) -> Dict[str, Dict[str, str]]:
"""Load institution type labels from schema on first access."""
if self._institution_type_labels is None:
self._institution_type_labels = self._load_institution_type_labels()
return self._institution_type_labels
@property
def subregion_labels(self) -> Dict[str, Dict[str, str]]:
"""Load subregion labels from reference files on first access."""
if self._subregion_labels is None:
self._subregion_labels = self._load_subregion_labels()
return self._subregion_labels
def _load_institution_type_labels(self) -> Dict[str, Dict[str, str]]:
"""
Load institution type labels from InstitutionTypeCodeEnum.
Returns dict like:
{"M": {"nl": "musea", "en": "museums", "de": "Museen"}, ...}
"""
labels: Dict[str, Dict[str, str]] = {}
# Primary source: InstitutionTypeCodeEnum with descriptions
enum_path = self.schema_path / "modules" / "enums" / "InstitutionTypeCodeEnum.yaml"
# Fallback labels derived from CustodianType subclass naming
# These match the schema's glamorcubesfixphdnt_code slot
fallback_labels = {
"G": {"en": "galleries", "nl": "galerijen", "de": "Galerien"},
"L": {"en": "libraries", "nl": "bibliotheken", "de": "Bibliotheken"},
"A": {"en": "archives", "nl": "archieven", "de": "Archive"},
"M": {"en": "museums", "nl": "musea", "de": "Museen"},
"O": {"en": "official institutions", "nl": "overheidsinstellingen", "de": "Behörden"},
"R": {"en": "research centers", "nl": "onderzoekscentra", "de": "Forschungszentren"},
"C": {"en": "corporate archives", "nl": "bedrijfsarchieven", "de": "Unternehmensarchive"},
"U": {"en": "institutions", "nl": "instellingen", "de": "Einrichtungen"},
"B": {"en": "botanical gardens and zoos", "nl": "botanische tuinen en dierentuinen", "de": "botanische Gärten und Zoos"},
"E": {"en": "education providers", "nl": "onderwijsinstellingen", "de": "Bildungseinrichtungen"},
"S": {"en": "heritage societies", "nl": "heemkundige kringen", "de": "Heimatvereine"},
"F": {"en": "features", "nl": "monumenten", "de": "Denkmäler"},
"I": {"en": "intangible heritage groups", "nl": "immaterieel erfgoedgroepen", "de": "immaterielles Kulturerbe"},
"X": {"en": "mixed institutions", "nl": "gecombineerde instellingen", "de": "gemischte Einrichtungen"},
"P": {"en": "personal collections", "nl": "privéverzamelingen", "de": "Privatsammlungen"},
"H": {"en": "holy sites", "nl": "religieuze erfgoedsites", "de": "religiöse Stätten"},
"D": {"en": "digital platforms", "nl": "digitale platforms", "de": "digitale Plattformen"},
"N": {"en": "heritage NGOs", "nl": "erfgoedorganisaties", "de": "Kulturerbe-NGOs"},
"T": {"en": "taste/smell heritage", "nl": "culinair erfgoed", "de": "kulinarisches Erbe"},
}
try:
if enum_path.exists():
with open(enum_path, 'r', encoding='utf-8') as f:
enum_data = yaml.safe_load(f)
# Extract labels from enum descriptions
permissible_values = enum_data.get('enums', {}).get('InstitutionTypeCodeEnum', {}).get('permissible_values', {})
for code, value_info in permissible_values.items():
description = value_info.get('description', '')
# Use description as English label, fallback for other languages
labels[code] = {
"en": description.lower() + "s" if description else fallback_labels.get(code, {}).get("en", code),
"nl": fallback_labels.get(code, {}).get("nl", code),
"de": fallback_labels.get(code, {}).get("de", code),
}
logger.info(f"Loaded {len(labels)} institution type labels from schema")
else:
logger.warning(f"Schema file not found: {enum_path}, using fallback labels")
labels = fallback_labels
except Exception as e:
logger.error(f"Error loading institution type labels: {e}, using fallback")
labels = fallback_labels
# Ensure all codes have labels
for code in "GLAMORCUBESFIXPHDNT":
if code not in labels:
labels[code] = fallback_labels.get(code, {"en": code, "nl": code, "de": code})
return labels
def _load_subregion_labels(self) -> Dict[str, Dict[str, str]]:
"""
Load subregion labels from ISO 3166-2 JSON files.
Returns dict like:
{"NL-NH": {"nl": "Noord-Holland", "en": "North Holland"}, ...}
"""
labels: Dict[str, Dict[str, str]] = {}
# Load all iso_3166_2_*.json files
try:
for json_file in self.reference_path.glob("iso_3166_2_*.json"):
country_code = json_file.stem.replace("iso_3166_2_", "").upper()
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
provinces = data.get('provinces', {})
# Build reverse lookup: code -> names
code_to_names: Dict[str, Dict[str, str]] = {}
for name, subdivision_code in provinces.items():
full_code = f"{country_code}-{subdivision_code}"
if full_code not in code_to_names:
code_to_names[full_code] = {}
# Detect language from name characteristics
# Dutch names often have hyphenated prefixes like "Noord-" or "Zuid-"
if any(name.startswith(prefix) for prefix in ["Noord", "Zuid", "Oost", "West"]):
code_to_names[full_code]["nl"] = name
elif name.startswith("North") or name.startswith("South"):
code_to_names[full_code]["en"] = name
else:
# Default: use for both if no language-specific version exists
if "nl" not in code_to_names[full_code]:
code_to_names[full_code]["nl"] = name
if "en" not in code_to_names[full_code]:
code_to_names[full_code]["en"] = name
labels.update(code_to_names)
logger.debug(f"Loaded {len(provinces)} subregion labels from {json_file.name}")
logger.info(f"Loaded {len(labels)} total subregion labels from reference files")
except Exception as e:
logger.error(f"Error loading subregion labels: {e}")
return labels
def get_institution_type_label(self, code: str, language: str = "en") -> str:
"""Get human-readable label for institution type code."""
labels = self.institution_type_labels.get(code, {})
return labels.get(language, labels.get("en", code))
def get_subregion_label(self, code: str, language: str = "en") -> str:
"""Get human-readable label for subregion code."""
labels = self.subregion_labels.get(code, {})
return labels.get(language, labels.get("en", code))
def get_all_institution_type_labels(self, language: str = "en") -> Dict[str, str]:
"""Get all institution type labels for a language (for template interpolation)."""
return {
code: self.get_institution_type_label(code, language)
for code in self.institution_type_labels
}
def get_all_subregion_labels(self, language: str = "en") -> Dict[str, str]:
"""Get all subregion labels for a language (for template interpolation)."""
return {
code: self.get_subregion_label(code, language)
for code in self.subregion_labels
}
# Singleton instance for efficient reuse
_label_resolver: Optional[SchemaLabelResolver] = None
def get_label_resolver(
schema_path: Optional[Path] = None,
reference_path: Optional[Path] = None
) -> SchemaLabelResolver:
"""
Get the singleton label resolver instance.
Creates a new instance if paths are specified, otherwise returns cached instance.
"""
global _label_resolver
if schema_path or reference_path or _label_resolver is None:
_label_resolver = SchemaLabelResolver(schema_path, reference_path)
return _label_resolver
# Convenience functions for common use cases
def get_institution_type_labels_nl() -> Dict[str, str]:
"""Get Dutch labels for all institution types."""
return get_label_resolver().get_all_institution_type_labels("nl")
def get_institution_type_labels_en() -> Dict[str, str]:
"""Get English labels for all institution types."""
return get_label_resolver().get_all_institution_type_labels("en")
def get_subregion_labels() -> Dict[str, str]:
"""Get default (Dutch) labels for all subregions."""
return get_label_resolver().get_all_subregion_labels("nl")