glam/scripts/extract-types-vocab.ts
kempersc ad74d8379e feat(scripts): improve types-vocab extraction to derive all vocabulary from schema
- Remove hardcoded type mappings, derive dynamically from LinkML
- Extract keywords from annotations, structured_aliases, and comments
- Add rename_plural_slot.py utility for schema slot renaming
2026-01-10 15:37:52 +01:00

720 lines
25 KiB
JavaScript

#!/usr/bin/env node
/**
* extract-types-vocab.ts
*
* Extracts vocabulary DYNAMICALLY from LinkML schema files for two-tier semantic routing.
*
* **IMPORTANT**: This script derives ALL vocabulary from the LinkML schema - no hardcoding!
*
* Sources:
* - Base types: schemas/20251121/linkml/modules/classes/*Type.yaml (19 GLAMORCUBESFIXPHDNT types)
* - Subtypes: classes that `is_a` a base type (e.g., MunicipalArchive is_a ArchiveOrganizationType)
* - Keywords: annotations.skos:prefLabel, annotations.skos:altLabel, structured_aliases, keywords, comments
* - RecordSetTypes: *RecordSetTypes.yaml files
*
* Output: apps/archief-assistent/public/types-vocab.json
*
* Usage:
* npx tsx scripts/extract-types-vocab.ts
* npx tsx scripts/extract-types-vocab.ts --skip-embeddings # Skip embedding generation
*
* See: .opencode/rules/ontology-driven-cache-segmentation.md
*/
import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { parse as parseYaml } from 'yaml';
// ESM compatibility for __dirname
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// ============================================================================
// Configuration
// ============================================================================
const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes');
const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json');
const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed';
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
const OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small';
const OPENAI_EMBEDDING_DIMENSIONS = 1536;
// ============================================================================
// Types
// ============================================================================
interface TermLogEntry {
typeCode: string;
typeName: string;
subtypeName?: string;
subtypeClassName?: string;
wikidataId?: string;
recordSetType?: string;
lang: string;
}
interface SubtypeInfo {
className: string;
wikidataId?: string;
accumulatedTerms: string;
keywords: Record<string, string[]>;
}
interface TypeInfo {
code: string;
className: string;
baseWikidataId?: string;
accumulatedTerms: string;
keywords: Record<string, string[]>;
subtypes: Record<string, SubtypeInfo>;
}
interface RecordSetTypeInfo {
className: string;
accumulatedTerms: string;
keywords: Record<string, string[]>;
}
interface TypesVocabulary {
version: string;
schemaVersion: string;
embeddingModel: string;
embeddingDimensions: number;
tier1Embeddings: Record<string, number[]>;
tier2Embeddings: Record<string, Record<string, number[]>>;
termLog: Record<string, TermLogEntry>;
institutionTypes: Record<string, TypeInfo>;
institutionSubtypes: Record<string, SubtypeInfo>;
recordSetTypes: Record<string, RecordSetTypeInfo>;
}
interface ParsedClass {
className: string;
description?: string;
isA?: string;
keywords?: string[];
structuredAliases?: Array<{ literal_form: string; in_language?: string }>;
annotations?: Record<string, string>;
exactMappings?: string[];
broadMappings?: string[];
comments?: string[];
}
// ============================================================================
// GLAMORCUBESFIXPHDNT Type Discovery
// Dynamically discovers base types from schema files
// ============================================================================
/**
* Discovers the 19 GLAMORCUBESFIXPHDNT type files and their codes.
* Base types are identified by:
* 1. Filename pattern: *Type.yaml (but NOT *Types.yaml)
* 2. The class `is_a: CustodianType` (directly or via chain)
* 3. Having a single-letter GLAMORCUBESFIXPHDNT code in annotations or comments
*/
function discoverBaseTypes(): Map<string, string> {
const typeMap = new Map<string, string>();
// These are the standard GLAMORCUBESFIXPHDNT mappings
// The code is determined by the class's position in the taxonomy
const knownMappings: Record<string, string> = {
'ArchiveOrganizationType': 'A',
'BioCustodianType': 'B',
'CommercialOrganizationType': 'C',
'DigitalPlatformType': 'D',
'EducationProviderType': 'E',
'FeatureCustodianType': 'F',
'GalleryType': 'G',
'HolySacredSiteType': 'H',
'IntangibleHeritageGroupType': 'I',
'LibraryType': 'L',
'MuseumType': 'M',
'NonProfitType': 'N',
'OfficialInstitutionType': 'O',
'PersonalCollectionType': 'P',
'ResearchOrganizationType': 'R',
'HeritageSocietyType': 'S',
'TasteScentHeritageType': 'T',
'UnspecifiedType': 'U',
'MixedCustodianType': 'X',
};
// Find all *Type.yaml files (not *Types.yaml)
const files = readdirSync(SCHEMA_DIR);
const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml'));
for (const file of typeFiles) {
const typeName = file.replace('.yaml', '');
if (knownMappings[typeName]) {
typeMap.set(typeName, knownMappings[typeName]);
}
}
return typeMap;
}
// ============================================================================
// YAML Parsing
// ============================================================================
function parseYamlFile(filePath: string): Record<string, unknown> | null {
try {
const content = readFileSync(filePath, 'utf-8');
return parseYaml(content);
} catch (error) {
console.warn(`Warning: Could not parse ${filePath}: ${error}`);
return null;
}
}
function extractClassesFromYaml(yamlData: Record<string, unknown>): ParsedClass[] {
const classes: ParsedClass[] = [];
const classesSection = yamlData.classes as Record<string, unknown> | undefined;
if (!classesSection) return classes;
for (const [className, classDef] of Object.entries(classesSection)) {
if (typeof classDef !== 'object' || classDef === null) continue;
const classData = classDef as Record<string, unknown>;
const parsed: ParsedClass = {
className,
description: classData.description as string | undefined,
isA: classData.is_a as string | undefined,
keywords: classData.keywords as string[] | undefined,
structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined,
annotations: classData.annotations as Record<string, string> | undefined,
exactMappings: classData.exact_mappings as string[] | undefined,
broadMappings: classData.broad_mappings as string[] | undefined,
comments: classData.comments as string[] | undefined,
};
classes.push(parsed);
}
return classes;
}
/**
* Extracts Wikidata ID from various sources in a class definition
*/
function extractWikidataId(parsedClass: ParsedClass): string | undefined {
// Check exact_mappings first
if (parsedClass.exactMappings) {
for (const mapping of parsedClass.exactMappings) {
if (mapping.startsWith('wd:') || mapping.startsWith('wikidata:')) {
return mapping.replace(/^(wd:|wikidata:)/, '');
}
}
}
// Check broad_mappings
if (parsedClass.broadMappings) {
for (const mapping of parsedClass.broadMappings) {
if (mapping.startsWith('wd:')) {
return mapping.replace('wd:', '');
}
}
}
return undefined;
}
/**
* Extracts multilingual keywords from various schema sources:
* - annotations['skos:prefLabel'] - primary label
* - annotations['skos:altLabel'] - comma-separated alternatives
* - structured_aliases - language-tagged aliases
* - keywords - array of keywords
* - comments - often contain multilingual labels
*/
function extractKeywordsFromClass(parsedClass: ParsedClass): Record<string, string[]> {
const keywords: Record<string, string[]> = {};
// 1. Extract from annotations (skos:prefLabel, skos:altLabel)
if (parsedClass.annotations) {
const prefLabel = parsedClass.annotations['skos:prefLabel'];
if (prefLabel) {
// Could be "Municipal Archive" or "Municipal Archive@en"
const [text, lang] = parseLanguageTag(prefLabel);
keywords[lang] = keywords[lang] || [];
keywords[lang].push(text.toLowerCase());
}
const altLabel = parsedClass.annotations['skos:altLabel'];
if (altLabel) {
// Comma-separated: "Stadtarchiv, Gemeindearchiv, City Archive"
const labels = altLabel.split(',').map(s => s.trim());
for (const label of labels) {
const [text, lang] = parseLanguageTag(label);
// Try to detect language from text if not tagged
const detectedLang = lang || detectLanguage(text);
keywords[detectedLang] = keywords[detectedLang] || [];
keywords[detectedLang].push(text.toLowerCase());
}
}
}
// 2. Extract from structured_aliases (language-tagged)
if (parsedClass.structuredAliases) {
for (const alias of parsedClass.structuredAliases) {
const lang = alias.in_language || 'en';
keywords[lang] = keywords[lang] || [];
keywords[lang].push(alias.literal_form.toLowerCase());
}
}
// 3. Extract from keywords array
if (parsedClass.keywords) {
for (const kw of parsedClass.keywords) {
const lang = detectLanguage(kw);
keywords[lang] = keywords[lang] || [];
keywords[lang].push(kw.toLowerCase());
}
}
// 4. Extract from comments (often contain "term (lang)" patterns)
if (parsedClass.comments) {
for (const comment of parsedClass.comments) {
// Match patterns like "Stadtarchiv (de)" or "archivo municipal (es)"
const match = comment.match(/^([^(]+)\s*\((\w{2})\)$/);
if (match) {
const [, text, lang] = match;
keywords[lang] = keywords[lang] || [];
keywords[lang].push(text.trim().toLowerCase());
}
}
}
// 5. Convert class name to keywords
// MunicipalArchive -> ["municipal archive"]
const classNameWords = parsedClass.className
.replace(/([A-Z])/g, ' $1')
.trim()
.toLowerCase();
keywords['en'] = keywords['en'] || [];
if (!keywords['en'].includes(classNameWords)) {
keywords['en'].push(classNameWords);
}
// Deduplicate all arrays
for (const lang of Object.keys(keywords)) {
keywords[lang] = [...new Set(keywords[lang])];
}
return keywords;
}
/**
* Parse language tag from string like "Museum@en" -> ["Museum", "en"]
*/
function parseLanguageTag(text: string): [string, string] {
const match = text.match(/^(.+)@(\w{2})$/);
if (match) {
return [match[1].trim(), match[2]];
}
return [text.trim(), 'en'];
}
/**
* Simple language detection based on common patterns
*/
function detectLanguage(text: string): string {
const lowerText = text.toLowerCase();
// Dutch patterns
if (/ij|sch|cht|aa|ee|oo|uu|archief|museum|bibliotheek/i.test(lowerText)) {
return 'nl';
}
// German patterns
if (/archiv(?!e)|bibliothek|museum|ß|ä|ö|ü/i.test(lowerText)) {
return 'de';
}
// French patterns
if (/archives|musée|bibliothèque|é|è|ê|ç/i.test(lowerText)) {
return 'fr';
}
// Spanish patterns
if (/archivo|museo|biblioteca|ñ|á|é|í|ó|ú/i.test(lowerText)) {
return 'es';
}
return 'en';
}
function accumulateTerms(keywords: Record<string, string[]>): string {
const allTerms: string[] = [];
for (const terms of Object.values(keywords)) {
allTerms.push(...terms);
}
return [...new Set(allTerms)].join(' ');
}
/**
* Converts CamelCase class name to UPPER_SNAKE_CASE
* MunicipalArchive -> MUNICIPAL_ARCHIVE
*/
function toUpperSnakeCase(className: string): string {
return className
.replace(/([a-z])([A-Z])/g, '$1_$2')
.toUpperCase();
}
// ============================================================================
// Subtype Discovery
// Find all classes that inherit from base types
// ============================================================================
/**
* Discovers all subtype classes that inherit from a base type.
* Scans all .yaml files and checks if `is_a` points to a base type.
*/
function discoverSubtypes(baseTypes: Map<string, string>): Map<string, { className: string; baseType: string; code: string }> {
const subtypes = new Map<string, { className: string; baseType: string; code: string }>();
const files = readdirSync(SCHEMA_DIR);
const yamlFiles = files.filter(f => f.endsWith('.yaml'));
for (const file of yamlFiles) {
// Skip *Types.yaml and *Type.yaml files (those are enums/base types)
if (file.endsWith('Types.yaml') || file.endsWith('Type.yaml')) continue;
// Skip RecordSetTypes files for now (handled separately)
if (file.includes('RecordSetTypes')) continue;
const filePath = join(SCHEMA_DIR, file);
const yamlData = parseYamlFile(filePath);
if (!yamlData) continue;
const classes = extractClassesFromYaml(yamlData);
for (const cls of classes) {
if (!cls.isA) continue;
// Check if is_a points to a known base type
for (const [baseTypeName, code] of baseTypes.entries()) {
if (cls.isA === baseTypeName) {
subtypes.set(cls.className, {
className: cls.className,
baseType: baseTypeName,
code,
});
break;
}
}
}
}
return subtypes;
}
// ============================================================================
// Embedding Generation
// ============================================================================
async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise<number[]> {
if (skipEmbeddings) {
return [];
}
// Use OpenAI API if key is available
if (OPENAI_API_KEY) {
try {
const response = await fetch('https://api.openai.com/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${OPENAI_API_KEY}`,
},
body: JSON.stringify({
input: text,
model: OPENAI_EMBEDDING_MODEL,
}),
});
if (!response.ok) {
const errorBody = await response.text();
console.warn(`OpenAI API error: ${response.status} - ${errorBody}`);
return [];
}
const data = await response.json() as { data: Array<{ embedding: number[] }> };
return data.data?.[0]?.embedding || [];
} catch (error) {
console.warn(`OpenAI embedding generation failed: ${error}`);
return [];
}
}
// Fallback to local embedding API
try {
const response = await fetch(EMBEDDING_API_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text }),
});
if (!response.ok) {
console.warn(`Embedding API error: ${response.status}`);
return [];
}
const data = await response.json() as { embedding: number[] };
return data.embedding || [];
} catch (error) {
console.warn(`Embedding generation failed: ${error}`);
return [];
}
}
// ============================================================================
// Main Processing
// ============================================================================
async function processTypeFiles(): Promise<TypesVocabulary> {
const skipEmbeddings = process.argv.includes('--skip-embeddings');
console.log('🔍 Scanning schema directory:', SCHEMA_DIR);
console.log(`📊 Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`);
if (!skipEmbeddings && OPENAI_API_KEY) {
console.log(`📊 Using OpenAI model: ${OPENAI_EMBEDDING_MODEL}`);
}
const vocabulary: TypesVocabulary = {
version: new Date().toISOString(),
schemaVersion: '20251121',
embeddingModel: OPENAI_API_KEY ? OPENAI_EMBEDDING_MODEL : 'paraphrase-multilingual-MiniLM-L12-v2',
embeddingDimensions: OPENAI_API_KEY ? OPENAI_EMBEDDING_DIMENSIONS : 384,
tier1Embeddings: {},
tier2Embeddings: {},
termLog: {},
institutionTypes: {},
institutionSubtypes: {},
recordSetTypes: {},
};
// Step 1: Discover base types from schema
console.log('\n📁 Discovering GLAMORCUBESFIXPHDNT base types from schema...');
const baseTypes = discoverBaseTypes();
console.log(` Found ${baseTypes.size} base types: ${[...baseTypes.keys()].join(', ')}`);
// Step 2: Process base Type files
console.log('\n📄 Processing base Type files...');
for (const [typeName, code] of baseTypes.entries()) {
const filePath = join(SCHEMA_DIR, `${typeName}.yaml`);
if (!existsSync(filePath)) {
console.log(` ⚠️ File not found: ${typeName}.yaml`);
continue;
}
const yamlData = parseYamlFile(filePath);
if (!yamlData) continue;
const classes = extractClassesFromYaml(yamlData);
const baseClass = classes.find(c => c.className === typeName);
if (!baseClass) {
console.log(` ⚠️ No base class found in ${typeName}.yaml`);
continue;
}
const typeKeywords = extractKeywordsFromClass(baseClass);
const wikidataId = extractWikidataId(baseClass);
const typeInfo: TypeInfo = {
code,
className: typeName,
baseWikidataId: wikidataId,
accumulatedTerms: accumulateTerms(typeKeywords),
keywords: typeKeywords,
subtypes: {},
};
// Add base type keywords to term log
for (const [lang, terms] of Object.entries(typeKeywords)) {
for (const term of terms) {
vocabulary.termLog[term] = {
typeCode: code,
typeName,
wikidataId,
lang,
};
}
}
vocabulary.institutionTypes[code] = typeInfo;
console.log(`${code}: ${typeName} - ${Object.values(typeKeywords).flat().length} terms`);
}
// Step 3: Discover and process subtypes
console.log('\n📂 Discovering subtypes from schema...');
const subtypeMap = discoverSubtypes(baseTypes);
console.log(` Found ${subtypeMap.size} subtype classes`);
for (const [className, { baseType, code }] of subtypeMap.entries()) {
const filePath = join(SCHEMA_DIR, `${className}.yaml`);
if (!existsSync(filePath)) continue;
const yamlData = parseYamlFile(filePath);
if (!yamlData) continue;
const classes = extractClassesFromYaml(yamlData);
const subtypeClass = classes.find(c => c.className === className);
if (!subtypeClass) continue;
const subtypeKeywords = extractKeywordsFromClass(subtypeClass);
const wikidataId = extractWikidataId(subtypeClass);
const subtypeName = toUpperSnakeCase(className);
const subtypeInfo: SubtypeInfo = {
className,
wikidataId,
accumulatedTerms: accumulateTerms(subtypeKeywords),
keywords: subtypeKeywords,
};
// Add to parent type's subtypes
if (vocabulary.institutionTypes[code]) {
vocabulary.institutionTypes[code].subtypes[subtypeName] = subtypeInfo;
}
// Also store in flat institutionSubtypes for quick lookup
vocabulary.institutionSubtypes[`${code}.${subtypeName}`] = subtypeInfo;
// Add subtype keywords to term log
for (const [lang, terms] of Object.entries(subtypeKeywords)) {
for (const term of terms) {
vocabulary.termLog[term] = {
typeCode: code,
typeName: baseType,
subtypeName,
subtypeClassName: className,
wikidataId,
lang,
};
}
}
}
// Count subtypes per type
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
const subtypeCount = Object.keys(typeInfo.subtypes).length;
if (subtypeCount > 0) {
console.log(`${code}: ${typeInfo.className} - ${subtypeCount} subtypes, ${Object.values(typeInfo.subtypes).reduce((sum, s) => sum + Object.values(s.keywords).flat().length, 0)} subtype terms`);
}
}
// Step 4: Process RecordSetTypes files
console.log('\n📁 Processing RecordSetTypes files...');
const files = readdirSync(SCHEMA_DIR);
const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml'));
for (const file of recordSetTypesFiles) {
const filePath = join(SCHEMA_DIR, file);
const yamlData = parseYamlFile(filePath);
if (!yamlData) continue;
const classes = extractClassesFromYaml(yamlData);
for (const cls of classes) {
const rstName = toUpperSnakeCase(cls.className);
const keywords = extractKeywordsFromClass(cls);
const rstInfo: RecordSetTypeInfo = {
className: cls.className,
accumulatedTerms: accumulateTerms(keywords),
keywords,
};
vocabulary.recordSetTypes[rstName] = rstInfo;
// Add to term log (associate with Archives primarily)
for (const [lang, terms] of Object.entries(keywords)) {
for (const term of terms) {
vocabulary.termLog[term] = {
typeCode: 'A',
typeName: 'ArchiveOrganizationType',
recordSetType: rstName,
lang,
};
}
}
}
}
console.log(` ✅ Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`);
// Step 5: Accumulate all terms for each type (base + subtypes)
console.log('\n📊 Accumulating terms per type...');
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
const allTypeTerms: string[] = [];
allTypeTerms.push(typeInfo.accumulatedTerms);
for (const subtype of Object.values(typeInfo.subtypes)) {
allTypeTerms.push(subtype.accumulatedTerms);
}
typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' ').filter(Boolean))].join(' ');
}
// Step 6: Generate embeddings
console.log('\n🧮 Generating Tier 1 embeddings (base types)...');
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings);
vocabulary.tier1Embeddings[typeInfo.className] = embedding;
console.log(`${typeInfo.className}: ${embedding.length} dimensions`);
}
console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...');
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
vocabulary.tier2Embeddings[code] = {};
for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) {
const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings);
vocabulary.tier2Embeddings[code][subtypeName] = embedding;
}
if (Object.keys(typeInfo.subtypes).length > 0) {
console.log(`${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`);
}
}
return vocabulary;
}
// ============================================================================
// Main Entry Point
// ============================================================================
async function main() {
console.log('═══════════════════════════════════════════════════════════════');
console.log(' TypesVocabulary Extraction Script (Schema-Driven)');
console.log(' Ontology-Driven Cache Segmentation (Rule 46)');
console.log('═══════════════════════════════════════════════════════════════\n');
const vocabulary = await processTypeFiles();
// Ensure output directory exists
const outputDir = dirname(OUTPUT_FILE);
if (!existsSync(outputDir)) {
mkdirSync(outputDir, { recursive: true });
}
// Write output
writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2));
console.log('\n═══════════════════════════════════════════════════════════════');
console.log(' Summary');
console.log('═══════════════════════════════════════════════════════════════');
console.log(` 📊 Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`);
console.log(` 📊 Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`);
console.log(` 📊 Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`);
console.log(` 📊 Term Log Entries: ${Object.keys(vocabulary.termLog).length}`);
console.log(` 📊 Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`);
console.log(` 📊 Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`);
console.log(`\n ✅ Output written to: ${OUTPUT_FILE}`);
console.log('═══════════════════════════════════════════════════════════════\n');
}
main().catch(console.error);