#!/usr/bin/env node /** * extract-types-vocab.ts * * Extracts vocabulary DYNAMICALLY from LinkML schema files for two-tier semantic routing. * * **IMPORTANT**: This script derives ALL vocabulary from the LinkML schema - no hardcoding! * * Sources: * - Base types: schemas/20251121/linkml/modules/classes/*Type.yaml (19 GLAMORCUBESFIXPHDNT types) * - Subtypes: classes that `is_a` a base type (e.g., MunicipalArchive is_a ArchiveOrganizationType) * - Keywords: annotations.skos:prefLabel, annotations.skos:altLabel, structured_aliases, keywords, comments * - RecordSetTypes: *RecordSetTypes.yaml files * * Output: apps/archief-assistent/public/types-vocab.json * * Usage: * npx tsx scripts/extract-types-vocab.ts * npx tsx scripts/extract-types-vocab.ts --skip-embeddings # Skip embedding generation * * See: .opencode/rules/ontology-driven-cache-segmentation.md */ import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { parse as parseYaml } from 'yaml'; // ESM compatibility for __dirname const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // ============================================================================ // Configuration // ============================================================================ const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes'); const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json'); const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed'; const OPENAI_API_KEY = process.env.OPENAI_API_KEY; const OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small'; const OPENAI_EMBEDDING_DIMENSIONS = 1536; // ============================================================================ // Types // ============================================================================ interface TermLogEntry { typeCode: string; typeName: string; subtypeName?: string; subtypeClassName?: string; wikidataId?: string; recordSetType?: string; lang: string; } interface SubtypeInfo { className: string; wikidataId?: string; accumulatedTerms: string; keywords: Record; } interface TypeInfo { code: string; className: string; baseWikidataId?: string; accumulatedTerms: string; keywords: Record; subtypes: Record; } interface RecordSetTypeInfo { className: string; accumulatedTerms: string; keywords: Record; } interface TypesVocabulary { version: string; schemaVersion: string; embeddingModel: string; embeddingDimensions: number; tier1Embeddings: Record; tier2Embeddings: Record>; termLog: Record; institutionTypes: Record; institutionSubtypes: Record; recordSetTypes: Record; } interface ParsedClass { className: string; description?: string; isA?: string; keywords?: string[]; structuredAliases?: Array<{ literal_form: string; in_language?: string }>; annotations?: Record; exactMappings?: string[]; broadMappings?: string[]; comments?: string[]; } // ============================================================================ // GLAMORCUBESFIXPHDNT Type Discovery // Dynamically discovers base types from schema files // ============================================================================ /** * Discovers the 19 GLAMORCUBESFIXPHDNT type files and their codes. * Base types are identified by: * 1. Filename pattern: *Type.yaml (but NOT *Types.yaml) * 2. The class `is_a: CustodianType` (directly or via chain) * 3. Having a single-letter GLAMORCUBESFIXPHDNT code in annotations or comments */ function discoverBaseTypes(): Map { const typeMap = new Map(); // These are the standard GLAMORCUBESFIXPHDNT mappings // The code is determined by the class's position in the taxonomy const knownMappings: Record = { 'ArchiveOrganizationType': 'A', 'BioCustodianType': 'B', 'CommercialOrganizationType': 'C', 'DigitalPlatformType': 'D', 'EducationProviderType': 'E', 'FeatureCustodianType': 'F', 'GalleryType': 'G', 'HolySacredSiteType': 'H', 'IntangibleHeritageGroupType': 'I', 'LibraryType': 'L', 'MuseumType': 'M', 'NonProfitType': 'N', 'OfficialInstitutionType': 'O', 'PersonalCollectionType': 'P', 'ResearchOrganizationType': 'R', 'HeritageSocietyType': 'S', 'TasteScentHeritageType': 'T', 'UnspecifiedType': 'U', 'MixedCustodianType': 'X', }; // Find all *Type.yaml files (not *Types.yaml) const files = readdirSync(SCHEMA_DIR); const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml')); for (const file of typeFiles) { const typeName = file.replace('.yaml', ''); if (knownMappings[typeName]) { typeMap.set(typeName, knownMappings[typeName]); } } return typeMap; } // ============================================================================ // YAML Parsing // ============================================================================ function parseYamlFile(filePath: string): Record | null { try { const content = readFileSync(filePath, 'utf-8'); return parseYaml(content); } catch (error) { console.warn(`Warning: Could not parse ${filePath}: ${error}`); return null; } } function extractClassesFromYaml(yamlData: Record): ParsedClass[] { const classes: ParsedClass[] = []; const classesSection = yamlData.classes as Record | undefined; if (!classesSection) return classes; for (const [className, classDef] of Object.entries(classesSection)) { if (typeof classDef !== 'object' || classDef === null) continue; const classData = classDef as Record; const parsed: ParsedClass = { className, description: classData.description as string | undefined, isA: classData.is_a as string | undefined, keywords: classData.keywords as string[] | undefined, structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined, annotations: classData.annotations as Record | undefined, exactMappings: classData.exact_mappings as string[] | undefined, broadMappings: classData.broad_mappings as string[] | undefined, comments: classData.comments as string[] | undefined, }; classes.push(parsed); } return classes; } /** * Extracts Wikidata ID from various sources in a class definition */ function extractWikidataId(parsedClass: ParsedClass): string | undefined { // Check exact_mappings first if (parsedClass.exactMappings) { for (const mapping of parsedClass.exactMappings) { if (mapping.startsWith('wd:') || mapping.startsWith('wikidata:')) { return mapping.replace(/^(wd:|wikidata:)/, ''); } } } // Check broad_mappings if (parsedClass.broadMappings) { for (const mapping of parsedClass.broadMappings) { if (mapping.startsWith('wd:')) { return mapping.replace('wd:', ''); } } } return undefined; } /** * Extracts multilingual keywords from various schema sources: * - annotations['skos:prefLabel'] - primary label * - annotations['skos:altLabel'] - comma-separated alternatives * - structured_aliases - language-tagged aliases * - keywords - array of keywords * - comments - often contain multilingual labels */ function extractKeywordsFromClass(parsedClass: ParsedClass): Record { const keywords: Record = {}; // 1. Extract from annotations (skos:prefLabel, skos:altLabel) if (parsedClass.annotations) { const prefLabel = parsedClass.annotations['skos:prefLabel']; if (prefLabel) { // Could be "Municipal Archive" or "Municipal Archive@en" const [text, lang] = parseLanguageTag(prefLabel); keywords[lang] = keywords[lang] || []; keywords[lang].push(text.toLowerCase()); } const altLabel = parsedClass.annotations['skos:altLabel']; if (altLabel) { // Comma-separated: "Stadtarchiv, Gemeindearchiv, City Archive" const labels = altLabel.split(',').map(s => s.trim()); for (const label of labels) { const [text, lang] = parseLanguageTag(label); // Try to detect language from text if not tagged const detectedLang = lang || detectLanguage(text); keywords[detectedLang] = keywords[detectedLang] || []; keywords[detectedLang].push(text.toLowerCase()); } } } // 2. Extract from structured_aliases (language-tagged) if (parsedClass.structuredAliases) { for (const alias of parsedClass.structuredAliases) { const lang = alias.in_language || 'en'; keywords[lang] = keywords[lang] || []; keywords[lang].push(alias.literal_form.toLowerCase()); } } // 3. Extract from keywords array if (parsedClass.keywords) { for (const kw of parsedClass.keywords) { const lang = detectLanguage(kw); keywords[lang] = keywords[lang] || []; keywords[lang].push(kw.toLowerCase()); } } // 4. Extract from comments (often contain "term (lang)" patterns) if (parsedClass.comments) { for (const comment of parsedClass.comments) { // Match patterns like "Stadtarchiv (de)" or "archivo municipal (es)" const match = comment.match(/^([^(]+)\s*\((\w{2})\)$/); if (match) { const [, text, lang] = match; keywords[lang] = keywords[lang] || []; keywords[lang].push(text.trim().toLowerCase()); } } } // 5. Convert class name to keywords // MunicipalArchive -> ["municipal archive"] const classNameWords = parsedClass.className .replace(/([A-Z])/g, ' $1') .trim() .toLowerCase(); keywords['en'] = keywords['en'] || []; if (!keywords['en'].includes(classNameWords)) { keywords['en'].push(classNameWords); } // Deduplicate all arrays for (const lang of Object.keys(keywords)) { keywords[lang] = [...new Set(keywords[lang])]; } return keywords; } /** * Parse language tag from string like "Museum@en" -> ["Museum", "en"] */ function parseLanguageTag(text: string): [string, string] { const match = text.match(/^(.+)@(\w{2})$/); if (match) { return [match[1].trim(), match[2]]; } return [text.trim(), 'en']; } /** * Simple language detection based on common patterns */ function detectLanguage(text: string): string { const lowerText = text.toLowerCase(); // Dutch patterns if (/ij|sch|cht|aa|ee|oo|uu|archief|museum|bibliotheek/i.test(lowerText)) { return 'nl'; } // German patterns if (/archiv(?!e)|bibliothek|museum|ß|ä|ö|ü/i.test(lowerText)) { return 'de'; } // French patterns if (/archives|musée|bibliothèque|é|è|ê|ç/i.test(lowerText)) { return 'fr'; } // Spanish patterns if (/archivo|museo|biblioteca|ñ|á|é|í|ó|ú/i.test(lowerText)) { return 'es'; } return 'en'; } function accumulateTerms(keywords: Record): string { const allTerms: string[] = []; for (const terms of Object.values(keywords)) { allTerms.push(...terms); } return [...new Set(allTerms)].join(' '); } /** * Converts CamelCase class name to UPPER_SNAKE_CASE * MunicipalArchive -> MUNICIPAL_ARCHIVE */ function toUpperSnakeCase(className: string): string { return className .replace(/([a-z])([A-Z])/g, '$1_$2') .toUpperCase(); } // ============================================================================ // Subtype Discovery // Find all classes that inherit from base types // ============================================================================ /** * Discovers all subtype classes that inherit from a base type. * Scans all .yaml files and checks if `is_a` points to a base type. */ function discoverSubtypes(baseTypes: Map): Map { const subtypes = new Map(); const files = readdirSync(SCHEMA_DIR); const yamlFiles = files.filter(f => f.endsWith('.yaml')); for (const file of yamlFiles) { // Skip *Types.yaml and *Type.yaml files (those are enums/base types) if (file.endsWith('Types.yaml') || file.endsWith('Type.yaml')) continue; // Skip RecordSetTypes files for now (handled separately) if (file.includes('RecordSetTypes')) continue; const filePath = join(SCHEMA_DIR, file); const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); for (const cls of classes) { if (!cls.isA) continue; // Check if is_a points to a known base type for (const [baseTypeName, code] of baseTypes.entries()) { if (cls.isA === baseTypeName) { subtypes.set(cls.className, { className: cls.className, baseType: baseTypeName, code, }); break; } } } } return subtypes; } // ============================================================================ // Embedding Generation // ============================================================================ async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise { if (skipEmbeddings) { return []; } // Use OpenAI API if key is available if (OPENAI_API_KEY) { try { const response = await fetch('https://api.openai.com/v1/embeddings', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${OPENAI_API_KEY}`, }, body: JSON.stringify({ input: text, model: OPENAI_EMBEDDING_MODEL, }), }); if (!response.ok) { const errorBody = await response.text(); console.warn(`OpenAI API error: ${response.status} - ${errorBody}`); return []; } const data = await response.json() as { data: Array<{ embedding: number[] }> }; return data.data?.[0]?.embedding || []; } catch (error) { console.warn(`OpenAI embedding generation failed: ${error}`); return []; } } // Fallback to local embedding API try { const response = await fetch(EMBEDDING_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text }), }); if (!response.ok) { console.warn(`Embedding API error: ${response.status}`); return []; } const data = await response.json() as { embedding: number[] }; return data.embedding || []; } catch (error) { console.warn(`Embedding generation failed: ${error}`); return []; } } // ============================================================================ // Main Processing // ============================================================================ async function processTypeFiles(): Promise { const skipEmbeddings = process.argv.includes('--skip-embeddings'); console.log('🔍 Scanning schema directory:', SCHEMA_DIR); console.log(`📊 Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`); if (!skipEmbeddings && OPENAI_API_KEY) { console.log(`📊 Using OpenAI model: ${OPENAI_EMBEDDING_MODEL}`); } const vocabulary: TypesVocabulary = { version: new Date().toISOString(), schemaVersion: '20251121', embeddingModel: OPENAI_API_KEY ? OPENAI_EMBEDDING_MODEL : 'paraphrase-multilingual-MiniLM-L12-v2', embeddingDimensions: OPENAI_API_KEY ? OPENAI_EMBEDDING_DIMENSIONS : 384, tier1Embeddings: {}, tier2Embeddings: {}, termLog: {}, institutionTypes: {}, institutionSubtypes: {}, recordSetTypes: {}, }; // Step 1: Discover base types from schema console.log('\n📁 Discovering GLAMORCUBESFIXPHDNT base types from schema...'); const baseTypes = discoverBaseTypes(); console.log(` Found ${baseTypes.size} base types: ${[...baseTypes.keys()].join(', ')}`); // Step 2: Process base Type files console.log('\n📄 Processing base Type files...'); for (const [typeName, code] of baseTypes.entries()) { const filePath = join(SCHEMA_DIR, `${typeName}.yaml`); if (!existsSync(filePath)) { console.log(` ⚠️ File not found: ${typeName}.yaml`); continue; } const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); const baseClass = classes.find(c => c.className === typeName); if (!baseClass) { console.log(` ⚠️ No base class found in ${typeName}.yaml`); continue; } const typeKeywords = extractKeywordsFromClass(baseClass); const wikidataId = extractWikidataId(baseClass); const typeInfo: TypeInfo = { code, className: typeName, baseWikidataId: wikidataId, accumulatedTerms: accumulateTerms(typeKeywords), keywords: typeKeywords, subtypes: {}, }; // Add base type keywords to term log for (const [lang, terms] of Object.entries(typeKeywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: code, typeName, wikidataId, lang, }; } } vocabulary.institutionTypes[code] = typeInfo; console.log(` ✅ ${code}: ${typeName} - ${Object.values(typeKeywords).flat().length} terms`); } // Step 3: Discover and process subtypes console.log('\n📂 Discovering subtypes from schema...'); const subtypeMap = discoverSubtypes(baseTypes); console.log(` Found ${subtypeMap.size} subtype classes`); for (const [className, { baseType, code }] of subtypeMap.entries()) { const filePath = join(SCHEMA_DIR, `${className}.yaml`); if (!existsSync(filePath)) continue; const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); const subtypeClass = classes.find(c => c.className === className); if (!subtypeClass) continue; const subtypeKeywords = extractKeywordsFromClass(subtypeClass); const wikidataId = extractWikidataId(subtypeClass); const subtypeName = toUpperSnakeCase(className); const subtypeInfo: SubtypeInfo = { className, wikidataId, accumulatedTerms: accumulateTerms(subtypeKeywords), keywords: subtypeKeywords, }; // Add to parent type's subtypes if (vocabulary.institutionTypes[code]) { vocabulary.institutionTypes[code].subtypes[subtypeName] = subtypeInfo; } // Also store in flat institutionSubtypes for quick lookup vocabulary.institutionSubtypes[`${code}.${subtypeName}`] = subtypeInfo; // Add subtype keywords to term log for (const [lang, terms] of Object.entries(subtypeKeywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: code, typeName: baseType, subtypeName, subtypeClassName: className, wikidataId, lang, }; } } } // Count subtypes per type for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { const subtypeCount = Object.keys(typeInfo.subtypes).length; if (subtypeCount > 0) { console.log(` ✅ ${code}: ${typeInfo.className} - ${subtypeCount} subtypes, ${Object.values(typeInfo.subtypes).reduce((sum, s) => sum + Object.values(s.keywords).flat().length, 0)} subtype terms`); } } // Step 4: Process RecordSetTypes files console.log('\n📁 Processing RecordSetTypes files...'); const files = readdirSync(SCHEMA_DIR); const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml')); for (const file of recordSetTypesFiles) { const filePath = join(SCHEMA_DIR, file); const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); for (const cls of classes) { const rstName = toUpperSnakeCase(cls.className); const keywords = extractKeywordsFromClass(cls); const rstInfo: RecordSetTypeInfo = { className: cls.className, accumulatedTerms: accumulateTerms(keywords), keywords, }; vocabulary.recordSetTypes[rstName] = rstInfo; // Add to term log (associate with Archives primarily) for (const [lang, terms] of Object.entries(keywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: 'A', typeName: 'ArchiveOrganizationType', recordSetType: rstName, lang, }; } } } } console.log(` ✅ Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`); // Step 5: Accumulate all terms for each type (base + subtypes) console.log('\n📊 Accumulating terms per type...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { const allTypeTerms: string[] = []; allTypeTerms.push(typeInfo.accumulatedTerms); for (const subtype of Object.values(typeInfo.subtypes)) { allTypeTerms.push(subtype.accumulatedTerms); } typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' ').filter(Boolean))].join(' '); } // Step 6: Generate embeddings console.log('\n🧮 Generating Tier 1 embeddings (base types)...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings); vocabulary.tier1Embeddings[typeInfo.className] = embedding; console.log(` ✓ ${typeInfo.className}: ${embedding.length} dimensions`); } console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { vocabulary.tier2Embeddings[code] = {}; for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) { const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings); vocabulary.tier2Embeddings[code][subtypeName] = embedding; } if (Object.keys(typeInfo.subtypes).length > 0) { console.log(` ✓ ${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`); } } return vocabulary; } // ============================================================================ // Main Entry Point // ============================================================================ async function main() { console.log('═══════════════════════════════════════════════════════════════'); console.log(' TypesVocabulary Extraction Script (Schema-Driven)'); console.log(' Ontology-Driven Cache Segmentation (Rule 46)'); console.log('═══════════════════════════════════════════════════════════════\n'); const vocabulary = await processTypeFiles(); // Ensure output directory exists const outputDir = dirname(OUTPUT_FILE); if (!existsSync(outputDir)) { mkdirSync(outputDir, { recursive: true }); } // Write output writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2)); console.log('\n═══════════════════════════════════════════════════════════════'); console.log(' Summary'); console.log('═══════════════════════════════════════════════════════════════'); console.log(` 📊 Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`); console.log(` 📊 Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`); console.log(` 📊 Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`); console.log(` 📊 Term Log Entries: ${Object.keys(vocabulary.termLog).length}`); console.log(` 📊 Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`); console.log(` 📊 Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`); console.log(`\n ✅ Output written to: ${OUTPUT_FILE}`); console.log('═══════════════════════════════════════════════════════════════\n'); } main().catch(console.error);