#!/usr/bin/env node /** * extract-types-vocab.ts * * Extracts vocabulary from LinkML *Type.yaml and *Types.yaml schema files * and generates embeddings for two-tier semantic routing. * * Output: apps/archief-assistent/public/types-vocab.json * * Usage: * npx tsx scripts/extract-types-vocab.ts * npx tsx scripts/extract-types-vocab.ts --skip-embeddings # Skip embedding generation * * See: .opencode/rules/ontology-driven-cache-segmentation.md */ import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { parse as parseYaml } from 'yaml'; // ESM compatibility for __dirname const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // ============================================================================ // Configuration // ============================================================================ const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes'); const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json'); const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed'; // GLAMORCUBESFIXPHDNT code mapping const TYPE_FILE_TO_CODE: Record = { 'ArchiveOrganizationType': 'A', 'BioCustodianType': 'B', 'CommercialOrganizationType': 'C', 'DigitalPlatformType': 'D', 'EducationProviderType': 'E', 'FeatureCustodianType': 'F', 'GalleryType': 'G', 'HolySacredSiteType': 'H', 'IntangibleHeritageGroupType': 'I', 'LibraryType': 'L', 'MuseumType': 'M', 'NonProfitType': 'N', 'OfficialInstitutionType': 'O', 'PersonalCollectionType': 'P', 'ResearchOrganizationType': 'R', 'HeritageSocietyType': 'S', 'TasteScentHeritageType': 'T', 'UnspecifiedType': 'U', 'MixedCustodianType': 'X', }; // Base vocabulary for the 19 GLAMORCUBESFIXPHDNT types // These are common terms users search for that aren't in the LinkML schema keywords const BASE_TYPE_KEYWORDS: Record> = { 'A': { nl: ['archief', 'archieven', 'archivaris', 'archiveren', 'archiefstuk', 'archiefstukken'], en: ['archive', 'archives', 'archivist', 'archival'], de: ['archiv', 'archive'], fr: ['archives', 'archiviste'], }, // Archive subtypes - commonly searched terms for specific archive categories 'A.MUNICIPAL_ARCHIVE': { nl: ['gemeentearchief', 'gemeentearchieven', 'stadsarchief', 'stadsarchieven'], en: ['municipal archive', 'municipal archives', 'city archive', 'city archives'], de: ['stadtarchiv', 'kommunalarchiv'], fr: ['archives municipales', 'archives communales'], }, 'A.REGIONAL_ARCHIVE': { nl: ['regionaal archief', 'regionale archieven', 'streekarchief', 'streekarchieven', 'provinciaal archief'], en: ['regional archive', 'regional archives', 'provincial archive'], de: ['regionalarchiv', 'landesarchiv'], fr: ['archives régionales', 'archives départementales'], }, 'A.NATIONAL_ARCHIVE': { nl: ['nationaal archief', 'rijksarchief', 'nationale archieven'], en: ['national archive', 'national archives', 'state archive'], de: ['nationalarchiv', 'bundesarchiv'], fr: ['archives nationales'], }, 'A.CHURCH_ARCHIVE': { nl: ['kerkarchief', 'kerkarchieven', 'parochiearchief', 'kerkelijke archieven'], en: ['church archive', 'church archives', 'parish archive', 'ecclesiastical archive'], de: ['kirchenarchiv', 'pfarrarchiv'], fr: ['archives paroissiales', 'archives ecclésiastiques'], }, 'A.BUSINESS_ARCHIVE': { nl: ['bedrijfsarchief', 'bedrijfsarchieven', 'ondernemingsarchief'], en: ['business archive', 'business archives', 'corporate archive', 'company archive'], de: ['unternehmensarchiv', 'firmenarchiv', 'wirtschaftsarchiv'], fr: ['archives d\'entreprise', 'archives économiques'], }, 'A.UNIVERSITY_ARCHIVE': { nl: ['universiteitsarchief', 'universitaire archieven', 'academisch archief'], en: ['university archive', 'university archives', 'academic archive'], de: ['universitätsarchiv', 'hochschularchiv'], fr: ['archives universitaires'], }, 'A.FAMILY_ARCHIVE': { nl: ['familiearchief', 'familiearchieven', 'huisarchief'], en: ['family archive', 'family archives', 'house archive', 'estate archive'], de: ['familienarchiv', 'hausarchiv'], fr: ['archives familiales'], }, 'A.NOTARIAL_ARCHIVE': { nl: ['notarieel archief', 'notariële archieven', 'notarisarchief'], en: ['notarial archive', 'notarial archives', 'notary archive'], de: ['notariatsarchiv'], fr: ['archives notariales'], }, 'B': { nl: ['dierentuin', 'dierentuinen', 'zoo', 'botanische tuin', 'arboretum', 'aquarium'], en: ['zoo', 'zoos', 'botanical garden', 'botanical gardens', 'arboretum', 'aquarium'], de: ['zoo', 'botanischer garten'], fr: ['zoo', 'jardin botanique'], }, 'C': { nl: ['bedrijfsarchief', 'bedrijfsarchieven', 'bedrijfscollectie', 'ondernemingsarchief'], en: ['corporate archive', 'corporate archives', 'business archive', 'company archive'], de: ['unternehmensarchiv', 'firmenarchiv'], fr: ['archives d\'entreprise'], }, 'D': { nl: ['digitaal platform', 'digitale platformen', 'online archief', 'digitale bibliotheek'], en: ['digital platform', 'digital platforms', 'online archive', 'digital library'], de: ['digitale plattform', 'online-archiv'], fr: ['plateforme numérique', 'archives numériques'], }, 'E': { nl: ['universiteit', 'universiteiten', 'hogeschool', 'hogescholen', 'onderwijsinstelling', 'school', 'scholen'], en: ['university', 'universities', 'college', 'colleges', 'school', 'schools', 'educational institution'], de: ['universität', 'hochschule', 'schule'], fr: ['université', 'école', 'établissement scolaire'], }, 'F': { nl: ['monument', 'monumenten', 'standbeeld', 'standbeelden', 'gedenkteken', 'begraafplaats', 'begraafplaatsen'], en: ['monument', 'monuments', 'statue', 'statues', 'memorial', 'cemetery', 'cemeteries'], de: ['denkmal', 'denkmäler', 'statue', 'friedhof'], fr: ['monument', 'monuments', 'statue', 'cimetière'], }, 'G': { nl: ['galerie', 'galerij', 'galerijen', 'kunstgalerie', 'kunstgalerij', 'kunsthal'], en: ['gallery', 'galleries', 'art gallery', 'art galleries', 'kunsthalle'], de: ['galerie', 'galerien', 'kunstgalerie', 'kunsthalle'], fr: ['galerie', 'galeries', 'galerie d\'art'], }, 'H': { nl: ['kerk', 'kerken', 'kathedraal', 'kapel', 'moskee', 'synagoge', 'tempel', 'klooster', 'abdij'], en: ['church', 'churches', 'cathedral', 'chapel', 'mosque', 'synagogue', 'temple', 'monastery', 'abbey'], de: ['kirche', 'kathedrale', 'kapelle', 'moschee', 'synagoge', 'tempel', 'kloster', 'abtei'], fr: ['église', 'cathédrale', 'chapelle', 'mosquée', 'synagogue', 'temple', 'monastère', 'abbaye'], }, 'I': { nl: ['immaterieel erfgoed', 'tradities', 'folklore', 'volkscultuur'], en: ['intangible heritage', 'traditions', 'folklore', 'oral history'], de: ['immaterielles erbe', 'traditionen', 'folklore'], fr: ['patrimoine immatériel', 'traditions', 'folklore'], }, 'L': { nl: ['bibliotheek', 'bibliotheken', 'bieb', 'boekerij', 'mediatheek', 'leeszaal'], en: ['library', 'libraries', 'public library', 'reading room'], de: ['bibliothek', 'bibliotheken', 'bücherei'], fr: ['bibliothèque', 'bibliothèques', 'médiathèque'], }, // Library subtypes - commonly searched terms for specific library categories 'L.PUBLIC_LIBRARY': { nl: ['openbare bibliotheek', 'openbare bibliotheken', 'stadsbibliotheek', 'gemeentebibliotheek'], en: ['public library', 'public libraries', 'city library'], de: ['öffentliche bibliothek', 'stadtbibliothek', 'stadtbücherei'], fr: ['bibliothèque publique', 'bibliothèque municipale'], }, 'L.ACADEMIC_LIBRARY': { nl: ['universiteitsbibliotheek', 'wetenschappelijke bibliotheek', 'academische bibliotheek', 'hogeschoolbibliotheek'], en: ['academic library', 'university library', 'research library'], de: ['universitätsbibliothek', 'wissenschaftliche bibliothek', 'hochschulbibliothek'], fr: ['bibliothèque universitaire', 'bibliothèque académique'], }, 'L.NATIONAL_LIBRARY': { nl: ['nationale bibliotheek', 'koninklijke bibliotheek', 'kb'], en: ['national library', 'royal library'], de: ['nationalbibliothek', 'staatsbibliothek'], fr: ['bibliothèque nationale'], }, 'L.SPECIAL_LIBRARY': { nl: ['speciale bibliotheek', 'vakbibliotheek', 'gespecialiseerde bibliotheek'], en: ['special library', 'specialized library', 'subject library'], de: ['spezialbibliothek', 'fachbibliothek'], fr: ['bibliothèque spécialisée'], }, 'L.SCHOOL_LIBRARY': { nl: ['schoolbibliotheek', 'schoolbibliotheken', 'schoolmediatheek'], en: ['school library', 'school libraries'], de: ['schulbibliothek', 'schulbücherei'], fr: ['bibliothèque scolaire'], }, 'L.CHILDRENS_LIBRARY': { nl: ['jeugdbibliotheek', 'kinderbibliotheek', 'jeugdafdeling'], en: ['children\'s library', 'youth library', 'kids library'], de: ['kinderbibliothek', 'jugendbibliothek'], fr: ['bibliothèque jeunesse', 'bibliothèque pour enfants'], }, 'L.THEOLOGICAL_LIBRARY': { nl: ['theologische bibliotheek', 'kerkelijke bibliotheek', 'kloosterbibliotheek'], en: ['theological library', 'religious library', 'monastery library'], de: ['theologische bibliothek', 'klosterbibliothek'], fr: ['bibliothèque théologique', 'bibliothèque monastique'], }, 'L.MUSIC_LIBRARY': { nl: ['muziekbibliotheek', 'muziekcollectie'], en: ['music library', 'music collection'], de: ['musikbibliothek'], fr: ['bibliothèque musicale', 'médiathèque musicale'], }, 'M': { nl: ['museum', 'musea', 'museums', 'tentoonstellingsruimte', 'expositieruimte'], en: ['museum', 'museums', 'exhibition space'], de: ['museum', 'museen'], fr: ['musée', 'musées'], }, // Museum subtypes - commonly searched terms that map to specific museum categories 'M.ART_MUSEUM': { nl: ['kunstmuseum', 'kunstmusea', 'kunstcollectie'], en: ['art museum', 'art museums', 'fine art museum'], de: ['kunstmuseum', 'kunstmuseen'], fr: ['musée d\'art', 'musée des beaux-arts'], }, 'M.NATURAL_HISTORY_MUSEUM': { nl: ['natuurhistorisch museum', 'natuurmuseum', 'natuurmusea'], en: ['natural history museum', 'natural history museums'], de: ['naturkundemuseum', 'naturhistorisches museum'], fr: ['musée d\'histoire naturelle'], }, 'M.SCIENCE_MUSEUM': { nl: ['wetenschapsmuseum', 'techniekmuseum', 'wetenschapsmusea'], en: ['science museum', 'science museums', 'technology museum'], de: ['wissenschaftsmuseum', 'technikmuseum'], fr: ['musée des sciences', 'musée de la technique'], }, 'M.HISTORY_MUSEUM': { nl: ['historisch museum', 'geschiedenismuseum', 'historische musea'], en: ['history museum', 'history museums', 'historical museum'], de: ['geschichtsmuseum', 'historisches museum'], fr: ['musée d\'histoire', 'musée historique'], }, 'M.OPEN_AIR_MUSEUM': { nl: ['openluchtmuseum', 'openluchtmusea', 'freilichtmuseum'], en: ['open-air museum', 'open air museum', 'outdoor museum'], de: ['freilichtmuseum', 'freiluftmuseum'], fr: ['musée de plein air', 'écomusée'], }, 'M.ARCHAEOLOGICAL_MUSEUM': { nl: ['archeologisch museum', 'oudheidkundig museum', 'archeologische musea'], en: ['archaeological museum', 'archaeology museum'], de: ['archäologisches museum'], fr: ['musée archéologique'], }, 'M.MARITIME_MUSEUM': { nl: ['maritiem museum', 'scheepvaartmuseum', 'maritieme musea'], en: ['maritime museum', 'naval museum', 'shipping museum'], de: ['schifffahrtsmuseum', 'maritimes museum'], fr: ['musée maritime', 'musée de la marine'], }, 'M.MILITARY_MUSEUM': { nl: ['militair museum', 'legermuseum', 'oorlogsmuseum', 'militaire musea'], en: ['military museum', 'war museum', 'army museum'], de: ['militärmuseum', 'heeresmuseum', 'kriegsmuseum'], fr: ['musée militaire', 'musée de l\'armée'], }, 'M.ETHNOGRAPHIC_MUSEUM': { nl: ['volkenkundig museum', 'etnografisch museum', 'volkenkundige musea'], en: ['ethnographic museum', 'ethnography museum', 'anthropology museum'], de: ['völkerkundemuseum', 'ethnologisches museum'], fr: ['musée ethnographique', 'musée d\'ethnologie'], }, 'M.FOLK_MUSEUM': { nl: ['volkskundemuseum', 'heemkundig museum', 'folkloristisch museum'], en: ['folk museum', 'folklore museum', 'heritage museum'], de: ['volkskundemuseum', 'heimatmuseum'], fr: ['musée du folklore', 'musée des traditions populaires'], }, 'M.LOCAL_HISTORY_MUSEUM': { nl: ['streekmuseum', 'stadsmuseum', 'gemeentemuseum', 'lokaal museum'], en: ['local history museum', 'city museum', 'regional museum'], de: ['heimatmuseum', 'stadtmuseum', 'regionalmuseum'], fr: ['musée local', 'musée régional', 'musée de la ville'], }, 'M.TRANSPORT_MUSEUM': { nl: ['vervoermuseum', 'spoorwegmuseum', 'automuseum', 'trammuseum'], en: ['transport museum', 'railway museum', 'automobile museum', 'car museum'], de: ['verkehrsmuseum', 'eisenbahnmuseum', 'automuseum'], fr: ['musée des transports', 'musée du chemin de fer', 'musée de l\'automobile'], }, 'M.CHILDRENS_MUSEUM': { nl: ['kindermuseum', 'kindermusea'], en: ['children\'s museum', 'kids museum'], de: ['kindermuseum'], fr: ['musée pour enfants'], }, 'N': { nl: ['stichting', 'stichtingen', 'ngo', 'non-profit', 'goed doel'], en: ['foundation', 'foundations', 'ngo', 'non-profit', 'nonprofit', 'charity'], de: ['stiftung', 'stiftungen', 'ngo'], fr: ['fondation', 'fondations', 'ong'], }, 'O': { // Note: Archive-specific terms (gemeentearchief, nationaal archief, etc.) are now mapped to // archive subtypes (A.MUNICIPAL_ARCHIVE, A.NATIONAL_ARCHIVE, etc.) for more precise cache segmentation nl: ['overheidsinstelling', 'rijksdienst', 'erfgoeddienst', 'erfgoedinstelling', 'rijksoverheid'], en: ['government institution', 'heritage agency', 'heritage service'], de: ['staatliche einrichtung', 'denkmalamt', 'kulturamt'], fr: ['institution gouvernementale', 'service du patrimoine'], }, 'P': { nl: ['privécollectie', 'privéverzameling', 'particuliere collectie', 'verzamelaar'], en: ['private collection', 'personal collection', 'collector', 'private archive'], de: ['privatsammlung', 'privatarchiv', 'sammler'], fr: ['collection privée', 'collection personnelle', 'collectionneur'], }, 'R': { nl: ['onderzoeksinstituut', 'kenniscentrum', 'documentatiecentrum', 'studiecentrum'], en: ['research institute', 'research center', 'knowledge center', 'documentation center', 'study center'], de: ['forschungsinstitut', 'forschungszentrum', 'dokumentationszentrum'], fr: ['institut de recherche', 'centre de recherche', 'centre de documentation'], }, 'S': { nl: ['historische vereniging', 'heemkundige kring', 'oudheidkundige vereniging', 'genootschap', 'erfgoedvereniging'], en: ['historical society', 'heritage society', 'antiquarian society', 'local history society'], de: ['geschichtsverein', 'heimatverein', 'altertumsverein'], fr: ['société historique', 'société d\'histoire', 'association patrimoniale'], }, 'T': { nl: ['culinair erfgoed', 'gastronomisch erfgoed', 'parfumerie', 'distilleerderij', 'brouwerij'], en: ['culinary heritage', 'gastronomic heritage', 'perfumery', 'distillery', 'brewery', 'taste heritage'], de: ['kulinarisches erbe', 'gastronomisches erbe', 'parfümerie', 'destillerie', 'brauerei'], fr: ['patrimoine culinaire', 'patrimoine gastronomique', 'parfumerie', 'distillerie', 'brasserie'], }, 'U': { nl: ['onbekend', 'niet gespecificeerd'], en: ['unknown', 'unspecified', 'unclassified'], de: ['unbekannt', 'nicht spezifiziert'], fr: ['inconnu', 'non spécifié'], }, 'X': { nl: ['gemengd', 'gecombineerd', 'museum en archief', 'archief en bibliotheek'], en: ['mixed', 'combined', 'museum and archive', 'archive and library'], de: ['gemischt', 'kombiniert'], fr: ['mixte', 'combiné'], }, }; // ============================================================================ // Types // ============================================================================ interface TermLogEntry { typeCode: string; typeName: string; subtypeName?: string; recordSetType?: string; wikidata?: string; lang: string; } interface SubtypeInfo { className: string; wikidata?: string; accumulatedTerms: string; keywords: Record; } interface TypeInfo { code: string; className: string; baseWikidata?: string; accumulatedTerms: string; keywords: Record; subtypes: Record; } interface RecordSetTypeInfo { className: string; accumulatedTerms: string; keywords: Record; } interface TypesVocabulary { version: string; schemaVersion: string; embeddingModel: string; embeddingDimensions: number; tier1Embeddings: Record; tier2Embeddings: Record>; termLog: Record; institutionTypes: Record; recordSetTypes: Record; } interface ParsedClass { className: string; description?: string; keywords?: string[]; structuredAliases?: Array<{ literal_form: string; in_language?: string }>; wikidataEntity?: string; isSubtypeOf?: string; } // ============================================================================ // YAML Parsing // ============================================================================ function parseYamlFile(filePath: string): Record | null { try { const content = readFileSync(filePath, 'utf-8'); return parseYaml(content); } catch (error) { console.warn(`Warning: Could not parse ${filePath}: ${error}`); return null; } } function extractClassesFromYaml(yamlData: Record): ParsedClass[] { const classes: ParsedClass[] = []; const classesSection = yamlData.classes as Record | undefined; if (!classesSection) return classes; for (const [className, classDef] of Object.entries(classesSection)) { if (typeof classDef !== 'object' || classDef === null) continue; const classData = classDef as Record; // Skip abstract base classes (except the main Type class) if (classData.abstract === true && !className.endsWith('Type')) continue; const parsed: ParsedClass = { className, description: classData.description as string | undefined, keywords: classData.keywords as string[] | undefined, structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined, isSubtypeOf: classData.is_a as string | undefined, }; // Extract wikidata entity from slot_usage or mappings const slotUsage = classData.slot_usage as Record | undefined; if (slotUsage?.wikidata_entity) { const wdSlot = slotUsage.wikidata_entity as Record; parsed.wikidataEntity = wdSlot.equals_string as string | undefined; } // Check exact_mappings for Wikidata const exactMappings = classData.exact_mappings as string[] | undefined; if (exactMappings) { const wdMapping = exactMappings.find(m => m.startsWith('wd:') || m.startsWith('wikidata:')); if (wdMapping) { parsed.wikidataEntity = wdMapping.replace(/^(wd:|wikidata:)/, ''); } } // Check broad_mappings for Wikidata const broadMappings = classData.broad_mappings as string[] | undefined; if (broadMappings && !parsed.wikidataEntity) { const wdMapping = broadMappings.find(m => m.startsWith('wd:')); if (wdMapping) { parsed.wikidataEntity = wdMapping.replace('wd:', ''); } } classes.push(parsed); } return classes; } function extractKeywordsFromClass(parsedClass: ParsedClass): Record { const keywords: Record = {}; // 1. Extract from keywords array (usually language-agnostic, assume Dutch/English) if (parsedClass.keywords) { keywords['nl'] = keywords['nl'] || []; keywords['en'] = keywords['en'] || []; for (const kw of parsedClass.keywords) { // Simple heuristic: Dutch words often have Dutch-specific patterns const isDutch = /[ij]|sch|cht|aa|ee|oo|uu/i.test(kw); if (isDutch) { keywords['nl'].push(kw.toLowerCase()); } else { keywords['en'].push(kw.toLowerCase()); } } } // 2. Extract from structured_aliases (language-tagged) if (parsedClass.structuredAliases) { for (const alias of parsedClass.structuredAliases) { const lang = alias.in_language || 'en'; keywords[lang] = keywords[lang] || []; keywords[lang].push(alias.literal_form.toLowerCase()); } } // 3. Convert class name to keywords // MunicipalArchive -> ["municipal archive", "municipal", "archive"] const classNameWords = parsedClass.className .replace(/([A-Z])/g, ' $1') .trim() .toLowerCase() .split(/\s+/); keywords['en'] = keywords['en'] || []; keywords['en'].push(classNameWords.join(' ')); return keywords; } function accumulateTerms(keywords: Record): string { const allTerms: string[] = []; for (const terms of Object.values(keywords)) { allTerms.push(...terms); } return [...new Set(allTerms)].join(' '); } // ============================================================================ // Embedding Generation // ============================================================================ async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise { if (skipEmbeddings) { // Return empty placeholder return []; } try { const response = await fetch(EMBEDDING_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text }), }); if (!response.ok) { console.warn(`Embedding API error: ${response.status}`); return []; } const data = await response.json(); return data.embedding || []; } catch (error) { console.warn(`Embedding generation failed: ${error}`); return []; } } // ============================================================================ // Main Processing // ============================================================================ async function processTypeFiles(): Promise { const skipEmbeddings = process.argv.includes('--skip-embeddings'); console.log('🔍 Scanning schema directory:', SCHEMA_DIR); console.log(`📊 Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`); const vocabulary: TypesVocabulary = { version: new Date().toISOString(), schemaVersion: '20251121', embeddingModel: 'paraphrase-multilingual-MiniLM-L12-v2', embeddingDimensions: 384, tier1Embeddings: {}, tier2Embeddings: {}, termLog: {}, institutionTypes: {}, recordSetTypes: {}, }; // Find all *Type.yaml files (base types) const files = readdirSync(SCHEMA_DIR); const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml')); const typesFiles = files.filter(f => f.endsWith('Types.yaml')); console.log(`\n📁 Found ${typeFiles.length} Type files and ${typesFiles.length} Types files`); // Process base Type files for (const file of typeFiles) { const typeName = file.replace('.yaml', ''); const code = TYPE_FILE_TO_CODE[typeName]; if (!code) { console.log(` ⏭️ Skipping ${typeName} (not in GLAMORCUBESFIXPHDNT)`); continue; } console.log(`\n📄 Processing ${typeName} (${code})`); const filePath = join(SCHEMA_DIR, file); const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); const baseClass = classes.find(c => c.className === typeName); if (!baseClass) { console.log(` ⚠️ No base class found in ${file}`); continue; } // Initialize type info const typeInfo: TypeInfo = { code, className: typeName, baseWikidata: baseClass.wikidataEntity, accumulatedTerms: '', keywords: extractKeywordsFromClass(baseClass), subtypes: {}, }; // Merge BASE_TYPE_KEYWORDS into typeInfo.keywords const baseKeywords = BASE_TYPE_KEYWORDS[code]; if (baseKeywords) { for (const [lang, terms] of Object.entries(baseKeywords)) { typeInfo.keywords[lang] = typeInfo.keywords[lang] || []; typeInfo.keywords[lang].push(...terms); // Deduplicate typeInfo.keywords[lang] = [...new Set(typeInfo.keywords[lang])]; } } // Look for corresponding Types file (subtypes) const subtypesFilePath = join(SCHEMA_DIR, file.replace('Type.yaml', 'Types.yaml')); if (existsSync(subtypesFilePath)) { console.log(` 📂 Processing subtypes from ${subtypesFilePath.split('/').pop()}`); const subtypesYaml = parseYamlFile(subtypesFilePath); if (subtypesYaml) { const subtypeClasses = extractClassesFromYaml(subtypesYaml); for (const subclass of subtypeClasses) { // Convert CamelCase to UPPER_SNAKE_CASE const subtypeName = subclass.className .replace(/([a-z])([A-Z])/g, '$1_$2') .toUpperCase(); const subtypeKeywords = extractKeywordsFromClass(subclass); const subtypeInfo: SubtypeInfo = { className: subclass.className, wikidata: subclass.wikidataEntity, accumulatedTerms: accumulateTerms(subtypeKeywords), keywords: subtypeKeywords, }; typeInfo.subtypes[subtypeName] = subtypeInfo; // Add to term log for (const [lang, terms] of Object.entries(subtypeKeywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: code, typeName, subtypeName, wikidata: subclass.wikidataEntity, lang, }; } } console.log(` ✓ ${subclass.className}: ${Object.values(subtypeKeywords).flat().length} terms`); } } } // Accumulate all terms for this type (base + all subtypes) const allTypeTerms: string[] = []; allTypeTerms.push(accumulateTerms(typeInfo.keywords)); for (const subtype of Object.values(typeInfo.subtypes)) { allTypeTerms.push(subtype.accumulatedTerms); } typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' '))].join(' '); // Add base type keywords to term log for (const [lang, terms] of Object.entries(typeInfo.keywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: code, typeName, lang, }; } } vocabulary.institutionTypes[code] = typeInfo; console.log(` ✅ ${typeName}: ${Object.keys(typeInfo.subtypes).length} subtypes, ${typeInfo.accumulatedTerms.split(' ').length} total terms`); } // Add BASE_TYPE_KEYWORDS to termLog (common user search terms not in LinkML schemas) console.log('\n📖 Adding base vocabulary keywords to termLog...'); let baseTermCount = 0; for (const [code, langKeywords] of Object.entries(BASE_TYPE_KEYWORDS)) { // Handle subtype codes like 'M.ART_MUSEUM' vs base codes like 'M' const isSubtype = code.includes('.'); const baseCode = isSubtype ? code.split('.')[0] : code; const subtypeName = isSubtype ? code.split('.')[1] : undefined; const typeName = Object.entries(TYPE_FILE_TO_CODE).find(([name, c]) => c === baseCode)?.[0]; if (!typeName) continue; for (const [lang, terms] of Object.entries(langKeywords)) { for (const term of terms) { // Don't overwrite existing entries from schema (they have more specific info) if (!vocabulary.termLog[term]) { const entry: TermLogEntry = { typeCode: baseCode, typeName, lang, }; // Add subtype info if this is a subtype entry if (subtypeName) { entry.subtypeName = subtypeName; } vocabulary.termLog[term] = entry; baseTermCount++; } } } } console.log(` ✅ Added ${baseTermCount} base vocabulary terms (including subtype keywords)`); // Process RecordSetTypes files console.log('\n📁 Processing RecordSetTypes files...'); const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml')); for (const file of recordSetTypesFiles) { const filePath = join(SCHEMA_DIR, file); const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); for (const cls of classes) { // Skip abstract base classes if (cls.className.endsWith('RecordSetType') && !cls.className.includes('Fonds') && !cls.className.includes('Series') && !cls.className.includes('Collection')) { continue; } // Convert CamelCase to UPPER_SNAKE_CASE const rstName = cls.className .replace(/([a-z])([A-Z])/g, '$1_$2') .toUpperCase(); const keywords = extractKeywordsFromClass(cls); const rstInfo: RecordSetTypeInfo = { className: cls.className, accumulatedTerms: accumulateTerms(keywords), keywords, }; vocabulary.recordSetTypes[rstName] = rstInfo; // Add to term log for (const [lang, terms] of Object.entries(keywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: 'A', // Most record set types are archive-related typeName: 'ArchiveOrganizationType', recordSetType: rstName, lang, }; } } } } console.log(` ✅ Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`); // Generate Tier 1 embeddings (Types file level) console.log('\n🧮 Generating Tier 1 embeddings (Types files)...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings); vocabulary.tier1Embeddings[typeInfo.className] = embedding; console.log(` ✓ ${typeInfo.className}: ${embedding.length} dimensions`); } // Generate Tier 2 embeddings (individual subtypes) console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { vocabulary.tier2Embeddings[code] = {}; for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) { const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings); vocabulary.tier2Embeddings[code][subtypeName] = embedding; } console.log(` ✓ ${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`); } return vocabulary; } // ============================================================================ // Main Entry Point // ============================================================================ async function main() { console.log('═══════════════════════════════════════════════════════════════'); console.log(' TypesVocabulary Extraction Script'); console.log(' Ontology-Driven Cache Segmentation (Rule 46)'); console.log('═══════════════════════════════════════════════════════════════\n'); const vocabulary = await processTypeFiles(); // Ensure output directory exists const outputDir = dirname(OUTPUT_FILE); if (!existsSync(outputDir)) { mkdirSync(outputDir, { recursive: true }); } // Write output writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2)); console.log('\n═══════════════════════════════════════════════════════════════'); console.log(' Summary'); console.log('═══════════════════════════════════════════════════════════════'); console.log(` 📊 Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`); console.log(` 📊 Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`); console.log(` 📊 Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`); console.log(` 📊 Term Log Entries: ${Object.keys(vocabulary.termLog).length}`); console.log(` 📊 Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`); console.log(` 📊 Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`); console.log(`\n ✅ Output written to: ${OUTPUT_FILE}`); console.log('═══════════════════════════════════════════════════════════════\n'); } main().catch(console.error);