838 lines
34 KiB
JavaScript
838 lines
34 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* extract-types-vocab.ts
|
|
*
|
|
* Extracts vocabulary from LinkML *Type.yaml and *Types.yaml schema files
|
|
* and generates embeddings for two-tier semantic routing.
|
|
*
|
|
* Output: apps/archief-assistent/public/types-vocab.json
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/extract-types-vocab.ts
|
|
* npx tsx scripts/extract-types-vocab.ts --skip-embeddings # Skip embedding generation
|
|
*
|
|
* See: .opencode/rules/ontology-driven-cache-segmentation.md
|
|
*/
|
|
|
|
import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs';
|
|
import { join, dirname } from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
import { parse as parseYaml } from 'yaml';
|
|
|
|
// ESM compatibility for __dirname
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = dirname(__filename);
|
|
|
|
// ============================================================================
|
|
// Configuration
|
|
// ============================================================================
|
|
|
|
const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes');
|
|
const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json');
|
|
const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed';
|
|
|
|
// GLAMORCUBESFIXPHDNT code mapping
|
|
const TYPE_FILE_TO_CODE: Record<string, string> = {
|
|
'ArchiveOrganizationType': 'A',
|
|
'BioCustodianType': 'B',
|
|
'CommercialOrganizationType': 'C',
|
|
'DigitalPlatformType': 'D',
|
|
'EducationProviderType': 'E',
|
|
'FeatureCustodianType': 'F',
|
|
'GalleryType': 'G',
|
|
'HolySacredSiteType': 'H',
|
|
'IntangibleHeritageGroupType': 'I',
|
|
'LibraryType': 'L',
|
|
'MuseumType': 'M',
|
|
'NonProfitType': 'N',
|
|
'OfficialInstitutionType': 'O',
|
|
'PersonalCollectionType': 'P',
|
|
'ResearchOrganizationType': 'R',
|
|
'HeritageSocietyType': 'S',
|
|
'TasteScentHeritageType': 'T',
|
|
'UnspecifiedType': 'U',
|
|
'MixedCustodianType': 'X',
|
|
};
|
|
|
|
// Base vocabulary for the 19 GLAMORCUBESFIXPHDNT types
|
|
// These are common terms users search for that aren't in the LinkML schema keywords
|
|
const BASE_TYPE_KEYWORDS: Record<string, Record<string, string[]>> = {
|
|
'A': {
|
|
nl: ['archief', 'archieven', 'archivaris', 'archiveren', 'archiefstuk', 'archiefstukken'],
|
|
en: ['archive', 'archives', 'archivist', 'archival'],
|
|
de: ['archiv', 'archive'],
|
|
fr: ['archives', 'archiviste'],
|
|
},
|
|
// Archive subtypes - commonly searched terms for specific archive categories
|
|
'A.MUNICIPAL_ARCHIVE': {
|
|
nl: ['gemeentearchief', 'gemeentearchieven', 'stadsarchief', 'stadsarchieven'],
|
|
en: ['municipal archive', 'municipal archives', 'city archive', 'city archives'],
|
|
de: ['stadtarchiv', 'kommunalarchiv'],
|
|
fr: ['archives municipales', 'archives communales'],
|
|
},
|
|
'A.REGIONAL_ARCHIVE': {
|
|
nl: ['regionaal archief', 'regionale archieven', 'streekarchief', 'streekarchieven', 'provinciaal archief'],
|
|
en: ['regional archive', 'regional archives', 'provincial archive'],
|
|
de: ['regionalarchiv', 'landesarchiv'],
|
|
fr: ['archives régionales', 'archives départementales'],
|
|
},
|
|
'A.NATIONAL_ARCHIVE': {
|
|
nl: ['nationaal archief', 'rijksarchief', 'nationale archieven'],
|
|
en: ['national archive', 'national archives', 'state archive'],
|
|
de: ['nationalarchiv', 'bundesarchiv'],
|
|
fr: ['archives nationales'],
|
|
},
|
|
'A.CHURCH_ARCHIVE': {
|
|
nl: ['kerkarchief', 'kerkarchieven', 'parochiearchief', 'kerkelijke archieven'],
|
|
en: ['church archive', 'church archives', 'parish archive', 'ecclesiastical archive'],
|
|
de: ['kirchenarchiv', 'pfarrarchiv'],
|
|
fr: ['archives paroissiales', 'archives ecclésiastiques'],
|
|
},
|
|
'A.BUSINESS_ARCHIVE': {
|
|
nl: ['bedrijfsarchief', 'bedrijfsarchieven', 'ondernemingsarchief'],
|
|
en: ['business archive', 'business archives', 'corporate archive', 'company archive'],
|
|
de: ['unternehmensarchiv', 'firmenarchiv', 'wirtschaftsarchiv'],
|
|
fr: ['archives d\'entreprise', 'archives économiques'],
|
|
},
|
|
'A.UNIVERSITY_ARCHIVE': {
|
|
nl: ['universiteitsarchief', 'universitaire archieven', 'academisch archief'],
|
|
en: ['university archive', 'university archives', 'academic archive'],
|
|
de: ['universitätsarchiv', 'hochschularchiv'],
|
|
fr: ['archives universitaires'],
|
|
},
|
|
'A.FAMILY_ARCHIVE': {
|
|
nl: ['familiearchief', 'familiearchieven', 'huisarchief'],
|
|
en: ['family archive', 'family archives', 'house archive', 'estate archive'],
|
|
de: ['familienarchiv', 'hausarchiv'],
|
|
fr: ['archives familiales'],
|
|
},
|
|
'A.NOTARIAL_ARCHIVE': {
|
|
nl: ['notarieel archief', 'notariële archieven', 'notarisarchief'],
|
|
en: ['notarial archive', 'notarial archives', 'notary archive'],
|
|
de: ['notariatsarchiv'],
|
|
fr: ['archives notariales'],
|
|
},
|
|
'B': {
|
|
nl: ['dierentuin', 'dierentuinen', 'zoo', 'botanische tuin', 'arboretum', 'aquarium'],
|
|
en: ['zoo', 'zoos', 'botanical garden', 'botanical gardens', 'arboretum', 'aquarium'],
|
|
de: ['zoo', 'botanischer garten'],
|
|
fr: ['zoo', 'jardin botanique'],
|
|
},
|
|
'C': {
|
|
nl: ['bedrijfsarchief', 'bedrijfsarchieven', 'bedrijfscollectie', 'ondernemingsarchief'],
|
|
en: ['corporate archive', 'corporate archives', 'business archive', 'company archive'],
|
|
de: ['unternehmensarchiv', 'firmenarchiv'],
|
|
fr: ['archives d\'entreprise'],
|
|
},
|
|
'D': {
|
|
nl: ['digitaal platform', 'digitale platformen', 'online archief', 'digitale bibliotheek'],
|
|
en: ['digital platform', 'digital platforms', 'online archive', 'digital library'],
|
|
de: ['digitale plattform', 'online-archiv'],
|
|
fr: ['plateforme numérique', 'archives numériques'],
|
|
},
|
|
'E': {
|
|
nl: ['universiteit', 'universiteiten', 'hogeschool', 'hogescholen', 'onderwijsinstelling', 'school', 'scholen'],
|
|
en: ['university', 'universities', 'college', 'colleges', 'school', 'schools', 'educational institution'],
|
|
de: ['universität', 'hochschule', 'schule'],
|
|
fr: ['université', 'école', 'établissement scolaire'],
|
|
},
|
|
'F': {
|
|
nl: ['monument', 'monumenten', 'standbeeld', 'standbeelden', 'gedenkteken', 'begraafplaats', 'begraafplaatsen'],
|
|
en: ['monument', 'monuments', 'statue', 'statues', 'memorial', 'cemetery', 'cemeteries'],
|
|
de: ['denkmal', 'denkmäler', 'statue', 'friedhof'],
|
|
fr: ['monument', 'monuments', 'statue', 'cimetière'],
|
|
},
|
|
'G': {
|
|
nl: ['galerie', 'galerij', 'galerijen', 'kunstgalerie', 'kunstgalerij', 'kunsthal'],
|
|
en: ['gallery', 'galleries', 'art gallery', 'art galleries', 'kunsthalle'],
|
|
de: ['galerie', 'galerien', 'kunstgalerie', 'kunsthalle'],
|
|
fr: ['galerie', 'galeries', 'galerie d\'art'],
|
|
},
|
|
'H': {
|
|
nl: ['kerk', 'kerken', 'kathedraal', 'kapel', 'moskee', 'synagoge', 'tempel', 'klooster', 'abdij'],
|
|
en: ['church', 'churches', 'cathedral', 'chapel', 'mosque', 'synagogue', 'temple', 'monastery', 'abbey'],
|
|
de: ['kirche', 'kathedrale', 'kapelle', 'moschee', 'synagoge', 'tempel', 'kloster', 'abtei'],
|
|
fr: ['église', 'cathédrale', 'chapelle', 'mosquée', 'synagogue', 'temple', 'monastère', 'abbaye'],
|
|
},
|
|
'I': {
|
|
nl: ['immaterieel erfgoed', 'tradities', 'folklore', 'volkscultuur'],
|
|
en: ['intangible heritage', 'traditions', 'folklore', 'oral history'],
|
|
de: ['immaterielles erbe', 'traditionen', 'folklore'],
|
|
fr: ['patrimoine immatériel', 'traditions', 'folklore'],
|
|
},
|
|
'L': {
|
|
nl: ['bibliotheek', 'bibliotheken', 'bieb', 'boekerij', 'mediatheek', 'leeszaal'],
|
|
en: ['library', 'libraries', 'public library', 'reading room'],
|
|
de: ['bibliothek', 'bibliotheken', 'bücherei'],
|
|
fr: ['bibliothèque', 'bibliothèques', 'médiathèque'],
|
|
},
|
|
// Library subtypes - commonly searched terms for specific library categories
|
|
'L.PUBLIC_LIBRARY': {
|
|
nl: ['openbare bibliotheek', 'openbare bibliotheken', 'stadsbibliotheek', 'gemeentebibliotheek'],
|
|
en: ['public library', 'public libraries', 'city library'],
|
|
de: ['öffentliche bibliothek', 'stadtbibliothek', 'stadtbücherei'],
|
|
fr: ['bibliothèque publique', 'bibliothèque municipale'],
|
|
},
|
|
'L.ACADEMIC_LIBRARY': {
|
|
nl: ['universiteitsbibliotheek', 'wetenschappelijke bibliotheek', 'academische bibliotheek', 'hogeschoolbibliotheek'],
|
|
en: ['academic library', 'university library', 'research library'],
|
|
de: ['universitätsbibliothek', 'wissenschaftliche bibliothek', 'hochschulbibliothek'],
|
|
fr: ['bibliothèque universitaire', 'bibliothèque académique'],
|
|
},
|
|
'L.NATIONAL_LIBRARY': {
|
|
nl: ['nationale bibliotheek', 'koninklijke bibliotheek', 'kb'],
|
|
en: ['national library', 'royal library'],
|
|
de: ['nationalbibliothek', 'staatsbibliothek'],
|
|
fr: ['bibliothèque nationale'],
|
|
},
|
|
'L.SPECIAL_LIBRARY': {
|
|
nl: ['speciale bibliotheek', 'vakbibliotheek', 'gespecialiseerde bibliotheek'],
|
|
en: ['special library', 'specialized library', 'subject library'],
|
|
de: ['spezialbibliothek', 'fachbibliothek'],
|
|
fr: ['bibliothèque spécialisée'],
|
|
},
|
|
'L.SCHOOL_LIBRARY': {
|
|
nl: ['schoolbibliotheek', 'schoolbibliotheken', 'schoolmediatheek'],
|
|
en: ['school library', 'school libraries'],
|
|
de: ['schulbibliothek', 'schulbücherei'],
|
|
fr: ['bibliothèque scolaire'],
|
|
},
|
|
'L.CHILDRENS_LIBRARY': {
|
|
nl: ['jeugdbibliotheek', 'kinderbibliotheek', 'jeugdafdeling'],
|
|
en: ['children\'s library', 'youth library', 'kids library'],
|
|
de: ['kinderbibliothek', 'jugendbibliothek'],
|
|
fr: ['bibliothèque jeunesse', 'bibliothèque pour enfants'],
|
|
},
|
|
'L.THEOLOGICAL_LIBRARY': {
|
|
nl: ['theologische bibliotheek', 'kerkelijke bibliotheek', 'kloosterbibliotheek'],
|
|
en: ['theological library', 'religious library', 'monastery library'],
|
|
de: ['theologische bibliothek', 'klosterbibliothek'],
|
|
fr: ['bibliothèque théologique', 'bibliothèque monastique'],
|
|
},
|
|
'L.MUSIC_LIBRARY': {
|
|
nl: ['muziekbibliotheek', 'muziekcollectie'],
|
|
en: ['music library', 'music collection'],
|
|
de: ['musikbibliothek'],
|
|
fr: ['bibliothèque musicale', 'médiathèque musicale'],
|
|
},
|
|
'M': {
|
|
nl: ['museum', 'musea', 'museums', 'tentoonstellingsruimte', 'expositieruimte'],
|
|
en: ['museum', 'museums', 'exhibition space'],
|
|
de: ['museum', 'museen'],
|
|
fr: ['musée', 'musées'],
|
|
},
|
|
// Museum subtypes - commonly searched terms that map to specific museum categories
|
|
'M.ART_MUSEUM': {
|
|
nl: ['kunstmuseum', 'kunstmusea', 'kunstcollectie'],
|
|
en: ['art museum', 'art museums', 'fine art museum'],
|
|
de: ['kunstmuseum', 'kunstmuseen'],
|
|
fr: ['musée d\'art', 'musée des beaux-arts'],
|
|
},
|
|
'M.NATURAL_HISTORY_MUSEUM': {
|
|
nl: ['natuurhistorisch museum', 'natuurmuseum', 'natuurmusea'],
|
|
en: ['natural history museum', 'natural history museums'],
|
|
de: ['naturkundemuseum', 'naturhistorisches museum'],
|
|
fr: ['musée d\'histoire naturelle'],
|
|
},
|
|
'M.SCIENCE_MUSEUM': {
|
|
nl: ['wetenschapsmuseum', 'techniekmuseum', 'wetenschapsmusea'],
|
|
en: ['science museum', 'science museums', 'technology museum'],
|
|
de: ['wissenschaftsmuseum', 'technikmuseum'],
|
|
fr: ['musée des sciences', 'musée de la technique'],
|
|
},
|
|
'M.HISTORY_MUSEUM': {
|
|
nl: ['historisch museum', 'geschiedenismuseum', 'historische musea'],
|
|
en: ['history museum', 'history museums', 'historical museum'],
|
|
de: ['geschichtsmuseum', 'historisches museum'],
|
|
fr: ['musée d\'histoire', 'musée historique'],
|
|
},
|
|
'M.OPEN_AIR_MUSEUM': {
|
|
nl: ['openluchtmuseum', 'openluchtmusea', 'freilichtmuseum'],
|
|
en: ['open-air museum', 'open air museum', 'outdoor museum'],
|
|
de: ['freilichtmuseum', 'freiluftmuseum'],
|
|
fr: ['musée de plein air', 'écomusée'],
|
|
},
|
|
'M.ARCHAEOLOGICAL_MUSEUM': {
|
|
nl: ['archeologisch museum', 'oudheidkundig museum', 'archeologische musea'],
|
|
en: ['archaeological museum', 'archaeology museum'],
|
|
de: ['archäologisches museum'],
|
|
fr: ['musée archéologique'],
|
|
},
|
|
'M.MARITIME_MUSEUM': {
|
|
nl: ['maritiem museum', 'scheepvaartmuseum', 'maritieme musea'],
|
|
en: ['maritime museum', 'naval museum', 'shipping museum'],
|
|
de: ['schifffahrtsmuseum', 'maritimes museum'],
|
|
fr: ['musée maritime', 'musée de la marine'],
|
|
},
|
|
'M.MILITARY_MUSEUM': {
|
|
nl: ['militair museum', 'legermuseum', 'oorlogsmuseum', 'militaire musea'],
|
|
en: ['military museum', 'war museum', 'army museum'],
|
|
de: ['militärmuseum', 'heeresmuseum', 'kriegsmuseum'],
|
|
fr: ['musée militaire', 'musée de l\'armée'],
|
|
},
|
|
'M.ETHNOGRAPHIC_MUSEUM': {
|
|
nl: ['volkenkundig museum', 'etnografisch museum', 'volkenkundige musea'],
|
|
en: ['ethnographic museum', 'ethnography museum', 'anthropology museum'],
|
|
de: ['völkerkundemuseum', 'ethnologisches museum'],
|
|
fr: ['musée ethnographique', 'musée d\'ethnologie'],
|
|
},
|
|
'M.FOLK_MUSEUM': {
|
|
nl: ['volkskundemuseum', 'heemkundig museum', 'folkloristisch museum'],
|
|
en: ['folk museum', 'folklore museum', 'heritage museum'],
|
|
de: ['volkskundemuseum', 'heimatmuseum'],
|
|
fr: ['musée du folklore', 'musée des traditions populaires'],
|
|
},
|
|
'M.LOCAL_HISTORY_MUSEUM': {
|
|
nl: ['streekmuseum', 'stadsmuseum', 'gemeentemuseum', 'lokaal museum'],
|
|
en: ['local history museum', 'city museum', 'regional museum'],
|
|
de: ['heimatmuseum', 'stadtmuseum', 'regionalmuseum'],
|
|
fr: ['musée local', 'musée régional', 'musée de la ville'],
|
|
},
|
|
'M.TRANSPORT_MUSEUM': {
|
|
nl: ['vervoermuseum', 'spoorwegmuseum', 'automuseum', 'trammuseum'],
|
|
en: ['transport museum', 'railway museum', 'automobile museum', 'car museum'],
|
|
de: ['verkehrsmuseum', 'eisenbahnmuseum', 'automuseum'],
|
|
fr: ['musée des transports', 'musée du chemin de fer', 'musée de l\'automobile'],
|
|
},
|
|
'M.CHILDRENS_MUSEUM': {
|
|
nl: ['kindermuseum', 'kindermusea'],
|
|
en: ['children\'s museum', 'kids museum'],
|
|
de: ['kindermuseum'],
|
|
fr: ['musée pour enfants'],
|
|
},
|
|
'N': {
|
|
nl: ['stichting', 'stichtingen', 'ngo', 'non-profit', 'goed doel'],
|
|
en: ['foundation', 'foundations', 'ngo', 'non-profit', 'nonprofit', 'charity'],
|
|
de: ['stiftung', 'stiftungen', 'ngo'],
|
|
fr: ['fondation', 'fondations', 'ong'],
|
|
},
|
|
'O': {
|
|
// Note: Archive-specific terms (gemeentearchief, nationaal archief, etc.) are now mapped to
|
|
// archive subtypes (A.MUNICIPAL_ARCHIVE, A.NATIONAL_ARCHIVE, etc.) for more precise cache segmentation
|
|
nl: ['overheidsinstelling', 'rijksdienst', 'erfgoeddienst', 'erfgoedinstelling', 'rijksoverheid'],
|
|
en: ['government institution', 'heritage agency', 'heritage service'],
|
|
de: ['staatliche einrichtung', 'denkmalamt', 'kulturamt'],
|
|
fr: ['institution gouvernementale', 'service du patrimoine'],
|
|
},
|
|
'P': {
|
|
nl: ['privécollectie', 'privéverzameling', 'particuliere collectie', 'verzamelaar'],
|
|
en: ['private collection', 'personal collection', 'collector', 'private archive'],
|
|
de: ['privatsammlung', 'privatarchiv', 'sammler'],
|
|
fr: ['collection privée', 'collection personnelle', 'collectionneur'],
|
|
},
|
|
'R': {
|
|
nl: ['onderzoeksinstituut', 'kenniscentrum', 'documentatiecentrum', 'studiecentrum'],
|
|
en: ['research institute', 'research center', 'knowledge center', 'documentation center', 'study center'],
|
|
de: ['forschungsinstitut', 'forschungszentrum', 'dokumentationszentrum'],
|
|
fr: ['institut de recherche', 'centre de recherche', 'centre de documentation'],
|
|
},
|
|
'S': {
|
|
nl: ['historische vereniging', 'heemkundige kring', 'oudheidkundige vereniging', 'genootschap', 'erfgoedvereniging'],
|
|
en: ['historical society', 'heritage society', 'antiquarian society', 'local history society'],
|
|
de: ['geschichtsverein', 'heimatverein', 'altertumsverein'],
|
|
fr: ['société historique', 'société d\'histoire', 'association patrimoniale'],
|
|
},
|
|
'T': {
|
|
nl: ['culinair erfgoed', 'gastronomisch erfgoed', 'parfumerie', 'distilleerderij', 'brouwerij'],
|
|
en: ['culinary heritage', 'gastronomic heritage', 'perfumery', 'distillery', 'brewery', 'taste heritage'],
|
|
de: ['kulinarisches erbe', 'gastronomisches erbe', 'parfümerie', 'destillerie', 'brauerei'],
|
|
fr: ['patrimoine culinaire', 'patrimoine gastronomique', 'parfumerie', 'distillerie', 'brasserie'],
|
|
},
|
|
'U': {
|
|
nl: ['onbekend', 'niet gespecificeerd'],
|
|
en: ['unknown', 'unspecified', 'unclassified'],
|
|
de: ['unbekannt', 'nicht spezifiziert'],
|
|
fr: ['inconnu', 'non spécifié'],
|
|
},
|
|
'X': {
|
|
nl: ['gemengd', 'gecombineerd', 'museum en archief', 'archief en bibliotheek'],
|
|
en: ['mixed', 'combined', 'museum and archive', 'archive and library'],
|
|
de: ['gemischt', 'kombiniert'],
|
|
fr: ['mixte', 'combiné'],
|
|
},
|
|
};
|
|
|
|
// ============================================================================
|
|
// Types
|
|
// ============================================================================
|
|
|
|
interface TermLogEntry {
|
|
typeCode: string;
|
|
typeName: string;
|
|
subtypeName?: string;
|
|
recordSetType?: string;
|
|
wikidata?: string;
|
|
lang: string;
|
|
}
|
|
|
|
interface SubtypeInfo {
|
|
className: string;
|
|
wikidata?: string;
|
|
accumulatedTerms: string;
|
|
keywords: Record<string, string[]>;
|
|
}
|
|
|
|
interface TypeInfo {
|
|
code: string;
|
|
className: string;
|
|
baseWikidata?: string;
|
|
accumulatedTerms: string;
|
|
keywords: Record<string, string[]>;
|
|
subtypes: Record<string, SubtypeInfo>;
|
|
}
|
|
|
|
interface RecordSetTypeInfo {
|
|
className: string;
|
|
accumulatedTerms: string;
|
|
keywords: Record<string, string[]>;
|
|
}
|
|
|
|
interface TypesVocabulary {
|
|
version: string;
|
|
schemaVersion: string;
|
|
embeddingModel: string;
|
|
embeddingDimensions: number;
|
|
tier1Embeddings: Record<string, number[]>;
|
|
tier2Embeddings: Record<string, Record<string, number[]>>;
|
|
termLog: Record<string, TermLogEntry>;
|
|
institutionTypes: Record<string, TypeInfo>;
|
|
recordSetTypes: Record<string, RecordSetTypeInfo>;
|
|
}
|
|
|
|
interface ParsedClass {
|
|
className: string;
|
|
description?: string;
|
|
keywords?: string[];
|
|
structuredAliases?: Array<{ literal_form: string; in_language?: string }>;
|
|
wikidataEntity?: string;
|
|
isSubtypeOf?: string;
|
|
}
|
|
|
|
// ============================================================================
|
|
// YAML Parsing
|
|
// ============================================================================
|
|
|
|
function parseYamlFile(filePath: string): Record<string, unknown> | null {
|
|
try {
|
|
const content = readFileSync(filePath, 'utf-8');
|
|
return parseYaml(content);
|
|
} catch (error) {
|
|
console.warn(`Warning: Could not parse ${filePath}: ${error}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function extractClassesFromYaml(yamlData: Record<string, unknown>): ParsedClass[] {
|
|
const classes: ParsedClass[] = [];
|
|
const classesSection = yamlData.classes as Record<string, unknown> | undefined;
|
|
|
|
if (!classesSection) return classes;
|
|
|
|
for (const [className, classDef] of Object.entries(classesSection)) {
|
|
if (typeof classDef !== 'object' || classDef === null) continue;
|
|
|
|
const classData = classDef as Record<string, unknown>;
|
|
|
|
// Skip abstract base classes (except the main Type class)
|
|
if (classData.abstract === true && !className.endsWith('Type')) continue;
|
|
|
|
const parsed: ParsedClass = {
|
|
className,
|
|
description: classData.description as string | undefined,
|
|
keywords: classData.keywords as string[] | undefined,
|
|
structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined,
|
|
isSubtypeOf: classData.is_a as string | undefined,
|
|
};
|
|
|
|
// Extract wikidata entity from slot_usage or mappings
|
|
const slotUsage = classData.slot_usage as Record<string, unknown> | undefined;
|
|
if (slotUsage?.wikidata_entity) {
|
|
const wdSlot = slotUsage.wikidata_entity as Record<string, unknown>;
|
|
parsed.wikidataEntity = wdSlot.equals_string as string | undefined;
|
|
}
|
|
|
|
// Check exact_mappings for Wikidata
|
|
const exactMappings = classData.exact_mappings as string[] | undefined;
|
|
if (exactMappings) {
|
|
const wdMapping = exactMappings.find(m => m.startsWith('wd:') || m.startsWith('wikidata:'));
|
|
if (wdMapping) {
|
|
parsed.wikidataEntity = wdMapping.replace(/^(wd:|wikidata:)/, '');
|
|
}
|
|
}
|
|
|
|
// Check broad_mappings for Wikidata
|
|
const broadMappings = classData.broad_mappings as string[] | undefined;
|
|
if (broadMappings && !parsed.wikidataEntity) {
|
|
const wdMapping = broadMappings.find(m => m.startsWith('wd:'));
|
|
if (wdMapping) {
|
|
parsed.wikidataEntity = wdMapping.replace('wd:', '');
|
|
}
|
|
}
|
|
|
|
classes.push(parsed);
|
|
}
|
|
|
|
return classes;
|
|
}
|
|
|
|
function extractKeywordsFromClass(parsedClass: ParsedClass): Record<string, string[]> {
|
|
const keywords: Record<string, string[]> = {};
|
|
|
|
// 1. Extract from keywords array (usually language-agnostic, assume Dutch/English)
|
|
if (parsedClass.keywords) {
|
|
keywords['nl'] = keywords['nl'] || [];
|
|
keywords['en'] = keywords['en'] || [];
|
|
for (const kw of parsedClass.keywords) {
|
|
// Simple heuristic: Dutch words often have Dutch-specific patterns
|
|
const isDutch = /[ij]|sch|cht|aa|ee|oo|uu/i.test(kw);
|
|
if (isDutch) {
|
|
keywords['nl'].push(kw.toLowerCase());
|
|
} else {
|
|
keywords['en'].push(kw.toLowerCase());
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2. Extract from structured_aliases (language-tagged)
|
|
if (parsedClass.structuredAliases) {
|
|
for (const alias of parsedClass.structuredAliases) {
|
|
const lang = alias.in_language || 'en';
|
|
keywords[lang] = keywords[lang] || [];
|
|
keywords[lang].push(alias.literal_form.toLowerCase());
|
|
}
|
|
}
|
|
|
|
// 3. Convert class name to keywords
|
|
// MunicipalArchive -> ["municipal archive", "municipal", "archive"]
|
|
const classNameWords = parsedClass.className
|
|
.replace(/([A-Z])/g, ' $1')
|
|
.trim()
|
|
.toLowerCase()
|
|
.split(/\s+/);
|
|
|
|
keywords['en'] = keywords['en'] || [];
|
|
keywords['en'].push(classNameWords.join(' '));
|
|
|
|
return keywords;
|
|
}
|
|
|
|
function accumulateTerms(keywords: Record<string, string[]>): string {
|
|
const allTerms: string[] = [];
|
|
for (const terms of Object.values(keywords)) {
|
|
allTerms.push(...terms);
|
|
}
|
|
return [...new Set(allTerms)].join(' ');
|
|
}
|
|
|
|
// ============================================================================
|
|
// Embedding Generation
|
|
// ============================================================================
|
|
|
|
async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise<number[]> {
|
|
if (skipEmbeddings) {
|
|
// Return empty placeholder
|
|
return [];
|
|
}
|
|
|
|
try {
|
|
const response = await fetch(EMBEDDING_API_URL, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ text }),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
console.warn(`Embedding API error: ${response.status}`);
|
|
return [];
|
|
}
|
|
|
|
const data = await response.json();
|
|
return data.embedding || [];
|
|
} catch (error) {
|
|
console.warn(`Embedding generation failed: ${error}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Main Processing
|
|
// ============================================================================
|
|
|
|
async function processTypeFiles(): Promise<TypesVocabulary> {
|
|
const skipEmbeddings = process.argv.includes('--skip-embeddings');
|
|
|
|
console.log('🔍 Scanning schema directory:', SCHEMA_DIR);
|
|
console.log(`📊 Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`);
|
|
|
|
const vocabulary: TypesVocabulary = {
|
|
version: new Date().toISOString(),
|
|
schemaVersion: '20251121',
|
|
embeddingModel: 'paraphrase-multilingual-MiniLM-L12-v2',
|
|
embeddingDimensions: 384,
|
|
tier1Embeddings: {},
|
|
tier2Embeddings: {},
|
|
termLog: {},
|
|
institutionTypes: {},
|
|
recordSetTypes: {},
|
|
};
|
|
|
|
// Find all *Type.yaml files (base types)
|
|
const files = readdirSync(SCHEMA_DIR);
|
|
const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml'));
|
|
const typesFiles = files.filter(f => f.endsWith('Types.yaml'));
|
|
|
|
console.log(`\n📁 Found ${typeFiles.length} Type files and ${typesFiles.length} Types files`);
|
|
|
|
// Process base Type files
|
|
for (const file of typeFiles) {
|
|
const typeName = file.replace('.yaml', '');
|
|
const code = TYPE_FILE_TO_CODE[typeName];
|
|
|
|
if (!code) {
|
|
console.log(` ⏭️ Skipping ${typeName} (not in GLAMORCUBESFIXPHDNT)`);
|
|
continue;
|
|
}
|
|
|
|
console.log(`\n📄 Processing ${typeName} (${code})`);
|
|
|
|
const filePath = join(SCHEMA_DIR, file);
|
|
const yamlData = parseYamlFile(filePath);
|
|
if (!yamlData) continue;
|
|
|
|
const classes = extractClassesFromYaml(yamlData);
|
|
const baseClass = classes.find(c => c.className === typeName);
|
|
|
|
if (!baseClass) {
|
|
console.log(` ⚠️ No base class found in ${file}`);
|
|
continue;
|
|
}
|
|
|
|
// Initialize type info
|
|
const typeInfo: TypeInfo = {
|
|
code,
|
|
className: typeName,
|
|
baseWikidata: baseClass.wikidataEntity,
|
|
accumulatedTerms: '',
|
|
keywords: extractKeywordsFromClass(baseClass),
|
|
subtypes: {},
|
|
};
|
|
|
|
// Merge BASE_TYPE_KEYWORDS into typeInfo.keywords
|
|
const baseKeywords = BASE_TYPE_KEYWORDS[code];
|
|
if (baseKeywords) {
|
|
for (const [lang, terms] of Object.entries(baseKeywords)) {
|
|
typeInfo.keywords[lang] = typeInfo.keywords[lang] || [];
|
|
typeInfo.keywords[lang].push(...terms);
|
|
// Deduplicate
|
|
typeInfo.keywords[lang] = [...new Set(typeInfo.keywords[lang])];
|
|
}
|
|
}
|
|
|
|
// Look for corresponding Types file (subtypes)
|
|
const subtypesFilePath = join(SCHEMA_DIR, file.replace('Type.yaml', 'Types.yaml'));
|
|
|
|
if (existsSync(subtypesFilePath)) {
|
|
console.log(` 📂 Processing subtypes from ${subtypesFilePath.split('/').pop()}`);
|
|
const subtypesYaml = parseYamlFile(subtypesFilePath);
|
|
if (subtypesYaml) {
|
|
const subtypeClasses = extractClassesFromYaml(subtypesYaml);
|
|
|
|
for (const subclass of subtypeClasses) {
|
|
// Convert CamelCase to UPPER_SNAKE_CASE
|
|
const subtypeName = subclass.className
|
|
.replace(/([a-z])([A-Z])/g, '$1_$2')
|
|
.toUpperCase();
|
|
const subtypeKeywords = extractKeywordsFromClass(subclass);
|
|
|
|
const subtypeInfo: SubtypeInfo = {
|
|
className: subclass.className,
|
|
wikidata: subclass.wikidataEntity,
|
|
accumulatedTerms: accumulateTerms(subtypeKeywords),
|
|
keywords: subtypeKeywords,
|
|
};
|
|
|
|
typeInfo.subtypes[subtypeName] = subtypeInfo;
|
|
|
|
// Add to term log
|
|
for (const [lang, terms] of Object.entries(subtypeKeywords)) {
|
|
for (const term of terms) {
|
|
vocabulary.termLog[term] = {
|
|
typeCode: code,
|
|
typeName,
|
|
subtypeName,
|
|
wikidata: subclass.wikidataEntity,
|
|
lang,
|
|
};
|
|
}
|
|
}
|
|
|
|
console.log(` ✓ ${subclass.className}: ${Object.values(subtypeKeywords).flat().length} terms`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Accumulate all terms for this type (base + all subtypes)
|
|
const allTypeTerms: string[] = [];
|
|
allTypeTerms.push(accumulateTerms(typeInfo.keywords));
|
|
for (const subtype of Object.values(typeInfo.subtypes)) {
|
|
allTypeTerms.push(subtype.accumulatedTerms);
|
|
}
|
|
typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' '))].join(' ');
|
|
|
|
// Add base type keywords to term log
|
|
for (const [lang, terms] of Object.entries(typeInfo.keywords)) {
|
|
for (const term of terms) {
|
|
vocabulary.termLog[term] = {
|
|
typeCode: code,
|
|
typeName,
|
|
lang,
|
|
};
|
|
}
|
|
}
|
|
|
|
vocabulary.institutionTypes[code] = typeInfo;
|
|
console.log(` ✅ ${typeName}: ${Object.keys(typeInfo.subtypes).length} subtypes, ${typeInfo.accumulatedTerms.split(' ').length} total terms`);
|
|
}
|
|
|
|
// Add BASE_TYPE_KEYWORDS to termLog (common user search terms not in LinkML schemas)
|
|
console.log('\n📖 Adding base vocabulary keywords to termLog...');
|
|
let baseTermCount = 0;
|
|
for (const [code, langKeywords] of Object.entries(BASE_TYPE_KEYWORDS)) {
|
|
// Handle subtype codes like 'M.ART_MUSEUM' vs base codes like 'M'
|
|
const isSubtype = code.includes('.');
|
|
const baseCode = isSubtype ? code.split('.')[0] : code;
|
|
const subtypeName = isSubtype ? code.split('.')[1] : undefined;
|
|
|
|
const typeName = Object.entries(TYPE_FILE_TO_CODE).find(([name, c]) => c === baseCode)?.[0];
|
|
if (!typeName) continue;
|
|
|
|
for (const [lang, terms] of Object.entries(langKeywords)) {
|
|
for (const term of terms) {
|
|
// Don't overwrite existing entries from schema (they have more specific info)
|
|
if (!vocabulary.termLog[term]) {
|
|
const entry: TermLogEntry = {
|
|
typeCode: baseCode,
|
|
typeName,
|
|
lang,
|
|
};
|
|
|
|
// Add subtype info if this is a subtype entry
|
|
if (subtypeName) {
|
|
entry.subtypeName = subtypeName;
|
|
}
|
|
|
|
vocabulary.termLog[term] = entry;
|
|
baseTermCount++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
console.log(` ✅ Added ${baseTermCount} base vocabulary terms (including subtype keywords)`);
|
|
|
|
// Process RecordSetTypes files
|
|
console.log('\n📁 Processing RecordSetTypes files...');
|
|
const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml'));
|
|
|
|
for (const file of recordSetTypesFiles) {
|
|
const filePath = join(SCHEMA_DIR, file);
|
|
const yamlData = parseYamlFile(filePath);
|
|
if (!yamlData) continue;
|
|
|
|
const classes = extractClassesFromYaml(yamlData);
|
|
|
|
for (const cls of classes) {
|
|
// Skip abstract base classes
|
|
if (cls.className.endsWith('RecordSetType') && !cls.className.includes('Fonds') &&
|
|
!cls.className.includes('Series') && !cls.className.includes('Collection')) {
|
|
continue;
|
|
}
|
|
|
|
// Convert CamelCase to UPPER_SNAKE_CASE
|
|
const rstName = cls.className
|
|
.replace(/([a-z])([A-Z])/g, '$1_$2')
|
|
.toUpperCase();
|
|
const keywords = extractKeywordsFromClass(cls);
|
|
|
|
const rstInfo: RecordSetTypeInfo = {
|
|
className: cls.className,
|
|
accumulatedTerms: accumulateTerms(keywords),
|
|
keywords,
|
|
};
|
|
|
|
vocabulary.recordSetTypes[rstName] = rstInfo;
|
|
|
|
// Add to term log
|
|
for (const [lang, terms] of Object.entries(keywords)) {
|
|
for (const term of terms) {
|
|
vocabulary.termLog[term] = {
|
|
typeCode: 'A', // Most record set types are archive-related
|
|
typeName: 'ArchiveOrganizationType',
|
|
recordSetType: rstName,
|
|
lang,
|
|
};
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log(` ✅ Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`);
|
|
|
|
// Generate Tier 1 embeddings (Types file level)
|
|
console.log('\n🧮 Generating Tier 1 embeddings (Types files)...');
|
|
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
|
|
const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings);
|
|
vocabulary.tier1Embeddings[typeInfo.className] = embedding;
|
|
console.log(` ✓ ${typeInfo.className}: ${embedding.length} dimensions`);
|
|
}
|
|
|
|
// Generate Tier 2 embeddings (individual subtypes)
|
|
console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...');
|
|
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
|
|
vocabulary.tier2Embeddings[code] = {};
|
|
|
|
for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) {
|
|
const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings);
|
|
vocabulary.tier2Embeddings[code][subtypeName] = embedding;
|
|
}
|
|
|
|
console.log(` ✓ ${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`);
|
|
}
|
|
|
|
return vocabulary;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Main Entry Point
|
|
// ============================================================================
|
|
|
|
async function main() {
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
|
console.log(' TypesVocabulary Extraction Script');
|
|
console.log(' Ontology-Driven Cache Segmentation (Rule 46)');
|
|
console.log('═══════════════════════════════════════════════════════════════\n');
|
|
|
|
const vocabulary = await processTypeFiles();
|
|
|
|
// Ensure output directory exists
|
|
const outputDir = dirname(OUTPUT_FILE);
|
|
if (!existsSync(outputDir)) {
|
|
mkdirSync(outputDir, { recursive: true });
|
|
}
|
|
|
|
// Write output
|
|
writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2));
|
|
|
|
console.log('\n═══════════════════════════════════════════════════════════════');
|
|
console.log(' Summary');
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
|
console.log(` 📊 Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`);
|
|
console.log(` 📊 Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`);
|
|
console.log(` 📊 Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`);
|
|
console.log(` 📊 Term Log Entries: ${Object.keys(vocabulary.termLog).length}`);
|
|
console.log(` 📊 Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`);
|
|
console.log(` 📊 Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`);
|
|
console.log(`\n ✅ Output written to: ${OUTPUT_FILE}`);
|
|
console.log('═══════════════════════════════════════════════════════════════\n');
|
|
}
|
|
|
|
main().catch(console.error);
|