glam/scripts/extract-types-vocab.ts

#!/usr/bin/env node
/**
 * extract-types-vocab.ts
 *
 * Extracts vocabulary from LinkML *Type.yaml and *Types.yaml schema files
 * and generates embeddings for two-tier semantic routing.
 *
 * Output: apps/archief-assistent/public/types-vocab.json
 *
 * Usage:
 *   npx tsx scripts/extract-types-vocab.ts
 *   npx tsx scripts/extract-types-vocab.ts --skip-embeddings  # Skip embedding generation
 *
 * See: .opencode/rules/ontology-driven-cache-segmentation.md
 */

import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { parse as parseYaml } from 'yaml';

// ESM compatibility for __dirname
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// ============================================================================
// Configuration
// ============================================================================

const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes');
const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json');
const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed';

// GLAMORCUBESFIXPHDNT code mapping
const TYPE_FILE_TO_CODE: Record<string, string> = {
  'ArchiveOrganizationType': 'A',
  'BioCustodianType': 'B',
  'CommercialOrganizationType': 'C',
  'DigitalPlatformType': 'D',
  'EducationProviderType': 'E',
  'FeatureCustodianType': 'F',
  'GalleryType': 'G',
  'HolySacredSiteType': 'H',
  'IntangibleHeritageGroupType': 'I',
  'LibraryType': 'L',
  'MuseumType': 'M',
  'NonProfitType': 'N',
  'OfficialInstitutionType': 'O',
  'PersonalCollectionType': 'P',
  'ResearchOrganizationType': 'R',
  'HeritageSocietyType': 'S',
  'TasteScentHeritageType': 'T',
  'UnspecifiedType': 'U',
  'MixedCustodianType': 'X',
};

// ============================================================================
// Types
// ============================================================================

interface TermLogEntry {
  typeCode: string;
  typeName: string;
  subtypeName?: string;
  recordSetType?: string;
  wikidata?: string;
  lang: string;
}

interface SubtypeInfo {
  className: string;
  wikidata?: string;
  accumulatedTerms: string;
  keywords: Record<string, string[]>;
}

interface TypeInfo {
  code: string;
  className: string;
  baseWikidata?: string;
  accumulatedTerms: string;
  keywords: Record<string, string[]>;
  subtypes: Record<string, SubtypeInfo>;
}

interface RecordSetTypeInfo {
  className: string;
  accumulatedTerms: string;
  keywords: Record<string, string[]>;
}

interface TypesVocabulary {
  version: string;
  schemaVersion: string;
  embeddingModel: string;
  embeddingDimensions: number;
  tier1Embeddings: Record<string, number[]>;
  tier2Embeddings: Record<string, Record<string, number[]>>;
  termLog: Record<string, TermLogEntry>;
  institutionTypes: Record<string, TypeInfo>;
  recordSetTypes: Record<string, RecordSetTypeInfo>;
}

interface ParsedClass {
  className: string;
  description?: string;
  keywords?: string[];
  structuredAliases?: Array<{ literal_form: string; in_language?: string }>;
  wikidataEntity?: string;
  isSubtypeOf?: string;
}

// ============================================================================
// YAML Parsing
// ============================================================================

function parseYamlFile(filePath: string): Record<string, unknown> | null {
  try {
    const content = readFileSync(filePath, 'utf-8');
    return parseYaml(content);
  } catch (error) {
    console.warn(`Warning: Could not parse ${filePath}: ${error}`);
    return null;
  }
}

function extractClassesFromYaml(yamlData: Record<string, unknown>): ParsedClass[] {
  const classes: ParsedClass[] = [];
  const classesSection = yamlData.classes as Record<string, unknown> | undefined;

  if (!classesSection) return classes;

  for (const [className, classDef] of Object.entries(classesSection)) {
    if (typeof classDef !== 'object' || classDef === null) continue;

    const classData = classDef as Record<string, unknown>;

    // Skip abstract base classes (except the main Type class)
    if (classData.abstract === true && !className.endsWith('Type')) continue;

    const parsed: ParsedClass = {
      className,
      description: classData.description as string | undefined,
      keywords: classData.keywords as string[] | undefined,
      structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined,
      isSubtypeOf: classData.is_a as string | undefined,
    };

    // Extract wikidata entity from slot_usage or mappings
    const slotUsage = classData.slot_usage as Record<string, unknown> | undefined;
    if (slotUsage?.wikidata_entity) {
      const wdSlot = slotUsage.wikidata_entity as Record<string, unknown>;
      parsed.wikidataEntity = wdSlot.equals_string as string | undefined;
    }

    // Check exact_mappings for Wikidata
    const exactMappings = classData.exact_mappings as string[] | undefined;
    if (exactMappings) {
      const wdMapping = exactMappings.find(m => m.startsWith('wd:') || m.startsWith('wikidata:'));
      if (wdMapping) {
        parsed.wikidataEntity = wdMapping.replace(/^(wd:|wikidata:)/, '');
      }
    }

    // Check broad_mappings for Wikidata
    const broadMappings = classData.broad_mappings as string[] | undefined;
    if (broadMappings && !parsed.wikidataEntity) {
      const wdMapping = broadMappings.find(m => m.startsWith('wd:'));
      if (wdMapping) {
        parsed.wikidataEntity = wdMapping.replace('wd:', '');
      }
    }

    classes.push(parsed);
  }

  return classes;
}

function extractKeywordsFromClass(parsedClass: ParsedClass): Record<string, string[]> {
  const keywords: Record<string, string[]> = {};

  // 1. Extract from keywords array (usually language-agnostic, assume Dutch/English)
  if (parsedClass.keywords) {
    keywords['nl'] = keywords['nl'] || [];
    keywords['en'] = keywords['en'] || [];
    for (const kw of parsedClass.keywords) {
      // Simple heuristic: Dutch words often have Dutch-specific patterns
      const isDutch = /[ij]|sch|cht|aa|ee|oo|uu/i.test(kw);
      if (isDutch) {
        keywords['nl'].push(kw.toLowerCase());
      } else {
        keywords['en'].push(kw.toLowerCase());
      }
    }
  }

  // 2. Extract from structured_aliases (language-tagged)
  if (parsedClass.structuredAliases) {
    for (const alias of parsedClass.structuredAliases) {
      const lang = alias.in_language || 'en';
      keywords[lang] = keywords[lang] || [];
      keywords[lang].push(alias.literal_form.toLowerCase());
    }
  }

  // 3. Convert class name to keywords
  // MunicipalArchive -> ["municipal archive", "municipal", "archive"]
  const classNameWords = parsedClass.className
    .replace(/([A-Z])/g, ' $1')
    .trim()
    .toLowerCase()
    .split(/\s+/);

  keywords['en'] = keywords['en'] || [];
  keywords['en'].push(classNameWords.join(' '));

  return keywords;
}

function accumulateTerms(keywords: Record<string, string[]>): string {
  const allTerms: string[] = [];
  for (const terms of Object.values(keywords)) {
    allTerms.push(...terms);
  }
  return [...new Set(allTerms)].join(' ');
}

// ============================================================================
// Embedding Generation
// ============================================================================

async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise<number[]> {
  if (skipEmbeddings) {
    // Return empty placeholder
    return [];
  }

  try {
    const response = await fetch(EMBEDDING_API_URL, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ text }),
    });

    if (!response.ok) {
      console.warn(`Embedding API error: ${response.status}`);
      return [];
    }

    const data = await response.json();
    return data.embedding || [];
  } catch (error) {
    console.warn(`Embedding generation failed: ${error}`);
    return [];
  }
}

// ============================================================================
// Main Processing
// ============================================================================

async function processTypeFiles(): Promise<TypesVocabulary> {
  const skipEmbeddings = process.argv.includes('--skip-embeddings');

  console.log('🔍 Scanning schema directory:', SCHEMA_DIR);
  console.log(`📊 Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`);

  const vocabulary: TypesVocabulary = {
    version: new Date().toISOString(),
    schemaVersion: '20251121',
    embeddingModel: 'paraphrase-multilingual-MiniLM-L12-v2',
    embeddingDimensions: 384,
    tier1Embeddings: {},
    tier2Embeddings: {},
    termLog: {},
    institutionTypes: {},
    recordSetTypes: {},
  };

  // Find all *Type.yaml files (base types)
  const files = readdirSync(SCHEMA_DIR);
  const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml'));
  const typesFiles = files.filter(f => f.endsWith('Types.yaml'));

  console.log(`\n📁 Found ${typeFiles.length} Type files and ${typesFiles.length} Types files`);

  // Process base Type files
  for (const file of typeFiles) {
    const typeName = file.replace('.yaml', '');
    const code = TYPE_FILE_TO_CODE[typeName];

    if (!code) {
      console.log(`  ⏭️  Skipping ${typeName} (not in GLAMORCUBESFIXPHDNT)`);
      continue;
    }

    console.log(`\n📄 Processing ${typeName} (${code})`);

    const filePath = join(SCHEMA_DIR, file);
    const yamlData = parseYamlFile(filePath);
    if (!yamlData) continue;

    const classes = extractClassesFromYaml(yamlData);
    const baseClass = classes.find(c => c.className === typeName);

    if (!baseClass) {
      console.log(`  ⚠️  No base class found in ${file}`);
      continue;
    }

    // Initialize type info
    const typeInfo: TypeInfo = {
      code,
      className: typeName,
      baseWikidata: baseClass.wikidataEntity,
      accumulatedTerms: '',
      keywords: extractKeywordsFromClass(baseClass),
      subtypes: {},
    };

    // Look for corresponding Types file (subtypes)
    const subtypesFilePath = join(SCHEMA_DIR, file.replace('Type.yaml', 'Types.yaml'));

    if (existsSync(subtypesFilePath)) {
      console.log(`  📂 Processing subtypes from ${subtypesFilePath.split('/').pop()}`);
      const subtypesYaml = parseYamlFile(subtypesFilePath);
      if (subtypesYaml) {
        const subtypeClasses = extractClassesFromYaml(subtypesYaml);

        for (const subclass of subtypeClasses) {
          // Convert CamelCase to UPPER_SNAKE_CASE
          const subtypeName = subclass.className
            .replace(/([a-z])([A-Z])/g, '$1_$2')
            .toUpperCase();
          const subtypeKeywords = extractKeywordsFromClass(subclass);

          const subtypeInfo: SubtypeInfo = {
            className: subclass.className,
            wikidata: subclass.wikidataEntity,
            accumulatedTerms: accumulateTerms(subtypeKeywords),
            keywords: subtypeKeywords,
          };

          typeInfo.subtypes[subtypeName] = subtypeInfo;

          // Add to term log
          for (const [lang, terms] of Object.entries(subtypeKeywords)) {
            for (const term of terms) {
              vocabulary.termLog[term] = {
                typeCode: code,
                typeName,
                subtypeName,
                wikidata: subclass.wikidataEntity,
                lang,
              };
            }
          }

          console.log(`    ✓ ${subclass.className}: ${Object.values(subtypeKeywords).flat().length} terms`);
        }
      }
    }

    // Accumulate all terms for this type (base + all subtypes)
    const allTypeTerms: string[] = [];
    allTypeTerms.push(accumulateTerms(typeInfo.keywords));
    for (const subtype of Object.values(typeInfo.subtypes)) {
      allTypeTerms.push(subtype.accumulatedTerms);
    }
    typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' '))].join(' ');

    // Add base type keywords to term log
    for (const [lang, terms] of Object.entries(typeInfo.keywords)) {
      for (const term of terms) {
        vocabulary.termLog[term] = {
          typeCode: code,
          typeName,
          lang,
        };
      }
    }

    vocabulary.institutionTypes[code] = typeInfo;
    console.log(`  ✅ ${typeName}: ${Object.keys(typeInfo.subtypes).length} subtypes, ${typeInfo.accumulatedTerms.split(' ').length} total terms`);
  }

  // Process RecordSetTypes files
  console.log('\n📁 Processing RecordSetTypes files...');
  const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml'));

  for (const file of recordSetTypesFiles) {
    const filePath = join(SCHEMA_DIR, file);
    const yamlData = parseYamlFile(filePath);
    if (!yamlData) continue;

    const classes = extractClassesFromYaml(yamlData);

    for (const cls of classes) {
      // Skip abstract base classes
      if (cls.className.endsWith('RecordSetType') && !cls.className.includes('Fonds') &&
          !cls.className.includes('Series') && !cls.className.includes('Collection')) {
        continue;
      }

      // Convert CamelCase to UPPER_SNAKE_CASE
      const rstName = cls.className
        .replace(/([a-z])([A-Z])/g, '$1_$2')
        .toUpperCase();
      const keywords = extractKeywordsFromClass(cls);

      const rstInfo: RecordSetTypeInfo = {
        className: cls.className,
        accumulatedTerms: accumulateTerms(keywords),
        keywords,
      };

      vocabulary.recordSetTypes[rstName] = rstInfo;

      // Add to term log
      for (const [lang, terms] of Object.entries(keywords)) {
        for (const term of terms) {
          vocabulary.termLog[term] = {
            typeCode: 'A', // Most record set types are archive-related
            typeName: 'ArchiveOrganizationType',
            recordSetType: rstName,
            lang,
          };
        }
      }
    }
  }

  console.log(`  ✅ Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`);

  // Generate Tier 1 embeddings (Types file level)
  console.log('\n🧮 Generating Tier 1 embeddings (Types files)...');
  for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
    const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings);
    vocabulary.tier1Embeddings[typeInfo.className] = embedding;
    console.log(`  ✓ ${typeInfo.className}: ${embedding.length} dimensions`);
  }

  // Generate Tier 2 embeddings (individual subtypes)
  console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...');
  for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
    vocabulary.tier2Embeddings[code] = {};

    for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) {
      const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings);
      vocabulary.tier2Embeddings[code][subtypeName] = embedding;
    }

    console.log(`  ✓ ${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`);
  }

  return vocabulary;
}

// ============================================================================
// Main Entry Point
// ============================================================================

async function main() {
  console.log('═══════════════════════════════════════════════════════════════');
  console.log('  TypesVocabulary Extraction Script');
  console.log('  Ontology-Driven Cache Segmentation (Rule 46)');
  console.log('═══════════════════════════════════════════════════════════════\n');

  const vocabulary = await processTypeFiles();

  // Ensure output directory exists
  const outputDir = dirname(OUTPUT_FILE);
  if (!existsSync(outputDir)) {
    mkdirSync(outputDir, { recursive: true });
  }

  // Write output
  writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2));

  console.log('\n═══════════════════════════════════════════════════════════════');
  console.log('  Summary');
  console.log('═══════════════════════════════════════════════════════════════');
  console.log(`  📊 Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`);
  console.log(`  📊 Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`);
  console.log(`  📊 Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`);
  console.log(`  📊 Term Log Entries: ${Object.keys(vocabulary.termLog).length}`);
  console.log(`  📊 Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`);
  console.log(`  📊 Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`);
  console.log(`\n  ✅ Output written to: ${OUTPUT_FILE}`);
  console.log('═══════════════════════════════════════════════════════════════\n');
}

main().catch(console.error);