diff --git a/frontend/src/lib/ontology/ontology-loader.ts b/frontend/src/lib/ontology/ontology-loader.ts new file mode 100644 index 0000000000..5272e8a47b --- /dev/null +++ b/frontend/src/lib/ontology/ontology-loader.ts @@ -0,0 +1,1476 @@ +/** + * Ontology Loader Service + * + * Parses RDF/OWL/TTL ontology files and extracts: + * - Classes (rdfs:Class, owl:Class) + * - Properties (rdf:Property, owl:ObjectProperty, owl:DatatypeProperty) + * - Metadata (labels, descriptions, comments) + * - Hierarchy (subClassOf, subPropertyOf) + * + * ## Ontology Classification (Guarino 1998) + * + * This loader uses Guarino's canonical ontology classification by generality level: + * + * 1. **Top-Level (Upper/Foundational)**: Domain-independent ontologies providing + * very general concepts (space, time, object, event, agent). Examples: PROV-O, + * SKOS, Dublin Core, FOAF, ORG. + * + * 2. **Domain**: Ontologies specific to a knowledge domain, specializing top-level + * concepts. Examples: CIDOC-CRM (cultural heritage), BIBFRAME (libraries), + * RiC-O (archives), FIBO (finance). + * + * 3. **Utility**: Cross-cutting modules that provide specialized capabilities + * composable with other ontologies. Examples: GEO (coordinates), TIME + * (temporal), VCard (contacts). + * + * 4. **Application**: Application profiles combining multiple ontologies for + * specific use cases. Most specialized, narrowest scope. Examples: CPOV + * (EU public orgs), DCAT-AP, Heritage Custodian Ontology. + * + * 5. **Mapping**: Alignment files providing crosswalks between ontologies + * (owl:equivalentClass, skos:exactMatch). Not ontologies per se, but + * interoperability artifacts. + * + * Reference: Guarino, N. (1998). "Formal Ontology in Information Systems" + * See also: https://bfmartins.gitlab.io/o4oa/content/classification.html + */ + +/** + * Ontology category based on Guarino's classification by generality level + */ +export type OntologyCategory = + | 'top-level' // Upper/foundational - domain-independent (PROV-O, SKOS, Dublin Core) + | 'domain' // Domain-specific (CIDOC-CRM, BIBFRAME, RiC-O) + | 'utility' // Cross-cutting modules (GEO, TIME, VCard) + | 'application' // Application profiles (CPOV, Heritage Custodian) + | 'mapping'; // Alignment/crosswalk files + +// Ontology file metadata +export interface OntologyFile { + name: string; + path: string; + format: 'ttl' | 'rdf' | 'owl' | 'jsonld' | 'csv'; + category: OntologyCategory; + /** Brief description of the ontology's purpose */ + description?: string; + /** Namespace URI(s) for this ontology - used for URI-to-ontology lookup */ + namespaces?: string[]; +} + +/** + * Language-tagged literal value + * Stores text with optional language tag (e.g., "Dataset"@en) + */ +export interface LangString { + value: string; + lang?: string; +} + +/** + * Get the preferred label from a set of language-tagged labels + * Priority: en > first available > empty string + */ +export function getPreferredLabel(labels: LangString[]): string { + if (labels.length === 0) return ''; + const enLabel = labels.find(l => l.lang === 'en'); + if (enLabel) return enLabel.value; + // Return first label that has no lang tag, or first label + const noLang = labels.find(l => !l.lang); + return noLang?.value || labels[0].value; +} + +/** + * Get translations (all non-English labels) + */ +export function getTranslations(labels: LangString[]): LangString[] { + return labels.filter(l => l.lang && l.lang !== 'en'); +} + +// Parsed ontology class +export interface OntologyClass { + uri: string; + /** Primary label (English preferred) */ + label: string; + /** All labels with language tags */ + labels?: LangString[]; + /** Primary description (English preferred) */ + description?: string; + /** All descriptions with language tags */ + descriptions?: LangString[]; + /** Primary comment (English preferred) */ + comment?: string; + /** All comments with language tags */ + comments?: LangString[]; + subClassOf?: string[]; + equivalentClass?: string[]; + disjointWith?: string[]; + properties?: string[]; + isAbstract?: boolean; + deprecated?: boolean; +} + +// Parsed ontology property +export interface OntologyProperty { + uri: string; + /** Primary label (English preferred) */ + label: string; + /** All labels with language tags */ + labels?: LangString[]; + /** Primary description (English preferred) */ + description?: string; + /** All descriptions with language tags */ + descriptions?: LangString[]; + /** Primary comment (English preferred) */ + comment?: string; + /** All comments with language tags */ + comments?: LangString[]; + domain?: string[]; + range?: string[]; + subPropertyOf?: string[]; + equivalentProperty?: string[]; + propertyType: 'object' | 'datatype' | 'annotation' | 'unknown'; + functional?: boolean; + inverseFunctional?: boolean; + transitive?: boolean; + symmetric?: boolean; + deprecated?: boolean; +} + +// Parsed ontology individual/instance +export interface OntologyIndividual { + uri: string; + label: string; + labels?: LangString[]; + types: string[]; + description?: string; + descriptions?: LangString[]; +} + +// Complete parsed ontology +export interface ParsedOntology { + uri?: string; + title?: string; + description?: string; + version?: string; + creators?: string[]; + license?: string; + prefixes: Record; + classes: OntologyClass[]; + properties: OntologyProperty[]; + individuals: OntologyIndividual[]; + imports?: string[]; + rawContent?: string; +} + +// Well-known namespaces +const NAMESPACES: Record = { + rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + rdfs: 'http://www.w3.org/2000/01/rdf-schema#', + owl: 'http://www.w3.org/2002/07/owl#', + xsd: 'http://www.w3.org/2001/XMLSchema#', + dc: 'http://purl.org/dc/elements/1.1/', + dcterms: 'http://purl.org/dc/terms/', + skos: 'http://www.w3.org/2004/02/skos/core#', + foaf: 'http://xmlns.com/foaf/0.1/', + schema: 'http://schema.org/', + prov: 'http://www.w3.org/ns/prov#', + org: 'http://www.w3.org/ns/org#', +}; + +// Class type URIs +const CLASS_TYPES = [ + 'http://www.w3.org/2000/01/rdf-schema#Class', + 'http://www.w3.org/2002/07/owl#Class', +]; + +// Property type URIs +const PROPERTY_TYPES = [ + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property', + 'http://www.w3.org/2002/07/owl#ObjectProperty', + 'http://www.w3.org/2002/07/owl#DatatypeProperty', + 'http://www.w3.org/2002/07/owl#AnnotationProperty', +]; + +/** + * List of known ontology files with metadata + * + * Categorized using Guarino's classification by generality level. + */ +export const ONTOLOGY_FILES: OntologyFile[] = [ + // ============================================================================ + // TOP-LEVEL ONTOLOGIES (Upper/Foundational) + // Domain-independent, provide very general concepts applicable across all domains + // ============================================================================ + { + name: 'PROV-O', + path: 'prov-o.rdf', + format: 'ttl', // Note: file is actually Turtle despite .rdf extension + category: 'top-level', + description: 'W3C Provenance Ontology - who did what, when, how', + namespaces: ['http://www.w3.org/ns/prov#'] + }, + // Note: prov.ttl removed as duplicate of prov-o.rdf (same PROV namespace, fuller version) + { + name: 'SKOS', + path: 'skos.rdf', + format: 'rdf', + category: 'top-level', + description: 'Simple Knowledge Organization System - concepts, labels, hierarchies', + namespaces: ['http://www.w3.org/2004/02/skos/core#'] + }, + { + name: 'FOAF', + path: 'foaf.ttl', + format: 'ttl', + category: 'top-level', + description: 'Friend of a Friend - people, organizations, social networks', + namespaces: ['http://xmlns.com/foaf/0.1/'] + }, + { + name: 'Dublin Core Elements', + path: 'dublin_core_elements.rdf', + format: 'rdf', + category: 'top-level', + description: 'Core metadata elements (title, creator, date, subject)', + namespaces: ['http://purl.org/dc/elements/1.1/', 'http://purl.org/dc/terms/'] + }, + { + name: 'Schema.org', + path: 'schemaorg.owl', + format: 'owl', + category: 'top-level', + description: 'Web-scale structured data vocabulary', + namespaces: ['http://schema.org/', 'https://schema.org/'] + }, + { + name: 'ORG Ontology', + path: 'org.rdf', + format: 'rdf', + category: 'top-level', + description: 'W3C organizational structures', + namespaces: ['http://www.w3.org/ns/org#'] + }, + { + name: 'DCAT 3', + path: 'dcat3.ttl', + format: 'ttl', + category: 'top-level', + description: 'Data Catalog Vocabulary - datasets and data services', + namespaces: ['http://www.w3.org/ns/dcat#'] + }, + + // ============================================================================ + // DOMAIN ONTOLOGIES + // Specific to a knowledge domain, specialize top-level concepts + // ============================================================================ + { + name: 'CIDOC-CRM v7.1.3', + path: 'CIDOC_CRM_v7.1.3.rdf', + format: 'rdf', + category: 'domain', + description: 'ISO 21127 - Cultural heritage events, actors, objects', + namespaces: ['http://www.cidoc-crm.org/cidoc-crm/', 'https://www.cidoc-crm.org/cidoc-crm/'] + }, + { + name: 'RiC-O 1.1', + path: 'RiC-O_1-1.rdf', + format: 'rdf', + category: 'domain', + description: 'Records in Contexts - ICA archival description standard', + namespaces: ['https://www.ica.org/standards/RiC/ontology#'] + }, + { + name: 'BIBFRAME', + path: 'bibframe.rdf', + format: 'rdf', + category: 'domain', + description: 'Library of Congress bibliographic description', + namespaces: ['http://id.loc.gov/ontologies/bibframe/'] + }, + { + name: 'PREMIS 3', + path: 'premis3.owl', + format: 'owl', + category: 'domain', + description: 'Digital preservation metadata', + namespaces: ['http://www.loc.gov/premis/rdf/v3/'] + }, + { + name: 'CRMgeo', + path: 'CRMgeo_v1_2.rdfs', + format: 'rdf', + category: 'domain', + description: 'CIDOC-CRM extension for spatio-temporal modeling', + namespaces: ['http://www.ics.forth.gr/isl/CRMgeo/'] + }, + { + name: 'PiCo', + path: 'pico.ttl', + format: 'ttl', + category: 'domain', + description: 'Prosopography - person observations, staff roles', + namespaces: ['https://pico.hypotheses.org/the-pico-ontology#'] + }, + { + name: 'TOOI', + path: 'tooiont.ttl', + format: 'ttl', + category: 'domain', + description: 'Dutch government organizational ontology', + namespaces: ['https://identifier.overheid.nl/tooi/def/ont/'] + }, + { + name: 'OASIS', + path: 'oasis.owl', + format: 'owl', + category: 'domain', + description: 'Organization for the Advancement of Structured Information Standards', + namespaces: ['http://www.oasis-open.org/'] + }, + { + name: 'OMRSE', + path: 'omrse.owl', + format: 'owl', + category: 'domain', + description: 'Ontology of Medically Related Social Entities', + namespaces: ['http://purl.obolibrary.org/obo/OMRSE_'] + }, + { + name: 'ERA Ontology', + path: 'era_ontology.ttl', + format: 'ttl', + category: 'domain', + description: 'European Railway Agency ontology', + namespaces: ['http://data.europa.eu/949/'] + }, + { + name: 'EBG Ontology', + path: 'ebg-ontology.ttl', + format: 'ttl', + category: 'domain', + description: 'European Business Graph ontology', + namespaces: ['http://data.businessgraph.io/ontology#'] + }, + { + name: 'FIBO', + path: 'fibo.rdf', + format: 'rdf', + category: 'domain', + description: 'Financial Industry Business Ontology', + namespaces: ['https://spec.edmcouncil.org/fibo/ontology/'] + }, + { + name: 'GLEIF Base', + path: 'gleif_base.ttl', + format: 'ttl', + category: 'domain', + description: 'Global Legal Entity Identifier Foundation - base ontology', + namespaces: ['https://www.gleif.org/ontology/Base/'] + }, + { + name: 'GLEIF L1', + path: 'gleif_l1.ttl', + format: 'ttl', + category: 'domain', + description: 'GLEIF Level 1 - LEI data', + namespaces: ['https://www.gleif.org/ontology/L1/'] + }, + { + name: 'GLEIF L2', + path: 'gleif_l2.ttl', + format: 'ttl', + category: 'domain', + description: 'GLEIF Level 2 - relationship data', + namespaces: ['https://www.gleif.org/ontology/L2/'] + }, + { + name: 'GLEIF Legal Form', + path: 'gleif_legal_form.ttl', + format: 'ttl', + category: 'domain', + description: 'GLEIF Entity Legal Forms vocabulary', + namespaces: ['https://www.gleif.org/ontology/EntityLegalForm/'] + }, + { + name: 'GLEIF RA', + path: 'gleif_ra.ttl', + format: 'ttl', + category: 'domain', + description: 'GLEIF Registration Authorities', + namespaces: ['https://www.gleif.org/ontology/RegistrationAuthority/'] + }, + + // ============================================================================ + // UTILITY ONTOLOGIES (Cross-cutting modules) + // Specialized capabilities composable with other ontologies + // ============================================================================ + { + name: 'GEO', + path: 'geo.ttl', + format: 'ttl', + category: 'utility', + description: 'WGS84 geographic coordinates', + namespaces: ['http://www.w3.org/2003/01/geo/wgs84_pos#'] + }, + { + name: 'TIME', + path: 'time.rdf', + format: 'rdf', + category: 'utility', + description: 'OWL-Time - temporal intervals, instants, durations', + namespaces: ['http://www.w3.org/2006/time#'] + }, + { + name: 'VCard', + path: 'vcard.rdf', + format: 'rdf', + category: 'utility', + description: 'Contact information (addresses, phones, emails)', + namespaces: ['http://www.w3.org/2006/vcard/ns#'] + }, + { + name: 'PAV', + path: 'pav.rdf', + format: 'rdf', + category: 'utility', + description: 'Provenance, Authoring and Versioning', + namespaces: ['http://purl.org/pav/'] + }, + { + name: 'DOAP', + path: 'doap.rdf', + format: 'rdf', + category: 'utility', + description: 'Description of a Project - software project metadata', + namespaces: ['http://usefulinc.com/ns/doap#'] + }, + { + name: 'Hydra', + path: 'hydra_cg.jsonld', + format: 'jsonld', + category: 'utility', + description: 'Hypermedia-driven Web APIs vocabulary', + namespaces: ['http://www.w3.org/ns/hydra/core#'] + }, + { + name: 'RegOrg', + path: 'regorg.ttl', + format: 'ttl', + category: 'utility', + description: 'Registered organizations vocabulary', + namespaces: ['http://www.w3.org/ns/regorg#'] + }, + + // ============================================================================ + // APPLICATION ONTOLOGIES (Application Profiles) + // Combine multiple ontologies for specific use cases - most specialized + // ============================================================================ + { + name: 'CPOV (Core Public Org)', + path: 'core-public-organisation-ap.ttl', + format: 'ttl', + category: 'application', + description: 'EU application profile for public sector organizations', + namespaces: ['http://data.europa.eu/m8g/'] + }, + // Note: CPOV JSON-LD removed - use TTL version above (same content, better parsing) + // Heritage Custodian Ontology - this project's application profile + // Combines CIDOC-CRM, PiCo, RiC-O, PROV-O, CPOV for heritage institution modeling + // { + // name: 'Heritage Custodian', + // path: 'schemas/20251121/rdf/custodian_multi_aspect.owl.ttl', + // format: 'ttl', + // category: 'application', + // description: 'GLAM heritage custodian institutions - multi-aspect temporal modeling' + // }, + + // ============================================================================ + // MAPPING FILES (Alignments/Crosswalks) + // Not ontologies per se - provide equivalence mappings between ontologies + // ============================================================================ + // Note: dbpedia_ontology.owl removed - file contains only placeholder text + // The DBpedia ontology is available at: https://dbpedia.org/ontology/ + { + name: 'DBpedia Classes Sample', + path: 'dbpedia_classes_sample.ttl', + format: 'ttl', + category: 'mapping', + description: 'Sample of DBpedia class hierarchy' + }, + { + name: 'DBpedia Heritage Classes', + path: 'dbpedia_heritage_classes.ttl', + format: 'ttl', + category: 'mapping', + description: 'DBpedia classes relevant to cultural heritage' + }, + { + name: 'DBpedia-Wikidata Mappings', + path: 'dbpedia_wikidata_mappings.ttl', + format: 'ttl', + category: 'mapping', + description: 'Equivalence mappings between DBpedia and Wikidata' + }, + { + name: 'WOD Thing', + path: 'wod_thing.ttl', + format: 'ttl', + category: 'mapping', + description: 'Web of Data Thing alignments' + }, +]; + +/** + * Load raw ontology content from file + */ +export async function loadOntologyRaw(path: string): Promise { + const response = await fetch(`/ontology/${path}`); + if (!response.ok) { + throw new Error(`Failed to load ontology: ${path}`); + } + return response.text(); +} + +/** + * Parse ontology content based on format + */ +export async function loadOntology(path: string): Promise { + const content = await loadOntologyRaw(path); + const format = detectFormat(content, path); + + switch (format) { + case 'ttl': + return parseTurtleOntology(content); + case 'rdf': + case 'owl': + return parseRdfXmlOntology(content); + case 'jsonld': + return parseJsonLdOntology(content); + default: + throw new Error(`Unsupported format: ${format}`); + } +} + +/** + * Detect format from content (overrides extension if needed) + * Some .rdf files are actually Turtle format + */ +function detectFormat(content: string, path: string): string { + const ext = path.split('.').pop()?.toLowerCase() || ''; + + // First check if content starts with JSON + const trimmedContent = content.trim(); + if (trimmedContent.startsWith('{') || trimmedContent.startsWith('[')) { + return 'jsonld'; + } + + // Check if content looks like Turtle (has @prefix or PREFIX) + // This catches .rdf files that are actually Turtle format + if (trimmedContent.startsWith('@prefix') || + trimmedContent.startsWith('PREFIX') || + trimmedContent.startsWith('@base') || + trimmedContent.startsWith('# baseURI') || + trimmedContent.startsWith('# generated from')) { + return 'ttl'; + } + + // Check if content starts with XML declaration or RDF root element + if (trimmedContent.startsWith(' = { ...NAMESPACES }; + const classes: Map = new Map(); + const properties: Map = new Map(); + const individuals: Map = new Map(); + + const lines = content.split('\n'); + let currentSubject: string | null = null; + let currentTriples: Array<{ predicate: string; object: string }> = []; + let lastPredicate: string | null = null; // Track last predicate for comma continuations + let blankNodeDepth = 0; // Track depth of blank node blocks to skip + + // First pass: extract prefixes + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.startsWith('@prefix') || trimmed.startsWith('PREFIX')) { + const match = trimmed.match(/@?prefix\s+(\w*):\s*<([^>]+)>/i); + if (match) { + prefixes[match[1] || ''] = match[2]; + } + } + } + + // Second pass: parse triples + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#') || trimmed.startsWith('@prefix') || trimmed.startsWith('PREFIX')) { + continue; + } + + // Track blank node depth across lines + // Count opening and closing brackets (outside quotes) + let inQuotes = false; + for (const char of trimmed) { + if (char === '"' || char === "'") { + inQuotes = !inQuotes; + } else if (!inQuotes) { + if (char === '[') blankNodeDepth++; + else if (char === ']') blankNodeDepth--; + } + } + + // Skip lines that are entirely within a blank node block + // (but process lines that start or end a blank node block for proper depth tracking) + if (blankNodeDepth > 0 && !trimmed.includes('[') && !trimmed.includes(']')) { + continue; + } + // Also skip lines that only close a blank node + if (trimmed === ']' || trimmed === '] ;' || trimmed === '] ,' || trimmed === '] .') { + continue; + } + + // Handle continuation lines (start with predicate after whitespace in original) + // These lines start with 'a ' (for rdf:type) or a predicate term (like rdfs:label) + const isIndentedLine = line.length > 0 && /^\s+/.test(line) && !trimmed.startsWith(';') && !trimmed.startsWith(','); + + // Handle subject declaration + if (!trimmed.startsWith(';') && !trimmed.startsWith(',') && !isIndentedLine) { + // Process previous subject if exists + if (currentSubject && currentTriples.length > 0) { + processSubject(currentSubject, currentTriples, prefixes, classes, properties, individuals); + } + + // Start new subject + const parts = splitTurtleLine(trimmed, prefixes); + if (parts.length >= 3) { + // Full triple on one line: subject predicate object + currentSubject = expandUri(parts[0], prefixes); + // Handle 'a' shorthand for rdf:type + const predicate = parts[1] === 'a' ? NAMESPACES.rdf + 'type' : expandUri(parts[1], prefixes); + lastPredicate = predicate; // Track for comma continuations + currentTriples = [{ predicate, object: parts.slice(2).join(' ') }]; + } else if (parts.length === 1 && !trimmed.endsWith('.')) { + // Subject alone on a line (DCAT3 style): dcat:Catalog + currentSubject = expandUri(parts[0], prefixes); + currentTriples = []; + lastPredicate = null; + } + } else if (trimmed.startsWith(';') && currentSubject) { + // Continuation with semicolon: additional predicate-object for same subject + const parts = splitTurtleLine(trimmed.substring(1).trim(), prefixes); + if (parts.length >= 2) { + // Handle 'a' shorthand for rdf:type + const predicate = parts[0] === 'a' ? NAMESPACES.rdf + 'type' : expandUri(parts[0], prefixes); + lastPredicate = predicate; // Track for comma continuations + currentTriples.push({ predicate, object: parts.slice(1).join(' ') }); + } + } else if (isIndentedLine && currentSubject) { + // Indented line with predicate-object (DCAT3 style): " a rdfs:Class ;" + // OR comma-continuation with additional object: " owl:ObjectProperty;" + const parts = splitTurtleLine(trimmed, prefixes); + if (parts.length >= 2) { + // Handle 'a' shorthand for rdf:type + const predicate = parts[0] === 'a' ? NAMESPACES.rdf + 'type' : expandUri(parts[0], prefixes); + lastPredicate = predicate; // Track for comma continuations + currentTriples.push({ predicate, object: parts.slice(1).join(' ') }); + } else if (parts.length === 1 && lastPredicate) { + // Single value on indented line - this is a comma-continuation + // Example: "owl:ObjectProperty;" after "a rdf:Property," + // The previous line ended with comma, so this value belongs to lastPredicate + currentTriples.push({ predicate: lastPredicate, object: parts[0] }); + } + } + } + + // Process last subject + if (currentSubject && currentTriples.length > 0) { + processSubject(currentSubject, currentTriples, prefixes, classes, properties, individuals); + } + + return { + prefixes, + classes: Array.from(classes.values()), + properties: Array.from(properties.values()), + individuals: Array.from(individuals.values()), + rawContent: content, + }; +} + +/** + * Split a Turtle line into parts, handling quoted strings and blank nodes + */ +function splitTurtleLine(line: string, _prefixes: Record): string[] { + const parts: string[] = []; + let current = ''; + let inQuotes = false; + let quoteChar = ''; + let bracketDepth = 0; // Track blank node depth + + for (let i = 0; i < line.length; i++) { + const char = line[i]; + + // Handle blank node brackets - skip content inside [...] blocks + if (!inQuotes && char === '[') { + // If we have content before the bracket, push it + if (current) { + parts.push(current); + current = ''; + } + bracketDepth++; + continue; + } + if (!inQuotes && char === ']') { + bracketDepth--; + continue; + } + + // Skip everything inside blank node brackets + if (bracketDepth > 0) { + continue; + } + + if (!inQuotes && (char === '"' || char === "'")) { + inQuotes = true; + quoteChar = char; + current += char; + } else if (inQuotes && char === quoteChar && line[i - 1] !== '\\') { + inQuotes = false; + current += char; + } else if (!inQuotes && /\s/.test(char)) { + if (current) { + parts.push(current); + current = ''; + } + } else if (!inQuotes && (char === '.' || char === ';' || char === ',')) { + if (current) { + parts.push(current); + current = ''; + } + } else { + current += char; + } + } + + if (current) { + parts.push(current); + } + + return parts; +} + +/** + * Process a subject and its triples + */ +function processSubject( + subject: string, + triples: Array<{ predicate: string; object: string }>, + prefixes: Record, + classes: Map, + properties: Map, + individuals: Map +): void { + const types: string[] = []; + const labels: LangString[] = []; + const descriptions: LangString[] = []; + const comments: LangString[] = []; + const subClassOf: string[] = []; + const subPropertyOf: string[] = []; + const domain: string[] = []; + const range: string[] = []; + + for (const triple of triples) { + const { predicate, object } = triple; + const expandedObject = expandUri(object, prefixes); + + if (predicate === NAMESPACES.rdf + 'type') { + types.push(expandedObject); + } else if (predicate === NAMESPACES.rdfs + 'label') { + const langStr = extractLiteralWithLang(object); + if (langStr) labels.push(langStr); + } else if (predicate === NAMESPACES.rdfs + 'comment') { + const langStr = extractLiteralWithLang(object); + if (langStr) comments.push(langStr); + } else if (predicate === NAMESPACES.dcterms + 'description' || predicate === NAMESPACES.dc + 'description') { + const langStr = extractLiteralWithLang(object); + if (langStr) descriptions.push(langStr); + } else if (predicate === NAMESPACES.skos + 'definition') { + const langStr = extractLiteralWithLang(object); + if (langStr) descriptions.push(langStr); + } else if (predicate === NAMESPACES.rdfs + 'subClassOf') { + subClassOf.push(expandedObject); + } else if (predicate === NAMESPACES.rdfs + 'subPropertyOf') { + subPropertyOf.push(expandedObject); + } else if (predicate === NAMESPACES.rdfs + 'domain') { + domain.push(expandedObject); + } else if (predicate === NAMESPACES.rdfs + 'range') { + range.push(expandedObject); + } + } + + // Determine if this is a class, property, or individual + const isClass = types.some(t => CLASS_TYPES.includes(t)); + const isProperty = types.some(t => PROPERTY_TYPES.includes(t)); + + // Get preferred values (English or first available) + const label = getPreferredLabel(labels) || getLocalName(subject); + const description = getPreferredLabel(descriptions); + const comment = getPreferredLabel(comments); + + if (isClass) { + classes.set(subject, { + uri: subject, + label, + labels: labels.length > 0 ? labels : undefined, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + comment: comment || undefined, + comments: comments.length > 0 ? comments : undefined, + subClassOf: subClassOf.length > 0 ? subClassOf : undefined, + }); + } else if (isProperty) { + const propertyType = types.includes(NAMESPACES.owl + 'ObjectProperty') + ? 'object' + : types.includes(NAMESPACES.owl + 'DatatypeProperty') + ? 'datatype' + : types.includes(NAMESPACES.owl + 'AnnotationProperty') + ? 'annotation' + : 'unknown'; + + properties.set(subject, { + uri: subject, + label, + labels: labels.length > 0 ? labels : undefined, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + comment: comment || undefined, + comments: comments.length > 0 ? comments : undefined, + propertyType, + domain: domain.length > 0 ? domain : undefined, + range: range.length > 0 ? range : undefined, + subPropertyOf: subPropertyOf.length > 0 ? subPropertyOf : undefined, + }); + } else if (types.length > 0) { + individuals.set(subject, { + uri: subject, + label, + labels: labels.length > 0 ? labels : undefined, + types, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + }); + } +} + +/** + * Parse RDF/XML format ontology + */ +function parseRdfXmlOntology(content: string): ParsedOntology { + const prefixes: Record = { ...NAMESPACES }; + const classes: Map = new Map(); + const properties: Map = new Map(); + const individuals: Map = new Map(); + + try { + const parser = new DOMParser(); + const doc = parser.parseFromString(content, 'application/xml'); + + // Check for parse errors + const parseError = doc.querySelector('parsererror'); + if (parseError) { + console.warn('XML parse error, attempting lenient parsing'); + return parseTurtleOntology(content); // Fallback to turtle parser + } + + // Extract namespace prefixes from root element + const root = doc.documentElement; + for (const attr of Array.from(root.attributes)) { + if (attr.name.startsWith('xmlns:')) { + const prefix = attr.name.substring(6); + prefixes[prefix] = attr.value; + } else if (attr.name === 'xmlns') { + prefixes[''] = attr.value; + } + } + + // Find all class definitions + // Deduplicate by URI (different query methods may return different Element objects for same URI) + const classElementsByUri = new Map(); + const collectClassElement = (elem: Element) => { + const uri = elem.getAttribute('rdf:about') || elem.getAttribute('about') || ''; + if (!uri) return; + + // Keep the element with more child elements (richer metadata) + const existing = classElementsByUri.get(uri); + if (!existing || elem.childElementCount > existing.childElementCount) { + classElementsByUri.set(uri, elem); + } + }; + + for (const elem of doc.querySelectorAll('Class')) { + collectClassElement(elem); + } + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.rdfs, 'Class')) { + collectClassElement(elem); + } + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.owl, 'Class')) { + collectClassElement(elem); + } + + // Also find rdf:Description elements with rdf:type owl:Class or rdfs:Class + // This is used by SKOS and other ontologies + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.rdf, 'Description')) { + const typeElements = elem.getElementsByTagNameNS(NAMESPACES.rdf, 'type'); + for (const typeElem of Array.from(typeElements)) { + const typeUri = typeElem.getAttribute('rdf:resource') || typeElem.getAttribute('resource') || ''; + if (typeUri === NAMESPACES.owl + 'Class' || typeUri === NAMESPACES.rdfs + 'Class') { + collectClassElement(elem); + break; + } + } + } + + for (const elem of classElementsByUri.values()) { + const uri = elem.getAttribute('rdf:about') || elem.getAttribute('about') || ''; + + // Collect all labels, comments, descriptions with language tags + const labels = getElementTextsWithLang(elem, 'label', NAMESPACES.rdfs); + const comments = getElementTextsWithLang(elem, 'comment', NAMESPACES.rdfs); + const descriptions = [ + ...getElementTextsWithLang(elem, 'description', NAMESPACES.dc), + ...getElementTextsWithLang(elem, 'description', NAMESPACES.dcterms), + ...getElementTextsWithLang(elem, 'definition', NAMESPACES.skos), // SKOS uses skos:definition + ]; + + // Get preferred (English or first) values + const label = getPreferredLabel(labels) || getLocalName(uri); + const comment = getPreferredLabel(comments); + const description = getPreferredLabel(descriptions); + + const subClassOf = getElementRefs(elem, 'subClassOf', NAMESPACES.rdfs); + + classes.set(uri, { + uri, + label, + labels: labels.length > 0 ? labels : undefined, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + comment: comment || undefined, + comments: comments.length > 0 ? comments : undefined, + subClassOf: subClassOf.length > 0 ? subClassOf : undefined, + }); + } + + // Find all property definitions + // Deduplicate by URI (different query methods may return different Element objects for same URI) + type PropertyType = 'object' | 'datatype' | 'annotation' | 'unknown'; + const propertyElementsByUri = new Map(); + const collectPropertyElement = (elem: Element, propType: PropertyType) => { + const uri = elem.getAttribute('rdf:about') || elem.getAttribute('about') || ''; + if (!uri) return; + + // Keep the element with more child elements (richer metadata) + const existing = propertyElementsByUri.get(uri); + if (!existing || elem.childElementCount > existing.elem.childElementCount) { + propertyElementsByUri.set(uri, { elem, propType }); + } + }; + + for (const elem of doc.querySelectorAll('Property')) { + collectPropertyElement(elem, 'unknown'); + } + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.rdf, 'Property')) { + collectPropertyElement(elem, 'unknown'); + } + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.owl, 'ObjectProperty')) { + collectPropertyElement(elem, 'object'); + } + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.owl, 'DatatypeProperty')) { + collectPropertyElement(elem, 'datatype'); + } + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.owl, 'AnnotationProperty')) { + collectPropertyElement(elem, 'annotation'); + } + + // Also find rdf:Description elements with rdf:type rdf:Property, owl:ObjectProperty, etc. + // This is used by Dublin Core and other ontologies + for (const elem of doc.getElementsByTagNameNS(NAMESPACES.rdf, 'Description')) { + const typeElements = elem.getElementsByTagNameNS(NAMESPACES.rdf, 'type'); + for (const typeElem of Array.from(typeElements)) { + const typeUri = typeElem.getAttribute('rdf:resource') || typeElem.getAttribute('resource') || ''; + if (typeUri === NAMESPACES.rdf + 'Property') { + collectPropertyElement(elem, 'unknown'); + break; + } else if (typeUri === NAMESPACES.owl + 'ObjectProperty') { + collectPropertyElement(elem, 'object'); + break; + } else if (typeUri === NAMESPACES.owl + 'DatatypeProperty') { + collectPropertyElement(elem, 'datatype'); + break; + } else if (typeUri === NAMESPACES.owl + 'AnnotationProperty') { + collectPropertyElement(elem, 'annotation'); + break; + } + } + } + + for (const [uri, { elem, propType }] of propertyElementsByUri.entries()) { + // Collect all labels, comments, descriptions with language tags + const labels = getElementTextsWithLang(elem, 'label', NAMESPACES.rdfs); + const comments = getElementTextsWithLang(elem, 'comment', NAMESPACES.rdfs); + const descriptions = [ + ...getElementTextsWithLang(elem, 'description', NAMESPACES.dc), + ...getElementTextsWithLang(elem, 'description', NAMESPACES.dcterms), + ...getElementTextsWithLang(elem, 'definition', NAMESPACES.skos), // SKOS uses skos:definition + ]; + + // Get preferred (English or first) values + const label = getPreferredLabel(labels) || getLocalName(uri); + const comment = getPreferredLabel(comments); + const description = getPreferredLabel(descriptions); + + const domain = getElementRefs(elem, 'domain', NAMESPACES.rdfs); + const range = getElementRefs(elem, 'range', NAMESPACES.rdfs); + const subPropertyOf = getElementRefs(elem, 'subPropertyOf', NAMESPACES.rdfs); + + // Use propType from the collection, fall back to tag name detection + const tagName = elem.localName || elem.nodeName; + const propertyType = propType !== 'unknown' ? propType + : tagName === 'ObjectProperty' ? 'object' + : tagName === 'DatatypeProperty' ? 'datatype' + : tagName === 'AnnotationProperty' ? 'annotation' + : 'unknown'; + + properties.set(uri, { + uri, + label, + labels: labels.length > 0 ? labels : undefined, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + comment: comment || undefined, + comments: comments.length > 0 ? comments : undefined, + propertyType, + domain: domain.length > 0 ? domain : undefined, + range: range.length > 0 ? range : undefined, + subPropertyOf: subPropertyOf.length > 0 ? subPropertyOf : undefined, + }); + } + + return { + prefixes, + classes: Array.from(classes.values()), + properties: Array.from(properties.values()), + individuals: Array.from(individuals.values()), + rawContent: content, + }; + } catch (error) { + console.error('Error parsing RDF/XML:', error); + return { + prefixes, + classes: [], + properties: [], + individuals: [], + rawContent: content, + }; + } +} + +/** + * Parse JSON-LD format ontology + */ +function parseJsonLdOntology(content: string): ParsedOntology { + const prefixes: Record = { ...NAMESPACES }; + const classes: Map = new Map(); + const properties: Map = new Map(); + const individuals: Map = new Map(); + + try { + const data = JSON.parse(content); + + // Extract context prefixes + if (data['@context']) { + const context = data['@context']; + if (typeof context === 'object' && !Array.isArray(context)) { + for (const [key, value] of Object.entries(context)) { + if (typeof value === 'string' && !key.startsWith('@')) { + prefixes[key] = value; + } + } + } + } + + // Process graph (or defines for Hydra-style JSON-LD) + const graph = data['@graph'] || data['defines'] || [data]; + + for (const item of graph) { + if (!item['@id']) continue; + + const uri = item['@id']; + const types = normalizeArray(item['@type'] || []).filter((t): t is string => typeof t === 'string'); + + // Collect all language variants + const labels = extractJsonLdLiteralsWithLang(item['rdfs:label'] || item['label']); + const comments = extractJsonLdLiteralsWithLang(item['rdfs:comment'] || item['comment']); + const descriptions = extractJsonLdLiteralsWithLang(item['dcterms:description'] || item['description']); + + // Get preferred (English or first) values + const label = getPreferredLabel(labels) || getLocalName(uri); + const comment = getPreferredLabel(comments); + const description = getPreferredLabel(descriptions); + + // Check for class types (including hydra:Class for Hydra vocabularies) + const isClass = types.some((t) => + t === 'rdfs:Class' || t === 'owl:Class' || t === 'hydra:Class' || + CLASS_TYPES.includes(expandUri(t, prefixes)) + ); + + // Check for property types (including hydra:Link for Hydra vocabularies) + const isProperty = types.some((t) => + t.includes('Property') || t === 'hydra:Link' || + PROPERTY_TYPES.includes(expandUri(t, prefixes)) + ); + + if (isClass) { + // Extract subClassOf + const subClassOf = normalizeArray(item['rdfs:subClassOf'] || item['subClassOf'] || []) + .map((s: unknown) => typeof s === 'string' ? s : (s as Record)?.['@id']) + .filter((s): s is string => typeof s === 'string'); + + classes.set(uri, { + uri, + label, + labels: labels.length > 0 ? labels : undefined, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + comment: comment || undefined, + comments: comments.length > 0 ? comments : undefined, + subClassOf: subClassOf.length > 0 ? subClassOf : undefined, + }); + } else if (isProperty) { + // Extract domain and range + const domain = normalizeArray(item['rdfs:domain'] || item['domain'] || []) + .map((d: unknown) => typeof d === 'string' ? d : (d as Record)?.['@id']) + .filter((d): d is string => typeof d === 'string'); + const range = normalizeArray(item['rdfs:range'] || item['range'] || []) + .map((r: unknown) => typeof r === 'string' ? r : (r as Record)?.['@id']) + .filter((r): r is string => typeof r === 'string'); + const subPropertyOf = normalizeArray(item['rdfs:subPropertyOf'] || item['subPropertyOf'] || []) + .map((s: unknown) => typeof s === 'string' ? s : (s as Record)?.['@id']) + .filter((s): s is string => typeof s === 'string'); + + properties.set(uri, { + uri, + label, + labels: labels.length > 0 ? labels : undefined, + description: description || comment || undefined, + descriptions: descriptions.length > 0 ? descriptions : undefined, + comment: comment || undefined, + comments: comments.length > 0 ? comments : undefined, + propertyType: types.includes('owl:ObjectProperty') || types.includes('hydra:Link') ? 'object' + : types.includes('owl:DatatypeProperty') ? 'datatype' + : 'unknown', + domain: domain.length > 0 ? domain : undefined, + range: range.length > 0 ? range : undefined, + subPropertyOf: subPropertyOf.length > 0 ? subPropertyOf : undefined, + }); + } + } + + return { + prefixes, + classes: Array.from(classes.values()), + properties: Array.from(properties.values()), + individuals: Array.from(individuals.values()), + rawContent: content, + }; + } catch (error) { + console.error('Error parsing JSON-LD:', error); + return { + prefixes, + classes: [], + properties: [], + individuals: [], + rawContent: content, + }; + } +} + +/** + * Expand prefixed URI to full URI + */ +export function expandUri(uri: string, prefixes: Record): string { + if (!uri) return uri; + + // Remove angle brackets if present + let cleaned = uri.trim().replace(/^$/, ''); + + // Check if it's already a full URI + if (cleaned.startsWith('http://') || cleaned.startsWith('https://')) { + return cleaned; + } + + // Handle prefixed URIs + const colonIndex = cleaned.indexOf(':'); + if (colonIndex > 0) { + const prefix = cleaned.substring(0, colonIndex); + const localName = cleaned.substring(colonIndex + 1); + + if (prefixes[prefix]) { + return prefixes[prefix] + localName; + } + } + + return cleaned; +} + +/** + * Compact URI using prefixes + */ +export function compactUri(uri: string, prefixes: Record): string { + for (const [prefix, namespace] of Object.entries(prefixes)) { + if (uri.startsWith(namespace)) { + const localName = uri.substring(namespace.length); + return prefix ? `${prefix}:${localName}` : localName; + } + } + return uri; +} + +/** + * Get local name from URI + */ +export function getLocalName(uri: string): string { + const hashIndex = uri.lastIndexOf('#'); + const slashIndex = uri.lastIndexOf('/'); + const index = Math.max(hashIndex, slashIndex); + return index >= 0 ? uri.substring(index + 1) : uri; +} + +/** + * Extract literal with language tag from Turtle string + * Handles: "Dataset"@en, "Conjunto de datos"@es, "Dataset" (no lang), plain text + * Returns: { value: "Dataset", lang: "en" } or { value: "Dataset" } for no lang + */ +function extractLiteralWithLang(raw: string): LangString | null { + if (!raw) return null; + + const trimmed = raw.trim(); + + // Match: "text"@lang or 'text'@lang + const langMatch = trimmed.match(/^["'](.+?)["']@(\w+(?:-\w+)*)$/s); + if (langMatch) { + return { value: langMatch[1], lang: langMatch[2] }; + } + + // Match: "text"^^xsd:type (typed literal without lang) + const typedMatch = trimmed.match(/^["'](.+?)["']\^\^.+$/s); + if (typedMatch) { + return { value: typedMatch[1] }; + } + + // Match: "text" or 'text' (quoted but no lang tag) + const quotedMatch = trimmed.match(/^["'](.+?)["']$/s); + if (quotedMatch) { + return { value: quotedMatch[1] }; + } + + // Plain text (no quotes) - skip URIs and blank nodes + if (!trimmed.startsWith('<') && !trimmed.startsWith('_:') && !trimmed.includes(':')) { + return { value: trimmed }; + } + + return null; +} + +/** + * Extract all language-tagged literals from JSON-LD value + * Handles: single value, array of values, language maps + */ +function extractJsonLdLiteralsWithLang(value: unknown): LangString[] { + if (!value) return []; + + const results: LangString[] = []; + + // Handle array of values + if (Array.isArray(value)) { + for (const item of value) { + results.push(...extractJsonLdLiteralsWithLang(item)); + } + return results; + } + + // Handle string + if (typeof value === 'string') { + return [{ value }]; + } + + // Handle object with @value and optional @language + if (typeof value === 'object') { + const obj = value as Record; + + if (obj['@value']) { + const lang = obj['@language'] as string | undefined; + results.push(lang ? { value: obj['@value'] as string, lang } : { value: obj['@value'] as string }); + } + } + + return results; +} + +/** + * Normalize to array + */ +function normalizeArray(value: unknown): unknown[] { + if (Array.isArray(value)) return value; + if (value) return [value]; + return []; +} + +/** + * Get all text content with language tags from XML elements + * Handles xml:lang attributes for multilingual support + */ +function getElementTextsWithLang(parent: Element, localName: string, namespace: string): LangString[] { + const results: LangString[] = []; + const elements = parent.getElementsByTagNameNS(namespace, localName); + + for (const elem of Array.from(elements)) { + const text = elem.textContent?.trim(); + if (text) { + const lang = elem.getAttribute('xml:lang') || elem.getAttribute('lang'); + results.push(lang ? { value: text, lang } : { value: text }); + } + } + + return results; +} + +/** + * Get resource references from XML element + */ +function getElementRefs(parent: Element, localName: string, namespace: string): string[] { + const refs: string[] = []; + const elements = parent.getElementsByTagNameNS(namespace, localName); + + for (const elem of Array.from(elements)) { + // Direct reference: + const ref = elem.getAttribute('rdf:resource') || elem.getAttribute('resource'); + if (ref) { + refs.push(ref); + continue; + } + + // Nested class reference: + // Also handles unionOf constructs in Schema.org style + const nestedClasses = [ + ...Array.from(elem.getElementsByTagNameNS(NAMESPACES.owl, 'Class')), + ...Array.from(elem.getElementsByTagNameNS(NAMESPACES.rdfs, 'Class')), + ]; + + for (const nested of nestedClasses) { + const nestedRef = nested.getAttribute('rdf:about') || nested.getAttribute('about'); + if (nestedRef) { + refs.push(nestedRef); + } + } + } + + return refs; +} + +/** + * Find a class or property by URI across all ontologies + */ +export function findEntityByUri( + uri: string, + ontologies: Map +): { ontology: string; entity: OntologyClass | OntologyProperty; type: 'class' | 'property' } | null { + for (const [name, ontology] of ontologies) { + // Check classes + const cls = ontology.classes.find(c => c.uri === uri); + if (cls) { + return { ontology: name, entity: cls, type: 'class' }; + } + + // Check properties + const prop = ontology.properties.find(p => p.uri === uri); + if (prop) { + return { ontology: name, entity: prop, type: 'property' }; + } + } + + return null; +} + +/** + * Get the ontology file by name + */ +export function getOntologyFileByName(name: string): OntologyFile | undefined { + return ONTOLOGY_FILES.find(f => f.name === name); +} + +/** + * Get ontology files by category + */ +export function getOntologyFilesByCategory(category: OntologyFile['category']): OntologyFile[] { + return ONTOLOGY_FILES.filter(f => f.category === category); +} + +/** + * Find the ontology file that contains a given URI based on namespace matching. + * Returns the first matching ontology, or undefined if no match found. + */ +export function getOntologyFileByUri(uri: string): OntologyFile | undefined { + // Normalize the URI for comparison + const normalizedUri = uri.toLowerCase(); + + for (const file of ONTOLOGY_FILES) { + if (file.namespaces) { + for (const ns of file.namespaces) { + const normalizedNs = ns.toLowerCase(); + if (normalizedUri.startsWith(normalizedNs)) { + return file; + } + } + } + } + + return undefined; +} + +/** + * Get the namespace from a full URI (everything before the local name) + */ +export function getNamespaceFromUri(uri: string): string | undefined { + // Try to find the last # or / to separate namespace from local name + const hashIndex = uri.lastIndexOf('#'); + if (hashIndex > 0) { + return uri.substring(0, hashIndex + 1); + } + + const slashIndex = uri.lastIndexOf('/'); + if (slashIndex > 0) { + return uri.substring(0, slashIndex + 1); + } + + return undefined; +}