From ca219340f28ffc7cd59f40f4a122d34d959be9b1 Mon Sep 17 00:00:00 2001 From: kempersc Date: Fri, 26 Dec 2025 14:30:31 +0100 Subject: [PATCH] enrich entries --- docs/GRAPH_SCORE_INHERITANCE.md | 292 ++++++ frontend/package.json | 10 +- .../schemas/20251121/linkml/manifest.json | 2 +- .../components/query/ConversationPanel.css | 103 +++ .../components/query/ConversationPanel.tsx | 35 + .../uml/CustodianTypeIndicator3D.tsx | 36 +- .../src/lib/linkml/linkml-schema-service.ts | 18 + frontend/src/lib/schema-custodian-mapping.ts | 47 +- frontend/src/pages/LinkMLViewerPage.tsx | 19 +- node_modules/.modules.yaml | 2 +- node_modules/.pnpm/lock.yaml | 10 +- .../node_modules/lucide-react/LICENSE | 39 + .../node_modules/lucide-react/README.md | 73 ++ .../node_modules/lucide-react/dynamic.mjs | 10 + .../lucide-react/dynamicIconImports.mjs | 1 + .../node_modules/lucide-react/package.json | 74 ++ .../node_modules/react | 1 + node_modules/.pnpm/node_modules/lucide-react | 2 +- pnpm-lock.yaml | 10 +- scripts/discover_custodian_websites.py | 561 ++++++++++++ scripts/discover_websites_crawl4ai.py | 150 ++++ scripts/enrich_custodian_logos_crawl4ai.py | 17 +- scripts/index_persons_qdrant.py | 75 +- src/glam_extractor/api/hybrid_retriever.py | 846 ++++++++++++++++-- 24 files changed, 2304 insertions(+), 129 deletions(-) create mode 100644 docs/GRAPH_SCORE_INHERITANCE.md create mode 100644 node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE create mode 100644 node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md create mode 100644 node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs create mode 100644 node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs create mode 100644 node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json create mode 120000 node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react create mode 100644 scripts/discover_custodian_websites.py create mode 100644 scripts/discover_websites_crawl4ai.py diff --git a/docs/GRAPH_SCORE_INHERITANCE.md b/docs/GRAPH_SCORE_INHERITANCE.md new file mode 100644 index 0000000000..16c07dd934 --- /dev/null +++ b/docs/GRAPH_SCORE_INHERITANCE.md @@ -0,0 +1,292 @@ +# Graph Score Inheritance in Hybrid Retrieval + +## Overview + +The Heritage RAG system uses a **hybrid retrieval** approach that combines: +1. **Vector search** (semantic similarity via embeddings) +2. **Knowledge graph expansion** (SPARQL-based relationship discovery) + +This document explains the **graph score inheritance** feature that ensures vector search results benefit from knowledge graph relationships. + +## The Problem + +Before graph score inheritance, the hybrid retrieval had a scoring gap: + +| Result Source | Vector Score | Graph Score | Combined Score | +|---------------|--------------|-------------|----------------| +| Vector search results | 0.5-0.8 | **0.0** | 0.35-0.56 | +| Graph expansion results | 0.0 | 0.5-0.8 | 0.15-0.24 | + +**Why this happened:** +- Vector search finds institutions semantically similar to the query +- Graph expansion finds **different** institutions (same city/type) with different GHCIDs +- Since GHCIDs don't match, no direct merging occurs +- Vector results always dominate because `combined = 0.7 * vector + 0.3 * graph` + +**Example before fix:** +``` +Query: "Archieven in Amsterdam" + +1. Stadsarchief Amsterdam | V:0.659 G:0.000 C:0.461 +2. Noord-Hollands Archief | V:0.675 G:0.000 C:0.472 +3. The Black Archives | V:0.636 G:0.000 C:0.445 +``` + +The graph expansion was finding related institutions in Amsterdam, but that information wasn't reflected in the scores. + +## The Solution: Graph Score Inheritance + +Vector results now **inherit** graph scores from related institutions found via graph expansion. + +### How It Works + +``` +1. Vector Search + └── Returns: [Inst_A, Inst_B, Inst_C] with vector_scores + +2. Graph Expansion (for top 5 vector results) + └── For Inst_A in Amsterdam: + └── SPARQL finds: [Inst_X, Inst_Y] also in Amsterdam + └── These get graph_score=0.8 (same_city) + └── They track: related_institutions=[Inst_A.ghcid] + +3. Inheritance Calculation + └── Inst_A inherits from [Inst_X, Inst_Y]: + inherited_score = avg([0.8, 0.8]) * 0.5 = 0.4 + └── Inst_A.graph_score = max(0.0, 0.4) = 0.4 + +4. Combined Scoring + └── Inst_A.combined = 0.7 * vector + 0.3 * 0.4 = higher rank! +``` + +### Inheritance Factor + +```python +INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores +``` + +This means: +- Same-city institutions (graph_score=0.8) → inherited score of **0.40** +- Same-type institutions (graph_score=0.5) → inherited score of **0.25** + +## Implementation Details + +### File Location + +``` +/Users/kempersc/apps/glam/src/glam_extractor/api/hybrid_retriever.py +``` + +### Key Method: `_combine_and_rank()` + +Located at lines ~1539-1671, this method: + +1. **Creates lookup by GHCID** for merging +2. **Handles direct merges** when graph result GHCID matches vector result +3. **Builds inheritance map** tracking which vector results each graph result was expanded from +4. **Applies inheritance** calculating inherited scores for vector results +5. **Computes combined scores** with the formula: `0.7 * vector + 0.3 * graph` + +### Code Structure + +```python +def _combine_and_rank( + self, + vector_results: list[RetrievedInstitution], + graph_results: list[RetrievedInstitution], + k: int +) -> list[RetrievedInstitution]: + """Combine vector and graph results with weighted scoring and graph inheritance.""" + + # 1. Create lookup by GHCID + results_by_ghcid: dict[str, RetrievedInstitution] = {} + vector_ghcids = set() + + # 2. Add vector results + for inst in vector_results: + results_by_ghcid[inst.ghcid] = inst + vector_ghcids.add(inst.ghcid) + + # 3. Build inheritance map: vector_ghcid -> [(related_ghcid, graph_score, reason)] + inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids} + + for inst in graph_results: + if inst.ghcid in results_by_ghcid: + # Direct merge + existing = results_by_ghcid[inst.ghcid] + existing.graph_score = max(existing.graph_score, inst.graph_score) + else: + # New from graph - track for inheritance + results_by_ghcid[inst.ghcid] = inst + for seed_ghcid in inst.related_institutions: + if seed_ghcid in inheritance_map: + inheritance_map[seed_ghcid].append( + (inst.ghcid, inst.graph_score, inst.expansion_reason) + ) + + # 4. Apply inheritance + INHERITANCE_FACTOR = 0.5 + for vector_ghcid, related_list in inheritance_map.items(): + if related_list: + inst = results_by_ghcid[vector_ghcid] + related_scores = [score for _, score, _ in related_list] + inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR + inst.graph_score = max(inst.graph_score, inherited_score) + + # 5. Calculate combined scores + for inst in results_by_ghcid.values(): + inst.combined_score = ( + self.vector_weight * inst.vector_score + + self.graph_weight * inst.graph_score + ) + + return sorted(results_by_ghcid.values(), key=lambda x: x.combined_score, reverse=True)[:k] +``` + +### Graph Expansion Scores + +The `_expand_via_graph()` method assigns these base scores: + +| Expansion Type | Graph Score | SPARQL Pattern | +|----------------|-------------|----------------| +| Same city | 0.8 | `?s schema:location ?loc . ?loc hc:cityCode ?cityCode` | +| Same institution type | 0.5 | `?s hc:institutionType ?type` | + +## Results + +### Before (Graph Score = 0.0) + +``` +Query: "Welke musea zijn er in Utrecht?" + +1. Centraal Museum | V:0.589 G:0.000 C:0.412 +2. Museum Speelklok | V:0.591 G:0.000 C:0.414 +3. Universiteitsmuseum Utrecht | V:0.641 G:0.000 C:0.449 +``` + +### After (Graph Score Inherited) + +``` +Query: "Welke musea zijn er in Utrecht?" + +1. Universiteitsmuseum Utrecht | V:0.641 G:0.400 C:0.569 +2. Museum Speelklok | V:0.591 G:0.400 C:0.534 +3. Centraal Museum | V:0.589 G:0.400 C:0.532 +``` + +**Key improvements:** +- Graph scores now **0.400** (inherited from same-city museums) +- Combined scores **increased by ~25%** (0.412 → 0.532) +- Ranking now considers **geographic relevance** + +### More Examples + +``` +Query: "Bibliotheken in Den Haag" + +1. Centrale Bibliotheek | V:0.697 G:0.400 C:0.608 +2. Koninklijke Bibliotheek | V:0.676 G:0.400 C:0.593 +3. Huis van het Boek | V:0.630 G:0.400 C:0.561 +4. Bibliotheek Hoeksche Waard | V:0.613 G:0.400 C:0.549 +5. Centrale Bibliotheek (other) | V:0.623 G:0.000 C:0.436 <- No inheritance (different city) +``` + +## Configuration + +### Weights (in `HybridRetriever.__init__`) + +```python +self.vector_weight = 0.7 # Semantic similarity importance +self.graph_weight = 0.3 # Knowledge graph importance +``` + +### Inheritance Factor + +```python +INHERITANCE_FACTOR = 0.5 # In _combine_and_rank() +``` + +**Tuning considerations:** +- Higher factor (0.6-0.8): Stronger influence from graph relationships +- Lower factor (0.3-0.4): More conservative, vector similarity dominates +- Current value (0.5): Balanced approach + +## Logging + +The implementation includes detailed logging for debugging: + +```python +# INFO level (always visible) +logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: {ghcids}...") + +# DEBUG level (when LOG_LEVEL=DEBUG) +logger.debug(f"Inheritance: {ghcid} graph_score: {old:.3f} -> {new:.3f} (from {n} related)") +``` + +**Check logs on production:** +```bash +ssh root@91.98.224.44 "journalctl -u glam-rag-api --since '5 minutes ago' | grep -i inheritance" +``` + +## API Response Structure + +The graph score is exposed in the API response: + +```json +{ + "retrieved_results": [ + { + "ghcid": "NL-UT-UTR-M-CM", + "name": "Centraal Museum", + "scores": { + "vector": 0.589, + "graph": 0.400, // <-- Now populated via inheritance + "combined": 0.532 + }, + "related_institutions": ["NL-UT-UTR-M-MS", "NL-UT-UTR-M-UMUU"] + } + ] +} +``` + +## Deployment + +**File to deploy:** +```bash +scp /Users/kempersc/apps/glam/src/glam_extractor/api/hybrid_retriever.py \ + root@91.98.224.44:/opt/glam-backend/rag/glam_extractor/api/ +``` + +**Restart service:** +```bash +ssh root@91.98.224.44 "systemctl restart glam-rag-api" +``` + +**Verify:** +```bash +curl -s -X POST 'https://archief.support/api/rag/dspy/query' \ + -H 'Content-Type: application/json' \ + -d '{"question": "Musea in Rotterdam", "language": "nl"}' | \ + python3 -c "import sys,json; r=json.load(sys.stdin)['retrieved_results']; print('\n'.join(f\"{x['name'][:30]:30} G:{x['scores']['graph']:.2f}\" for x in r[:5]))" +``` + +## Related Files + +| File | Purpose | +|------|---------| +| `hybrid_retriever.py` | Main implementation with `_combine_and_rank()` | +| `dspy_heritage_rag.py` | RAG pipeline that calls `retriever.search()` | +| `main.py` | FastAPI endpoints serving the RAG API | + +## Future Improvements + +1. **Dynamic inheritance factor**: Adjust based on query type (geographic vs. thematic) +2. **Multi-hop expansion**: Inherit from institutions 2+ hops away +3. **Weighted inheritance**: Weight by relationship type (same_city=0.8, same_type=0.5) +4. **Negative inheritance**: Penalize results unrelated to graph findings + +--- + +**Last Updated:** 2025-12-24 +**Implemented:** 2025-12-23 +**Status:** Production (archief.support) diff --git a/frontend/package.json b/frontend/package.json index 9b49371dea..5e85f74dbd 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -16,14 +16,14 @@ "test:coverage": "vitest run --coverage" }, "dependencies": { - "@glam/api-client": "workspace:*", - "@glam/hooks": "workspace:*", - "@glam/theme": "workspace:*", - "@glam/ui": "workspace:*", "@codemirror/lang-javascript": "^6.2.4", "@duckdb/duckdb-wasm": "^1.31.0", "@emotion/react": "^11.14.0", "@emotion/styled": "^11.14.1", + "@glam/api-client": "workspace:*", + "@glam/hooks": "workspace:*", + "@glam/theme": "workspace:*", + "@glam/ui": "workspace:*", "@mui/icons-material": "^7.3.6", "@mui/material": "^7.3.5", "@tanstack/react-query": "^5.90.10", @@ -45,7 +45,7 @@ "fdir": "^6.5.0", "js-yaml": "^4.1.1", "lodash": "^4.17.21", - "lucide-react": "^0.561.0", + "lucide-react": "^0.562.0", "maplibre-gl": "^5.14.0", "mermaid": "^11.4.0", "n3": "^1.26.0", diff --git a/frontend/public/schemas/20251121/linkml/manifest.json b/frontend/public/schemas/20251121/linkml/manifest.json index a8ea872d2f..7495688a84 100644 --- a/frontend/public/schemas/20251121/linkml/manifest.json +++ b/frontend/public/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2025-12-23T16:58:31.474Z", + "generated": "2025-12-25T12:42:29.931Z", "version": "1.0.0", "categories": [ { diff --git a/frontend/src/components/query/ConversationPanel.css b/frontend/src/components/query/ConversationPanel.css index b5eec5700e..2cf41fc5c6 100644 --- a/frontend/src/components/query/ConversationPanel.css +++ b/frontend/src/components/query/ConversationPanel.css @@ -872,3 +872,106 @@ padding: 0.125rem 0.375rem; } } + +/* ========================================================================== + Chain-of-Thought Reasoning Display (GLM 4.7 Interleaved Thinking) + ========================================================================== */ + +.conversation-panel__reasoning { + margin: 0.75rem 0; + border: 1px solid var(--border-color, #e0e0e0); + border-radius: 0.5rem; + overflow: hidden; + background: var(--surface-secondary, #fafafa); +} + +.conversation-panel__reasoning-toggle { + display: flex; + align-items: center; + gap: 0.5rem; + padding: 0.5rem 0.75rem; + background: var(--surface-secondary, #f5f5f5); + cursor: pointer; + font-size: 0.8125rem; + color: var(--text-secondary, #757575); + border: none; + width: 100%; + transition: background-color 0.2s; + list-style: none; /* Remove default marker */ +} + +.conversation-panel__reasoning-toggle::-webkit-details-marker { + display: none; /* Hide default arrow in WebKit browsers */ +} + +.conversation-panel__reasoning-toggle::before { + content: '▶'; + font-size: 0.625rem; + transition: transform 0.2s; +} + +.conversation-panel__reasoning[open] .conversation-panel__reasoning-toggle::before { + transform: rotate(90deg); +} + +.conversation-panel__reasoning-toggle:hover { + background: var(--surface-tertiary, #eeeeee); +} + +.conversation-panel__reasoning-toggle svg { + color: var(--primary-color, #1976d2); + flex-shrink: 0; +} + +.conversation-panel__reasoning-content { + padding: 0.75rem; + background: var(--surface-code, #1e1e1e); + max-height: 300px; + overflow-y: auto; + border-top: 1px solid var(--border-color, #e0e0e0); +} + +.conversation-panel__reasoning-content pre { + margin: 0; + white-space: pre-wrap; + word-break: break-word; + font-size: 0.75rem; + line-height: 1.5; + color: var(--text-code, #d4d4d4); + font-family: 'Fira Code', 'Monaco', 'Consolas', monospace; +} + +/* Scrollbar styling for reasoning content */ +.conversation-panel__reasoning-content::-webkit-scrollbar { + width: 6px; +} + +.conversation-panel__reasoning-content::-webkit-scrollbar-track { + background: #2d2d2d; +} + +.conversation-panel__reasoning-content::-webkit-scrollbar-thumb { + background: #555; + border-radius: 3px; +} + +.conversation-panel__reasoning-content::-webkit-scrollbar-thumb:hover { + background: #777; +} + +/* Responsive: Reasoning section */ +@media (max-width: 768px) { + .conversation-panel__reasoning-toggle { + font-size: 0.75rem; + padding: 0.375rem 0.5rem; + } + + .conversation-panel__reasoning-content { + max-height: 200px; + padding: 0.5rem; + } + + .conversation-panel__reasoning-content pre { + font-size: 0.6875rem; + } +} diff --git a/frontend/src/components/query/ConversationPanel.tsx b/frontend/src/components/query/ConversationPanel.tsx index 1c25db7763..c012bedae9 100644 --- a/frontend/src/components/query/ConversationPanel.tsx +++ b/frontend/src/components/query/ConversationPanel.tsx @@ -164,6 +164,9 @@ const TEXT = { sourcesUsed: { nl: 'Bronnen gebruikt', en: 'Sources used' }, llmProvider: { nl: 'Model', en: 'Model' }, answer: { nl: 'Antwoord', en: 'Answer' }, + showReasoning: { nl: 'Toon redenering', en: 'Show reasoning' }, + hideReasoning: { nl: 'Verberg redenering', en: 'Hide reasoning' }, + reasoningTitle: { nl: 'Denkproces', en: 'Thinking Process' }, }; // Example questions to help users get started - shorter list @@ -180,6 +183,21 @@ const EXAMPLE_QUESTIONS = { ], }; +// LLM Response Metadata - matches backend LLMResponseMetadata model +interface LLMResponseMetadata { + content?: string; + reasoning_content?: string; // GLM 4.7 chain-of-thought reasoning + model?: string; + provider?: string; // zai, anthropic, openai, groq + prompt_tokens?: number; + completion_tokens?: number; + total_tokens?: number; + thinking_mode?: string; // enabled, disabled, interleaved + latency_ms?: number; + cached?: boolean; + finish_reason?: string; +} + interface Message { id: string; role: 'user' | 'assistant'; @@ -192,6 +210,7 @@ interface Message { error?: string; errorCode?: string; llmProviderUsed?: string; // Which LLM provider generated this response + llmResponse?: LLMResponseMetadata; // Full LLM response metadata including chain-of-thought } interface HistoryItem { @@ -351,6 +370,7 @@ export const ConversationPanel: React.FC = ({ onQueryGen answer: string; sourcesUsed: string[]; llmProviderUsed?: string; + llmResponse?: LLMResponseMetadata; // Full LLM response with reasoning_content }> => { // Determine API endpoint based on environment const hostname = window.location.hostname; @@ -395,6 +415,7 @@ export const ConversationPanel: React.FC = ({ onQueryGen answer: data.answer || data.explanation || '', sourcesUsed: data.sources_used || selectedSources, llmProviderUsed: data.llm_provider_used, + llmResponse: data.llm_response, // Pass through chain-of-thought metadata }; }; @@ -445,6 +466,7 @@ export const ConversationPanel: React.FC = ({ onQueryGen sparql: result.sparql, sourcesUsed: result.sourcesUsed, llmProviderUsed: result.llmProviderUsed, + llmResponse: result.llmResponse, isLoading: false, } : msg @@ -928,6 +950,19 @@ export const ConversationPanel: React.FC = ({ onQueryGen <>

{message.content}

+ {/* Chain-of-Thought Reasoning (GLM 4.7 Interleaved Thinking) */} + {message.llmResponse?.reasoning_content && ( +
+ + + {t('showReasoning')} + +
+
{message.llmResponse.reasoning_content}
+
+
+ )} + {/* Sources Used Badges */} {message.sourcesUsed && message.sourcesUsed.length > 0 && (
diff --git a/frontend/src/components/uml/CustodianTypeIndicator3D.tsx b/frontend/src/components/uml/CustodianTypeIndicator3D.tsx index 5707191f59..e44ebb1e4d 100644 --- a/frontend/src/components/uml/CustodianTypeIndicator3D.tsx +++ b/frontend/src/components/uml/CustodianTypeIndicator3D.tsx @@ -196,9 +196,9 @@ function createGLAMPolyhedronGeometry(radius: number = 1): THREE.BufferGeometry * Creates text sprites positioned at the center of each icosahedron face. * * Label visibility behavior: - * - Relevant types: Always visible (opacity 1) - these are the types passed in highlightTypes - * - Non-relevant types: Only visible when expanded (opacity 0 when collapsed) - * - If highlightTypes is empty (universal), all labels are shown when expanded only + * - Empty array (no annotation): NO labels shown at all (blank cube) + * - 19+ types (universal): All labels shown when expanded only + * - Specific types (1-18): Only those letters shown (always visible) */ function createFaceLabels( geometry: THREE.BufferGeometry, @@ -209,14 +209,16 @@ function createFaceLabels( const positions = geometry.getAttribute('position'); const faceCount = positions.count / 3; const highlightSet = new Set(highlightTypes); - const isUniversal = highlightTypes.length === 0 || highlightTypes.length >= 19; + const hasNoAnnotation = highlightTypes.length === 0; + const isUniversal = highlightTypes.length >= 19; for (let faceIndex = 0; faceIndex < Math.min(faceCount, 20); faceIndex++) { const typeIndex = faceIndex % 19; const typeCode = CUSTODIAN_TYPE_CODES[typeIndex]; // Determine if this type is relevant (highlighted) - const isRelevant = highlightTypes.length === 0 || highlightSet.has(typeCode); + // Empty array = no annotation = nothing is relevant + const isRelevant = !hasNoAnnotation && (isUniversal || highlightSet.has(typeCode)); // Calculate face center (average of 3 vertices) const v0 = new THREE.Vector3( @@ -559,7 +561,7 @@ export const CustodianTypeIndicator3D: React.FC = // Tooltip text const tooltipText = useMemo(() => { - if (types.length === 0) return 'Heritage Custodian Types (GLAMORCUBESFIXPHDNT)'; + if (types.length === 0) return 'No custodian types'; return types .map(code => getCustodianTypeByCode(code)?.label[language] ?? code) .join(', '); @@ -667,23 +669,27 @@ export const CustodianTypeIndicator3D: React.FC = // Update label visibility based on expanded state and highlighted types // Label visibility rules: - // - Non-universal elements (1-18 types): Show relevant letters only (both collapsed and expanded) - // - Universal elements (19 types or empty): Show all letters only when expanded + // - No annotation (empty array, length 0): Show NO letters (blank cube) + // - Universal annotation (19+ types): Show all letters only when expanded + // - Specific types (1-18 types): Show ONLY those letters (both collapsed and expanded) if (labelsGroupRef.current) { const highlightSet = new Set(types); - const isUniversal = types.length === 0 || types.length >= 19; + const hasNoAnnotation = types.length === 0; + const isUniversal = types.length >= 19; labelsGroupRef.current.children.forEach((child) => { if (child instanceof THREE.Sprite && child.userData.typeCode) { const typeCode = child.userData.typeCode as CustodianTypeCode; - const isRelevant = types.length === 0 || highlightSet.has(typeCode); + const isRelevant = highlightSet.has(typeCode); - if (isUniversal) { - // Universal elements: Show all letters only when expanded + if (hasNoAnnotation) { + // No annotation: Show NO letters at all (blank cube) + child.material.opacity = 0; + } else if (isUniversal) { + // Universal annotation (19+ types): Show all letters only when expanded child.material.opacity = isExpanded ? 1 : 0; } else { - // Non-universal elements: Show ONLY relevant letters (hidden otherwise) - // When expanded, relevant letters get full opacity + // Specific types (1-18): Show ONLY relevant letters (hidden otherwise) if (isRelevant) { child.material.opacity = 1; // Relevant letters always visible } else { @@ -1172,7 +1178,7 @@ export const CustodianTypeIndicator3DFallback: React.FC { - if (types.length === 0) return 'Heritage Custodian Types'; + if (types.length === 0) return 'No custodian types'; return types .map(code => getCustodianTypeByCode(code)?.label[language] ?? code) .join(', '); diff --git a/frontend/src/lib/linkml/linkml-schema-service.ts b/frontend/src/lib/linkml/linkml-schema-service.ts index bfc8947e42..9d17b76c44 100644 --- a/frontend/src/lib/linkml/linkml-schema-service.ts +++ b/frontend/src/lib/linkml/linkml-schema-service.ts @@ -1071,6 +1071,24 @@ class LinkMLSchemaService { return this.parseCustodianTypesAnnotation(slot.annotations.custodian_types); } + /** + * Get all classes that use a given slot + * Returns array of class names that have this slot in their slots array + */ + async getClassesUsingSlot(slotName: string): Promise { + await this.initialize(); + const classes: string[] = []; + + for (const [className, schema] of this.classSchemas.entries()) { + const classDef = schema.classes?.[className]; + if (classDef?.slots?.includes(slotName)) { + classes.push(className); + } + } + + return classes; + } + /** * Get custodian_types annotation from an enum definition * Returns null if annotation not found diff --git a/frontend/src/lib/schema-custodian-mapping.ts b/frontend/src/lib/schema-custodian-mapping.ts index 395bf4533f..5ef49f41a0 100644 --- a/frontend/src/lib/schema-custodian-mapping.ts +++ b/frontend/src/lib/schema-custodian-mapping.ts @@ -260,7 +260,11 @@ function validateCustodianTypes(types: string[]): CustodianTypeCode[] { * Priority: * 1. Read from LinkML schema annotations (custodian_types) * 2. Fall back to static mapping - * 3. Default to all types (universal) + * 3. Default to EMPTY ARRAY (no types assigned) - cube will show no letters + * + * NOTE: We return [] instead of DEFAULT_CUSTODIAN_TYPES when no annotation exists + * because classes without explicit custodian_types annotations should NOT display + * all 19 letters on the cube. Only classes with explicit annotations should show letters. */ export async function getCustodianTypesForClassAsync(className: string): Promise { try { @@ -276,15 +280,23 @@ export async function getCustodianTypesForClassAsync(className: string): Promise console.warn(`[CustodianMapping] Error reading annotations for class ${className}:`, error); } - // Fall back to static mapping - return CLASS_TO_CUSTODIAN_TYPE[className] || DEFAULT_CUSTODIAN_TYPES; + // Fall back to static mapping, or empty array if no mapping exists + // Empty array means "no custodian types assigned" - cube will show no letters + return CLASS_TO_CUSTODIAN_TYPE[className] || []; } /** * Get custodian types for a schema slot (async version) + * + * Priority: + * 1. Read from slot's own LinkML schema annotations (custodian_types) + * 2. Inherit from parent class(es) that use this slot + * 3. Fall back to static mapping + * 4. Return empty array (no types assigned - cube shows no letters) */ export async function getCustodianTypesForSlotAsync(slotName: string): Promise { try { + // 1. Try slot's own annotation first const annotationTypes = await linkmlSchemaService.getSlotCustodianTypes(slotName); if (annotationTypes && annotationTypes.length > 0) { const validated = validateCustodianTypes(annotationTypes); @@ -292,15 +304,38 @@ export async function getCustodianTypesForSlotAsync(slotName: string): Promise 0) { + const inheritedTypes = new Set(); + for (const className of parentClasses) { + const classTypes = await linkmlSchemaService.getClassCustodianTypes(className); + if (classTypes && classTypes.length > 0) { + const validated = validateCustodianTypes(classTypes); + validated.forEach(t => inheritedTypes.add(t)); + } + } + if (inheritedTypes.size > 0) { + return Array.from(inheritedTypes); + } + } } catch (error) { console.warn(`[CustodianMapping] Error reading annotations for slot ${slotName}:`, error); } - return SLOT_TO_CUSTODIAN_TYPE[slotName] || DEFAULT_CUSTODIAN_TYPES; + // 3. Fall back to static mapping, or empty array if no mapping exists + // Empty array means "no custodian types assigned" - cube will show no letters + return SLOT_TO_CUSTODIAN_TYPE[slotName] || []; } /** * Get custodian types for a schema enum (async version) + * + * Priority: + * 1. Read from enum's LinkML schema annotations (custodian_types) + * 2. Fall back to static mapping + * 3. Return empty array (no types assigned - cube shows no letters) */ export async function getCustodianTypesForEnumAsync(enumName: string): Promise { try { @@ -315,5 +350,7 @@ export async function getCustodianTypesForEnumAsync(enumName: string): Promise { const mainContentRef = useRef(null); // Schema loading progress tracking - const { progress: schemaProgress, isLoading: isSchemaServiceLoading } = useSchemaLoadingProgress(); + const { progress: schemaProgress, isLoading: isSchemaServiceLoading, isComplete: isSchemaServiceComplete } = useSchemaLoadingProgress(); // Handler for filtering by custodian type (clicking polyhedron face or legend item) // Multi-select toggle behavior: clicking type adds/removes from set @@ -881,17 +881,32 @@ const LinkMLViewerPage: React.FC = () => { // Load custodian types from schema annotations when schema changes // This pre-loads types asynchronously so they're available for rendering + // IMPORTANT: Wait for schema service to complete loading before fetching custodian types + // to avoid race condition where annotations aren't available yet useEffect(() => { if (!schema) { setCustodianTypesLoaded(false); return; } + // Don't load custodian types until schema service has finished loading all class files + // This prevents the race condition where we try to read annotations before they're loaded + if (!isSchemaServiceComplete) { + console.log('[LinkMLViewerPage] Waiting for schema service to complete before loading custodian types...'); + return; + } + const loadCustodianTypes = async () => { const classes = extractClasses(schema); const slots = extractSlots(schema); const enums = extractEnums(schema); + console.log('[LinkMLViewerPage] Schema service complete, loading custodian types for', { + classes: classes.length, + slots: slots.length, + enums: enums.length + }); + // Load types for all classes in parallel const classTypesPromises = classes.map(async (cls) => { const types = await getCustodianTypesForClassAsync(cls.name); @@ -951,7 +966,7 @@ const LinkMLViewerPage: React.FC = () => { }; loadCustodianTypes(); - }, [schema]); + }, [schema, isSchemaServiceComplete]); const toggleSection = (section: string) => { setExpandedSections(prev => { diff --git a/node_modules/.modules.yaml b/node_modules/.modules.yaml index 72fe6ed40d..c2b2597015 100644 --- a/node_modules/.modules.yaml +++ b/node_modules/.modules.yaml @@ -987,7 +987,7 @@ hoistedDependencies: loose-envify: private lru-cache@11.2.4: lru-cache: private - lucide-react@0.561.0(react@19.2.3): + lucide-react@0.562.0(react@19.2.3): lucide-react: private lz-string@1.5.0: lz-string: private diff --git a/node_modules/.pnpm/lock.yaml b/node_modules/.pnpm/lock.yaml index d9bdb1092f..d74fd5f526 100644 --- a/node_modules/.pnpm/lock.yaml +++ b/node_modules/.pnpm/lock.yaml @@ -169,8 +169,8 @@ importers: specifier: ^4.17.21 version: 4.17.21 lucide-react: - specifier: ^0.561.0 - version: 0.561.0(react@19.2.3) + specifier: ^0.562.0 + version: 0.562.0(react@19.2.3) maplibre-gl: specifier: ^5.14.0 version: 5.15.0 @@ -2507,8 +2507,8 @@ packages: lru-cache@5.1.1: resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} - lucide-react@0.561.0: - resolution: {integrity: sha512-Y59gMY38tl4/i0qewcqohPdEbieBy7SovpBL9IFebhc2mDd8x4PZSOsiFRkpPcOq6bj1r/mjH/Rk73gSlIJP2A==} + lucide-react@0.562.0: + resolution: {integrity: sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==} peerDependencies: react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -5660,7 +5660,7 @@ snapshots: dependencies: yallist: 3.1.1 - lucide-react@0.561.0(react@19.2.3): + lucide-react@0.562.0(react@19.2.3): dependencies: react: 19.2.3 diff --git a/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE new file mode 100644 index 0000000000..46e6962181 --- /dev/null +++ b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE @@ -0,0 +1,39 @@ +ISC License + +Copyright (c) for portions of Lucide are held by Cole Bemis 2013-2023 as part of Feather (MIT). All other copyright (c) for Lucide are held by Lucide Contributors 2025. + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +--- + +The MIT License (MIT) (for portions derived from Feather) + +Copyright (c) 2013-2023 Cole Bemis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md new file mode 100644 index 0000000000..8d02efe968 --- /dev/null +++ b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md @@ -0,0 +1,73 @@ +

+ + Lucide icon library for React applications. + +

+ +

+Lucide icon library for React applications. +

+ +
+ + [![npm](https://img.shields.io/npm/v/lucide-react?color=blue)](https://www.npmjs.com/package/lucide-react) + ![NPM Downloads](https://img.shields.io/npm/dw/lucide-react) + [![GitHub](https://img.shields.io/github/license/lucide-icons/lucide)](https://lucide.dev/license) +
+ +

+ About + · + Icons + · + Documentation + · + License +

+ +# Lucide React + +Implementation of the lucide icon library for React applications. + +## Installation + +```sh +pnpm add lucide-react +``` + +```sh +npm install lucide-react +``` + +```sh +yarn add lucide-react +``` + +```sh +bun add lucide-react +``` + +## Documentation + +For full documentation, visit [lucide.dev](https://lucide.dev/guide/packages/lucide-react) + +## Community + +Join the [Discord server](https://discord.gg/EH6nSts) to chat with the maintainers and other users. + +## License + +Lucide is licensed under the ISC license. See [LICENSE](https://lucide.dev/license). + +## Sponsors + + + Powered by Vercel + + +DigitalOcean Referral Badge + +### Awesome backers 🍺 + +Scipress sponsor badge +pdfme sponsor badge diff --git a/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs new file mode 100644 index 0000000000..29e1076def --- /dev/null +++ b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs @@ -0,0 +1,10 @@ +/** + * @license lucide-react v0.562.0 - ISC + * + * This source code is licensed under the ISC license. + * See the LICENSE file in the root directory of this source tree. + */ + +export { default as DynamicIcon, iconNames } from './dist/esm/DynamicIcon.js'; +export { default as dynamicIconImports } from './dist/esm/dynamicIconImports.js'; +//# sourceMappingURL=dynamic.mjs.map diff --git a/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs new file mode 100644 index 0000000000..7a725d5b50 --- /dev/null +++ b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs @@ -0,0 +1 @@ +export { default } from './dist/esm/dynamicIconImports.js'; diff --git a/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json new file mode 100644 index 0000000000..29e9a09b18 --- /dev/null +++ b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json @@ -0,0 +1,74 @@ +{ + "name": "lucide-react", + "description": "A Lucide icon library package for React applications.", + "version": "0.562.0", + "license": "ISC", + "homepage": "https://lucide.dev", + "bugs": "https://github.com/lucide-icons/lucide/issues", + "repository": { + "type": "git", + "url": "https://github.com/lucide-icons/lucide.git", + "directory": "packages/lucide-react" + }, + "keywords": [ + "Lucide", + "React", + "Feather", + "Icons", + "Icon", + "SVG", + "Feather Icons", + "Fontawesome", + "Font Awesome" + ], + "author": "Eric Fennis", + "amdName": "lucide-react", + "main": "dist/cjs/lucide-react.js", + "main:umd": "dist/umd/lucide-react.js", + "module": "dist/esm/lucide-react.js", + "unpkg": "dist/umd/lucide-react.min.js", + "typings": "dist/lucide-react.d.ts", + "sideEffects": false, + "files": [ + "dist", + "dynamic.mjs", + "dynamic.js.map", + "dynamic.d.ts", + "dynamicIconImports.mjs", + "dynamicIconImports.js.map", + "dynamicIconImports.d.ts" + ], + "devDependencies": { + "@testing-library/jest-dom": "^6.1.6", + "@testing-library/react": "^14.1.2", + "@types/react": "^18.2.37", + "@vitejs/plugin-react": "^4.4.1", + "jest-serializer-html": "^7.1.0", + "react": "18.2.0", + "react-dom": "18.2.0", + "rollup": "^4.53.3", + "rollup-plugin-dts": "^6.2.3", + "rollup-plugin-preserve-directives": "^0.4.0", + "typescript": "^5.8.3", + "vite": "^7.2.4", + "vitest": "^4.0.12", + "@lucide/shared": "1.0.0", + "@lucide/rollup-plugins": "1.0.0", + "@lucide/build-icons": "1.1.0" + }, + "peerDependencies": { + "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0" + }, + "scripts": { + "build": "pnpm clean && pnpm copy:license && pnpm build:icons && pnpm typecheck && pnpm build:bundles", + "copy:license": "cp ../../LICENSE ./LICENSE", + "clean": "rm -rf dist && rm -rf stats && rm -rf ./src/icons/*.ts && rm -f dynamic.* && rm -f dynamicIconImports.d.ts", + "build:icons": "build-icons --output=./src --templateSrc=./scripts/exportTemplate.mts --renderUniqueKey --withAliases --withDynamicImports --separateAliasesFile --separateAliasesFileIgnore=fingerprint --aliasesFileExtension=.ts --iconFileExtension=.ts --exportFileName=index.ts", + "build:bundles": "rollup -c ./rollup.config.mjs", + "typecheck": "tsc", + "typecheck:watch": "tsc -w", + "test": "pnpm build:icons && vitest run", + "test:watch": "vitest watch", + "version": "pnpm version --git-tag-version=false" + } +} \ No newline at end of file diff --git a/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react new file mode 120000 index 0000000000..01bc808511 --- /dev/null +++ b/node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react @@ -0,0 +1 @@ +../../react@19.2.3/node_modules/react \ No newline at end of file diff --git a/node_modules/.pnpm/node_modules/lucide-react b/node_modules/.pnpm/node_modules/lucide-react index 9ba6205c2d..311e1120c7 120000 --- a/node_modules/.pnpm/node_modules/lucide-react +++ b/node_modules/.pnpm/node_modules/lucide-react @@ -1 +1 @@ -../lucide-react@0.561.0_react@19.2.3/node_modules/lucide-react \ No newline at end of file +../lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d9bdb1092f..d74fd5f526 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -169,8 +169,8 @@ importers: specifier: ^4.17.21 version: 4.17.21 lucide-react: - specifier: ^0.561.0 - version: 0.561.0(react@19.2.3) + specifier: ^0.562.0 + version: 0.562.0(react@19.2.3) maplibre-gl: specifier: ^5.14.0 version: 5.15.0 @@ -2507,8 +2507,8 @@ packages: lru-cache@5.1.1: resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} - lucide-react@0.561.0: - resolution: {integrity: sha512-Y59gMY38tl4/i0qewcqohPdEbieBy7SovpBL9IFebhc2mDd8x4PZSOsiFRkpPcOq6bj1r/mjH/Rk73gSlIJP2A==} + lucide-react@0.562.0: + resolution: {integrity: sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==} peerDependencies: react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -5660,7 +5660,7 @@ snapshots: dependencies: yallist: 3.1.1 - lucide-react@0.561.0(react@19.2.3): + lucide-react@0.562.0(react@19.2.3): dependencies: react: 19.2.3 diff --git a/scripts/discover_custodian_websites.py b/scripts/discover_custodian_websites.py new file mode 100644 index 0000000000..a4a69ce03b --- /dev/null +++ b/scripts/discover_custodian_websites.py @@ -0,0 +1,561 @@ +#!/usr/bin/env python3 +""" +Discover website URLs for custodian YAML files that are missing them. + +This script uses web search (via DuckDuckGo or Google) to find official websites +for heritage institutions based on their name and location. + +Search strategy: +1. Search for institution name + city + country +2. Search for institution name + "official website" +3. Search for institution name + institution type (museum, library, archive) + +Output: +- Updates custodian YAML files with discovered website URLs +- Stores provenance for discovered URLs + +Usage: + python scripts/discover_custodian_websites.py [options] + +Options: + --dry-run Show what would be discovered without modifying files + --limit N Process only first N files (for testing) + --file PATH Process a single specific file + --country CODE Filter by country code (e.g., JP, CZ) + --resume Resume from last checkpoint + +Requirements: + pip install duckduckgo-search pyyaml httpx +""" + +import argparse +import asyncio +import json +import logging +import re +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urlparse + +import yaml + +try: + from duckduckgo_search import DDGS +except ImportError: + print("Please install duckduckgo-search: pip install duckduckgo-search") + sys.exit(1) + +try: + import httpx +except ImportError: + print("Please install httpx: pip install httpx") + sys.exit(1) + +# Logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Configuration +CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" +CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_checkpoint.json" +REQUEST_DELAY = 3.0 # seconds between searches (be nice to search engines) + +# Domain blacklist (not actual institution websites) +DOMAIN_BLACKLIST = { + 'wikipedia.org', 'wikidata.org', 'wikimedia.org', + 'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', + 'youtube.com', 'tiktok.com', 'pinterest.com', + 'tripadvisor.com', 'tripadvisor.jp', 'yelp.com', + 'google.com', 'google.co.jp', 'maps.google.com', + 'amazon.com', 'amazon.co.jp', 'ebay.com', + 'booking.com', 'expedia.com', 'hotels.com', + 'foursquare.com', 'bing.com', 'yahoo.com', + 'findagrave.com', 'ancestry.com', 'familysearch.org', + 'academia.edu', 'researchgate.net', + 'timeanddate.com', 'weather.com', +} + +# Domain preferences (prefer these TLDs for official sites) +PREFERRED_TLDS = { + 'JP': ['.go.jp', '.lg.jp', '.ac.jp', '.or.jp', '.jp'], + 'CZ': ['.cz', '.gov.cz'], + 'NL': ['.nl', '.gov.nl'], + 'BE': ['.be', '.gov.be'], + 'DE': ['.de', '.gov.de'], + 'AT': ['.at', '.gv.at'], + 'CH': ['.ch', '.admin.ch'], +} + + +def get_custodian_name(entry: dict) -> str | None: + """Extract institution name from entry.""" + # Priority 1: Emic name (native language official name) + if entry.get('custodian_name', {}).get('emic_name'): + return entry['custodian_name']['emic_name'] + + # Priority 2: Wikidata native language label (ja, zh, ko, etc.) + wikidata = entry.get('wikidata_enrichment', {}) + country = get_country_from_entry(entry) + + # Map country to preferred label language + country_lang_map = { + 'JP': 'ja', + 'CN': 'zh', + 'KR': 'ko', + 'TW': 'zh', + 'TH': 'th', + 'VN': 'vi', + 'RU': 'ru', + 'GR': 'el', + 'IL': 'he', + 'SA': 'ar', + 'IR': 'fa', + } + + if country in country_lang_map: + lang = country_lang_map[country] + native_label = wikidata.get(f'wikidata_label_{lang}') or wikidata.get('wikidata_labels', {}).get(lang) + if native_label: + return native_label + + # Priority 3: Claim value + if entry.get('custodian_name', {}).get('claim_value'): + return entry['custodian_name']['claim_value'] + + # Priority 4: Original entry name + if entry.get('original_entry', {}).get('name'): + return entry['original_entry']['name'] + + # Priority 5: Organisatie (Dutch) + if entry.get('original_entry', {}).get('organisatie'): + return entry['original_entry']['organisatie'] + + return None + + +def get_country_from_entry(entry: dict) -> str | None: + """Extract country code from entry.""" + # Check location.country + if entry.get('location', {}).get('country'): + return entry['location']['country'] + + # Check original_entry.locations + if entry.get('original_entry', {}).get('locations'): + loc = entry['original_entry']['locations'][0] + if loc.get('country'): + return loc['country'] + + return None + + +def get_location_info(entry: dict) -> dict: + """Extract location information from entry.""" + location = {} + + # Check original_entry.locations + if entry.get('original_entry', {}).get('locations'): + loc = entry['original_entry']['locations'][0] + location['city'] = loc.get('city') + location['region'] = loc.get('region') + location['country'] = loc.get('country') + location['street_address'] = loc.get('street_address') + + # Check original_entry directly + if not location.get('city'): + orig = entry.get('original_entry', {}) + location['city'] = orig.get('city') or orig.get('plaats') + location['country'] = orig.get('country') + + return location + + +def get_institution_type(entry: dict) -> str | None: + """Get institution type for search refinement.""" + inst_type = entry.get('original_entry', {}).get('institution_type') + if inst_type: + type_map = { + 'LIBRARY': 'library', + 'MUSEUM': 'museum', + 'ARCHIVE': 'archive', + 'GALLERY': 'gallery', + 'RESEARCH_CENTER': 'research center', + 'EDUCATION_PROVIDER': 'university', + } + return type_map.get(inst_type) + return None + + +def has_website(entry: dict) -> bool: + """Check if entry already has a website.""" + # Check various website fields + if entry.get('original_entry', {}).get('webadres_organisatie'): + return True + + # Check identifiers + for ident in entry.get('original_entry', {}).get('identifiers', []): + if ident.get('identifier_scheme') == 'Website': + return True + + # Check enrichment fields + if entry.get('website_discovery', {}).get('website_url'): + return True + if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'): + return True + if entry.get('google_maps_enrichment', {}).get('website'): + return True + + return False + + +def is_valid_website(url: str, country: str | None = None) -> bool: + """Check if URL is a valid institutional website.""" + if not url: + return False + + try: + parsed = urlparse(url) + domain = parsed.netloc.lower() + + # Remove www prefix + if domain.startswith('www.'): + domain = domain[4:] + + # Check blacklist + for blacklisted in DOMAIN_BLACKLIST: + if blacklisted in domain: + return False + + return True + except Exception: + return False + + +def score_website(url: str, country: str, name: str) -> int: + """Score a website URL based on likelihood of being official site.""" + score = 0 + + try: + parsed = urlparse(url) + domain = parsed.netloc.lower() + + # Prefer country-specific TLDs + preferred = PREFERRED_TLDS.get(country, []) + for i, tld in enumerate(preferred): + if domain.endswith(tld): + score += (len(preferred) - i) * 10 + break + + # Prefer HTTPS + if parsed.scheme == 'https': + score += 5 + + # Prefer shorter paths (homepage vs deep link) + path_depth = len([p for p in parsed.path.split('/') if p]) + score -= path_depth * 2 + + # Check if institution name words appear in domain + name_words = set(re.findall(r'\w+', name.lower())) + domain_words = set(re.findall(r'\w+', domain)) + common_words = name_words & domain_words + score += len(common_words) * 5 + + except Exception: + pass + + return score + + +def search_for_website(name: str, location: dict, inst_type: str | None = None) -> list[dict]: + """Search for institution website using DuckDuckGo.""" + results = [] + + # Build search queries + queries = [] + + city = location.get('city', '') + country = location.get('country', '') + + # Primary query: name + city + if city: + queries.append(f'"{name}" {city}') + + # Secondary query: name + country + institution type + if inst_type: + queries.append(f'"{name}" {country} {inst_type} official') + + # Tertiary: just the name with "official website" + queries.append(f'"{name}" official website') + + ddgs = DDGS() + + for query in queries[:2]: # Limit to 2 queries per institution + try: + search_results = list(ddgs.text(query, max_results=5)) + + for r in search_results: + url = r.get('href') or r.get('url') + if url and is_valid_website(url, country): + results.append({ + 'url': url, + 'title': r.get('title', ''), + 'snippet': r.get('body', ''), + 'query': query, + 'score': score_website(url, country, name) + }) + + time.sleep(1) # Rate limit between queries + + except Exception as e: + logger.warning(f"Search error for '{query}': {e}") + time.sleep(2) + + # Sort by score and deduplicate + seen_domains = set() + unique_results = [] + for r in sorted(results, key=lambda x: -x['score']): + domain = urlparse(r['url']).netloc.lower() + if domain not in seen_domains: + seen_domains.add(domain) + unique_results.append(r) + + return unique_results[:3] # Return top 3 unique results + + +async def verify_website(url: str) -> dict: + """Verify that a website is accessible and get basic info.""" + result = { + 'accessible': False, + 'final_url': url, + 'status_code': None, + 'title': None, + } + + try: + async with httpx.AsyncClient(follow_redirects=True, timeout=15.0) as client: + response = await client.get(url) + result['accessible'] = response.status_code == 200 + result['status_code'] = response.status_code + result['final_url'] = str(response.url) + + # Extract title + if result['accessible']: + match = re.search(r']*>([^<]+)', response.text, re.I) + if match: + result['title'] = match.group(1).strip() + + except Exception as e: + logger.debug(f"Failed to verify {url}: {e}") + + return result + + +def load_checkpoint() -> dict: + """Load progress checkpoint.""" + if CHECKPOINT_FILE.exists(): + with open(CHECKPOINT_FILE, 'r') as f: + return json.load(f) + return {'processed_files': [], 'found_count': 0, 'not_found_count': 0} + + +def save_checkpoint(checkpoint: dict): + """Save progress checkpoint.""" + with open(CHECKPOINT_FILE, 'w') as f: + json.dump(checkpoint, f, indent=2) + + +def update_custodian_file(filepath: Path, website_url: str, discovery_info: dict) -> bool: + """Update custodian YAML file with discovered website.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + entry = yaml.safe_load(f) + + if not entry: + return False + + # Add website discovery section + entry['website_discovery'] = { + 'website_url': website_url, + 'discovery_date': datetime.now(timezone.utc).isoformat(), + 'discovery_method': 'duckduckgo_search', + 'search_query': discovery_info.get('query', ''), + 'confidence_score': min(discovery_info.get('score', 0) / 50, 1.0), # Normalize to 0-1 + 'verification': { + 'accessible': discovery_info.get('verification', {}).get('accessible', False), + 'page_title': discovery_info.get('verification', {}).get('title'), + 'final_url': discovery_info.get('verification', {}).get('final_url'), + } + } + + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return True + + except Exception as e: + logger.error(f"Failed to update {filepath}: {e}") + return False + + +async def process_file(filepath: Path, dry_run: bool = False) -> dict: + """Process a single custodian file.""" + result = { + 'filename': filepath.name, + 'status': 'skipped', + 'website': None, + } + + try: + with open(filepath, 'r', encoding='utf-8') as f: + entry = yaml.safe_load(f) + + if not entry: + result['status'] = 'empty' + return result + + # Skip if already has website + if has_website(entry): + result['status'] = 'has_website' + return result + + # Get institution info + name = get_custodian_name(entry) + if not name: + result['status'] = 'no_name' + return result + + location = get_location_info(entry) + inst_type = get_institution_type(entry) + country = location.get('country', filepath.name[:2]) + + logger.info(f"Searching for: {name} ({location.get('city', 'unknown city')}, {country})") + + # Search for website + search_results = search_for_website(name, location, inst_type) + + if not search_results: + result['status'] = 'not_found' + return result + + # Verify top result + best = search_results[0] + verification = await verify_website(best['url']) + best['verification'] = verification + + if verification['accessible']: + result['website'] = verification['final_url'] + result['status'] = 'found' + result['discovery_info'] = best + + if not dry_run: + update_custodian_file(filepath, verification['final_url'], best) + logger.info(f" → Found: {verification['final_url']}") + else: + # Try second result if first is inaccessible + if len(search_results) > 1: + second = search_results[1] + verification2 = await verify_website(second['url']) + if verification2['accessible']: + second['verification'] = verification2 + result['website'] = verification2['final_url'] + result['status'] = 'found' + result['discovery_info'] = second + + if not dry_run: + update_custodian_file(filepath, verification2['final_url'], second) + logger.info(f" → Found (2nd): {verification2['final_url']}") + else: + result['status'] = 'inaccessible' + else: + result['status'] = 'inaccessible' + + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + logger.error(f"Error processing {filepath}: {e}") + + return result + + +async def main(): + parser = argparse.ArgumentParser(description='Discover websites for custodian files') + parser.add_argument('--dry-run', action='store_true', help='Show what would be discovered') + parser.add_argument('--limit', type=int, help='Process only first N files') + parser.add_argument('--file', type=str, help='Process a single specific file') + parser.add_argument('--country', type=str, help='Filter by country code (e.g., JP, CZ)') + parser.add_argument('--resume', action='store_true', help='Resume from checkpoint') + + args = parser.parse_args() + + # Get files to process + if args.file: + files = [Path(args.file)] + else: + pattern = f"{args.country}-*.yaml" if args.country else "*.yaml" + files = sorted(CUSTODIAN_DIR.glob(pattern)) + + # Filter out non-custodian files + files = [f for f in files if f.name[0].isupper() and '-' in f.name] + + # Load checkpoint + checkpoint = load_checkpoint() if args.resume else {'processed_files': [], 'found_count': 0, 'not_found_count': 0} + processed_set = set(checkpoint['processed_files']) + + if args.resume: + files = [f for f in files if f.name not in processed_set] + logger.info(f"Resuming: {len(processed_set)} files already processed, {len(files)} remaining") + + # Apply limit + if args.limit: + files = files[:args.limit] + + logger.info(f"Processing {len(files)} custodian files...") + + # Process files + found_count = checkpoint.get('found_count', 0) + not_found_count = checkpoint.get('not_found_count', 0) + + for i, filepath in enumerate(files): + result = await process_file(filepath, args.dry_run) + + # Update counts + if result['status'] == 'found': + found_count += 1 + elif result['status'] in ('not_found', 'inaccessible'): + not_found_count += 1 + + # Update checkpoint + if not args.dry_run: + checkpoint['processed_files'].append(filepath.name) + checkpoint['found_count'] = found_count + checkpoint['not_found_count'] = not_found_count + + if (i + 1) % 10 == 0: + save_checkpoint(checkpoint) + + # Progress update + if (i + 1) % 10 == 0: + logger.info(f"Progress: {i + 1}/{len(files)} - Found: {found_count}, Not found: {not_found_count}") + + # Rate limiting + time.sleep(REQUEST_DELAY) + + # Final checkpoint save + if not args.dry_run: + save_checkpoint(checkpoint) + + # Summary + logger.info(f"\n{'='*50}") + logger.info(f"Discovery complete!") + logger.info(f" Files processed: {len(files)}") + logger.info(f" Websites found: {found_count}") + logger.info(f" Not found: {not_found_count}") + logger.info(f"{'='*50}") + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/scripts/discover_websites_crawl4ai.py b/scripts/discover_websites_crawl4ai.py new file mode 100644 index 0000000000..26551ea1e7 --- /dev/null +++ b/scripts/discover_websites_crawl4ai.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Simplified Website Discovery for Custodians using crawl4ai. +Discovers websites by: +1. Searching DuckDuckGo +2. Verifying with crawl4ai +3. Updating YAML files with discovered URLs +""" +import asyncio +import httpx +import json +import logging +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urljoin, urlparse +import yaml + +# Logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Configuration +CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" +CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json" +REQUEST_DELAY = 3.0 # seconds between requests +DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q=" + +async def discover_websites(name, city, country): + """Search DuckDuckGo and verify websites.""" + logger.info(f"Searching for: {name}") + + # Simple search - use .format() to avoid f-string issues + city_part = f" {city}" if city else "" + query = f"{name}{city_part}" if city_part else f"{name}" + + # Search DuckDuckGo + search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}" + + try: + async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client: + response = await client.get(search_url) + if response.status_code not in [200, 202]: + logger.warning(f"Search failed: {response.status_code}") + return None + + html = response.text + links = [] + for match in re.finditer(r']+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)', html, re.I): + href = match.group(1).replace('&', '&').replace('<', '<').replace('>', '>') + if href: + links.append({'url': href, 'title': match.group(3)}) + + if not links: + logger.info(f"No results found") + return None + + logger.info(f"Found {len(links)} candidates, verifying...") + + verified = [] + for link in sorted(links, key=lambda x: len(x['title'])): + try: + async with httpx.AsyncClient(timeout=15.0) as client: + verify_response = await client.get(link['url']) + if verify_response.status_code == 200: + logger.info(f"Verified: {link['url']}") + verified.append({ + 'url': link['url'], + 'title': link['title'], + 'status': 'found' + }) + else: + logger.debug(f"Verification failed for {link['url']}") + except Exception: + logger.debug(f"Verification error for {link['url']}") + + if verified: + best = verified[0] + logger.info(f"Best candidate: {best['url']}") + return { + 'status': 'found', + 'message': f"Discovered and verified: {best['url']}", + 'website_url': best['url'], + 'title': best.get('title'), + } + else: + logger.info(f"No valid websites found") + return { + 'status': 'not_found', + 'message': 'No valid results found' + } + + except Exception as e: + logger.error(f"Search error: {e}") + return None + +def update_custodian_file(filepath, website_url, title): + """Update custodian YAML file with discovered website.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + entry = yaml.safe_load(f) + if not entry: + logger.error(f"Invalid file: {filepath}") + return False + + # Add website discovery section + entry['website_discovery'] = { + 'website_url': website_url, + 'discovery_date': datetime.now(timezone.utc).isoformat(), + 'discovery_method': 'crawl4ai_search_and_verify', + 'title': title, + 'confidence_score': 0.0, # Will be updated if verification succeeds + } + + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + logger.info(f"Updated: {filepath}") + return True + except Exception as e: + logger.error(f"Failed to update {filepath}: {e}") + return False + +async def main(): + files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1] # Test with 1 file + + logger.info(f"Processing {len(files)} custodian files...") + + for filepath in files: + name = Path(filepath).stem.replace('_', ' ') + logger.info(f"Processing: {name}") + + url = await discover_websites(name, None, 'JP') + + if url: + website_url = url.get('website_url') or url.get('url') + title = url.get('title') + if update_custodian_file(filepath, website_url, title): + logger.info(f" → Discovered: {website_url}") + else: + logger.info(f"No website found") + + logger.info("Done!") + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/scripts/enrich_custodian_logos_crawl4ai.py b/scripts/enrich_custodian_logos_crawl4ai.py index 06004cb861..e45a770cbd 100644 --- a/scripts/enrich_custodian_logos_crawl4ai.py +++ b/scripts/enrich_custodian_logos_crawl4ai.py @@ -75,19 +75,26 @@ REQUEST_DELAY = 2.0 # seconds between requests def get_website_url(entry: dict) -> str | None: """Extract website URL from custodian entry.""" - # Priority 1: Original entry webadres + # Priority 1: Original entry webadres (Dutch ISIL format) if entry.get('original_entry', {}).get('webadres_organisatie'): url = entry['original_entry']['webadres_organisatie'] if url and url.strip() and url.strip().lower() not in ('null', 'none', ''): return normalize_url(url.strip()) - # Priority 2: Museum register website + # Priority 2: Website in identifiers array (Czech ISIL and ARON format) + for ident in entry.get('original_entry', {}).get('identifiers', []): + if ident.get('identifier_scheme') == 'Website': + url = ident.get('identifier_value') or ident.get('identifier_url') + if url and url.strip(): + return normalize_url(url.strip()) + + # Priority 3: Museum register website if entry.get('museum_register_enrichment', {}).get('website_url'): url = entry['museum_register_enrichment']['website_url'] if url and url.strip(): return normalize_url(url.strip()) - # Priority 3: Wikidata official website + # Priority 4: Wikidata official website if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'): url = entry['wikidata_enrichment']['wikidata_official_website'] # Handle list of URLs (take first one) @@ -96,13 +103,13 @@ def get_website_url(entry: dict) -> str | None: if url and isinstance(url, str) and url.strip(): return normalize_url(url.strip()) - # Priority 4: Google Maps website + # Priority 5: Google Maps website if entry.get('google_maps_enrichment', {}).get('website'): url = entry['google_maps_enrichment']['website'] if url and url.strip(): return normalize_url(url.strip()) - # Priority 5: Web enrichment source URL + # Priority 6: Web enrichment source URL if entry.get('web_enrichment', {}).get('source_url'): url = entry['web_enrichment']['source_url'] if url and url.strip(): diff --git a/scripts/index_persons_qdrant.py b/scripts/index_persons_qdrant.py index 63df6c221e..34cac9acc8 100644 --- a/scripts/index_persons_qdrant.py +++ b/scripts/index_persons_qdrant.py @@ -54,9 +54,22 @@ def extract_person_text(data: dict[str, Any]) -> str: parts = [] profile = data.get("profile_data", {}) + person = data.get("person", {}) + source_staff = data.get("source_staff_info", {}) + extraction = data.get("extraction_metadata", {}) - # Full name (primary identifier) - name = profile.get("full_name", "") + # Full name - check ALL possible locations in order of priority + name = ( + profile.get("full_name") or + profile.get("name") or + person.get("full_name") or + person.get("name") or + source_staff.get("name") or + source_staff.get("person_name") or + extraction.get("person_name") or + data.get("name") or + "" + ) if name: parts.append(f"Name: {name}") @@ -259,13 +272,21 @@ def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]: } profile = data.get("profile_data", {}) + person = data.get("person", {}) + source_staff = data.get("source_staff_info", {}) extraction = data.get("extraction_metadata", {}) - # Full name - check multiple possible field names + # Full name - check ALL possible field names (same as extract_person_text) name = ( - profile.get("name", "") or - profile.get("full_name", "") or - data.get("name", "") + profile.get("full_name") or + profile.get("name") or + person.get("full_name") or + person.get("name") or + source_staff.get("name") or + source_staff.get("person_name") or + extraction.get("person_name") or + data.get("name") or + "" ) if name: metadata["name"] = name @@ -414,16 +435,19 @@ def find_person_files(data_dir: Path) -> list[Path]: class PersonRetriever: - """Qdrant retriever specifically for person entities.""" + """Qdrant retriever specifically for person entities. + + Uses MiniLM (384-dim) embeddings by default for consistency with + the hybrid_retriever.py query-time embedding model. + """ def __init__( self, host: str = "localhost", port: int = 6333, collection_name: str = "heritage_persons", - embedding_model: str = "text-embedding-3-small", - embedding_dim: int = 1536, - api_key: str | None = None, + embedding_model: str = "all-MiniLM-L6-v2", # MiniLM for local embeddings + embedding_dim: int = 384, # MiniLM output dimension url: str | None = None, https: bool = False, prefix: str | None = None, @@ -434,7 +458,7 @@ class PersonRetriever: self.collection_name = collection_name self.embedding_model = embedding_model self.embedding_dim = embedding_dim - self.api_key = api_key or os.getenv("OPENAI_API_KEY") + # MiniLM model runs locally, no API key needed # Initialize Qdrant client if url: @@ -451,25 +475,23 @@ class PersonRetriever: else: self.client = QdrantClient(host=host, port=port, timeout=60) - self._openai_client = None + self._sentence_model = None @property - def openai_client(self): - """Lazy-load OpenAI client.""" - if self._openai_client is None: - import openai - self._openai_client = openai.OpenAI(api_key=self.api_key) - return self._openai_client + def sentence_model(self): + """Lazy-load SentenceTransformer model.""" + if self._sentence_model is None: + from sentence_transformers import SentenceTransformer + logger.info(f"Loading embedding model: {self.embedding_model}") + self._sentence_model = SentenceTransformer(self.embedding_model) + return self._sentence_model def _get_embeddings_batch(self, texts: list[str]) -> list[list[float]]: - """Get embedding vectors for multiple texts.""" + """Get embedding vectors for multiple texts using MiniLM.""" if not texts: return [] - response = self.openai_client.embeddings.create( - input=texts, - model=self.embedding_model - ) - return [item.embedding for item in sorted(response.data, key=lambda x: x.index)] + embeddings = self.sentence_model.encode(texts, show_progress_bar=False) + return embeddings.tolist() def ensure_collection(self) -> None: """Ensure the collection exists, create if not.""" @@ -655,10 +677,7 @@ def main(): logger.info(f" Metadata: {list(doc['metadata'].keys())}") sys.exit(0) - # Check for OpenAI API key - if not os.getenv("OPENAI_API_KEY"): - logger.error("OPENAI_API_KEY environment variable is required for embeddings") - sys.exit(1) + # Note: MiniLM model runs locally, no API key needed # Create retriever if args.url: diff --git a/src/glam_extractor/api/hybrid_retriever.py b/src/glam_extractor/api/hybrid_retriever.py index ead40947fb..4191442599 100644 --- a/src/glam_extractor/api/hybrid_retriever.py +++ b/src/glam_extractor/api/hybrid_retriever.py @@ -36,6 +36,8 @@ Example usage: import hashlib import logging import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from typing import Any, TYPE_CHECKING @@ -128,9 +130,150 @@ class RetrievedInstitution: } +# =================================================================== +# Linked Data URI Generation Utilities +# =================================================================== +# Generate stable ontology-aligned URIs for Person and PersonObservation +# following the LinkML schema at schemas/20251121/linkml/ +# Namespace: https://nde.nl/ontology/hc/ +# =================================================================== + +import re +import unicodedata + +# Ontology namespaces +ONTOLOGY_BASE = "https://nde.nl/ontology/hc" +PERSON_HUB_PREFIX = f"{ONTOLOGY_BASE}/person" +PERSON_OBS_PREFIX = f"{ONTOLOGY_BASE}/person-obs" +CUSTODIAN_PREFIX = f"{ONTOLOGY_BASE}/custodian" + +# JSON-LD context for person search responses +PERSON_JSONLD_CONTEXT = { + "@vocab": f"{ONTOLOGY_BASE}/", + "schema": "http://schema.org/", + "pico": "https://personsincontext.org/model#", + "prov": "http://www.w3.org/ns/prov#", + "foaf": "http://xmlns.com/foaf/0.1/", + "name": "schema:name", + "jobTitle": "schema:jobTitle", + "affiliation": "schema:affiliation", + "sameAs": "schema:sameAs", + "refers_to_person": "pico:observationOf", + "observation_source": "prov:hadPrimarySource", +} + + +def generate_slug(text: str) -> str: + """Generate URL-safe slug from text. + + Examples: + "Kitty Bogte" → "kitty-bogte" + "Dr. Jane Smith" → "dr-jane-smith" + "Taco Dibbits" → "taco-dibbits" + """ + if not text: + return "unknown" + + # Normalize unicode (NFD decomposition) and remove diacritics + normalized = unicodedata.normalize('NFD', text) + ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Convert to lowercase + lowercase = ascii_text.lower() + + # Replace non-alphanumeric with hyphens + slug = re.sub(r'[^a-z0-9]+', '-', lowercase) + + # Collapse multiple hyphens and strip leading/trailing + slug = re.sub(r'-+', '-', slug).strip('-') + + return slug or "unknown" + + +def generate_role_slug(headline: str | None) -> str: + """Generate role slug from job title/headline. + + Examples: + "Programmer/curator" → "programmer-curator" + "Senior Archivist" → "senior-archivist" + None → "staff" + """ + if not headline: + return "staff" + return generate_slug(headline) + + +def generate_person_hub_uri(name: str, linkedin_slug: str | None = None) -> str: + """Generate Person hub URI (abstract identity). + + Format: https://nde.nl/ontology/hc/person/{person-slug} + + Uses LinkedIn slug if available for stability, otherwise derives from name. + + Examples: + generate_person_hub_uri("Kitty Bogte", "kittybogte") + → "https://nde.nl/ontology/hc/person/kittybogte" + generate_person_hub_uri("Dr. Jane Smith") + → "https://nde.nl/ontology/hc/person/dr-jane-smith" + """ + if linkedin_slug: + slug = linkedin_slug + else: + slug = generate_slug(name) + + return f"{PERSON_HUB_PREFIX}/{slug}" + + +def generate_observation_uri( + custodian_slug: str | None, + person_name: str, + role_slug: str | None = None, + linkedin_slug: str | None = None +) -> str: + """Generate PersonObservation URI. + + Format: https://nde.nl/ontology/hc/person-obs/{custodian-slug}/{person-slug}/{role-slug} + + Examples: + generate_observation_uri("nl-ga-nationaal-archief", "Kitty Bogte", "programmer-curator") + → "https://nde.nl/ontology/hc/person-obs/nl-ga-nationaal-archief/kitty-bogte/programmer-curator" + """ + custodian = custodian_slug or "unknown-custodian" + person = linkedin_slug or generate_slug(person_name) + role = role_slug or "staff" + + return f"{PERSON_OBS_PREFIX}/{custodian}/{person}/{role}" + + +def generate_custodian_uri(custodian_slug: str | None, ghcid: str | None = None) -> str | None: + """Generate Custodian URI. + + Format: https://nde.nl/ontology/hc/custodian/{ghcid-or-slug} + """ + if ghcid: + return f"{CUSTODIAN_PREFIX}/{ghcid}" + elif custodian_slug: + return f"{CUSTODIAN_PREFIX}/{custodian_slug}" + return None + + +def extract_linkedin_slug(linkedin_url: str | None) -> str | None: + """Extract slug from LinkedIn URL. + + Examples: + "https://www.linkedin.com/in/kittybogte" → "kittybogte" + "https://linkedin.com/in/jane-smith-12345" → "jane-smith-12345" + """ + if not linkedin_url: + return None + + match = re.search(r'linkedin\.com/in/([^/?]+)', linkedin_url) + return match.group(1) if match else None + + @dataclass class RetrievedPerson: - """A retrieved person/staff member with search scores.""" + """A retrieved person/staff member with search scores and linked data URIs.""" person_id: str name: str @@ -148,9 +291,42 @@ class RetrievedPerson: source_type: str | None = None # "staff_list" or "entity_profile" linkedin_url: str | None = None - def to_dict(self) -> dict[str, Any]: - """Convert to dictionary for API responses.""" - return { + # Linked data fields (generated) + linkedin_profile_path: str | None = None # Path to entity JSON file + + @property + def linkedin_slug(self) -> str | None: + """Extract LinkedIn slug from URL.""" + return extract_linkedin_slug(self.linkedin_url) + + @property + def person_hub_uri(self) -> str: + """Generate Person hub URI (abstract identity).""" + return generate_person_hub_uri(self.name, self.linkedin_slug) + + @property + def observation_uri(self) -> str: + """Generate PersonObservation URI.""" + role_slug = generate_role_slug(self.headline) + return generate_observation_uri( + self.custodian_slug, + self.name, + role_slug, + self.linkedin_slug + ) + + @property + def custodian_uri(self) -> str | None: + """Generate Custodian URI.""" + return generate_custodian_uri(self.custodian_slug) + + def to_dict(self, include_jsonld: bool = True) -> dict[str, Any]: + """Convert to dictionary for API responses. + + Args: + include_jsonld: If True, include JSON-LD linked data fields (@id, @type, etc.) + """ + result = { "person_id": self.person_id, "name": self.name, "scores": { @@ -169,6 +345,26 @@ class RetrievedPerson: "linkedin_url": self.linkedin_url, } } + + if include_jsonld: + # Add JSON-LD linked data fields + result["@id"] = self.observation_uri + result["@type"] = "pico:PersonObservation" + result["refers_to_person"] = self.person_hub_uri + + # Add custodian affiliation if available + if self.custodian_uri: + result["unit_affiliation"] = self.custodian_uri + + # Add schema:sameAs for LinkedIn URL + if self.linkedin_url: + result["schema:sameAs"] = self.linkedin_url + + # Add linkedin_profile_path if available + if self.linkedin_profile_path: + result["linkedin_profile_path"] = self.linkedin_profile_path + + return result # Query type detection patterns @@ -254,6 +450,103 @@ def get_province_code(province_name: str | None) -> str | None: return None return DUTCH_PROVINCE_CODES.get(province_name.lower().strip()) +def looks_like_person_name(query: str) -> bool: + """Detect if query looks like a person's name for name-boosted search. + + A query looks like a person name if it: + - Contains 2-4 capitalized words (first/last name pattern) + - Does NOT contain common non-name words (institutions, locations, etc.) + - Does NOT contain question words (who, what, where, etc.) + + Args: + query: Search query string + + Returns: + True if query appears to be a person name + + Examples: + >>> looks_like_person_name("Kitty Bogte") + True + >>> looks_like_person_name("Who works at the Rijksmuseum?") + False + >>> looks_like_person_name("archivist at Nationaal Archief") + False + """ + # Skip if query contains question words or common phrases + non_name_indicators = [ + # Question words + "who", "what", "where", "which", "how", "why", + "wie", "wat", "waar", "welk", "hoe", "waarom", + # Role/job indicators + "works at", "working at", "werkt bij", "werkzaam", + "archivist", "curator", "director", "librarian", + "archivaris", "directeur", "bibliothecaris", + # Prepositions indicating context + " at ", " in ", " of ", " for ", " the ", + " bij ", " in ", " van ", " voor ", " de ", " het ", + # Punctuation that indicates non-name queries + "?", "!", + ] + + query_lower = query.lower() + for indicator in non_name_indicators: + if indicator in query_lower: + return False + + # Check for capitalized word pattern (typical of names) + words = query.strip().split() + if len(words) < 2 or len(words) > 4: + return False + + # Check if words look like name components (capitalized or all letters) + capitalized_count = sum(1 for w in words if w[0].isupper() and w.isalpha()) + + # Most name words should be capitalized + return capitalized_count >= len(words) - 1 # Allow one lowercase (e.g., "van", "de") + + +def calculate_name_match_boost(query: str, name: str) -> float: + """Calculate a score boost for name matching. + + Uses case-insensitive substring matching to boost results where + the query matches part or all of the person's name. + + Args: + query: Search query (potential name) + name: Person's name from search result + + Returns: + Boost factor (1.0 = no boost, >1.0 = boosted) + - 3.0: Exact match (case-insensitive) + - 2.5: Query contains full name or name contains full query + - 2.0: Partial match (first or last name matches) + - 1.0: No match + """ + query_lower = query.lower().strip() + name_lower = name.lower().strip() + + # Exact match + if query_lower == name_lower: + return 3.0 + + # Query is substring of name or vice versa + if query_lower in name_lower or name_lower in query_lower: + return 2.5 + + # Check for partial matches (first or last name) + query_parts = set(query_lower.split()) + name_parts = set(name_lower.split()) + + # How many query parts match name parts? + matching_parts = query_parts & name_parts + if matching_parts: + # More matching parts = higher boost + match_ratio = len(matching_parts) / max(len(query_parts), len(name_parts)) + return 1.0 + match_ratio # 1.5-2.0 range for partial matches + + return 1.0 # No boost + + def detect_query_type(query: str, dspy_entity_type: str | None = None) -> str: """Detect if query is about institutions or persons. @@ -529,29 +822,43 @@ class SPARQLClient: def __init__( self, endpoint: str = DEFAULT_SPARQL_ENDPOINT, - timeout: float = DEFAULT_SPARQL_TIMEOUT + timeout: float = DEFAULT_SPARQL_TIMEOUT, + max_connections: int = 20 # Allow concurrent connections for parallel queries ): self.endpoint = endpoint self.timeout = timeout + self.max_connections = max_connections self._client: httpx.Client | None = None @property def client(self) -> httpx.Client: - """Lazy-initialize HTTP client.""" + """Lazy-initialize HTTP client with connection pooling.""" if self._client is None: - self._client = httpx.Client(timeout=self.timeout) + # Configure connection pool for parallel SPARQL queries + limits = httpx.Limits( + max_keepalive_connections=self.max_connections, + max_connections=self.max_connections, + keepalive_expiry=30.0 # Keep connections alive for reuse + ) + self._client = httpx.Client( + timeout=self.timeout, + limits=limits, + http2=False # HTTP/1.1 is often faster for small queries + ) return self._client - def query(self, sparql: str) -> list[dict[str, Any]]: + def query(self, sparql: str, log_timing: bool = False) -> list[dict[str, Any]]: """Execute SPARQL query and return results. Args: sparql: SPARQL query string + log_timing: Whether to log query execution time Returns: List of result bindings as dictionaries """ full_query = SPARQL_PREFIXES + sparql + start_time = time.time() if log_timing else 0 try: response = self.client.post( @@ -572,6 +879,10 @@ class SPARQLClient: row[key] = value.get("value", "") results.append(row) + if log_timing: + duration_ms = (time.time() - start_time) * 1000 + logger.debug(f"SPARQL query completed: {len(results)} results in {duration_ms:.0f}ms") + return results except httpx.HTTPError as e: @@ -1060,9 +1371,220 @@ class HybridRetriever: # Return up to k results return filtered[:k] + def _build_batched_expansion_query( + self, + seed_institutions: list[RetrievedInstitution], + exclude_ghcids: set[str], + limit_per_expansion: int = 5 + ) -> tuple[str, dict[str, dict]]: + """Build a single SPARQL query with UNION clauses for all expansions. + + DEDUPLICATES by city code and type+country to avoid redundant query patterns. + For example, if 5 seeds are all from Amsterdam with type MUSEUM, we only + create ONE city expansion (for AMS) and ONE type expansion (for NL + M), + not 10 redundant UNIONs. + + Args: + seed_institutions: Seed institutions to expand from + exclude_ghcids: GHCIDs to exclude from results + limit_per_expansion: Max results per expansion type + + Returns: + Tuple of (SPARQL query string, expansion_metadata dict) + expansion_metadata maps expansion_key -> {seed, type, city/type_code} + """ + unions = [] + expansion_metadata = {} + + # Track unique patterns to avoid duplicate queries + seen_city_codes: set[str] = set() + seen_type_patterns: set[str] = set() # "country-type_code" pattern + + seeds_to_expand = seed_institutions[:5] + city_idx = 0 + type_idx = 0 + + for seed in seeds_to_expand: + # City expansion - deduplicate by city code + if seed.city: + city_code = seed.city[:3].upper() + if city_code not in seen_city_codes: + seen_city_codes.add(city_code) + expansion_key = f"city_{city_idx}" + city_idx += 1 + unions.append(f""" + {{ + SELECT ?s ?name ?ghcid ?type ("{expansion_key}" AS ?expansion_key) WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid . + FILTER(CONTAINS(?ghcid, "-{city_code}-")) + OPTIONAL {{ ?s hc:institutionType ?type }} + }} + LIMIT {limit_per_expansion + len(exclude_ghcids)} + }} + """) + expansion_metadata[expansion_key] = { + "seed": seed, + "type": "city", + "city": seed.city, + "city_code": city_code + } + + # Type expansion - deduplicate by country + type_code pattern + if seed.institution_type and seed.country: + type_code = get_custodian_type_to_heritage_code().get(seed.institution_type, "") + if type_code: + pattern_key = f"{seed.country}-{type_code}" + if pattern_key not in seen_type_patterns: + seen_type_patterns.add(pattern_key) + expansion_key = f"type_{type_idx}" + type_idx += 1 + unions.append(f""" + {{ + SELECT ?s ?name ?ghcid ?city ("{expansion_key}" AS ?expansion_key) WHERE {{ + ?s a hcc:Custodian ; + skos:prefLabel ?name ; + hc:ghcid ?ghcid . + FILTER(STRSTARTS(?ghcid, "{seed.country}-")) + FILTER(CONTAINS(?ghcid, "-{type_code}-")) + OPTIONAL {{ ?s schema:location ?city }} + }} + LIMIT {limit_per_expansion + len(exclude_ghcids)} + }} + """) + expansion_metadata[expansion_key] = { + "seed": seed, + "type": "type", + "institution_type": seed.institution_type, + "type_code": type_code, + "country": seed.country + } + + if not unions: + return "", {} + + # Log deduplication stats + logger.info(f"Batched SPARQL: {len(unions)} UNIONs (deduplicated from max {len(seeds_to_expand) * 2}). " + f"Unique cities: {seen_city_codes}, Unique types: {seen_type_patterns}") + + # Combine all unions into a single query + query = f""" + SELECT ?s ?name ?ghcid ?type ?city ?expansion_key WHERE {{ + {" UNION ".join(unions)} + }} + """ + + return query, expansion_metadata + + def _graph_expand_batched( + self, + seed_institutions: list[RetrievedInstitution] + ) -> list[RetrievedInstitution]: + """Expand seed results using a SINGLE batched SPARQL query. + + This is a significant optimization over the parallel ThreadPoolExecutor + approach. Instead of 10 HTTP requests (even in parallel), we execute + ONE SPARQL query with UNION clauses. + + Performance comparison: + - Sequential: 10 queries × ~100ms = 4+ seconds + - Parallel (ThreadPool): ~500ms-1s (limited by GIL/connection pool) + - Batched (this method): ONE query ~150-300ms + + Args: + seed_institutions: Initial vector search results + + Returns: + Additional institutions found via graph expansion + """ + start_time = time.time() + exclude_ghcids = {inst.ghcid for inst in seed_institutions} + expanded = [] + seen_ghcids = set(exclude_ghcids) + + # Build batched query + query, expansion_metadata = self._build_batched_expansion_query( + seed_institutions, exclude_ghcids, limit_per_expansion=self.k_expand + ) + + if not query: + logger.debug("No graph expansion tasks to execute") + return expanded + + # Execute single batched query + query_start = time.time() + results = self.sparql_client.query(query) + query_duration = (time.time() - query_start) * 1000 + + logger.debug(f"Batched SPARQL query: {len(results)} raw results in {query_duration:.0f}ms") + + # Group results by expansion_key + results_by_expansion: dict[str, list[dict]] = {} + for row in results: + exp_key = row.get("expansion_key", "") + if exp_key: + if exp_key not in results_by_expansion: + results_by_expansion[exp_key] = [] + results_by_expansion[exp_key].append(row) + + # Process results, filtering and creating RetrievedInstitution objects + for exp_key, rows in results_by_expansion.items(): + if exp_key not in expansion_metadata: + continue + + meta = expansion_metadata[exp_key] + seed = meta["seed"] + exp_type = meta["type"] + + count = 0 + for row in rows: + ghcid = row.get("ghcid", "") + if not ghcid or ghcid in seen_ghcids: + continue + + if count >= self.k_expand: + break + + seen_ghcids.add(ghcid) + count += 1 + + if exp_type == "city": + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.8, # High score for same city + institution_type=row.get("type"), + expansion_reason="same_city", + related_institutions=[seed.ghcid] + )) + elif exp_type == "type": + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.5, # Medium score for same type + institution_type=seed.institution_type, + city=row.get("city"), + expansion_reason="same_type", + related_institutions=[seed.ghcid] + )) + + logger.debug(f"Expansion {exp_key}: {count} results for {seed.ghcid}") + + total_time = (time.time() - start_time) * 1000 + logger.info(f"Graph expansion (batched): 1 query, {len(results)} raw results, " + f"{len(expanded)} expanded in {total_time:.0f}ms") + + return expanded + def _expand_by_city(self, city: str, exclude_ghcids: set[str], limit: int = 5) -> list[dict]: """Find other institutions in the same city via SPARQL. + Note: This method is kept for backwards compatibility and direct calls. + For batch operations, use _graph_expand_batched() instead. + Args: city: City name to search for exclude_ghcids: GHCIDs to exclude from results @@ -1078,12 +1600,12 @@ class HybridRetriever: SELECT ?s ?name ?ghcid ?type WHERE {{ ?s a hcc:Custodian ; skos:prefLabel ?name ; - ghc:ghcid ?ghcid . + hc:ghcid ?ghcid . # Match city in GHCID (format: CC-RR-CCC-T-ABBR) FILTER(CONTAINS(?ghcid, "-{city[:3].upper()}-")) - OPTIONAL {{ ?s hc:institution_type ?type }} + OPTIONAL {{ ?s hc:institutionType ?type }} }} LIMIT {limit + len(exclude_ghcids)} """ @@ -1126,13 +1648,13 @@ class HybridRetriever: SELECT ?s ?name ?ghcid ?city WHERE {{ ?s a hcc:Custodian ; skos:prefLabel ?name ; - ghc:ghcid ?ghcid . + hc:ghcid ?ghcid . # Match country and type in GHCID FILTER(STRSTARTS(?ghcid, "{country}-")) FILTER(CONTAINS(?ghcid, "-{type_code}-")) - OPTIONAL {{ ?s hc:city ?city }} + OPTIONAL {{ ?s schema:location ?city }} }} LIMIT {limit + len(exclude_ghcids)} """ @@ -1167,10 +1689,10 @@ class HybridRetriever: SELECT ?s ?name ?ghcid ?type WHERE {{ ?s a hcc:Custodian ; skos:prefLabel ?name ; - ghc:ghcid ?ghcid ; + hc:ghcid ?ghcid ; wdt:P17 wd:{wikidata_country} . - OPTIONAL {{ ?s hc:institution_type ?type }} + OPTIONAL {{ ?s hc:institutionType ?type }} }} LIMIT {limit + len(exclude_ghcids)} """ @@ -1189,59 +1711,171 @@ class HybridRetriever: def _graph_expand( self, - seed_institutions: list[RetrievedInstitution] + seed_institutions: list[RetrievedInstitution], + use_batched: bool = True ) -> list[RetrievedInstitution]: """Expand seed results using knowledge graph relationships. + By default uses batched SPARQL (single query with UNION) for best performance. + Falls back to parallel ThreadPoolExecutor if batched fails. + + Performance comparison: + - Sequential: 10 queries × ~100ms = 4+ seconds + - Parallel (ThreadPool): ~500ms-3s (limited by GIL/connection pool) + - Batched (UNION query): ONE query ~150-300ms ← DEFAULT + + Args: + seed_institutions: Initial vector search results + use_batched: If True (default), use batched SPARQL query. + If False, use parallel ThreadPoolExecutor. + + Returns: + Additional institutions found via graph expansion + """ + if use_batched: + try: + return self._graph_expand_batched(seed_institutions) + except Exception as e: + logger.warning(f"Batched graph expansion failed, falling back to parallel: {e}") + # Fall through to parallel implementation + + return self._graph_expand_parallel(seed_institutions) + + def _graph_expand_parallel( + self, + seed_institutions: list[RetrievedInstitution] + ) -> list[RetrievedInstitution]: + """Expand seed results using parallel SPARQL queries (fallback method). + + Uses ThreadPoolExecutor to parallelize SPARQL queries. This is slower than + the batched approach but serves as a fallback. + Args: seed_institutions: Initial vector search results Returns: Additional institutions found via graph expansion """ + start_time = time.time() exclude_ghcids = {inst.ghcid for inst in seed_institutions} expanded = [] seen_ghcids = set(exclude_ghcids) - for seed in seed_institutions[:5]: # Expand top 5 seeds - # Expansion 1: Same city + # Prepare all expansion tasks + # Each task is a tuple: (task_type, seed, query_params) + tasks = [] + seeds_to_expand = seed_institutions[:5] # Expand top 5 seeds + + for seed in seeds_to_expand: + # City expansion task if seed.city: - city_results = self._expand_by_city( - seed.city, seen_ghcids, limit=self.k_expand - ) - for row in city_results: - ghcid = row.get("ghcid", "") - if ghcid and ghcid not in seen_ghcids: - seen_ghcids.add(ghcid) - expanded.append(RetrievedInstitution( - ghcid=ghcid, - name=row.get("name", ""), - uri=row.get("s", ""), - graph_score=0.8, # High score for same city - institution_type=row.get("type"), - expansion_reason="same_city", - related_institutions=[seed.ghcid] - )) + tasks.append(("city", seed, {"city": seed.city})) - # Expansion 2: Same type in same country + # Type expansion task if seed.institution_type and seed.country: - type_results = self._expand_by_type( - seed.institution_type, seed.country, seen_ghcids, limit=self.k_expand - ) - for row in type_results: - ghcid = row.get("ghcid", "") - if ghcid and ghcid not in seen_ghcids: - seen_ghcids.add(ghcid) - expanded.append(RetrievedInstitution( - ghcid=ghcid, - name=row.get("name", ""), - uri=row.get("s", ""), - graph_score=0.5, # Medium score for same type - institution_type=seed.institution_type, - city=row.get("city"), - expansion_reason="same_type", - related_institutions=[seed.ghcid] - )) + tasks.append(("type", seed, { + "institution_type": seed.institution_type, + "country": seed.country + })) + + if not tasks: + logger.debug("No graph expansion tasks to execute") + return expanded + + # Execute SPARQL queries in parallel + # Use min(10, len(tasks)) workers to avoid over-parallelization + max_workers = min(10, len(tasks)) + + def execute_expansion(task): + """Execute a single expansion task and return results with metadata.""" + task_type, seed, params = task + task_start = time.time() + + try: + if task_type == "city": + results = self._expand_by_city( + params["city"], exclude_ghcids, limit=self.k_expand + ) + return { + "task_type": task_type, + "seed": seed, + "results": results, + "duration_ms": (time.time() - task_start) * 1000 + } + elif task_type == "type": + results = self._expand_by_type( + params["institution_type"], + params["country"], + exclude_ghcids, + limit=self.k_expand + ) + return { + "task_type": task_type, + "seed": seed, + "results": results, + "duration_ms": (time.time() - task_start) * 1000 + } + except Exception as e: + logger.warning(f"Graph expansion task failed: {task_type} for {seed.ghcid}: {e}") + return { + "task_type": task_type, + "seed": seed, + "results": [], + "duration_ms": (time.time() - task_start) * 1000, + "error": str(e) + } + + # Run all tasks in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(execute_expansion, task): task for task in tasks} + + for future in as_completed(futures): + result = future.result() + if result is None: + continue + + task_type = result["task_type"] + seed = result["seed"] + rows = result["results"] + duration = result.get("duration_ms", 0) + + logger.debug(f"Graph expansion {task_type} for {seed.ghcid}: " + f"{len(rows)} results in {duration:.0f}ms") + + # Process results based on task type + if task_type == "city": + for row in rows: + ghcid = row.get("ghcid", "") + if ghcid and ghcid not in seen_ghcids: + seen_ghcids.add(ghcid) + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.8, # High score for same city + institution_type=row.get("type"), + expansion_reason="same_city", + related_institutions=[seed.ghcid] + )) + elif task_type == "type": + for row in rows: + ghcid = row.get("ghcid", "") + if ghcid and ghcid not in seen_ghcids: + seen_ghcids.add(ghcid) + expanded.append(RetrievedInstitution( + ghcid=ghcid, + name=row.get("name", ""), + uri=row.get("s", ""), + graph_score=0.5, # Medium score for same type + institution_type=seed.institution_type, + city=row.get("city"), + expansion_reason="same_type", + related_institutions=[seed.ghcid] + )) + + total_time = (time.time() - start_time) * 1000 + logger.info(f"Graph expansion completed: {len(tasks)} queries, " + f"{len(expanded)} results in {total_time:.0f}ms (parallel)") return expanded @@ -1251,7 +1885,13 @@ class HybridRetriever: graph_results: list[RetrievedInstitution], k: int ) -> list[RetrievedInstitution]: - """Combine vector and graph results with weighted scoring. + """Combine vector and graph results with weighted scoring and graph inheritance. + + This method implements a hybrid scoring approach: + 1. Direct merge: If a graph result matches a vector result (same GHCID), + the graph_score is directly applied + 2. Graph inheritance: Vector results inherit a portion of graph scores from + related institutions found via graph expansion (same city/type) Args: vector_results: Results from vector search @@ -1261,25 +1901,95 @@ class HybridRetriever: Returns: Combined and ranked results """ + # Debug logging for investigation + logger.debug(f"Combining {len(vector_results)} vector + {len(graph_results)} graph results") + # Create lookup by GHCID for merging results_by_ghcid: dict[str, RetrievedInstitution] = {} + # Track which vector GHCIDs we have for inheritance + vector_ghcids = set() + # Add vector results for inst in vector_results: if inst.ghcid: results_by_ghcid[inst.ghcid] = inst + vector_ghcids.add(inst.ghcid) + logger.debug(f" Vector: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) " + f"v={inst.vector_score:.3f} g={inst.graph_score:.3f}") + + # Track direct merges and inheritance candidates + direct_merges = 0 + inheritance_boosts = [] + + # Merge graph results and build inheritance map + # inheritance_map: vector_ghcid -> list of (related_ghcid, graph_score, reason) + inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids} - # Merge graph results for inst in graph_results: + logger.debug(f" Graph: {inst.ghcid} ({inst.name[:30] if inst.name else '?'}...) " + f"g={inst.graph_score:.3f} reason={inst.expansion_reason} " + f"related_to={inst.related_institutions}") + if inst.ghcid in results_by_ghcid: - # Combine scores if already present + # Direct merge: graph result matches existing vector result existing = results_by_ghcid[inst.ghcid] + old_graph_score = existing.graph_score existing.graph_score = max(existing.graph_score, inst.graph_score) existing.related_institutions.extend(inst.related_institutions) if inst.expansion_reason: existing.expansion_reason = inst.expansion_reason + direct_merges += 1 + logger.debug(f" -> Direct merge! {inst.ghcid} graph_score: {old_graph_score:.3f} -> {existing.graph_score:.3f}") else: + # New institution from graph expansion results_by_ghcid[inst.ghcid] = inst + + # Build inheritance: this graph result was expanded FROM a vector result + # The related_institutions field contains the seed GHCID(s) it was expanded from + for seed_ghcid in inst.related_institutions: + if seed_ghcid in inheritance_map: + inheritance_map[seed_ghcid].append( + (inst.ghcid, inst.graph_score, inst.expansion_reason or "related") + ) + + logger.debug(f"Direct merges: {direct_merges}") + + # Apply graph score inheritance to vector results + # Vector results inherit a portion of graph scores from their related institutions + INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores + + for vector_ghcid, related_list in inheritance_map.items(): + if related_list and vector_ghcid in results_by_ghcid: + inst = results_by_ghcid[vector_ghcid] + + # Calculate inherited score: average of related graph scores * inheritance factor + related_scores = [score for _, score, _ in related_list] + inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR + + old_graph_score = inst.graph_score + # Inherit: take max of current graph_score and inherited score + inst.graph_score = max(inst.graph_score, inherited_score) + + if inst.graph_score > old_graph_score: + # Track related institutions for context + related_ghcids = [ghcid for ghcid, _, _ in related_list] + inst.related_institutions.extend(related_ghcids[:3]) # Add up to 3 related + + inheritance_boosts.append({ + "ghcid": vector_ghcid, + "name": inst.name, + "old_graph": old_graph_score, + "new_graph": inst.graph_score, + "inherited_from": len(related_list), + "reasons": list(set(r for _, _, r in related_list)) + }) + logger.debug(f" Inheritance: {vector_ghcid} graph_score: {old_graph_score:.3f} -> " + f"{inst.graph_score:.3f} (from {len(related_list)} related institutions)") + + if inheritance_boosts: + logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: " + f"{[b['ghcid'] for b in inheritance_boosts[:3]]}...") # Calculate combined scores for inst in results_by_ghcid.values(): @@ -1295,6 +2005,12 @@ class HybridRetriever: reverse=True ) + # Log top results for debugging + logger.debug(f"Top {min(5, len(ranked))} combined results:") + for i, inst in enumerate(ranked[:5]): + logger.debug(f" {i+1}. {inst.ghcid} ({inst.name[:25] if inst.name else '?'}...) " + f"combined={inst.combined_score:.3f} (v={inst.vector_score:.3f}, g={inst.graph_score:.3f})") + return ranked[:k] def _get_person_collection_vector_size(self) -> int | None: @@ -1418,8 +2134,20 @@ class HybridRetriever: richness_boost = 0.7 + 0.3 * richness_score person.combined_score = person.vector_score * richness_boost + # Apply name-matching boost for queries that look like person names + # This ensures that searching for "Kitty Bogte" returns Kitty Bogte first, + # even if vector similarity ranks other Dutch names higher + if looks_like_person_name(query) and person.name: + name_boost = calculate_name_match_boost(query, person.name) + if name_boost > 1.0: + logger.debug(f"Name match boost {name_boost}x for '{person.name}' (query: '{query}')") + person.combined_score *= name_boost + persons.append(person) + # Re-sort by combined score after name boosting + persons.sort(key=lambda p: p.combined_score, reverse=True) + return persons def search_persons( @@ -1462,9 +2190,15 @@ class HybridRetriever: logger.info(f"Person search for: {query[:50]}... (model: {using or 'auto'}, role_category: {target_role_category}, custodian_type: {target_custodian_type})") - # Over-fetch to allow for post-filtering by role category + # Over-fetch to allow for post-filtering and name boosting + # - Base multiplier: 2x for general queries + # - Role category filter: 3x (need more candidates for keyword filtering) + # - Name queries: fetch minimum 100 to ensure name boost can find exact matches + # (vector similarity often ranks similar-sounding names higher than exact matches) + is_name_query = looks_like_person_name(query) fetch_multiplier = 3 if target_role_category else 2 - results = self._person_vector_search(query, k * fetch_multiplier, using=using, filter_conditions=filter_conditions) + fetch_count = max(k * fetch_multiplier, 100 if is_name_query else 0) + results = self._person_vector_search(query, fetch_count, using=using, filter_conditions=filter_conditions) logger.info(f"Found {len(results)} person results after Qdrant filtering") # Apply role category post-filtering (keyword-based since not indexed)