enrich entries
This commit is contained in:
parent
59963c8d3f
commit
ca219340f2
24 changed files with 2304 additions and 129 deletions
292
docs/GRAPH_SCORE_INHERITANCE.md
Normal file
292
docs/GRAPH_SCORE_INHERITANCE.md
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
# Graph Score Inheritance in Hybrid Retrieval
|
||||
|
||||
## Overview
|
||||
|
||||
The Heritage RAG system uses a **hybrid retrieval** approach that combines:
|
||||
1. **Vector search** (semantic similarity via embeddings)
|
||||
2. **Knowledge graph expansion** (SPARQL-based relationship discovery)
|
||||
|
||||
This document explains the **graph score inheritance** feature that ensures vector search results benefit from knowledge graph relationships.
|
||||
|
||||
## The Problem
|
||||
|
||||
Before graph score inheritance, the hybrid retrieval had a scoring gap:
|
||||
|
||||
| Result Source | Vector Score | Graph Score | Combined Score |
|
||||
|---------------|--------------|-------------|----------------|
|
||||
| Vector search results | 0.5-0.8 | **0.0** | 0.35-0.56 |
|
||||
| Graph expansion results | 0.0 | 0.5-0.8 | 0.15-0.24 |
|
||||
|
||||
**Why this happened:**
|
||||
- Vector search finds institutions semantically similar to the query
|
||||
- Graph expansion finds **different** institutions (same city/type) with different GHCIDs
|
||||
- Since GHCIDs don't match, no direct merging occurs
|
||||
- Vector results always dominate because `combined = 0.7 * vector + 0.3 * graph`
|
||||
|
||||
**Example before fix:**
|
||||
```
|
||||
Query: "Archieven in Amsterdam"
|
||||
|
||||
1. Stadsarchief Amsterdam | V:0.659 G:0.000 C:0.461
|
||||
2. Noord-Hollands Archief | V:0.675 G:0.000 C:0.472
|
||||
3. The Black Archives | V:0.636 G:0.000 C:0.445
|
||||
```
|
||||
|
||||
The graph expansion was finding related institutions in Amsterdam, but that information wasn't reflected in the scores.
|
||||
|
||||
## The Solution: Graph Score Inheritance
|
||||
|
||||
Vector results now **inherit** graph scores from related institutions found via graph expansion.
|
||||
|
||||
### How It Works
|
||||
|
||||
```
|
||||
1. Vector Search
|
||||
└── Returns: [Inst_A, Inst_B, Inst_C] with vector_scores
|
||||
|
||||
2. Graph Expansion (for top 5 vector results)
|
||||
└── For Inst_A in Amsterdam:
|
||||
└── SPARQL finds: [Inst_X, Inst_Y] also in Amsterdam
|
||||
└── These get graph_score=0.8 (same_city)
|
||||
└── They track: related_institutions=[Inst_A.ghcid]
|
||||
|
||||
3. Inheritance Calculation
|
||||
└── Inst_A inherits from [Inst_X, Inst_Y]:
|
||||
inherited_score = avg([0.8, 0.8]) * 0.5 = 0.4
|
||||
└── Inst_A.graph_score = max(0.0, 0.4) = 0.4
|
||||
|
||||
4. Combined Scoring
|
||||
└── Inst_A.combined = 0.7 * vector + 0.3 * 0.4 = higher rank!
|
||||
```
|
||||
|
||||
### Inheritance Factor
|
||||
|
||||
```python
|
||||
INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores
|
||||
```
|
||||
|
||||
This means:
|
||||
- Same-city institutions (graph_score=0.8) → inherited score of **0.40**
|
||||
- Same-type institutions (graph_score=0.5) → inherited score of **0.25**
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### File Location
|
||||
|
||||
```
|
||||
/Users/kempersc/apps/glam/src/glam_extractor/api/hybrid_retriever.py
|
||||
```
|
||||
|
||||
### Key Method: `_combine_and_rank()`
|
||||
|
||||
Located at lines ~1539-1671, this method:
|
||||
|
||||
1. **Creates lookup by GHCID** for merging
|
||||
2. **Handles direct merges** when graph result GHCID matches vector result
|
||||
3. **Builds inheritance map** tracking which vector results each graph result was expanded from
|
||||
4. **Applies inheritance** calculating inherited scores for vector results
|
||||
5. **Computes combined scores** with the formula: `0.7 * vector + 0.3 * graph`
|
||||
|
||||
### Code Structure
|
||||
|
||||
```python
|
||||
def _combine_and_rank(
|
||||
self,
|
||||
vector_results: list[RetrievedInstitution],
|
||||
graph_results: list[RetrievedInstitution],
|
||||
k: int
|
||||
) -> list[RetrievedInstitution]:
|
||||
"""Combine vector and graph results with weighted scoring and graph inheritance."""
|
||||
|
||||
# 1. Create lookup by GHCID
|
||||
results_by_ghcid: dict[str, RetrievedInstitution] = {}
|
||||
vector_ghcids = set()
|
||||
|
||||
# 2. Add vector results
|
||||
for inst in vector_results:
|
||||
results_by_ghcid[inst.ghcid] = inst
|
||||
vector_ghcids.add(inst.ghcid)
|
||||
|
||||
# 3. Build inheritance map: vector_ghcid -> [(related_ghcid, graph_score, reason)]
|
||||
inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids}
|
||||
|
||||
for inst in graph_results:
|
||||
if inst.ghcid in results_by_ghcid:
|
||||
# Direct merge
|
||||
existing = results_by_ghcid[inst.ghcid]
|
||||
existing.graph_score = max(existing.graph_score, inst.graph_score)
|
||||
else:
|
||||
# New from graph - track for inheritance
|
||||
results_by_ghcid[inst.ghcid] = inst
|
||||
for seed_ghcid in inst.related_institutions:
|
||||
if seed_ghcid in inheritance_map:
|
||||
inheritance_map[seed_ghcid].append(
|
||||
(inst.ghcid, inst.graph_score, inst.expansion_reason)
|
||||
)
|
||||
|
||||
# 4. Apply inheritance
|
||||
INHERITANCE_FACTOR = 0.5
|
||||
for vector_ghcid, related_list in inheritance_map.items():
|
||||
if related_list:
|
||||
inst = results_by_ghcid[vector_ghcid]
|
||||
related_scores = [score for _, score, _ in related_list]
|
||||
inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR
|
||||
inst.graph_score = max(inst.graph_score, inherited_score)
|
||||
|
||||
# 5. Calculate combined scores
|
||||
for inst in results_by_ghcid.values():
|
||||
inst.combined_score = (
|
||||
self.vector_weight * inst.vector_score +
|
||||
self.graph_weight * inst.graph_score
|
||||
)
|
||||
|
||||
return sorted(results_by_ghcid.values(), key=lambda x: x.combined_score, reverse=True)[:k]
|
||||
```
|
||||
|
||||
### Graph Expansion Scores
|
||||
|
||||
The `_expand_via_graph()` method assigns these base scores:
|
||||
|
||||
| Expansion Type | Graph Score | SPARQL Pattern |
|
||||
|----------------|-------------|----------------|
|
||||
| Same city | 0.8 | `?s schema:location ?loc . ?loc hc:cityCode ?cityCode` |
|
||||
| Same institution type | 0.5 | `?s hc:institutionType ?type` |
|
||||
|
||||
## Results
|
||||
|
||||
### Before (Graph Score = 0.0)
|
||||
|
||||
```
|
||||
Query: "Welke musea zijn er in Utrecht?"
|
||||
|
||||
1. Centraal Museum | V:0.589 G:0.000 C:0.412
|
||||
2. Museum Speelklok | V:0.591 G:0.000 C:0.414
|
||||
3. Universiteitsmuseum Utrecht | V:0.641 G:0.000 C:0.449
|
||||
```
|
||||
|
||||
### After (Graph Score Inherited)
|
||||
|
||||
```
|
||||
Query: "Welke musea zijn er in Utrecht?"
|
||||
|
||||
1. Universiteitsmuseum Utrecht | V:0.641 G:0.400 C:0.569
|
||||
2. Museum Speelklok | V:0.591 G:0.400 C:0.534
|
||||
3. Centraal Museum | V:0.589 G:0.400 C:0.532
|
||||
```
|
||||
|
||||
**Key improvements:**
|
||||
- Graph scores now **0.400** (inherited from same-city museums)
|
||||
- Combined scores **increased by ~25%** (0.412 → 0.532)
|
||||
- Ranking now considers **geographic relevance**
|
||||
|
||||
### More Examples
|
||||
|
||||
```
|
||||
Query: "Bibliotheken in Den Haag"
|
||||
|
||||
1. Centrale Bibliotheek | V:0.697 G:0.400 C:0.608
|
||||
2. Koninklijke Bibliotheek | V:0.676 G:0.400 C:0.593
|
||||
3. Huis van het Boek | V:0.630 G:0.400 C:0.561
|
||||
4. Bibliotheek Hoeksche Waard | V:0.613 G:0.400 C:0.549
|
||||
5. Centrale Bibliotheek (other) | V:0.623 G:0.000 C:0.436 <- No inheritance (different city)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Weights (in `HybridRetriever.__init__`)
|
||||
|
||||
```python
|
||||
self.vector_weight = 0.7 # Semantic similarity importance
|
||||
self.graph_weight = 0.3 # Knowledge graph importance
|
||||
```
|
||||
|
||||
### Inheritance Factor
|
||||
|
||||
```python
|
||||
INHERITANCE_FACTOR = 0.5 # In _combine_and_rank()
|
||||
```
|
||||
|
||||
**Tuning considerations:**
|
||||
- Higher factor (0.6-0.8): Stronger influence from graph relationships
|
||||
- Lower factor (0.3-0.4): More conservative, vector similarity dominates
|
||||
- Current value (0.5): Balanced approach
|
||||
|
||||
## Logging
|
||||
|
||||
The implementation includes detailed logging for debugging:
|
||||
|
||||
```python
|
||||
# INFO level (always visible)
|
||||
logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: {ghcids}...")
|
||||
|
||||
# DEBUG level (when LOG_LEVEL=DEBUG)
|
||||
logger.debug(f"Inheritance: {ghcid} graph_score: {old:.3f} -> {new:.3f} (from {n} related)")
|
||||
```
|
||||
|
||||
**Check logs on production:**
|
||||
```bash
|
||||
ssh root@91.98.224.44 "journalctl -u glam-rag-api --since '5 minutes ago' | grep -i inheritance"
|
||||
```
|
||||
|
||||
## API Response Structure
|
||||
|
||||
The graph score is exposed in the API response:
|
||||
|
||||
```json
|
||||
{
|
||||
"retrieved_results": [
|
||||
{
|
||||
"ghcid": "NL-UT-UTR-M-CM",
|
||||
"name": "Centraal Museum",
|
||||
"scores": {
|
||||
"vector": 0.589,
|
||||
"graph": 0.400, // <-- Now populated via inheritance
|
||||
"combined": 0.532
|
||||
},
|
||||
"related_institutions": ["NL-UT-UTR-M-MS", "NL-UT-UTR-M-UMUU"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Deployment
|
||||
|
||||
**File to deploy:**
|
||||
```bash
|
||||
scp /Users/kempersc/apps/glam/src/glam_extractor/api/hybrid_retriever.py \
|
||||
root@91.98.224.44:/opt/glam-backend/rag/glam_extractor/api/
|
||||
```
|
||||
|
||||
**Restart service:**
|
||||
```bash
|
||||
ssh root@91.98.224.44 "systemctl restart glam-rag-api"
|
||||
```
|
||||
|
||||
**Verify:**
|
||||
```bash
|
||||
curl -s -X POST 'https://archief.support/api/rag/dspy/query' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"question": "Musea in Rotterdam", "language": "nl"}' | \
|
||||
python3 -c "import sys,json; r=json.load(sys.stdin)['retrieved_results']; print('\n'.join(f\"{x['name'][:30]:30} G:{x['scores']['graph']:.2f}\" for x in r[:5]))"
|
||||
```
|
||||
|
||||
## Related Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `hybrid_retriever.py` | Main implementation with `_combine_and_rank()` |
|
||||
| `dspy_heritage_rag.py` | RAG pipeline that calls `retriever.search()` |
|
||||
| `main.py` | FastAPI endpoints serving the RAG API |
|
||||
|
||||
## Future Improvements
|
||||
|
||||
1. **Dynamic inheritance factor**: Adjust based on query type (geographic vs. thematic)
|
||||
2. **Multi-hop expansion**: Inherit from institutions 2+ hops away
|
||||
3. **Weighted inheritance**: Weight by relationship type (same_city=0.8, same_type=0.5)
|
||||
4. **Negative inheritance**: Penalize results unrelated to graph findings
|
||||
|
||||
---
|
||||
|
||||
**Last Updated:** 2025-12-24
|
||||
**Implemented:** 2025-12-23
|
||||
**Status:** Production (archief.support)
|
||||
|
|
@ -16,14 +16,14 @@
|
|||
"test:coverage": "vitest run --coverage"
|
||||
},
|
||||
"dependencies": {
|
||||
"@glam/api-client": "workspace:*",
|
||||
"@glam/hooks": "workspace:*",
|
||||
"@glam/theme": "workspace:*",
|
||||
"@glam/ui": "workspace:*",
|
||||
"@codemirror/lang-javascript": "^6.2.4",
|
||||
"@duckdb/duckdb-wasm": "^1.31.0",
|
||||
"@emotion/react": "^11.14.0",
|
||||
"@emotion/styled": "^11.14.1",
|
||||
"@glam/api-client": "workspace:*",
|
||||
"@glam/hooks": "workspace:*",
|
||||
"@glam/theme": "workspace:*",
|
||||
"@glam/ui": "workspace:*",
|
||||
"@mui/icons-material": "^7.3.6",
|
||||
"@mui/material": "^7.3.5",
|
||||
"@tanstack/react-query": "^5.90.10",
|
||||
|
|
@ -45,7 +45,7 @@
|
|||
"fdir": "^6.5.0",
|
||||
"js-yaml": "^4.1.1",
|
||||
"lodash": "^4.17.21",
|
||||
"lucide-react": "^0.561.0",
|
||||
"lucide-react": "^0.562.0",
|
||||
"maplibre-gl": "^5.14.0",
|
||||
"mermaid": "^11.4.0",
|
||||
"n3": "^1.26.0",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"generated": "2025-12-23T16:58:31.474Z",
|
||||
"generated": "2025-12-25T12:42:29.931Z",
|
||||
"version": "1.0.0",
|
||||
"categories": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -872,3 +872,106 @@
|
|||
padding: 0.125rem 0.375rem;
|
||||
}
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Chain-of-Thought Reasoning Display (GLM 4.7 Interleaved Thinking)
|
||||
========================================================================== */
|
||||
|
||||
.conversation-panel__reasoning {
|
||||
margin: 0.75rem 0;
|
||||
border: 1px solid var(--border-color, #e0e0e0);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
background: var(--surface-secondary, #fafafa);
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-toggle {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: var(--surface-secondary, #f5f5f5);
|
||||
cursor: pointer;
|
||||
font-size: 0.8125rem;
|
||||
color: var(--text-secondary, #757575);
|
||||
border: none;
|
||||
width: 100%;
|
||||
transition: background-color 0.2s;
|
||||
list-style: none; /* Remove default marker */
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-toggle::-webkit-details-marker {
|
||||
display: none; /* Hide default arrow in WebKit browsers */
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-toggle::before {
|
||||
content: '▶';
|
||||
font-size: 0.625rem;
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning[open] .conversation-panel__reasoning-toggle::before {
|
||||
transform: rotate(90deg);
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-toggle:hover {
|
||||
background: var(--surface-tertiary, #eeeeee);
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-toggle svg {
|
||||
color: var(--primary-color, #1976d2);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content {
|
||||
padding: 0.75rem;
|
||||
background: var(--surface-code, #1e1e1e);
|
||||
max-height: 300px;
|
||||
overflow-y: auto;
|
||||
border-top: 1px solid var(--border-color, #e0e0e0);
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content pre {
|
||||
margin: 0;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
font-size: 0.75rem;
|
||||
line-height: 1.5;
|
||||
color: var(--text-code, #d4d4d4);
|
||||
font-family: 'Fira Code', 'Monaco', 'Consolas', monospace;
|
||||
}
|
||||
|
||||
/* Scrollbar styling for reasoning content */
|
||||
.conversation-panel__reasoning-content::-webkit-scrollbar {
|
||||
width: 6px;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content::-webkit-scrollbar-track {
|
||||
background: #2d2d2d;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content::-webkit-scrollbar-thumb {
|
||||
background: #555;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content::-webkit-scrollbar-thumb:hover {
|
||||
background: #777;
|
||||
}
|
||||
|
||||
/* Responsive: Reasoning section */
|
||||
@media (max-width: 768px) {
|
||||
.conversation-panel__reasoning-toggle {
|
||||
font-size: 0.75rem;
|
||||
padding: 0.375rem 0.5rem;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content {
|
||||
max-height: 200px;
|
||||
padding: 0.5rem;
|
||||
}
|
||||
|
||||
.conversation-panel__reasoning-content pre {
|
||||
font-size: 0.6875rem;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -164,6 +164,9 @@ const TEXT = {
|
|||
sourcesUsed: { nl: 'Bronnen gebruikt', en: 'Sources used' },
|
||||
llmProvider: { nl: 'Model', en: 'Model' },
|
||||
answer: { nl: 'Antwoord', en: 'Answer' },
|
||||
showReasoning: { nl: 'Toon redenering', en: 'Show reasoning' },
|
||||
hideReasoning: { nl: 'Verberg redenering', en: 'Hide reasoning' },
|
||||
reasoningTitle: { nl: 'Denkproces', en: 'Thinking Process' },
|
||||
};
|
||||
|
||||
// Example questions to help users get started - shorter list
|
||||
|
|
@ -180,6 +183,21 @@ const EXAMPLE_QUESTIONS = {
|
|||
],
|
||||
};
|
||||
|
||||
// LLM Response Metadata - matches backend LLMResponseMetadata model
|
||||
interface LLMResponseMetadata {
|
||||
content?: string;
|
||||
reasoning_content?: string; // GLM 4.7 chain-of-thought reasoning
|
||||
model?: string;
|
||||
provider?: string; // zai, anthropic, openai, groq
|
||||
prompt_tokens?: number;
|
||||
completion_tokens?: number;
|
||||
total_tokens?: number;
|
||||
thinking_mode?: string; // enabled, disabled, interleaved
|
||||
latency_ms?: number;
|
||||
cached?: boolean;
|
||||
finish_reason?: string;
|
||||
}
|
||||
|
||||
interface Message {
|
||||
id: string;
|
||||
role: 'user' | 'assistant';
|
||||
|
|
@ -192,6 +210,7 @@ interface Message {
|
|||
error?: string;
|
||||
errorCode?: string;
|
||||
llmProviderUsed?: string; // Which LLM provider generated this response
|
||||
llmResponse?: LLMResponseMetadata; // Full LLM response metadata including chain-of-thought
|
||||
}
|
||||
|
||||
interface HistoryItem {
|
||||
|
|
@ -351,6 +370,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
answer: string;
|
||||
sourcesUsed: string[];
|
||||
llmProviderUsed?: string;
|
||||
llmResponse?: LLMResponseMetadata; // Full LLM response with reasoning_content
|
||||
}> => {
|
||||
// Determine API endpoint based on environment
|
||||
const hostname = window.location.hostname;
|
||||
|
|
@ -395,6 +415,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
answer: data.answer || data.explanation || '',
|
||||
sourcesUsed: data.sources_used || selectedSources,
|
||||
llmProviderUsed: data.llm_provider_used,
|
||||
llmResponse: data.llm_response, // Pass through chain-of-thought metadata
|
||||
};
|
||||
};
|
||||
|
||||
|
|
@ -445,6 +466,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
sparql: result.sparql,
|
||||
sourcesUsed: result.sourcesUsed,
|
||||
llmProviderUsed: result.llmProviderUsed,
|
||||
llmResponse: result.llmResponse,
|
||||
isLoading: false,
|
||||
}
|
||||
: msg
|
||||
|
|
@ -928,6 +950,19 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
<>
|
||||
<p>{message.content}</p>
|
||||
|
||||
{/* Chain-of-Thought Reasoning (GLM 4.7 Interleaved Thinking) */}
|
||||
{message.llmResponse?.reasoning_content && (
|
||||
<details className="conversation-panel__reasoning">
|
||||
<summary className="conversation-panel__reasoning-toggle">
|
||||
<Sparkles size={14} />
|
||||
<span>{t('showReasoning')}</span>
|
||||
</summary>
|
||||
<div className="conversation-panel__reasoning-content">
|
||||
<pre>{message.llmResponse.reasoning_content}</pre>
|
||||
</div>
|
||||
</details>
|
||||
)}
|
||||
|
||||
{/* Sources Used Badges */}
|
||||
{message.sourcesUsed && message.sourcesUsed.length > 0 && (
|
||||
<div className="conversation-panel__sources-used">
|
||||
|
|
|
|||
|
|
@ -196,9 +196,9 @@ function createGLAMPolyhedronGeometry(radius: number = 1): THREE.BufferGeometry
|
|||
* Creates text sprites positioned at the center of each icosahedron face.
|
||||
*
|
||||
* Label visibility behavior:
|
||||
* - Relevant types: Always visible (opacity 1) - these are the types passed in highlightTypes
|
||||
* - Non-relevant types: Only visible when expanded (opacity 0 when collapsed)
|
||||
* - If highlightTypes is empty (universal), all labels are shown when expanded only
|
||||
* - Empty array (no annotation): NO labels shown at all (blank cube)
|
||||
* - 19+ types (universal): All labels shown when expanded only
|
||||
* - Specific types (1-18): Only those letters shown (always visible)
|
||||
*/
|
||||
function createFaceLabels(
|
||||
geometry: THREE.BufferGeometry,
|
||||
|
|
@ -209,14 +209,16 @@ function createFaceLabels(
|
|||
const positions = geometry.getAttribute('position');
|
||||
const faceCount = positions.count / 3;
|
||||
const highlightSet = new Set(highlightTypes);
|
||||
const isUniversal = highlightTypes.length === 0 || highlightTypes.length >= 19;
|
||||
const hasNoAnnotation = highlightTypes.length === 0;
|
||||
const isUniversal = highlightTypes.length >= 19;
|
||||
|
||||
for (let faceIndex = 0; faceIndex < Math.min(faceCount, 20); faceIndex++) {
|
||||
const typeIndex = faceIndex % 19;
|
||||
const typeCode = CUSTODIAN_TYPE_CODES[typeIndex];
|
||||
|
||||
// Determine if this type is relevant (highlighted)
|
||||
const isRelevant = highlightTypes.length === 0 || highlightSet.has(typeCode);
|
||||
// Empty array = no annotation = nothing is relevant
|
||||
const isRelevant = !hasNoAnnotation && (isUniversal || highlightSet.has(typeCode));
|
||||
|
||||
// Calculate face center (average of 3 vertices)
|
||||
const v0 = new THREE.Vector3(
|
||||
|
|
@ -559,7 +561,7 @@ export const CustodianTypeIndicator3D: React.FC<CustodianTypeIndicator3DProps> =
|
|||
|
||||
// Tooltip text
|
||||
const tooltipText = useMemo(() => {
|
||||
if (types.length === 0) return 'Heritage Custodian Types (GLAMORCUBESFIXPHDNT)';
|
||||
if (types.length === 0) return 'No custodian types';
|
||||
return types
|
||||
.map(code => getCustodianTypeByCode(code)?.label[language] ?? code)
|
||||
.join(', ');
|
||||
|
|
@ -667,23 +669,27 @@ export const CustodianTypeIndicator3D: React.FC<CustodianTypeIndicator3DProps> =
|
|||
|
||||
// Update label visibility based on expanded state and highlighted types
|
||||
// Label visibility rules:
|
||||
// - Non-universal elements (1-18 types): Show relevant letters only (both collapsed and expanded)
|
||||
// - Universal elements (19 types or empty): Show all letters only when expanded
|
||||
// - No annotation (empty array, length 0): Show NO letters (blank cube)
|
||||
// - Universal annotation (19+ types): Show all letters only when expanded
|
||||
// - Specific types (1-18 types): Show ONLY those letters (both collapsed and expanded)
|
||||
if (labelsGroupRef.current) {
|
||||
const highlightSet = new Set(types);
|
||||
const isUniversal = types.length === 0 || types.length >= 19;
|
||||
const hasNoAnnotation = types.length === 0;
|
||||
const isUniversal = types.length >= 19;
|
||||
|
||||
labelsGroupRef.current.children.forEach((child) => {
|
||||
if (child instanceof THREE.Sprite && child.userData.typeCode) {
|
||||
const typeCode = child.userData.typeCode as CustodianTypeCode;
|
||||
const isRelevant = types.length === 0 || highlightSet.has(typeCode);
|
||||
const isRelevant = highlightSet.has(typeCode);
|
||||
|
||||
if (isUniversal) {
|
||||
// Universal elements: Show all letters only when expanded
|
||||
if (hasNoAnnotation) {
|
||||
// No annotation: Show NO letters at all (blank cube)
|
||||
child.material.opacity = 0;
|
||||
} else if (isUniversal) {
|
||||
// Universal annotation (19+ types): Show all letters only when expanded
|
||||
child.material.opacity = isExpanded ? 1 : 0;
|
||||
} else {
|
||||
// Non-universal elements: Show ONLY relevant letters (hidden otherwise)
|
||||
// When expanded, relevant letters get full opacity
|
||||
// Specific types (1-18): Show ONLY relevant letters (hidden otherwise)
|
||||
if (isRelevant) {
|
||||
child.material.opacity = 1; // Relevant letters always visible
|
||||
} else {
|
||||
|
|
@ -1172,7 +1178,7 @@ export const CustodianTypeIndicator3DFallback: React.FC<CustodianTypeIndicator3D
|
|||
const color = config?.color ?? '#888888';
|
||||
|
||||
const tooltipText = useMemo(() => {
|
||||
if (types.length === 0) return 'Heritage Custodian Types';
|
||||
if (types.length === 0) return 'No custodian types';
|
||||
return types
|
||||
.map(code => getCustodianTypeByCode(code)?.label[language] ?? code)
|
||||
.join(', ');
|
||||
|
|
|
|||
|
|
@ -1071,6 +1071,24 @@ class LinkMLSchemaService {
|
|||
return this.parseCustodianTypesAnnotation(slot.annotations.custodian_types);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all classes that use a given slot
|
||||
* Returns array of class names that have this slot in their slots array
|
||||
*/
|
||||
async getClassesUsingSlot(slotName: string): Promise<string[]> {
|
||||
await this.initialize();
|
||||
const classes: string[] = [];
|
||||
|
||||
for (const [className, schema] of this.classSchemas.entries()) {
|
||||
const classDef = schema.classes?.[className];
|
||||
if (classDef?.slots?.includes(slotName)) {
|
||||
classes.push(className);
|
||||
}
|
||||
}
|
||||
|
||||
return classes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get custodian_types annotation from an enum definition
|
||||
* Returns null if annotation not found
|
||||
|
|
|
|||
|
|
@ -260,7 +260,11 @@ function validateCustodianTypes(types: string[]): CustodianTypeCode[] {
|
|||
* Priority:
|
||||
* 1. Read from LinkML schema annotations (custodian_types)
|
||||
* 2. Fall back to static mapping
|
||||
* 3. Default to all types (universal)
|
||||
* 3. Default to EMPTY ARRAY (no types assigned) - cube will show no letters
|
||||
*
|
||||
* NOTE: We return [] instead of DEFAULT_CUSTODIAN_TYPES when no annotation exists
|
||||
* because classes without explicit custodian_types annotations should NOT display
|
||||
* all 19 letters on the cube. Only classes with explicit annotations should show letters.
|
||||
*/
|
||||
export async function getCustodianTypesForClassAsync(className: string): Promise<CustodianTypeCode[]> {
|
||||
try {
|
||||
|
|
@ -276,15 +280,23 @@ export async function getCustodianTypesForClassAsync(className: string): Promise
|
|||
console.warn(`[CustodianMapping] Error reading annotations for class ${className}:`, error);
|
||||
}
|
||||
|
||||
// Fall back to static mapping
|
||||
return CLASS_TO_CUSTODIAN_TYPE[className] || DEFAULT_CUSTODIAN_TYPES;
|
||||
// Fall back to static mapping, or empty array if no mapping exists
|
||||
// Empty array means "no custodian types assigned" - cube will show no letters
|
||||
return CLASS_TO_CUSTODIAN_TYPE[className] || [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get custodian types for a schema slot (async version)
|
||||
*
|
||||
* Priority:
|
||||
* 1. Read from slot's own LinkML schema annotations (custodian_types)
|
||||
* 2. Inherit from parent class(es) that use this slot
|
||||
* 3. Fall back to static mapping
|
||||
* 4. Return empty array (no types assigned - cube shows no letters)
|
||||
*/
|
||||
export async function getCustodianTypesForSlotAsync(slotName: string): Promise<CustodianTypeCode[]> {
|
||||
try {
|
||||
// 1. Try slot's own annotation first
|
||||
const annotationTypes = await linkmlSchemaService.getSlotCustodianTypes(slotName);
|
||||
if (annotationTypes && annotationTypes.length > 0) {
|
||||
const validated = validateCustodianTypes(annotationTypes);
|
||||
|
|
@ -292,15 +304,38 @@ export async function getCustodianTypesForSlotAsync(slotName: string): Promise<C
|
|||
return validated;
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Try to inherit from parent class(es) that use this slot
|
||||
const parentClasses = await linkmlSchemaService.getClassesUsingSlot(slotName);
|
||||
if (parentClasses.length > 0) {
|
||||
const inheritedTypes = new Set<CustodianTypeCode>();
|
||||
for (const className of parentClasses) {
|
||||
const classTypes = await linkmlSchemaService.getClassCustodianTypes(className);
|
||||
if (classTypes && classTypes.length > 0) {
|
||||
const validated = validateCustodianTypes(classTypes);
|
||||
validated.forEach(t => inheritedTypes.add(t));
|
||||
}
|
||||
}
|
||||
if (inheritedTypes.size > 0) {
|
||||
return Array.from(inheritedTypes);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`[CustodianMapping] Error reading annotations for slot ${slotName}:`, error);
|
||||
}
|
||||
|
||||
return SLOT_TO_CUSTODIAN_TYPE[slotName] || DEFAULT_CUSTODIAN_TYPES;
|
||||
// 3. Fall back to static mapping, or empty array if no mapping exists
|
||||
// Empty array means "no custodian types assigned" - cube will show no letters
|
||||
return SLOT_TO_CUSTODIAN_TYPE[slotName] || [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get custodian types for a schema enum (async version)
|
||||
*
|
||||
* Priority:
|
||||
* 1. Read from enum's LinkML schema annotations (custodian_types)
|
||||
* 2. Fall back to static mapping
|
||||
* 3. Return empty array (no types assigned - cube shows no letters)
|
||||
*/
|
||||
export async function getCustodianTypesForEnumAsync(enumName: string): Promise<CustodianTypeCode[]> {
|
||||
try {
|
||||
|
|
@ -315,5 +350,7 @@ export async function getCustodianTypesForEnumAsync(enumName: string): Promise<C
|
|||
console.warn(`[CustodianMapping] Error reading annotations for enum ${enumName}:`, error);
|
||||
}
|
||||
|
||||
return ENUM_TO_CUSTODIAN_TYPE[enumName] || DEFAULT_CUSTODIAN_TYPES;
|
||||
// Fall back to static mapping, or empty array if no mapping exists
|
||||
// Empty array means "no custodian types assigned" - cube will show no letters
|
||||
return ENUM_TO_CUSTODIAN_TYPE[enumName] || [];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -732,7 +732,7 @@ const LinkMLViewerPage: React.FC = () => {
|
|||
const mainContentRef = useRef<HTMLElement>(null);
|
||||
|
||||
// Schema loading progress tracking
|
||||
const { progress: schemaProgress, isLoading: isSchemaServiceLoading } = useSchemaLoadingProgress();
|
||||
const { progress: schemaProgress, isLoading: isSchemaServiceLoading, isComplete: isSchemaServiceComplete } = useSchemaLoadingProgress();
|
||||
|
||||
// Handler for filtering by custodian type (clicking polyhedron face or legend item)
|
||||
// Multi-select toggle behavior: clicking type adds/removes from set
|
||||
|
|
@ -881,17 +881,32 @@ const LinkMLViewerPage: React.FC = () => {
|
|||
|
||||
// Load custodian types from schema annotations when schema changes
|
||||
// This pre-loads types asynchronously so they're available for rendering
|
||||
// IMPORTANT: Wait for schema service to complete loading before fetching custodian types
|
||||
// to avoid race condition where annotations aren't available yet
|
||||
useEffect(() => {
|
||||
if (!schema) {
|
||||
setCustodianTypesLoaded(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't load custodian types until schema service has finished loading all class files
|
||||
// This prevents the race condition where we try to read annotations before they're loaded
|
||||
if (!isSchemaServiceComplete) {
|
||||
console.log('[LinkMLViewerPage] Waiting for schema service to complete before loading custodian types...');
|
||||
return;
|
||||
}
|
||||
|
||||
const loadCustodianTypes = async () => {
|
||||
const classes = extractClasses(schema);
|
||||
const slots = extractSlots(schema);
|
||||
const enums = extractEnums(schema);
|
||||
|
||||
console.log('[LinkMLViewerPage] Schema service complete, loading custodian types for', {
|
||||
classes: classes.length,
|
||||
slots: slots.length,
|
||||
enums: enums.length
|
||||
});
|
||||
|
||||
// Load types for all classes in parallel
|
||||
const classTypesPromises = classes.map(async (cls) => {
|
||||
const types = await getCustodianTypesForClassAsync(cls.name);
|
||||
|
|
@ -951,7 +966,7 @@ const LinkMLViewerPage: React.FC = () => {
|
|||
};
|
||||
|
||||
loadCustodianTypes();
|
||||
}, [schema]);
|
||||
}, [schema, isSchemaServiceComplete]);
|
||||
|
||||
const toggleSection = (section: string) => {
|
||||
setExpandedSections(prev => {
|
||||
|
|
|
|||
2
node_modules/.modules.yaml
generated
vendored
2
node_modules/.modules.yaml
generated
vendored
|
|
@ -987,7 +987,7 @@ hoistedDependencies:
|
|||
loose-envify: private
|
||||
lru-cache@11.2.4:
|
||||
lru-cache: private
|
||||
lucide-react@0.561.0(react@19.2.3):
|
||||
lucide-react@0.562.0(react@19.2.3):
|
||||
lucide-react: private
|
||||
lz-string@1.5.0:
|
||||
lz-string: private
|
||||
|
|
|
|||
10
node_modules/.pnpm/lock.yaml
generated
vendored
10
node_modules/.pnpm/lock.yaml
generated
vendored
|
|
@ -169,8 +169,8 @@ importers:
|
|||
specifier: ^4.17.21
|
||||
version: 4.17.21
|
||||
lucide-react:
|
||||
specifier: ^0.561.0
|
||||
version: 0.561.0(react@19.2.3)
|
||||
specifier: ^0.562.0
|
||||
version: 0.562.0(react@19.2.3)
|
||||
maplibre-gl:
|
||||
specifier: ^5.14.0
|
||||
version: 5.15.0
|
||||
|
|
@ -2507,8 +2507,8 @@ packages:
|
|||
lru-cache@5.1.1:
|
||||
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
|
||||
|
||||
lucide-react@0.561.0:
|
||||
resolution: {integrity: sha512-Y59gMY38tl4/i0qewcqohPdEbieBy7SovpBL9IFebhc2mDd8x4PZSOsiFRkpPcOq6bj1r/mjH/Rk73gSlIJP2A==}
|
||||
lucide-react@0.562.0:
|
||||
resolution: {integrity: sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==}
|
||||
peerDependencies:
|
||||
react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
|
|
@ -5660,7 +5660,7 @@ snapshots:
|
|||
dependencies:
|
||||
yallist: 3.1.1
|
||||
|
||||
lucide-react@0.561.0(react@19.2.3):
|
||||
lucide-react@0.562.0(react@19.2.3):
|
||||
dependencies:
|
||||
react: 19.2.3
|
||||
|
||||
|
|
|
|||
39
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE
generated
vendored
Normal file
39
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE
generated
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
ISC License
|
||||
|
||||
Copyright (c) for portions of Lucide are held by Cole Bemis 2013-2023 as part of Feather (MIT). All other copyright (c) for Lucide are held by Lucide Contributors 2025.
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
---
|
||||
|
||||
The MIT License (MIT) (for portions derived from Feather)
|
||||
|
||||
Copyright (c) 2013-2023 Cole Bemis
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
73
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md
generated
vendored
Normal file
73
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md
generated
vendored
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
<p align="center">
|
||||
<a href="https://github.com/lucide-icons/lucide">
|
||||
<img src="https://lucide.dev/package-logos/lucide-react.svg" alt="Lucide icon library for React applications." width="540">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
Lucide icon library for React applications.
|
||||
</p>
|
||||
|
||||
<div align="center">
|
||||
|
||||
[](https://www.npmjs.com/package/lucide-react)
|
||||

|
||||
[](https://lucide.dev/license)
|
||||
</div>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://lucide.dev/guide/">About</a>
|
||||
·
|
||||
<a href="https://lucide.dev/icons/">Icons</a>
|
||||
·
|
||||
<a href="https://lucide.dev/guide/packages/lucide-react">Documentation</a>
|
||||
·
|
||||
<a href="https://lucide.dev/license">License</a>
|
||||
</p>
|
||||
|
||||
# Lucide React
|
||||
|
||||
Implementation of the lucide icon library for React applications.
|
||||
|
||||
## Installation
|
||||
|
||||
```sh
|
||||
pnpm add lucide-react
|
||||
```
|
||||
|
||||
```sh
|
||||
npm install lucide-react
|
||||
```
|
||||
|
||||
```sh
|
||||
yarn add lucide-react
|
||||
```
|
||||
|
||||
```sh
|
||||
bun add lucide-react
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
For full documentation, visit [lucide.dev](https://lucide.dev/guide/packages/lucide-react)
|
||||
|
||||
## Community
|
||||
|
||||
Join the [Discord server](https://discord.gg/EH6nSts) to chat with the maintainers and other users.
|
||||
|
||||
## License
|
||||
|
||||
Lucide is licensed under the ISC license. See [LICENSE](https://lucide.dev/license).
|
||||
|
||||
## Sponsors
|
||||
|
||||
<a href="https://vercel.com?utm_source=lucide&utm_campaign=oss">
|
||||
<img src="https://lucide.dev/vercel.svg" alt="Powered by Vercel" width="200" />
|
||||
</a>
|
||||
|
||||
<a href="https://www.digitalocean.com/?refcode=b0877a2caebd&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge"><img src="https://lucide.dev/digitalocean.svg" width="200" alt="DigitalOcean Referral Badge" /></a>
|
||||
|
||||
### Awesome backers 🍺
|
||||
|
||||
<a href="https://www.scipress.io?utm_source=lucide"><img src="https://lucide.dev/sponsors/scipress.svg" width="180" alt="Scipress sponsor badge" /></a>
|
||||
<a href="https://github.com/pdfme/pdfme"><img src="https://lucide.dev/sponsors/pdfme.svg" width="180" alt="pdfme sponsor badge" /></a>
|
||||
10
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs
generated
vendored
Normal file
10
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs
generated
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
/**
|
||||
* @license lucide-react v0.562.0 - ISC
|
||||
*
|
||||
* This source code is licensed under the ISC license.
|
||||
* See the LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
export { default as DynamicIcon, iconNames } from './dist/esm/DynamicIcon.js';
|
||||
export { default as dynamicIconImports } from './dist/esm/dynamicIconImports.js';
|
||||
//# sourceMappingURL=dynamic.mjs.map
|
||||
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs
generated
vendored
Normal file
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs
generated
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
export { default } from './dist/esm/dynamicIconImports.js';
|
||||
74
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json
generated
vendored
Normal file
74
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json
generated
vendored
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
{
|
||||
"name": "lucide-react",
|
||||
"description": "A Lucide icon library package for React applications.",
|
||||
"version": "0.562.0",
|
||||
"license": "ISC",
|
||||
"homepage": "https://lucide.dev",
|
||||
"bugs": "https://github.com/lucide-icons/lucide/issues",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/lucide-icons/lucide.git",
|
||||
"directory": "packages/lucide-react"
|
||||
},
|
||||
"keywords": [
|
||||
"Lucide",
|
||||
"React",
|
||||
"Feather",
|
||||
"Icons",
|
||||
"Icon",
|
||||
"SVG",
|
||||
"Feather Icons",
|
||||
"Fontawesome",
|
||||
"Font Awesome"
|
||||
],
|
||||
"author": "Eric Fennis",
|
||||
"amdName": "lucide-react",
|
||||
"main": "dist/cjs/lucide-react.js",
|
||||
"main:umd": "dist/umd/lucide-react.js",
|
||||
"module": "dist/esm/lucide-react.js",
|
||||
"unpkg": "dist/umd/lucide-react.min.js",
|
||||
"typings": "dist/lucide-react.d.ts",
|
||||
"sideEffects": false,
|
||||
"files": [
|
||||
"dist",
|
||||
"dynamic.mjs",
|
||||
"dynamic.js.map",
|
||||
"dynamic.d.ts",
|
||||
"dynamicIconImports.mjs",
|
||||
"dynamicIconImports.js.map",
|
||||
"dynamicIconImports.d.ts"
|
||||
],
|
||||
"devDependencies": {
|
||||
"@testing-library/jest-dom": "^6.1.6",
|
||||
"@testing-library/react": "^14.1.2",
|
||||
"@types/react": "^18.2.37",
|
||||
"@vitejs/plugin-react": "^4.4.1",
|
||||
"jest-serializer-html": "^7.1.0",
|
||||
"react": "18.2.0",
|
||||
"react-dom": "18.2.0",
|
||||
"rollup": "^4.53.3",
|
||||
"rollup-plugin-dts": "^6.2.3",
|
||||
"rollup-plugin-preserve-directives": "^0.4.0",
|
||||
"typescript": "^5.8.3",
|
||||
"vite": "^7.2.4",
|
||||
"vitest": "^4.0.12",
|
||||
"@lucide/shared": "1.0.0",
|
||||
"@lucide/rollup-plugins": "1.0.0",
|
||||
"@lucide/build-icons": "1.1.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "pnpm clean && pnpm copy:license && pnpm build:icons && pnpm typecheck && pnpm build:bundles",
|
||||
"copy:license": "cp ../../LICENSE ./LICENSE",
|
||||
"clean": "rm -rf dist && rm -rf stats && rm -rf ./src/icons/*.ts && rm -f dynamic.* && rm -f dynamicIconImports.d.ts",
|
||||
"build:icons": "build-icons --output=./src --templateSrc=./scripts/exportTemplate.mts --renderUniqueKey --withAliases --withDynamicImports --separateAliasesFile --separateAliasesFileIgnore=fingerprint --aliasesFileExtension=.ts --iconFileExtension=.ts --exportFileName=index.ts",
|
||||
"build:bundles": "rollup -c ./rollup.config.mjs",
|
||||
"typecheck": "tsc",
|
||||
"typecheck:watch": "tsc -w",
|
||||
"test": "pnpm build:icons && vitest run",
|
||||
"test:watch": "vitest watch",
|
||||
"version": "pnpm version --git-tag-version=false"
|
||||
}
|
||||
}
|
||||
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react
generated
vendored
Symbolic link
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react
generated
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
|||
../../react@19.2.3/node_modules/react
|
||||
2
node_modules/.pnpm/node_modules/lucide-react
generated
vendored
2
node_modules/.pnpm/node_modules/lucide-react
generated
vendored
|
|
@ -1 +1 @@
|
|||
../lucide-react@0.561.0_react@19.2.3/node_modules/lucide-react
|
||||
../lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react
|
||||
|
|
@ -169,8 +169,8 @@ importers:
|
|||
specifier: ^4.17.21
|
||||
version: 4.17.21
|
||||
lucide-react:
|
||||
specifier: ^0.561.0
|
||||
version: 0.561.0(react@19.2.3)
|
||||
specifier: ^0.562.0
|
||||
version: 0.562.0(react@19.2.3)
|
||||
maplibre-gl:
|
||||
specifier: ^5.14.0
|
||||
version: 5.15.0
|
||||
|
|
@ -2507,8 +2507,8 @@ packages:
|
|||
lru-cache@5.1.1:
|
||||
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
|
||||
|
||||
lucide-react@0.561.0:
|
||||
resolution: {integrity: sha512-Y59gMY38tl4/i0qewcqohPdEbieBy7SovpBL9IFebhc2mDd8x4PZSOsiFRkpPcOq6bj1r/mjH/Rk73gSlIJP2A==}
|
||||
lucide-react@0.562.0:
|
||||
resolution: {integrity: sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==}
|
||||
peerDependencies:
|
||||
react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
|
|
@ -5660,7 +5660,7 @@ snapshots:
|
|||
dependencies:
|
||||
yallist: 3.1.1
|
||||
|
||||
lucide-react@0.561.0(react@19.2.3):
|
||||
lucide-react@0.562.0(react@19.2.3):
|
||||
dependencies:
|
||||
react: 19.2.3
|
||||
|
||||
|
|
|
|||
561
scripts/discover_custodian_websites.py
Normal file
561
scripts/discover_custodian_websites.py
Normal file
|
|
@ -0,0 +1,561 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Discover website URLs for custodian YAML files that are missing them.
|
||||
|
||||
This script uses web search (via DuckDuckGo or Google) to find official websites
|
||||
for heritage institutions based on their name and location.
|
||||
|
||||
Search strategy:
|
||||
1. Search for institution name + city + country
|
||||
2. Search for institution name + "official website"
|
||||
3. Search for institution name + institution type (museum, library, archive)
|
||||
|
||||
Output:
|
||||
- Updates custodian YAML files with discovered website URLs
|
||||
- Stores provenance for discovered URLs
|
||||
|
||||
Usage:
|
||||
python scripts/discover_custodian_websites.py [options]
|
||||
|
||||
Options:
|
||||
--dry-run Show what would be discovered without modifying files
|
||||
--limit N Process only first N files (for testing)
|
||||
--file PATH Process a single specific file
|
||||
--country CODE Filter by country code (e.g., JP, CZ)
|
||||
--resume Resume from last checkpoint
|
||||
|
||||
Requirements:
|
||||
pip install duckduckgo-search pyyaml httpx
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import yaml
|
||||
|
||||
try:
|
||||
from duckduckgo_search import DDGS
|
||||
except ImportError:
|
||||
print("Please install duckduckgo-search: pip install duckduckgo-search")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import httpx
|
||||
except ImportError:
|
||||
print("Please install httpx: pip install httpx")
|
||||
sys.exit(1)
|
||||
|
||||
# Logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
||||
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_checkpoint.json"
|
||||
REQUEST_DELAY = 3.0 # seconds between searches (be nice to search engines)
|
||||
|
||||
# Domain blacklist (not actual institution websites)
|
||||
DOMAIN_BLACKLIST = {
|
||||
'wikipedia.org', 'wikidata.org', 'wikimedia.org',
|
||||
'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com',
|
||||
'youtube.com', 'tiktok.com', 'pinterest.com',
|
||||
'tripadvisor.com', 'tripadvisor.jp', 'yelp.com',
|
||||
'google.com', 'google.co.jp', 'maps.google.com',
|
||||
'amazon.com', 'amazon.co.jp', 'ebay.com',
|
||||
'booking.com', 'expedia.com', 'hotels.com',
|
||||
'foursquare.com', 'bing.com', 'yahoo.com',
|
||||
'findagrave.com', 'ancestry.com', 'familysearch.org',
|
||||
'academia.edu', 'researchgate.net',
|
||||
'timeanddate.com', 'weather.com',
|
||||
}
|
||||
|
||||
# Domain preferences (prefer these TLDs for official sites)
|
||||
PREFERRED_TLDS = {
|
||||
'JP': ['.go.jp', '.lg.jp', '.ac.jp', '.or.jp', '.jp'],
|
||||
'CZ': ['.cz', '.gov.cz'],
|
||||
'NL': ['.nl', '.gov.nl'],
|
||||
'BE': ['.be', '.gov.be'],
|
||||
'DE': ['.de', '.gov.de'],
|
||||
'AT': ['.at', '.gv.at'],
|
||||
'CH': ['.ch', '.admin.ch'],
|
||||
}
|
||||
|
||||
|
||||
def get_custodian_name(entry: dict) -> str | None:
|
||||
"""Extract institution name from entry."""
|
||||
# Priority 1: Emic name (native language official name)
|
||||
if entry.get('custodian_name', {}).get('emic_name'):
|
||||
return entry['custodian_name']['emic_name']
|
||||
|
||||
# Priority 2: Wikidata native language label (ja, zh, ko, etc.)
|
||||
wikidata = entry.get('wikidata_enrichment', {})
|
||||
country = get_country_from_entry(entry)
|
||||
|
||||
# Map country to preferred label language
|
||||
country_lang_map = {
|
||||
'JP': 'ja',
|
||||
'CN': 'zh',
|
||||
'KR': 'ko',
|
||||
'TW': 'zh',
|
||||
'TH': 'th',
|
||||
'VN': 'vi',
|
||||
'RU': 'ru',
|
||||
'GR': 'el',
|
||||
'IL': 'he',
|
||||
'SA': 'ar',
|
||||
'IR': 'fa',
|
||||
}
|
||||
|
||||
if country in country_lang_map:
|
||||
lang = country_lang_map[country]
|
||||
native_label = wikidata.get(f'wikidata_label_{lang}') or wikidata.get('wikidata_labels', {}).get(lang)
|
||||
if native_label:
|
||||
return native_label
|
||||
|
||||
# Priority 3: Claim value
|
||||
if entry.get('custodian_name', {}).get('claim_value'):
|
||||
return entry['custodian_name']['claim_value']
|
||||
|
||||
# Priority 4: Original entry name
|
||||
if entry.get('original_entry', {}).get('name'):
|
||||
return entry['original_entry']['name']
|
||||
|
||||
# Priority 5: Organisatie (Dutch)
|
||||
if entry.get('original_entry', {}).get('organisatie'):
|
||||
return entry['original_entry']['organisatie']
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_country_from_entry(entry: dict) -> str | None:
|
||||
"""Extract country code from entry."""
|
||||
# Check location.country
|
||||
if entry.get('location', {}).get('country'):
|
||||
return entry['location']['country']
|
||||
|
||||
# Check original_entry.locations
|
||||
if entry.get('original_entry', {}).get('locations'):
|
||||
loc = entry['original_entry']['locations'][0]
|
||||
if loc.get('country'):
|
||||
return loc['country']
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_location_info(entry: dict) -> dict:
|
||||
"""Extract location information from entry."""
|
||||
location = {}
|
||||
|
||||
# Check original_entry.locations
|
||||
if entry.get('original_entry', {}).get('locations'):
|
||||
loc = entry['original_entry']['locations'][0]
|
||||
location['city'] = loc.get('city')
|
||||
location['region'] = loc.get('region')
|
||||
location['country'] = loc.get('country')
|
||||
location['street_address'] = loc.get('street_address')
|
||||
|
||||
# Check original_entry directly
|
||||
if not location.get('city'):
|
||||
orig = entry.get('original_entry', {})
|
||||
location['city'] = orig.get('city') or orig.get('plaats')
|
||||
location['country'] = orig.get('country')
|
||||
|
||||
return location
|
||||
|
||||
|
||||
def get_institution_type(entry: dict) -> str | None:
|
||||
"""Get institution type for search refinement."""
|
||||
inst_type = entry.get('original_entry', {}).get('institution_type')
|
||||
if inst_type:
|
||||
type_map = {
|
||||
'LIBRARY': 'library',
|
||||
'MUSEUM': 'museum',
|
||||
'ARCHIVE': 'archive',
|
||||
'GALLERY': 'gallery',
|
||||
'RESEARCH_CENTER': 'research center',
|
||||
'EDUCATION_PROVIDER': 'university',
|
||||
}
|
||||
return type_map.get(inst_type)
|
||||
return None
|
||||
|
||||
|
||||
def has_website(entry: dict) -> bool:
|
||||
"""Check if entry already has a website."""
|
||||
# Check various website fields
|
||||
if entry.get('original_entry', {}).get('webadres_organisatie'):
|
||||
return True
|
||||
|
||||
# Check identifiers
|
||||
for ident in entry.get('original_entry', {}).get('identifiers', []):
|
||||
if ident.get('identifier_scheme') == 'Website':
|
||||
return True
|
||||
|
||||
# Check enrichment fields
|
||||
if entry.get('website_discovery', {}).get('website_url'):
|
||||
return True
|
||||
if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'):
|
||||
return True
|
||||
if entry.get('google_maps_enrichment', {}).get('website'):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_valid_website(url: str, country: str | None = None) -> bool:
|
||||
"""Check if URL is a valid institutional website."""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Remove www prefix
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
# Check blacklist
|
||||
for blacklisted in DOMAIN_BLACKLIST:
|
||||
if blacklisted in domain:
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def score_website(url: str, country: str, name: str) -> int:
|
||||
"""Score a website URL based on likelihood of being official site."""
|
||||
score = 0
|
||||
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Prefer country-specific TLDs
|
||||
preferred = PREFERRED_TLDS.get(country, [])
|
||||
for i, tld in enumerate(preferred):
|
||||
if domain.endswith(tld):
|
||||
score += (len(preferred) - i) * 10
|
||||
break
|
||||
|
||||
# Prefer HTTPS
|
||||
if parsed.scheme == 'https':
|
||||
score += 5
|
||||
|
||||
# Prefer shorter paths (homepage vs deep link)
|
||||
path_depth = len([p for p in parsed.path.split('/') if p])
|
||||
score -= path_depth * 2
|
||||
|
||||
# Check if institution name words appear in domain
|
||||
name_words = set(re.findall(r'\w+', name.lower()))
|
||||
domain_words = set(re.findall(r'\w+', domain))
|
||||
common_words = name_words & domain_words
|
||||
score += len(common_words) * 5
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def search_for_website(name: str, location: dict, inst_type: str | None = None) -> list[dict]:
|
||||
"""Search for institution website using DuckDuckGo."""
|
||||
results = []
|
||||
|
||||
# Build search queries
|
||||
queries = []
|
||||
|
||||
city = location.get('city', '')
|
||||
country = location.get('country', '')
|
||||
|
||||
# Primary query: name + city
|
||||
if city:
|
||||
queries.append(f'"{name}" {city}')
|
||||
|
||||
# Secondary query: name + country + institution type
|
||||
if inst_type:
|
||||
queries.append(f'"{name}" {country} {inst_type} official')
|
||||
|
||||
# Tertiary: just the name with "official website"
|
||||
queries.append(f'"{name}" official website')
|
||||
|
||||
ddgs = DDGS()
|
||||
|
||||
for query in queries[:2]: # Limit to 2 queries per institution
|
||||
try:
|
||||
search_results = list(ddgs.text(query, max_results=5))
|
||||
|
||||
for r in search_results:
|
||||
url = r.get('href') or r.get('url')
|
||||
if url and is_valid_website(url, country):
|
||||
results.append({
|
||||
'url': url,
|
||||
'title': r.get('title', ''),
|
||||
'snippet': r.get('body', ''),
|
||||
'query': query,
|
||||
'score': score_website(url, country, name)
|
||||
})
|
||||
|
||||
time.sleep(1) # Rate limit between queries
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Search error for '{query}': {e}")
|
||||
time.sleep(2)
|
||||
|
||||
# Sort by score and deduplicate
|
||||
seen_domains = set()
|
||||
unique_results = []
|
||||
for r in sorted(results, key=lambda x: -x['score']):
|
||||
domain = urlparse(r['url']).netloc.lower()
|
||||
if domain not in seen_domains:
|
||||
seen_domains.add(domain)
|
||||
unique_results.append(r)
|
||||
|
||||
return unique_results[:3] # Return top 3 unique results
|
||||
|
||||
|
||||
async def verify_website(url: str) -> dict:
|
||||
"""Verify that a website is accessible and get basic info."""
|
||||
result = {
|
||||
'accessible': False,
|
||||
'final_url': url,
|
||||
'status_code': None,
|
||||
'title': None,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=15.0) as client:
|
||||
response = await client.get(url)
|
||||
result['accessible'] = response.status_code == 200
|
||||
result['status_code'] = response.status_code
|
||||
result['final_url'] = str(response.url)
|
||||
|
||||
# Extract title
|
||||
if result['accessible']:
|
||||
match = re.search(r'<title[^>]*>([^<]+)</title>', response.text, re.I)
|
||||
if match:
|
||||
result['title'] = match.group(1).strip()
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to verify {url}: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_checkpoint() -> dict:
|
||||
"""Load progress checkpoint."""
|
||||
if CHECKPOINT_FILE.exists():
|
||||
with open(CHECKPOINT_FILE, 'r') as f:
|
||||
return json.load(f)
|
||||
return {'processed_files': [], 'found_count': 0, 'not_found_count': 0}
|
||||
|
||||
|
||||
def save_checkpoint(checkpoint: dict):
|
||||
"""Save progress checkpoint."""
|
||||
with open(CHECKPOINT_FILE, 'w') as f:
|
||||
json.dump(checkpoint, f, indent=2)
|
||||
|
||||
|
||||
def update_custodian_file(filepath: Path, website_url: str, discovery_info: dict) -> bool:
|
||||
"""Update custodian YAML file with discovered website."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
entry = yaml.safe_load(f)
|
||||
|
||||
if not entry:
|
||||
return False
|
||||
|
||||
# Add website discovery section
|
||||
entry['website_discovery'] = {
|
||||
'website_url': website_url,
|
||||
'discovery_date': datetime.now(timezone.utc).isoformat(),
|
||||
'discovery_method': 'duckduckgo_search',
|
||||
'search_query': discovery_info.get('query', ''),
|
||||
'confidence_score': min(discovery_info.get('score', 0) / 50, 1.0), # Normalize to 0-1
|
||||
'verification': {
|
||||
'accessible': discovery_info.get('verification', {}).get('accessible', False),
|
||||
'page_title': discovery_info.get('verification', {}).get('title'),
|
||||
'final_url': discovery_info.get('verification', {}).get('final_url'),
|
||||
}
|
||||
}
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update {filepath}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def process_file(filepath: Path, dry_run: bool = False) -> dict:
|
||||
"""Process a single custodian file."""
|
||||
result = {
|
||||
'filename': filepath.name,
|
||||
'status': 'skipped',
|
||||
'website': None,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
entry = yaml.safe_load(f)
|
||||
|
||||
if not entry:
|
||||
result['status'] = 'empty'
|
||||
return result
|
||||
|
||||
# Skip if already has website
|
||||
if has_website(entry):
|
||||
result['status'] = 'has_website'
|
||||
return result
|
||||
|
||||
# Get institution info
|
||||
name = get_custodian_name(entry)
|
||||
if not name:
|
||||
result['status'] = 'no_name'
|
||||
return result
|
||||
|
||||
location = get_location_info(entry)
|
||||
inst_type = get_institution_type(entry)
|
||||
country = location.get('country', filepath.name[:2])
|
||||
|
||||
logger.info(f"Searching for: {name} ({location.get('city', 'unknown city')}, {country})")
|
||||
|
||||
# Search for website
|
||||
search_results = search_for_website(name, location, inst_type)
|
||||
|
||||
if not search_results:
|
||||
result['status'] = 'not_found'
|
||||
return result
|
||||
|
||||
# Verify top result
|
||||
best = search_results[0]
|
||||
verification = await verify_website(best['url'])
|
||||
best['verification'] = verification
|
||||
|
||||
if verification['accessible']:
|
||||
result['website'] = verification['final_url']
|
||||
result['status'] = 'found'
|
||||
result['discovery_info'] = best
|
||||
|
||||
if not dry_run:
|
||||
update_custodian_file(filepath, verification['final_url'], best)
|
||||
logger.info(f" → Found: {verification['final_url']}")
|
||||
else:
|
||||
# Try second result if first is inaccessible
|
||||
if len(search_results) > 1:
|
||||
second = search_results[1]
|
||||
verification2 = await verify_website(second['url'])
|
||||
if verification2['accessible']:
|
||||
second['verification'] = verification2
|
||||
result['website'] = verification2['final_url']
|
||||
result['status'] = 'found'
|
||||
result['discovery_info'] = second
|
||||
|
||||
if not dry_run:
|
||||
update_custodian_file(filepath, verification2['final_url'], second)
|
||||
logger.info(f" → Found (2nd): {verification2['final_url']}")
|
||||
else:
|
||||
result['status'] = 'inaccessible'
|
||||
else:
|
||||
result['status'] = 'inaccessible'
|
||||
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
logger.error(f"Error processing {filepath}: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description='Discover websites for custodian files')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be discovered')
|
||||
parser.add_argument('--limit', type=int, help='Process only first N files')
|
||||
parser.add_argument('--file', type=str, help='Process a single specific file')
|
||||
parser.add_argument('--country', type=str, help='Filter by country code (e.g., JP, CZ)')
|
||||
parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get files to process
|
||||
if args.file:
|
||||
files = [Path(args.file)]
|
||||
else:
|
||||
pattern = f"{args.country}-*.yaml" if args.country else "*.yaml"
|
||||
files = sorted(CUSTODIAN_DIR.glob(pattern))
|
||||
|
||||
# Filter out non-custodian files
|
||||
files = [f for f in files if f.name[0].isupper() and '-' in f.name]
|
||||
|
||||
# Load checkpoint
|
||||
checkpoint = load_checkpoint() if args.resume else {'processed_files': [], 'found_count': 0, 'not_found_count': 0}
|
||||
processed_set = set(checkpoint['processed_files'])
|
||||
|
||||
if args.resume:
|
||||
files = [f for f in files if f.name not in processed_set]
|
||||
logger.info(f"Resuming: {len(processed_set)} files already processed, {len(files)} remaining")
|
||||
|
||||
# Apply limit
|
||||
if args.limit:
|
||||
files = files[:args.limit]
|
||||
|
||||
logger.info(f"Processing {len(files)} custodian files...")
|
||||
|
||||
# Process files
|
||||
found_count = checkpoint.get('found_count', 0)
|
||||
not_found_count = checkpoint.get('not_found_count', 0)
|
||||
|
||||
for i, filepath in enumerate(files):
|
||||
result = await process_file(filepath, args.dry_run)
|
||||
|
||||
# Update counts
|
||||
if result['status'] == 'found':
|
||||
found_count += 1
|
||||
elif result['status'] in ('not_found', 'inaccessible'):
|
||||
not_found_count += 1
|
||||
|
||||
# Update checkpoint
|
||||
if not args.dry_run:
|
||||
checkpoint['processed_files'].append(filepath.name)
|
||||
checkpoint['found_count'] = found_count
|
||||
checkpoint['not_found_count'] = not_found_count
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
save_checkpoint(checkpoint)
|
||||
|
||||
# Progress update
|
||||
if (i + 1) % 10 == 0:
|
||||
logger.info(f"Progress: {i + 1}/{len(files)} - Found: {found_count}, Not found: {not_found_count}")
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# Final checkpoint save
|
||||
if not args.dry_run:
|
||||
save_checkpoint(checkpoint)
|
||||
|
||||
# Summary
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info(f"Discovery complete!")
|
||||
logger.info(f" Files processed: {len(files)}")
|
||||
logger.info(f" Websites found: {found_count}")
|
||||
logger.info(f" Not found: {not_found_count}")
|
||||
logger.info(f"{'='*50}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
150
scripts/discover_websites_crawl4ai.py
Normal file
150
scripts/discover_websites_crawl4ai.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simplified Website Discovery for Custodians using crawl4ai.
|
||||
Discovers websites by:
|
||||
1. Searching DuckDuckGo
|
||||
2. Verifying with crawl4ai
|
||||
3. Updating YAML files with discovered URLs
|
||||
"""
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import yaml
|
||||
|
||||
# Logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
||||
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
|
||||
REQUEST_DELAY = 3.0 # seconds between requests
|
||||
DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q="
|
||||
|
||||
async def discover_websites(name, city, country):
|
||||
"""Search DuckDuckGo and verify websites."""
|
||||
logger.info(f"Searching for: {name}")
|
||||
|
||||
# Simple search - use .format() to avoid f-string issues
|
||||
city_part = f" {city}" if city else ""
|
||||
query = f"{name}{city_part}" if city_part else f"{name}"
|
||||
|
||||
# Search DuckDuckGo
|
||||
search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
|
||||
response = await client.get(search_url)
|
||||
if response.status_code not in [200, 202]:
|
||||
logger.warning(f"Search failed: {response.status_code}")
|
||||
return None
|
||||
|
||||
html = response.text
|
||||
links = []
|
||||
for match in re.finditer(r'<a[^>]+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)</a>', html, re.I):
|
||||
href = match.group(1).replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
if href:
|
||||
links.append({'url': href, 'title': match.group(3)})
|
||||
|
||||
if not links:
|
||||
logger.info(f"No results found")
|
||||
return None
|
||||
|
||||
logger.info(f"Found {len(links)} candidates, verifying...")
|
||||
|
||||
verified = []
|
||||
for link in sorted(links, key=lambda x: len(x['title'])):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
verify_response = await client.get(link['url'])
|
||||
if verify_response.status_code == 200:
|
||||
logger.info(f"Verified: {link['url']}")
|
||||
verified.append({
|
||||
'url': link['url'],
|
||||
'title': link['title'],
|
||||
'status': 'found'
|
||||
})
|
||||
else:
|
||||
logger.debug(f"Verification failed for {link['url']}")
|
||||
except Exception:
|
||||
logger.debug(f"Verification error for {link['url']}")
|
||||
|
||||
if verified:
|
||||
best = verified[0]
|
||||
logger.info(f"Best candidate: {best['url']}")
|
||||
return {
|
||||
'status': 'found',
|
||||
'message': f"Discovered and verified: {best['url']}",
|
||||
'website_url': best['url'],
|
||||
'title': best.get('title'),
|
||||
}
|
||||
else:
|
||||
logger.info(f"No valid websites found")
|
||||
return {
|
||||
'status': 'not_found',
|
||||
'message': 'No valid results found'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search error: {e}")
|
||||
return None
|
||||
|
||||
def update_custodian_file(filepath, website_url, title):
|
||||
"""Update custodian YAML file with discovered website."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
entry = yaml.safe_load(f)
|
||||
if not entry:
|
||||
logger.error(f"Invalid file: {filepath}")
|
||||
return False
|
||||
|
||||
# Add website discovery section
|
||||
entry['website_discovery'] = {
|
||||
'website_url': website_url,
|
||||
'discovery_date': datetime.now(timezone.utc).isoformat(),
|
||||
'discovery_method': 'crawl4ai_search_and_verify',
|
||||
'title': title,
|
||||
'confidence_score': 0.0, # Will be updated if verification succeeds
|
||||
}
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
logger.info(f"Updated: {filepath}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update {filepath}: {e}")
|
||||
return False
|
||||
|
||||
async def main():
|
||||
files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1] # Test with 1 file
|
||||
|
||||
logger.info(f"Processing {len(files)} custodian files...")
|
||||
|
||||
for filepath in files:
|
||||
name = Path(filepath).stem.replace('_', ' ')
|
||||
logger.info(f"Processing: {name}")
|
||||
|
||||
url = await discover_websites(name, None, 'JP')
|
||||
|
||||
if url:
|
||||
website_url = url.get('website_url') or url.get('url')
|
||||
title = url.get('title')
|
||||
if update_custodian_file(filepath, website_url, title):
|
||||
logger.info(f" → Discovered: {website_url}")
|
||||
else:
|
||||
logger.info(f"No website found")
|
||||
|
||||
logger.info("Done!")
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
|
|
@ -75,19 +75,26 @@ REQUEST_DELAY = 2.0 # seconds between requests
|
|||
|
||||
def get_website_url(entry: dict) -> str | None:
|
||||
"""Extract website URL from custodian entry."""
|
||||
# Priority 1: Original entry webadres
|
||||
# Priority 1: Original entry webadres (Dutch ISIL format)
|
||||
if entry.get('original_entry', {}).get('webadres_organisatie'):
|
||||
url = entry['original_entry']['webadres_organisatie']
|
||||
if url and url.strip() and url.strip().lower() not in ('null', 'none', ''):
|
||||
return normalize_url(url.strip())
|
||||
|
||||
# Priority 2: Museum register website
|
||||
# Priority 2: Website in identifiers array (Czech ISIL and ARON format)
|
||||
for ident in entry.get('original_entry', {}).get('identifiers', []):
|
||||
if ident.get('identifier_scheme') == 'Website':
|
||||
url = ident.get('identifier_value') or ident.get('identifier_url')
|
||||
if url and url.strip():
|
||||
return normalize_url(url.strip())
|
||||
|
||||
# Priority 3: Museum register website
|
||||
if entry.get('museum_register_enrichment', {}).get('website_url'):
|
||||
url = entry['museum_register_enrichment']['website_url']
|
||||
if url and url.strip():
|
||||
return normalize_url(url.strip())
|
||||
|
||||
# Priority 3: Wikidata official website
|
||||
# Priority 4: Wikidata official website
|
||||
if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'):
|
||||
url = entry['wikidata_enrichment']['wikidata_official_website']
|
||||
# Handle list of URLs (take first one)
|
||||
|
|
@ -96,13 +103,13 @@ def get_website_url(entry: dict) -> str | None:
|
|||
if url and isinstance(url, str) and url.strip():
|
||||
return normalize_url(url.strip())
|
||||
|
||||
# Priority 4: Google Maps website
|
||||
# Priority 5: Google Maps website
|
||||
if entry.get('google_maps_enrichment', {}).get('website'):
|
||||
url = entry['google_maps_enrichment']['website']
|
||||
if url and url.strip():
|
||||
return normalize_url(url.strip())
|
||||
|
||||
# Priority 5: Web enrichment source URL
|
||||
# Priority 6: Web enrichment source URL
|
||||
if entry.get('web_enrichment', {}).get('source_url'):
|
||||
url = entry['web_enrichment']['source_url']
|
||||
if url and url.strip():
|
||||
|
|
|
|||
|
|
@ -54,9 +54,22 @@ def extract_person_text(data: dict[str, Any]) -> str:
|
|||
parts = []
|
||||
|
||||
profile = data.get("profile_data", {})
|
||||
person = data.get("person", {})
|
||||
source_staff = data.get("source_staff_info", {})
|
||||
extraction = data.get("extraction_metadata", {})
|
||||
|
||||
# Full name (primary identifier)
|
||||
name = profile.get("full_name", "")
|
||||
# Full name - check ALL possible locations in order of priority
|
||||
name = (
|
||||
profile.get("full_name") or
|
||||
profile.get("name") or
|
||||
person.get("full_name") or
|
||||
person.get("name") or
|
||||
source_staff.get("name") or
|
||||
source_staff.get("person_name") or
|
||||
extraction.get("person_name") or
|
||||
data.get("name") or
|
||||
""
|
||||
)
|
||||
if name:
|
||||
parts.append(f"Name: {name}")
|
||||
|
||||
|
|
@ -259,13 +272,21 @@ def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
|
|||
}
|
||||
|
||||
profile = data.get("profile_data", {})
|
||||
person = data.get("person", {})
|
||||
source_staff = data.get("source_staff_info", {})
|
||||
extraction = data.get("extraction_metadata", {})
|
||||
|
||||
# Full name - check multiple possible field names
|
||||
# Full name - check ALL possible field names (same as extract_person_text)
|
||||
name = (
|
||||
profile.get("name", "") or
|
||||
profile.get("full_name", "") or
|
||||
data.get("name", "")
|
||||
profile.get("full_name") or
|
||||
profile.get("name") or
|
||||
person.get("full_name") or
|
||||
person.get("name") or
|
||||
source_staff.get("name") or
|
||||
source_staff.get("person_name") or
|
||||
extraction.get("person_name") or
|
||||
data.get("name") or
|
||||
""
|
||||
)
|
||||
if name:
|
||||
metadata["name"] = name
|
||||
|
|
@ -414,16 +435,19 @@ def find_person_files(data_dir: Path) -> list[Path]:
|
|||
|
||||
|
||||
class PersonRetriever:
|
||||
"""Qdrant retriever specifically for person entities."""
|
||||
"""Qdrant retriever specifically for person entities.
|
||||
|
||||
Uses MiniLM (384-dim) embeddings by default for consistency with
|
||||
the hybrid_retriever.py query-time embedding model.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str = "localhost",
|
||||
port: int = 6333,
|
||||
collection_name: str = "heritage_persons",
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dim: int = 1536,
|
||||
api_key: str | None = None,
|
||||
embedding_model: str = "all-MiniLM-L6-v2", # MiniLM for local embeddings
|
||||
embedding_dim: int = 384, # MiniLM output dimension
|
||||
url: str | None = None,
|
||||
https: bool = False,
|
||||
prefix: str | None = None,
|
||||
|
|
@ -434,7 +458,7 @@ class PersonRetriever:
|
|||
self.collection_name = collection_name
|
||||
self.embedding_model = embedding_model
|
||||
self.embedding_dim = embedding_dim
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
# MiniLM model runs locally, no API key needed
|
||||
|
||||
# Initialize Qdrant client
|
||||
if url:
|
||||
|
|
@ -451,25 +475,23 @@ class PersonRetriever:
|
|||
else:
|
||||
self.client = QdrantClient(host=host, port=port, timeout=60)
|
||||
|
||||
self._openai_client = None
|
||||
self._sentence_model = None
|
||||
|
||||
@property
|
||||
def openai_client(self):
|
||||
"""Lazy-load OpenAI client."""
|
||||
if self._openai_client is None:
|
||||
import openai
|
||||
self._openai_client = openai.OpenAI(api_key=self.api_key)
|
||||
return self._openai_client
|
||||
def sentence_model(self):
|
||||
"""Lazy-load SentenceTransformer model."""
|
||||
if self._sentence_model is None:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
logger.info(f"Loading embedding model: {self.embedding_model}")
|
||||
self._sentence_model = SentenceTransformer(self.embedding_model)
|
||||
return self._sentence_model
|
||||
|
||||
def _get_embeddings_batch(self, texts: list[str]) -> list[list[float]]:
|
||||
"""Get embedding vectors for multiple texts."""
|
||||
"""Get embedding vectors for multiple texts using MiniLM."""
|
||||
if not texts:
|
||||
return []
|
||||
response = self.openai_client.embeddings.create(
|
||||
input=texts,
|
||||
model=self.embedding_model
|
||||
)
|
||||
return [item.embedding for item in sorted(response.data, key=lambda x: x.index)]
|
||||
embeddings = self.sentence_model.encode(texts, show_progress_bar=False)
|
||||
return embeddings.tolist()
|
||||
|
||||
def ensure_collection(self) -> None:
|
||||
"""Ensure the collection exists, create if not."""
|
||||
|
|
@ -655,10 +677,7 @@ def main():
|
|||
logger.info(f" Metadata: {list(doc['metadata'].keys())}")
|
||||
sys.exit(0)
|
||||
|
||||
# Check for OpenAI API key
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
logger.error("OPENAI_API_KEY environment variable is required for embeddings")
|
||||
sys.exit(1)
|
||||
# Note: MiniLM model runs locally, no API key needed
|
||||
|
||||
# Create retriever
|
||||
if args.url:
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue