enrich entries
This commit is contained in:
parent
59963c8d3f
commit
ca219340f2
24 changed files with 2304 additions and 129 deletions
292
docs/GRAPH_SCORE_INHERITANCE.md
Normal file
292
docs/GRAPH_SCORE_INHERITANCE.md
Normal file
|
|
@ -0,0 +1,292 @@
|
||||||
|
# Graph Score Inheritance in Hybrid Retrieval
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Heritage RAG system uses a **hybrid retrieval** approach that combines:
|
||||||
|
1. **Vector search** (semantic similarity via embeddings)
|
||||||
|
2. **Knowledge graph expansion** (SPARQL-based relationship discovery)
|
||||||
|
|
||||||
|
This document explains the **graph score inheritance** feature that ensures vector search results benefit from knowledge graph relationships.
|
||||||
|
|
||||||
|
## The Problem
|
||||||
|
|
||||||
|
Before graph score inheritance, the hybrid retrieval had a scoring gap:
|
||||||
|
|
||||||
|
| Result Source | Vector Score | Graph Score | Combined Score |
|
||||||
|
|---------------|--------------|-------------|----------------|
|
||||||
|
| Vector search results | 0.5-0.8 | **0.0** | 0.35-0.56 |
|
||||||
|
| Graph expansion results | 0.0 | 0.5-0.8 | 0.15-0.24 |
|
||||||
|
|
||||||
|
**Why this happened:**
|
||||||
|
- Vector search finds institutions semantically similar to the query
|
||||||
|
- Graph expansion finds **different** institutions (same city/type) with different GHCIDs
|
||||||
|
- Since GHCIDs don't match, no direct merging occurs
|
||||||
|
- Vector results always dominate because `combined = 0.7 * vector + 0.3 * graph`
|
||||||
|
|
||||||
|
**Example before fix:**
|
||||||
|
```
|
||||||
|
Query: "Archieven in Amsterdam"
|
||||||
|
|
||||||
|
1. Stadsarchief Amsterdam | V:0.659 G:0.000 C:0.461
|
||||||
|
2. Noord-Hollands Archief | V:0.675 G:0.000 C:0.472
|
||||||
|
3. The Black Archives | V:0.636 G:0.000 C:0.445
|
||||||
|
```
|
||||||
|
|
||||||
|
The graph expansion was finding related institutions in Amsterdam, but that information wasn't reflected in the scores.
|
||||||
|
|
||||||
|
## The Solution: Graph Score Inheritance
|
||||||
|
|
||||||
|
Vector results now **inherit** graph scores from related institutions found via graph expansion.
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
```
|
||||||
|
1. Vector Search
|
||||||
|
└── Returns: [Inst_A, Inst_B, Inst_C] with vector_scores
|
||||||
|
|
||||||
|
2. Graph Expansion (for top 5 vector results)
|
||||||
|
└── For Inst_A in Amsterdam:
|
||||||
|
└── SPARQL finds: [Inst_X, Inst_Y] also in Amsterdam
|
||||||
|
└── These get graph_score=0.8 (same_city)
|
||||||
|
└── They track: related_institutions=[Inst_A.ghcid]
|
||||||
|
|
||||||
|
3. Inheritance Calculation
|
||||||
|
└── Inst_A inherits from [Inst_X, Inst_Y]:
|
||||||
|
inherited_score = avg([0.8, 0.8]) * 0.5 = 0.4
|
||||||
|
└── Inst_A.graph_score = max(0.0, 0.4) = 0.4
|
||||||
|
|
||||||
|
4. Combined Scoring
|
||||||
|
└── Inst_A.combined = 0.7 * vector + 0.3 * 0.4 = higher rank!
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inheritance Factor
|
||||||
|
|
||||||
|
```python
|
||||||
|
INHERITANCE_FACTOR = 0.5 # Inherit 50% of related institutions' graph scores
|
||||||
|
```
|
||||||
|
|
||||||
|
This means:
|
||||||
|
- Same-city institutions (graph_score=0.8) → inherited score of **0.40**
|
||||||
|
- Same-type institutions (graph_score=0.5) → inherited score of **0.25**
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
### File Location
|
||||||
|
|
||||||
|
```
|
||||||
|
/Users/kempersc/apps/glam/src/glam_extractor/api/hybrid_retriever.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Key Method: `_combine_and_rank()`
|
||||||
|
|
||||||
|
Located at lines ~1539-1671, this method:
|
||||||
|
|
||||||
|
1. **Creates lookup by GHCID** for merging
|
||||||
|
2. **Handles direct merges** when graph result GHCID matches vector result
|
||||||
|
3. **Builds inheritance map** tracking which vector results each graph result was expanded from
|
||||||
|
4. **Applies inheritance** calculating inherited scores for vector results
|
||||||
|
5. **Computes combined scores** with the formula: `0.7 * vector + 0.3 * graph`
|
||||||
|
|
||||||
|
### Code Structure
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _combine_and_rank(
|
||||||
|
self,
|
||||||
|
vector_results: list[RetrievedInstitution],
|
||||||
|
graph_results: list[RetrievedInstitution],
|
||||||
|
k: int
|
||||||
|
) -> list[RetrievedInstitution]:
|
||||||
|
"""Combine vector and graph results with weighted scoring and graph inheritance."""
|
||||||
|
|
||||||
|
# 1. Create lookup by GHCID
|
||||||
|
results_by_ghcid: dict[str, RetrievedInstitution] = {}
|
||||||
|
vector_ghcids = set()
|
||||||
|
|
||||||
|
# 2. Add vector results
|
||||||
|
for inst in vector_results:
|
||||||
|
results_by_ghcid[inst.ghcid] = inst
|
||||||
|
vector_ghcids.add(inst.ghcid)
|
||||||
|
|
||||||
|
# 3. Build inheritance map: vector_ghcid -> [(related_ghcid, graph_score, reason)]
|
||||||
|
inheritance_map: dict[str, list[tuple[str, float, str]]] = {g: [] for g in vector_ghcids}
|
||||||
|
|
||||||
|
for inst in graph_results:
|
||||||
|
if inst.ghcid in results_by_ghcid:
|
||||||
|
# Direct merge
|
||||||
|
existing = results_by_ghcid[inst.ghcid]
|
||||||
|
existing.graph_score = max(existing.graph_score, inst.graph_score)
|
||||||
|
else:
|
||||||
|
# New from graph - track for inheritance
|
||||||
|
results_by_ghcid[inst.ghcid] = inst
|
||||||
|
for seed_ghcid in inst.related_institutions:
|
||||||
|
if seed_ghcid in inheritance_map:
|
||||||
|
inheritance_map[seed_ghcid].append(
|
||||||
|
(inst.ghcid, inst.graph_score, inst.expansion_reason)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Apply inheritance
|
||||||
|
INHERITANCE_FACTOR = 0.5
|
||||||
|
for vector_ghcid, related_list in inheritance_map.items():
|
||||||
|
if related_list:
|
||||||
|
inst = results_by_ghcid[vector_ghcid]
|
||||||
|
related_scores = [score for _, score, _ in related_list]
|
||||||
|
inherited_score = (sum(related_scores) / len(related_scores)) * INHERITANCE_FACTOR
|
||||||
|
inst.graph_score = max(inst.graph_score, inherited_score)
|
||||||
|
|
||||||
|
# 5. Calculate combined scores
|
||||||
|
for inst in results_by_ghcid.values():
|
||||||
|
inst.combined_score = (
|
||||||
|
self.vector_weight * inst.vector_score +
|
||||||
|
self.graph_weight * inst.graph_score
|
||||||
|
)
|
||||||
|
|
||||||
|
return sorted(results_by_ghcid.values(), key=lambda x: x.combined_score, reverse=True)[:k]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Graph Expansion Scores
|
||||||
|
|
||||||
|
The `_expand_via_graph()` method assigns these base scores:
|
||||||
|
|
||||||
|
| Expansion Type | Graph Score | SPARQL Pattern |
|
||||||
|
|----------------|-------------|----------------|
|
||||||
|
| Same city | 0.8 | `?s schema:location ?loc . ?loc hc:cityCode ?cityCode` |
|
||||||
|
| Same institution type | 0.5 | `?s hc:institutionType ?type` |
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
### Before (Graph Score = 0.0)
|
||||||
|
|
||||||
|
```
|
||||||
|
Query: "Welke musea zijn er in Utrecht?"
|
||||||
|
|
||||||
|
1. Centraal Museum | V:0.589 G:0.000 C:0.412
|
||||||
|
2. Museum Speelklok | V:0.591 G:0.000 C:0.414
|
||||||
|
3. Universiteitsmuseum Utrecht | V:0.641 G:0.000 C:0.449
|
||||||
|
```
|
||||||
|
|
||||||
|
### After (Graph Score Inherited)
|
||||||
|
|
||||||
|
```
|
||||||
|
Query: "Welke musea zijn er in Utrecht?"
|
||||||
|
|
||||||
|
1. Universiteitsmuseum Utrecht | V:0.641 G:0.400 C:0.569
|
||||||
|
2. Museum Speelklok | V:0.591 G:0.400 C:0.534
|
||||||
|
3. Centraal Museum | V:0.589 G:0.400 C:0.532
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key improvements:**
|
||||||
|
- Graph scores now **0.400** (inherited from same-city museums)
|
||||||
|
- Combined scores **increased by ~25%** (0.412 → 0.532)
|
||||||
|
- Ranking now considers **geographic relevance**
|
||||||
|
|
||||||
|
### More Examples
|
||||||
|
|
||||||
|
```
|
||||||
|
Query: "Bibliotheken in Den Haag"
|
||||||
|
|
||||||
|
1. Centrale Bibliotheek | V:0.697 G:0.400 C:0.608
|
||||||
|
2. Koninklijke Bibliotheek | V:0.676 G:0.400 C:0.593
|
||||||
|
3. Huis van het Boek | V:0.630 G:0.400 C:0.561
|
||||||
|
4. Bibliotheek Hoeksche Waard | V:0.613 G:0.400 C:0.549
|
||||||
|
5. Centrale Bibliotheek (other) | V:0.623 G:0.000 C:0.436 <- No inheritance (different city)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Weights (in `HybridRetriever.__init__`)
|
||||||
|
|
||||||
|
```python
|
||||||
|
self.vector_weight = 0.7 # Semantic similarity importance
|
||||||
|
self.graph_weight = 0.3 # Knowledge graph importance
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inheritance Factor
|
||||||
|
|
||||||
|
```python
|
||||||
|
INHERITANCE_FACTOR = 0.5 # In _combine_and_rank()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tuning considerations:**
|
||||||
|
- Higher factor (0.6-0.8): Stronger influence from graph relationships
|
||||||
|
- Lower factor (0.3-0.4): More conservative, vector similarity dominates
|
||||||
|
- Current value (0.5): Balanced approach
|
||||||
|
|
||||||
|
## Logging
|
||||||
|
|
||||||
|
The implementation includes detailed logging for debugging:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# INFO level (always visible)
|
||||||
|
logger.info(f"Graph inheritance applied to {len(inheritance_boosts)} vector results: {ghcids}...")
|
||||||
|
|
||||||
|
# DEBUG level (when LOG_LEVEL=DEBUG)
|
||||||
|
logger.debug(f"Inheritance: {ghcid} graph_score: {old:.3f} -> {new:.3f} (from {n} related)")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check logs on production:**
|
||||||
|
```bash
|
||||||
|
ssh root@91.98.224.44 "journalctl -u glam-rag-api --since '5 minutes ago' | grep -i inheritance"
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Response Structure
|
||||||
|
|
||||||
|
The graph score is exposed in the API response:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"retrieved_results": [
|
||||||
|
{
|
||||||
|
"ghcid": "NL-UT-UTR-M-CM",
|
||||||
|
"name": "Centraal Museum",
|
||||||
|
"scores": {
|
||||||
|
"vector": 0.589,
|
||||||
|
"graph": 0.400, // <-- Now populated via inheritance
|
||||||
|
"combined": 0.532
|
||||||
|
},
|
||||||
|
"related_institutions": ["NL-UT-UTR-M-MS", "NL-UT-UTR-M-UMUU"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
**File to deploy:**
|
||||||
|
```bash
|
||||||
|
scp /Users/kempersc/apps/glam/src/glam_extractor/api/hybrid_retriever.py \
|
||||||
|
root@91.98.224.44:/opt/glam-backend/rag/glam_extractor/api/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Restart service:**
|
||||||
|
```bash
|
||||||
|
ssh root@91.98.224.44 "systemctl restart glam-rag-api"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Verify:**
|
||||||
|
```bash
|
||||||
|
curl -s -X POST 'https://archief.support/api/rag/dspy/query' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"question": "Musea in Rotterdam", "language": "nl"}' | \
|
||||||
|
python3 -c "import sys,json; r=json.load(sys.stdin)['retrieved_results']; print('\n'.join(f\"{x['name'][:30]:30} G:{x['scores']['graph']:.2f}\" for x in r[:5]))"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Related Files
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `hybrid_retriever.py` | Main implementation with `_combine_and_rank()` |
|
||||||
|
| `dspy_heritage_rag.py` | RAG pipeline that calls `retriever.search()` |
|
||||||
|
| `main.py` | FastAPI endpoints serving the RAG API |
|
||||||
|
|
||||||
|
## Future Improvements
|
||||||
|
|
||||||
|
1. **Dynamic inheritance factor**: Adjust based on query type (geographic vs. thematic)
|
||||||
|
2. **Multi-hop expansion**: Inherit from institutions 2+ hops away
|
||||||
|
3. **Weighted inheritance**: Weight by relationship type (same_city=0.8, same_type=0.5)
|
||||||
|
4. **Negative inheritance**: Penalize results unrelated to graph findings
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** 2025-12-24
|
||||||
|
**Implemented:** 2025-12-23
|
||||||
|
**Status:** Production (archief.support)
|
||||||
|
|
@ -16,14 +16,14 @@
|
||||||
"test:coverage": "vitest run --coverage"
|
"test:coverage": "vitest run --coverage"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@glam/api-client": "workspace:*",
|
|
||||||
"@glam/hooks": "workspace:*",
|
|
||||||
"@glam/theme": "workspace:*",
|
|
||||||
"@glam/ui": "workspace:*",
|
|
||||||
"@codemirror/lang-javascript": "^6.2.4",
|
"@codemirror/lang-javascript": "^6.2.4",
|
||||||
"@duckdb/duckdb-wasm": "^1.31.0",
|
"@duckdb/duckdb-wasm": "^1.31.0",
|
||||||
"@emotion/react": "^11.14.0",
|
"@emotion/react": "^11.14.0",
|
||||||
"@emotion/styled": "^11.14.1",
|
"@emotion/styled": "^11.14.1",
|
||||||
|
"@glam/api-client": "workspace:*",
|
||||||
|
"@glam/hooks": "workspace:*",
|
||||||
|
"@glam/theme": "workspace:*",
|
||||||
|
"@glam/ui": "workspace:*",
|
||||||
"@mui/icons-material": "^7.3.6",
|
"@mui/icons-material": "^7.3.6",
|
||||||
"@mui/material": "^7.3.5",
|
"@mui/material": "^7.3.5",
|
||||||
"@tanstack/react-query": "^5.90.10",
|
"@tanstack/react-query": "^5.90.10",
|
||||||
|
|
@ -45,7 +45,7 @@
|
||||||
"fdir": "^6.5.0",
|
"fdir": "^6.5.0",
|
||||||
"js-yaml": "^4.1.1",
|
"js-yaml": "^4.1.1",
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
"lucide-react": "^0.561.0",
|
"lucide-react": "^0.562.0",
|
||||||
"maplibre-gl": "^5.14.0",
|
"maplibre-gl": "^5.14.0",
|
||||||
"mermaid": "^11.4.0",
|
"mermaid": "^11.4.0",
|
||||||
"n3": "^1.26.0",
|
"n3": "^1.26.0",
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"generated": "2025-12-23T16:58:31.474Z",
|
"generated": "2025-12-25T12:42:29.931Z",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"categories": [
|
"categories": [
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -872,3 +872,106 @@
|
||||||
padding: 0.125rem 0.375rem;
|
padding: 0.125rem 0.375rem;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ==========================================================================
|
||||||
|
Chain-of-Thought Reasoning Display (GLM 4.7 Interleaved Thinking)
|
||||||
|
========================================================================== */
|
||||||
|
|
||||||
|
.conversation-panel__reasoning {
|
||||||
|
margin: 0.75rem 0;
|
||||||
|
border: 1px solid var(--border-color, #e0e0e0);
|
||||||
|
border-radius: 0.5rem;
|
||||||
|
overflow: hidden;
|
||||||
|
background: var(--surface-secondary, #fafafa);
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-toggle {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
padding: 0.5rem 0.75rem;
|
||||||
|
background: var(--surface-secondary, #f5f5f5);
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: 0.8125rem;
|
||||||
|
color: var(--text-secondary, #757575);
|
||||||
|
border: none;
|
||||||
|
width: 100%;
|
||||||
|
transition: background-color 0.2s;
|
||||||
|
list-style: none; /* Remove default marker */
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-toggle::-webkit-details-marker {
|
||||||
|
display: none; /* Hide default arrow in WebKit browsers */
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-toggle::before {
|
||||||
|
content: '▶';
|
||||||
|
font-size: 0.625rem;
|
||||||
|
transition: transform 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning[open] .conversation-panel__reasoning-toggle::before {
|
||||||
|
transform: rotate(90deg);
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-toggle:hover {
|
||||||
|
background: var(--surface-tertiary, #eeeeee);
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-toggle svg {
|
||||||
|
color: var(--primary-color, #1976d2);
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content {
|
||||||
|
padding: 0.75rem;
|
||||||
|
background: var(--surface-code, #1e1e1e);
|
||||||
|
max-height: 300px;
|
||||||
|
overflow-y: auto;
|
||||||
|
border-top: 1px solid var(--border-color, #e0e0e0);
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content pre {
|
||||||
|
margin: 0;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-break: break-word;
|
||||||
|
font-size: 0.75rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: var(--text-code, #d4d4d4);
|
||||||
|
font-family: 'Fira Code', 'Monaco', 'Consolas', monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Scrollbar styling for reasoning content */
|
||||||
|
.conversation-panel__reasoning-content::-webkit-scrollbar {
|
||||||
|
width: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content::-webkit-scrollbar-track {
|
||||||
|
background: #2d2d2d;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content::-webkit-scrollbar-thumb {
|
||||||
|
background: #555;
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content::-webkit-scrollbar-thumb:hover {
|
||||||
|
background: #777;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Responsive: Reasoning section */
|
||||||
|
@media (max-width: 768px) {
|
||||||
|
.conversation-panel__reasoning-toggle {
|
||||||
|
font-size: 0.75rem;
|
||||||
|
padding: 0.375rem 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content {
|
||||||
|
max-height: 200px;
|
||||||
|
padding: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.conversation-panel__reasoning-content pre {
|
||||||
|
font-size: 0.6875rem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -164,6 +164,9 @@ const TEXT = {
|
||||||
sourcesUsed: { nl: 'Bronnen gebruikt', en: 'Sources used' },
|
sourcesUsed: { nl: 'Bronnen gebruikt', en: 'Sources used' },
|
||||||
llmProvider: { nl: 'Model', en: 'Model' },
|
llmProvider: { nl: 'Model', en: 'Model' },
|
||||||
answer: { nl: 'Antwoord', en: 'Answer' },
|
answer: { nl: 'Antwoord', en: 'Answer' },
|
||||||
|
showReasoning: { nl: 'Toon redenering', en: 'Show reasoning' },
|
||||||
|
hideReasoning: { nl: 'Verberg redenering', en: 'Hide reasoning' },
|
||||||
|
reasoningTitle: { nl: 'Denkproces', en: 'Thinking Process' },
|
||||||
};
|
};
|
||||||
|
|
||||||
// Example questions to help users get started - shorter list
|
// Example questions to help users get started - shorter list
|
||||||
|
|
@ -180,6 +183,21 @@ const EXAMPLE_QUESTIONS = {
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// LLM Response Metadata - matches backend LLMResponseMetadata model
|
||||||
|
interface LLMResponseMetadata {
|
||||||
|
content?: string;
|
||||||
|
reasoning_content?: string; // GLM 4.7 chain-of-thought reasoning
|
||||||
|
model?: string;
|
||||||
|
provider?: string; // zai, anthropic, openai, groq
|
||||||
|
prompt_tokens?: number;
|
||||||
|
completion_tokens?: number;
|
||||||
|
total_tokens?: number;
|
||||||
|
thinking_mode?: string; // enabled, disabled, interleaved
|
||||||
|
latency_ms?: number;
|
||||||
|
cached?: boolean;
|
||||||
|
finish_reason?: string;
|
||||||
|
}
|
||||||
|
|
||||||
interface Message {
|
interface Message {
|
||||||
id: string;
|
id: string;
|
||||||
role: 'user' | 'assistant';
|
role: 'user' | 'assistant';
|
||||||
|
|
@ -192,6 +210,7 @@ interface Message {
|
||||||
error?: string;
|
error?: string;
|
||||||
errorCode?: string;
|
errorCode?: string;
|
||||||
llmProviderUsed?: string; // Which LLM provider generated this response
|
llmProviderUsed?: string; // Which LLM provider generated this response
|
||||||
|
llmResponse?: LLMResponseMetadata; // Full LLM response metadata including chain-of-thought
|
||||||
}
|
}
|
||||||
|
|
||||||
interface HistoryItem {
|
interface HistoryItem {
|
||||||
|
|
@ -351,6 +370,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
||||||
answer: string;
|
answer: string;
|
||||||
sourcesUsed: string[];
|
sourcesUsed: string[];
|
||||||
llmProviderUsed?: string;
|
llmProviderUsed?: string;
|
||||||
|
llmResponse?: LLMResponseMetadata; // Full LLM response with reasoning_content
|
||||||
}> => {
|
}> => {
|
||||||
// Determine API endpoint based on environment
|
// Determine API endpoint based on environment
|
||||||
const hostname = window.location.hostname;
|
const hostname = window.location.hostname;
|
||||||
|
|
@ -395,6 +415,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
||||||
answer: data.answer || data.explanation || '',
|
answer: data.answer || data.explanation || '',
|
||||||
sourcesUsed: data.sources_used || selectedSources,
|
sourcesUsed: data.sources_used || selectedSources,
|
||||||
llmProviderUsed: data.llm_provider_used,
|
llmProviderUsed: data.llm_provider_used,
|
||||||
|
llmResponse: data.llm_response, // Pass through chain-of-thought metadata
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -445,6 +466,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
||||||
sparql: result.sparql,
|
sparql: result.sparql,
|
||||||
sourcesUsed: result.sourcesUsed,
|
sourcesUsed: result.sourcesUsed,
|
||||||
llmProviderUsed: result.llmProviderUsed,
|
llmProviderUsed: result.llmProviderUsed,
|
||||||
|
llmResponse: result.llmResponse,
|
||||||
isLoading: false,
|
isLoading: false,
|
||||||
}
|
}
|
||||||
: msg
|
: msg
|
||||||
|
|
@ -928,6 +950,19 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
||||||
<>
|
<>
|
||||||
<p>{message.content}</p>
|
<p>{message.content}</p>
|
||||||
|
|
||||||
|
{/* Chain-of-Thought Reasoning (GLM 4.7 Interleaved Thinking) */}
|
||||||
|
{message.llmResponse?.reasoning_content && (
|
||||||
|
<details className="conversation-panel__reasoning">
|
||||||
|
<summary className="conversation-panel__reasoning-toggle">
|
||||||
|
<Sparkles size={14} />
|
||||||
|
<span>{t('showReasoning')}</span>
|
||||||
|
</summary>
|
||||||
|
<div className="conversation-panel__reasoning-content">
|
||||||
|
<pre>{message.llmResponse.reasoning_content}</pre>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Sources Used Badges */}
|
{/* Sources Used Badges */}
|
||||||
{message.sourcesUsed && message.sourcesUsed.length > 0 && (
|
{message.sourcesUsed && message.sourcesUsed.length > 0 && (
|
||||||
<div className="conversation-panel__sources-used">
|
<div className="conversation-panel__sources-used">
|
||||||
|
|
|
||||||
|
|
@ -196,9 +196,9 @@ function createGLAMPolyhedronGeometry(radius: number = 1): THREE.BufferGeometry
|
||||||
* Creates text sprites positioned at the center of each icosahedron face.
|
* Creates text sprites positioned at the center of each icosahedron face.
|
||||||
*
|
*
|
||||||
* Label visibility behavior:
|
* Label visibility behavior:
|
||||||
* - Relevant types: Always visible (opacity 1) - these are the types passed in highlightTypes
|
* - Empty array (no annotation): NO labels shown at all (blank cube)
|
||||||
* - Non-relevant types: Only visible when expanded (opacity 0 when collapsed)
|
* - 19+ types (universal): All labels shown when expanded only
|
||||||
* - If highlightTypes is empty (universal), all labels are shown when expanded only
|
* - Specific types (1-18): Only those letters shown (always visible)
|
||||||
*/
|
*/
|
||||||
function createFaceLabels(
|
function createFaceLabels(
|
||||||
geometry: THREE.BufferGeometry,
|
geometry: THREE.BufferGeometry,
|
||||||
|
|
@ -209,14 +209,16 @@ function createFaceLabels(
|
||||||
const positions = geometry.getAttribute('position');
|
const positions = geometry.getAttribute('position');
|
||||||
const faceCount = positions.count / 3;
|
const faceCount = positions.count / 3;
|
||||||
const highlightSet = new Set(highlightTypes);
|
const highlightSet = new Set(highlightTypes);
|
||||||
const isUniversal = highlightTypes.length === 0 || highlightTypes.length >= 19;
|
const hasNoAnnotation = highlightTypes.length === 0;
|
||||||
|
const isUniversal = highlightTypes.length >= 19;
|
||||||
|
|
||||||
for (let faceIndex = 0; faceIndex < Math.min(faceCount, 20); faceIndex++) {
|
for (let faceIndex = 0; faceIndex < Math.min(faceCount, 20); faceIndex++) {
|
||||||
const typeIndex = faceIndex % 19;
|
const typeIndex = faceIndex % 19;
|
||||||
const typeCode = CUSTODIAN_TYPE_CODES[typeIndex];
|
const typeCode = CUSTODIAN_TYPE_CODES[typeIndex];
|
||||||
|
|
||||||
// Determine if this type is relevant (highlighted)
|
// Determine if this type is relevant (highlighted)
|
||||||
const isRelevant = highlightTypes.length === 0 || highlightSet.has(typeCode);
|
// Empty array = no annotation = nothing is relevant
|
||||||
|
const isRelevant = !hasNoAnnotation && (isUniversal || highlightSet.has(typeCode));
|
||||||
|
|
||||||
// Calculate face center (average of 3 vertices)
|
// Calculate face center (average of 3 vertices)
|
||||||
const v0 = new THREE.Vector3(
|
const v0 = new THREE.Vector3(
|
||||||
|
|
@ -559,7 +561,7 @@ export const CustodianTypeIndicator3D: React.FC<CustodianTypeIndicator3DProps> =
|
||||||
|
|
||||||
// Tooltip text
|
// Tooltip text
|
||||||
const tooltipText = useMemo(() => {
|
const tooltipText = useMemo(() => {
|
||||||
if (types.length === 0) return 'Heritage Custodian Types (GLAMORCUBESFIXPHDNT)';
|
if (types.length === 0) return 'No custodian types';
|
||||||
return types
|
return types
|
||||||
.map(code => getCustodianTypeByCode(code)?.label[language] ?? code)
|
.map(code => getCustodianTypeByCode(code)?.label[language] ?? code)
|
||||||
.join(', ');
|
.join(', ');
|
||||||
|
|
@ -667,23 +669,27 @@ export const CustodianTypeIndicator3D: React.FC<CustodianTypeIndicator3DProps> =
|
||||||
|
|
||||||
// Update label visibility based on expanded state and highlighted types
|
// Update label visibility based on expanded state and highlighted types
|
||||||
// Label visibility rules:
|
// Label visibility rules:
|
||||||
// - Non-universal elements (1-18 types): Show relevant letters only (both collapsed and expanded)
|
// - No annotation (empty array, length 0): Show NO letters (blank cube)
|
||||||
// - Universal elements (19 types or empty): Show all letters only when expanded
|
// - Universal annotation (19+ types): Show all letters only when expanded
|
||||||
|
// - Specific types (1-18 types): Show ONLY those letters (both collapsed and expanded)
|
||||||
if (labelsGroupRef.current) {
|
if (labelsGroupRef.current) {
|
||||||
const highlightSet = new Set(types);
|
const highlightSet = new Set(types);
|
||||||
const isUniversal = types.length === 0 || types.length >= 19;
|
const hasNoAnnotation = types.length === 0;
|
||||||
|
const isUniversal = types.length >= 19;
|
||||||
|
|
||||||
labelsGroupRef.current.children.forEach((child) => {
|
labelsGroupRef.current.children.forEach((child) => {
|
||||||
if (child instanceof THREE.Sprite && child.userData.typeCode) {
|
if (child instanceof THREE.Sprite && child.userData.typeCode) {
|
||||||
const typeCode = child.userData.typeCode as CustodianTypeCode;
|
const typeCode = child.userData.typeCode as CustodianTypeCode;
|
||||||
const isRelevant = types.length === 0 || highlightSet.has(typeCode);
|
const isRelevant = highlightSet.has(typeCode);
|
||||||
|
|
||||||
if (isUniversal) {
|
if (hasNoAnnotation) {
|
||||||
// Universal elements: Show all letters only when expanded
|
// No annotation: Show NO letters at all (blank cube)
|
||||||
|
child.material.opacity = 0;
|
||||||
|
} else if (isUniversal) {
|
||||||
|
// Universal annotation (19+ types): Show all letters only when expanded
|
||||||
child.material.opacity = isExpanded ? 1 : 0;
|
child.material.opacity = isExpanded ? 1 : 0;
|
||||||
} else {
|
} else {
|
||||||
// Non-universal elements: Show ONLY relevant letters (hidden otherwise)
|
// Specific types (1-18): Show ONLY relevant letters (hidden otherwise)
|
||||||
// When expanded, relevant letters get full opacity
|
|
||||||
if (isRelevant) {
|
if (isRelevant) {
|
||||||
child.material.opacity = 1; // Relevant letters always visible
|
child.material.opacity = 1; // Relevant letters always visible
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1172,7 +1178,7 @@ export const CustodianTypeIndicator3DFallback: React.FC<CustodianTypeIndicator3D
|
||||||
const color = config?.color ?? '#888888';
|
const color = config?.color ?? '#888888';
|
||||||
|
|
||||||
const tooltipText = useMemo(() => {
|
const tooltipText = useMemo(() => {
|
||||||
if (types.length === 0) return 'Heritage Custodian Types';
|
if (types.length === 0) return 'No custodian types';
|
||||||
return types
|
return types
|
||||||
.map(code => getCustodianTypeByCode(code)?.label[language] ?? code)
|
.map(code => getCustodianTypeByCode(code)?.label[language] ?? code)
|
||||||
.join(', ');
|
.join(', ');
|
||||||
|
|
|
||||||
|
|
@ -1071,6 +1071,24 @@ class LinkMLSchemaService {
|
||||||
return this.parseCustodianTypesAnnotation(slot.annotations.custodian_types);
|
return this.parseCustodianTypesAnnotation(slot.annotations.custodian_types);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all classes that use a given slot
|
||||||
|
* Returns array of class names that have this slot in their slots array
|
||||||
|
*/
|
||||||
|
async getClassesUsingSlot(slotName: string): Promise<string[]> {
|
||||||
|
await this.initialize();
|
||||||
|
const classes: string[] = [];
|
||||||
|
|
||||||
|
for (const [className, schema] of this.classSchemas.entries()) {
|
||||||
|
const classDef = schema.classes?.[className];
|
||||||
|
if (classDef?.slots?.includes(slotName)) {
|
||||||
|
classes.push(className);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return classes;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get custodian_types annotation from an enum definition
|
* Get custodian_types annotation from an enum definition
|
||||||
* Returns null if annotation not found
|
* Returns null if annotation not found
|
||||||
|
|
|
||||||
|
|
@ -260,7 +260,11 @@ function validateCustodianTypes(types: string[]): CustodianTypeCode[] {
|
||||||
* Priority:
|
* Priority:
|
||||||
* 1. Read from LinkML schema annotations (custodian_types)
|
* 1. Read from LinkML schema annotations (custodian_types)
|
||||||
* 2. Fall back to static mapping
|
* 2. Fall back to static mapping
|
||||||
* 3. Default to all types (universal)
|
* 3. Default to EMPTY ARRAY (no types assigned) - cube will show no letters
|
||||||
|
*
|
||||||
|
* NOTE: We return [] instead of DEFAULT_CUSTODIAN_TYPES when no annotation exists
|
||||||
|
* because classes without explicit custodian_types annotations should NOT display
|
||||||
|
* all 19 letters on the cube. Only classes with explicit annotations should show letters.
|
||||||
*/
|
*/
|
||||||
export async function getCustodianTypesForClassAsync(className: string): Promise<CustodianTypeCode[]> {
|
export async function getCustodianTypesForClassAsync(className: string): Promise<CustodianTypeCode[]> {
|
||||||
try {
|
try {
|
||||||
|
|
@ -276,15 +280,23 @@ export async function getCustodianTypesForClassAsync(className: string): Promise
|
||||||
console.warn(`[CustodianMapping] Error reading annotations for class ${className}:`, error);
|
console.warn(`[CustodianMapping] Error reading annotations for class ${className}:`, error);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fall back to static mapping
|
// Fall back to static mapping, or empty array if no mapping exists
|
||||||
return CLASS_TO_CUSTODIAN_TYPE[className] || DEFAULT_CUSTODIAN_TYPES;
|
// Empty array means "no custodian types assigned" - cube will show no letters
|
||||||
|
return CLASS_TO_CUSTODIAN_TYPE[className] || [];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get custodian types for a schema slot (async version)
|
* Get custodian types for a schema slot (async version)
|
||||||
|
*
|
||||||
|
* Priority:
|
||||||
|
* 1. Read from slot's own LinkML schema annotations (custodian_types)
|
||||||
|
* 2. Inherit from parent class(es) that use this slot
|
||||||
|
* 3. Fall back to static mapping
|
||||||
|
* 4. Return empty array (no types assigned - cube shows no letters)
|
||||||
*/
|
*/
|
||||||
export async function getCustodianTypesForSlotAsync(slotName: string): Promise<CustodianTypeCode[]> {
|
export async function getCustodianTypesForSlotAsync(slotName: string): Promise<CustodianTypeCode[]> {
|
||||||
try {
|
try {
|
||||||
|
// 1. Try slot's own annotation first
|
||||||
const annotationTypes = await linkmlSchemaService.getSlotCustodianTypes(slotName);
|
const annotationTypes = await linkmlSchemaService.getSlotCustodianTypes(slotName);
|
||||||
if (annotationTypes && annotationTypes.length > 0) {
|
if (annotationTypes && annotationTypes.length > 0) {
|
||||||
const validated = validateCustodianTypes(annotationTypes);
|
const validated = validateCustodianTypes(annotationTypes);
|
||||||
|
|
@ -292,15 +304,38 @@ export async function getCustodianTypesForSlotAsync(slotName: string): Promise<C
|
||||||
return validated;
|
return validated;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 2. Try to inherit from parent class(es) that use this slot
|
||||||
|
const parentClasses = await linkmlSchemaService.getClassesUsingSlot(slotName);
|
||||||
|
if (parentClasses.length > 0) {
|
||||||
|
const inheritedTypes = new Set<CustodianTypeCode>();
|
||||||
|
for (const className of parentClasses) {
|
||||||
|
const classTypes = await linkmlSchemaService.getClassCustodianTypes(className);
|
||||||
|
if (classTypes && classTypes.length > 0) {
|
||||||
|
const validated = validateCustodianTypes(classTypes);
|
||||||
|
validated.forEach(t => inheritedTypes.add(t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (inheritedTypes.size > 0) {
|
||||||
|
return Array.from(inheritedTypes);
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn(`[CustodianMapping] Error reading annotations for slot ${slotName}:`, error);
|
console.warn(`[CustodianMapping] Error reading annotations for slot ${slotName}:`, error);
|
||||||
}
|
}
|
||||||
|
|
||||||
return SLOT_TO_CUSTODIAN_TYPE[slotName] || DEFAULT_CUSTODIAN_TYPES;
|
// 3. Fall back to static mapping, or empty array if no mapping exists
|
||||||
|
// Empty array means "no custodian types assigned" - cube will show no letters
|
||||||
|
return SLOT_TO_CUSTODIAN_TYPE[slotName] || [];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get custodian types for a schema enum (async version)
|
* Get custodian types for a schema enum (async version)
|
||||||
|
*
|
||||||
|
* Priority:
|
||||||
|
* 1. Read from enum's LinkML schema annotations (custodian_types)
|
||||||
|
* 2. Fall back to static mapping
|
||||||
|
* 3. Return empty array (no types assigned - cube shows no letters)
|
||||||
*/
|
*/
|
||||||
export async function getCustodianTypesForEnumAsync(enumName: string): Promise<CustodianTypeCode[]> {
|
export async function getCustodianTypesForEnumAsync(enumName: string): Promise<CustodianTypeCode[]> {
|
||||||
try {
|
try {
|
||||||
|
|
@ -315,5 +350,7 @@ export async function getCustodianTypesForEnumAsync(enumName: string): Promise<C
|
||||||
console.warn(`[CustodianMapping] Error reading annotations for enum ${enumName}:`, error);
|
console.warn(`[CustodianMapping] Error reading annotations for enum ${enumName}:`, error);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ENUM_TO_CUSTODIAN_TYPE[enumName] || DEFAULT_CUSTODIAN_TYPES;
|
// Fall back to static mapping, or empty array if no mapping exists
|
||||||
|
// Empty array means "no custodian types assigned" - cube will show no letters
|
||||||
|
return ENUM_TO_CUSTODIAN_TYPE[enumName] || [];
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -732,7 +732,7 @@ const LinkMLViewerPage: React.FC = () => {
|
||||||
const mainContentRef = useRef<HTMLElement>(null);
|
const mainContentRef = useRef<HTMLElement>(null);
|
||||||
|
|
||||||
// Schema loading progress tracking
|
// Schema loading progress tracking
|
||||||
const { progress: schemaProgress, isLoading: isSchemaServiceLoading } = useSchemaLoadingProgress();
|
const { progress: schemaProgress, isLoading: isSchemaServiceLoading, isComplete: isSchemaServiceComplete } = useSchemaLoadingProgress();
|
||||||
|
|
||||||
// Handler for filtering by custodian type (clicking polyhedron face or legend item)
|
// Handler for filtering by custodian type (clicking polyhedron face or legend item)
|
||||||
// Multi-select toggle behavior: clicking type adds/removes from set
|
// Multi-select toggle behavior: clicking type adds/removes from set
|
||||||
|
|
@ -881,17 +881,32 @@ const LinkMLViewerPage: React.FC = () => {
|
||||||
|
|
||||||
// Load custodian types from schema annotations when schema changes
|
// Load custodian types from schema annotations when schema changes
|
||||||
// This pre-loads types asynchronously so they're available for rendering
|
// This pre-loads types asynchronously so they're available for rendering
|
||||||
|
// IMPORTANT: Wait for schema service to complete loading before fetching custodian types
|
||||||
|
// to avoid race condition where annotations aren't available yet
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!schema) {
|
if (!schema) {
|
||||||
setCustodianTypesLoaded(false);
|
setCustodianTypesLoaded(false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Don't load custodian types until schema service has finished loading all class files
|
||||||
|
// This prevents the race condition where we try to read annotations before they're loaded
|
||||||
|
if (!isSchemaServiceComplete) {
|
||||||
|
console.log('[LinkMLViewerPage] Waiting for schema service to complete before loading custodian types...');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const loadCustodianTypes = async () => {
|
const loadCustodianTypes = async () => {
|
||||||
const classes = extractClasses(schema);
|
const classes = extractClasses(schema);
|
||||||
const slots = extractSlots(schema);
|
const slots = extractSlots(schema);
|
||||||
const enums = extractEnums(schema);
|
const enums = extractEnums(schema);
|
||||||
|
|
||||||
|
console.log('[LinkMLViewerPage] Schema service complete, loading custodian types for', {
|
||||||
|
classes: classes.length,
|
||||||
|
slots: slots.length,
|
||||||
|
enums: enums.length
|
||||||
|
});
|
||||||
|
|
||||||
// Load types for all classes in parallel
|
// Load types for all classes in parallel
|
||||||
const classTypesPromises = classes.map(async (cls) => {
|
const classTypesPromises = classes.map(async (cls) => {
|
||||||
const types = await getCustodianTypesForClassAsync(cls.name);
|
const types = await getCustodianTypesForClassAsync(cls.name);
|
||||||
|
|
@ -951,7 +966,7 @@ const LinkMLViewerPage: React.FC = () => {
|
||||||
};
|
};
|
||||||
|
|
||||||
loadCustodianTypes();
|
loadCustodianTypes();
|
||||||
}, [schema]);
|
}, [schema, isSchemaServiceComplete]);
|
||||||
|
|
||||||
const toggleSection = (section: string) => {
|
const toggleSection = (section: string) => {
|
||||||
setExpandedSections(prev => {
|
setExpandedSections(prev => {
|
||||||
|
|
|
||||||
2
node_modules/.modules.yaml
generated
vendored
2
node_modules/.modules.yaml
generated
vendored
|
|
@ -987,7 +987,7 @@ hoistedDependencies:
|
||||||
loose-envify: private
|
loose-envify: private
|
||||||
lru-cache@11.2.4:
|
lru-cache@11.2.4:
|
||||||
lru-cache: private
|
lru-cache: private
|
||||||
lucide-react@0.561.0(react@19.2.3):
|
lucide-react@0.562.0(react@19.2.3):
|
||||||
lucide-react: private
|
lucide-react: private
|
||||||
lz-string@1.5.0:
|
lz-string@1.5.0:
|
||||||
lz-string: private
|
lz-string: private
|
||||||
|
|
|
||||||
10
node_modules/.pnpm/lock.yaml
generated
vendored
10
node_modules/.pnpm/lock.yaml
generated
vendored
|
|
@ -169,8 +169,8 @@ importers:
|
||||||
specifier: ^4.17.21
|
specifier: ^4.17.21
|
||||||
version: 4.17.21
|
version: 4.17.21
|
||||||
lucide-react:
|
lucide-react:
|
||||||
specifier: ^0.561.0
|
specifier: ^0.562.0
|
||||||
version: 0.561.0(react@19.2.3)
|
version: 0.562.0(react@19.2.3)
|
||||||
maplibre-gl:
|
maplibre-gl:
|
||||||
specifier: ^5.14.0
|
specifier: ^5.14.0
|
||||||
version: 5.15.0
|
version: 5.15.0
|
||||||
|
|
@ -2507,8 +2507,8 @@ packages:
|
||||||
lru-cache@5.1.1:
|
lru-cache@5.1.1:
|
||||||
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
|
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
|
||||||
|
|
||||||
lucide-react@0.561.0:
|
lucide-react@0.562.0:
|
||||||
resolution: {integrity: sha512-Y59gMY38tl4/i0qewcqohPdEbieBy7SovpBL9IFebhc2mDd8x4PZSOsiFRkpPcOq6bj1r/mjH/Rk73gSlIJP2A==}
|
resolution: {integrity: sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||||
|
|
||||||
|
|
@ -5660,7 +5660,7 @@ snapshots:
|
||||||
dependencies:
|
dependencies:
|
||||||
yallist: 3.1.1
|
yallist: 3.1.1
|
||||||
|
|
||||||
lucide-react@0.561.0(react@19.2.3):
|
lucide-react@0.562.0(react@19.2.3):
|
||||||
dependencies:
|
dependencies:
|
||||||
react: 19.2.3
|
react: 19.2.3
|
||||||
|
|
||||||
|
|
|
||||||
39
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE
generated
vendored
Normal file
39
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/LICENSE
generated
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
ISC License
|
||||||
|
|
||||||
|
Copyright (c) for portions of Lucide are held by Cole Bemis 2013-2023 as part of Feather (MIT). All other copyright (c) for Lucide are held by Lucide Contributors 2025.
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
purpose with or without fee is hereby granted, provided that the above
|
||||||
|
copyright notice and this permission notice appear in all copies.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The MIT License (MIT) (for portions derived from Feather)
|
||||||
|
|
||||||
|
Copyright (c) 2013-2023 Cole Bemis
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
73
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md
generated
vendored
Normal file
73
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/README.md
generated
vendored
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://github.com/lucide-icons/lucide">
|
||||||
|
<img src="https://lucide.dev/package-logos/lucide-react.svg" alt="Lucide icon library for React applications." width="540">
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
Lucide icon library for React applications.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
[](https://www.npmjs.com/package/lucide-react)
|
||||||
|

|
||||||
|
[](https://lucide.dev/license)
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://lucide.dev/guide/">About</a>
|
||||||
|
·
|
||||||
|
<a href="https://lucide.dev/icons/">Icons</a>
|
||||||
|
·
|
||||||
|
<a href="https://lucide.dev/guide/packages/lucide-react">Documentation</a>
|
||||||
|
·
|
||||||
|
<a href="https://lucide.dev/license">License</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
# Lucide React
|
||||||
|
|
||||||
|
Implementation of the lucide icon library for React applications.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pnpm add lucide-react
|
||||||
|
```
|
||||||
|
|
||||||
|
```sh
|
||||||
|
npm install lucide-react
|
||||||
|
```
|
||||||
|
|
||||||
|
```sh
|
||||||
|
yarn add lucide-react
|
||||||
|
```
|
||||||
|
|
||||||
|
```sh
|
||||||
|
bun add lucide-react
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
For full documentation, visit [lucide.dev](https://lucide.dev/guide/packages/lucide-react)
|
||||||
|
|
||||||
|
## Community
|
||||||
|
|
||||||
|
Join the [Discord server](https://discord.gg/EH6nSts) to chat with the maintainers and other users.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Lucide is licensed under the ISC license. See [LICENSE](https://lucide.dev/license).
|
||||||
|
|
||||||
|
## Sponsors
|
||||||
|
|
||||||
|
<a href="https://vercel.com?utm_source=lucide&utm_campaign=oss">
|
||||||
|
<img src="https://lucide.dev/vercel.svg" alt="Powered by Vercel" width="200" />
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<a href="https://www.digitalocean.com/?refcode=b0877a2caebd&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge"><img src="https://lucide.dev/digitalocean.svg" width="200" alt="DigitalOcean Referral Badge" /></a>
|
||||||
|
|
||||||
|
### Awesome backers 🍺
|
||||||
|
|
||||||
|
<a href="https://www.scipress.io?utm_source=lucide"><img src="https://lucide.dev/sponsors/scipress.svg" width="180" alt="Scipress sponsor badge" /></a>
|
||||||
|
<a href="https://github.com/pdfme/pdfme"><img src="https://lucide.dev/sponsors/pdfme.svg" width="180" alt="pdfme sponsor badge" /></a>
|
||||||
10
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs
generated
vendored
Normal file
10
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamic.mjs
generated
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
/**
|
||||||
|
* @license lucide-react v0.562.0 - ISC
|
||||||
|
*
|
||||||
|
* This source code is licensed under the ISC license.
|
||||||
|
* See the LICENSE file in the root directory of this source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export { default as DynamicIcon, iconNames } from './dist/esm/DynamicIcon.js';
|
||||||
|
export { default as dynamicIconImports } from './dist/esm/dynamicIconImports.js';
|
||||||
|
//# sourceMappingURL=dynamic.mjs.map
|
||||||
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs
generated
vendored
Normal file
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/dynamicIconImports.mjs
generated
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
export { default } from './dist/esm/dynamicIconImports.js';
|
||||||
74
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json
generated
vendored
Normal file
74
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react/package.json
generated
vendored
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
{
|
||||||
|
"name": "lucide-react",
|
||||||
|
"description": "A Lucide icon library package for React applications.",
|
||||||
|
"version": "0.562.0",
|
||||||
|
"license": "ISC",
|
||||||
|
"homepage": "https://lucide.dev",
|
||||||
|
"bugs": "https://github.com/lucide-icons/lucide/issues",
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/lucide-icons/lucide.git",
|
||||||
|
"directory": "packages/lucide-react"
|
||||||
|
},
|
||||||
|
"keywords": [
|
||||||
|
"Lucide",
|
||||||
|
"React",
|
||||||
|
"Feather",
|
||||||
|
"Icons",
|
||||||
|
"Icon",
|
||||||
|
"SVG",
|
||||||
|
"Feather Icons",
|
||||||
|
"Fontawesome",
|
||||||
|
"Font Awesome"
|
||||||
|
],
|
||||||
|
"author": "Eric Fennis",
|
||||||
|
"amdName": "lucide-react",
|
||||||
|
"main": "dist/cjs/lucide-react.js",
|
||||||
|
"main:umd": "dist/umd/lucide-react.js",
|
||||||
|
"module": "dist/esm/lucide-react.js",
|
||||||
|
"unpkg": "dist/umd/lucide-react.min.js",
|
||||||
|
"typings": "dist/lucide-react.d.ts",
|
||||||
|
"sideEffects": false,
|
||||||
|
"files": [
|
||||||
|
"dist",
|
||||||
|
"dynamic.mjs",
|
||||||
|
"dynamic.js.map",
|
||||||
|
"dynamic.d.ts",
|
||||||
|
"dynamicIconImports.mjs",
|
||||||
|
"dynamicIconImports.js.map",
|
||||||
|
"dynamicIconImports.d.ts"
|
||||||
|
],
|
||||||
|
"devDependencies": {
|
||||||
|
"@testing-library/jest-dom": "^6.1.6",
|
||||||
|
"@testing-library/react": "^14.1.2",
|
||||||
|
"@types/react": "^18.2.37",
|
||||||
|
"@vitejs/plugin-react": "^4.4.1",
|
||||||
|
"jest-serializer-html": "^7.1.0",
|
||||||
|
"react": "18.2.0",
|
||||||
|
"react-dom": "18.2.0",
|
||||||
|
"rollup": "^4.53.3",
|
||||||
|
"rollup-plugin-dts": "^6.2.3",
|
||||||
|
"rollup-plugin-preserve-directives": "^0.4.0",
|
||||||
|
"typescript": "^5.8.3",
|
||||||
|
"vite": "^7.2.4",
|
||||||
|
"vitest": "^4.0.12",
|
||||||
|
"@lucide/shared": "1.0.0",
|
||||||
|
"@lucide/rollup-plugins": "1.0.0",
|
||||||
|
"@lucide/build-icons": "1.1.0"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"build": "pnpm clean && pnpm copy:license && pnpm build:icons && pnpm typecheck && pnpm build:bundles",
|
||||||
|
"copy:license": "cp ../../LICENSE ./LICENSE",
|
||||||
|
"clean": "rm -rf dist && rm -rf stats && rm -rf ./src/icons/*.ts && rm -f dynamic.* && rm -f dynamicIconImports.d.ts",
|
||||||
|
"build:icons": "build-icons --output=./src --templateSrc=./scripts/exportTemplate.mts --renderUniqueKey --withAliases --withDynamicImports --separateAliasesFile --separateAliasesFileIgnore=fingerprint --aliasesFileExtension=.ts --iconFileExtension=.ts --exportFileName=index.ts",
|
||||||
|
"build:bundles": "rollup -c ./rollup.config.mjs",
|
||||||
|
"typecheck": "tsc",
|
||||||
|
"typecheck:watch": "tsc -w",
|
||||||
|
"test": "pnpm build:icons && vitest run",
|
||||||
|
"test:watch": "vitest watch",
|
||||||
|
"version": "pnpm version --git-tag-version=false"
|
||||||
|
}
|
||||||
|
}
|
||||||
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react
generated
vendored
Symbolic link
1
node_modules/.pnpm/lucide-react@0.562.0_react@19.2.3/node_modules/react
generated
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../react@19.2.3/node_modules/react
|
||||||
2
node_modules/.pnpm/node_modules/lucide-react
generated
vendored
2
node_modules/.pnpm/node_modules/lucide-react
generated
vendored
|
|
@ -1 +1 @@
|
||||||
../lucide-react@0.561.0_react@19.2.3/node_modules/lucide-react
|
../lucide-react@0.562.0_react@19.2.3/node_modules/lucide-react
|
||||||
|
|
@ -169,8 +169,8 @@ importers:
|
||||||
specifier: ^4.17.21
|
specifier: ^4.17.21
|
||||||
version: 4.17.21
|
version: 4.17.21
|
||||||
lucide-react:
|
lucide-react:
|
||||||
specifier: ^0.561.0
|
specifier: ^0.562.0
|
||||||
version: 0.561.0(react@19.2.3)
|
version: 0.562.0(react@19.2.3)
|
||||||
maplibre-gl:
|
maplibre-gl:
|
||||||
specifier: ^5.14.0
|
specifier: ^5.14.0
|
||||||
version: 5.15.0
|
version: 5.15.0
|
||||||
|
|
@ -2507,8 +2507,8 @@ packages:
|
||||||
lru-cache@5.1.1:
|
lru-cache@5.1.1:
|
||||||
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
|
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
|
||||||
|
|
||||||
lucide-react@0.561.0:
|
lucide-react@0.562.0:
|
||||||
resolution: {integrity: sha512-Y59gMY38tl4/i0qewcqohPdEbieBy7SovpBL9IFebhc2mDd8x4PZSOsiFRkpPcOq6bj1r/mjH/Rk73gSlIJP2A==}
|
resolution: {integrity: sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
react: ^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||||
|
|
||||||
|
|
@ -5660,7 +5660,7 @@ snapshots:
|
||||||
dependencies:
|
dependencies:
|
||||||
yallist: 3.1.1
|
yallist: 3.1.1
|
||||||
|
|
||||||
lucide-react@0.561.0(react@19.2.3):
|
lucide-react@0.562.0(react@19.2.3):
|
||||||
dependencies:
|
dependencies:
|
||||||
react: 19.2.3
|
react: 19.2.3
|
||||||
|
|
||||||
|
|
|
||||||
561
scripts/discover_custodian_websites.py
Normal file
561
scripts/discover_custodian_websites.py
Normal file
|
|
@ -0,0 +1,561 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Discover website URLs for custodian YAML files that are missing them.
|
||||||
|
|
||||||
|
This script uses web search (via DuckDuckGo or Google) to find official websites
|
||||||
|
for heritage institutions based on their name and location.
|
||||||
|
|
||||||
|
Search strategy:
|
||||||
|
1. Search for institution name + city + country
|
||||||
|
2. Search for institution name + "official website"
|
||||||
|
3. Search for institution name + institution type (museum, library, archive)
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- Updates custodian YAML files with discovered website URLs
|
||||||
|
- Stores provenance for discovered URLs
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/discover_custodian_websites.py [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--dry-run Show what would be discovered without modifying files
|
||||||
|
--limit N Process only first N files (for testing)
|
||||||
|
--file PATH Process a single specific file
|
||||||
|
--country CODE Filter by country code (e.g., JP, CZ)
|
||||||
|
--resume Resume from last checkpoint
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
pip install duckduckgo-search pyyaml httpx
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
try:
|
||||||
|
from duckduckgo_search import DDGS
|
||||||
|
except ImportError:
|
||||||
|
print("Please install duckduckgo-search: pip install duckduckgo-search")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
except ImportError:
|
||||||
|
print("Please install httpx: pip install httpx")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
||||||
|
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_checkpoint.json"
|
||||||
|
REQUEST_DELAY = 3.0 # seconds between searches (be nice to search engines)
|
||||||
|
|
||||||
|
# Domain blacklist (not actual institution websites)
|
||||||
|
DOMAIN_BLACKLIST = {
|
||||||
|
'wikipedia.org', 'wikidata.org', 'wikimedia.org',
|
||||||
|
'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com',
|
||||||
|
'youtube.com', 'tiktok.com', 'pinterest.com',
|
||||||
|
'tripadvisor.com', 'tripadvisor.jp', 'yelp.com',
|
||||||
|
'google.com', 'google.co.jp', 'maps.google.com',
|
||||||
|
'amazon.com', 'amazon.co.jp', 'ebay.com',
|
||||||
|
'booking.com', 'expedia.com', 'hotels.com',
|
||||||
|
'foursquare.com', 'bing.com', 'yahoo.com',
|
||||||
|
'findagrave.com', 'ancestry.com', 'familysearch.org',
|
||||||
|
'academia.edu', 'researchgate.net',
|
||||||
|
'timeanddate.com', 'weather.com',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Domain preferences (prefer these TLDs for official sites)
|
||||||
|
PREFERRED_TLDS = {
|
||||||
|
'JP': ['.go.jp', '.lg.jp', '.ac.jp', '.or.jp', '.jp'],
|
||||||
|
'CZ': ['.cz', '.gov.cz'],
|
||||||
|
'NL': ['.nl', '.gov.nl'],
|
||||||
|
'BE': ['.be', '.gov.be'],
|
||||||
|
'DE': ['.de', '.gov.de'],
|
||||||
|
'AT': ['.at', '.gv.at'],
|
||||||
|
'CH': ['.ch', '.admin.ch'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_custodian_name(entry: dict) -> str | None:
|
||||||
|
"""Extract institution name from entry."""
|
||||||
|
# Priority 1: Emic name (native language official name)
|
||||||
|
if entry.get('custodian_name', {}).get('emic_name'):
|
||||||
|
return entry['custodian_name']['emic_name']
|
||||||
|
|
||||||
|
# Priority 2: Wikidata native language label (ja, zh, ko, etc.)
|
||||||
|
wikidata = entry.get('wikidata_enrichment', {})
|
||||||
|
country = get_country_from_entry(entry)
|
||||||
|
|
||||||
|
# Map country to preferred label language
|
||||||
|
country_lang_map = {
|
||||||
|
'JP': 'ja',
|
||||||
|
'CN': 'zh',
|
||||||
|
'KR': 'ko',
|
||||||
|
'TW': 'zh',
|
||||||
|
'TH': 'th',
|
||||||
|
'VN': 'vi',
|
||||||
|
'RU': 'ru',
|
||||||
|
'GR': 'el',
|
||||||
|
'IL': 'he',
|
||||||
|
'SA': 'ar',
|
||||||
|
'IR': 'fa',
|
||||||
|
}
|
||||||
|
|
||||||
|
if country in country_lang_map:
|
||||||
|
lang = country_lang_map[country]
|
||||||
|
native_label = wikidata.get(f'wikidata_label_{lang}') or wikidata.get('wikidata_labels', {}).get(lang)
|
||||||
|
if native_label:
|
||||||
|
return native_label
|
||||||
|
|
||||||
|
# Priority 3: Claim value
|
||||||
|
if entry.get('custodian_name', {}).get('claim_value'):
|
||||||
|
return entry['custodian_name']['claim_value']
|
||||||
|
|
||||||
|
# Priority 4: Original entry name
|
||||||
|
if entry.get('original_entry', {}).get('name'):
|
||||||
|
return entry['original_entry']['name']
|
||||||
|
|
||||||
|
# Priority 5: Organisatie (Dutch)
|
||||||
|
if entry.get('original_entry', {}).get('organisatie'):
|
||||||
|
return entry['original_entry']['organisatie']
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_country_from_entry(entry: dict) -> str | None:
|
||||||
|
"""Extract country code from entry."""
|
||||||
|
# Check location.country
|
||||||
|
if entry.get('location', {}).get('country'):
|
||||||
|
return entry['location']['country']
|
||||||
|
|
||||||
|
# Check original_entry.locations
|
||||||
|
if entry.get('original_entry', {}).get('locations'):
|
||||||
|
loc = entry['original_entry']['locations'][0]
|
||||||
|
if loc.get('country'):
|
||||||
|
return loc['country']
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_location_info(entry: dict) -> dict:
|
||||||
|
"""Extract location information from entry."""
|
||||||
|
location = {}
|
||||||
|
|
||||||
|
# Check original_entry.locations
|
||||||
|
if entry.get('original_entry', {}).get('locations'):
|
||||||
|
loc = entry['original_entry']['locations'][0]
|
||||||
|
location['city'] = loc.get('city')
|
||||||
|
location['region'] = loc.get('region')
|
||||||
|
location['country'] = loc.get('country')
|
||||||
|
location['street_address'] = loc.get('street_address')
|
||||||
|
|
||||||
|
# Check original_entry directly
|
||||||
|
if not location.get('city'):
|
||||||
|
orig = entry.get('original_entry', {})
|
||||||
|
location['city'] = orig.get('city') or orig.get('plaats')
|
||||||
|
location['country'] = orig.get('country')
|
||||||
|
|
||||||
|
return location
|
||||||
|
|
||||||
|
|
||||||
|
def get_institution_type(entry: dict) -> str | None:
|
||||||
|
"""Get institution type for search refinement."""
|
||||||
|
inst_type = entry.get('original_entry', {}).get('institution_type')
|
||||||
|
if inst_type:
|
||||||
|
type_map = {
|
||||||
|
'LIBRARY': 'library',
|
||||||
|
'MUSEUM': 'museum',
|
||||||
|
'ARCHIVE': 'archive',
|
||||||
|
'GALLERY': 'gallery',
|
||||||
|
'RESEARCH_CENTER': 'research center',
|
||||||
|
'EDUCATION_PROVIDER': 'university',
|
||||||
|
}
|
||||||
|
return type_map.get(inst_type)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def has_website(entry: dict) -> bool:
|
||||||
|
"""Check if entry already has a website."""
|
||||||
|
# Check various website fields
|
||||||
|
if entry.get('original_entry', {}).get('webadres_organisatie'):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check identifiers
|
||||||
|
for ident in entry.get('original_entry', {}).get('identifiers', []):
|
||||||
|
if ident.get('identifier_scheme') == 'Website':
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check enrichment fields
|
||||||
|
if entry.get('website_discovery', {}).get('website_url'):
|
||||||
|
return True
|
||||||
|
if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'):
|
||||||
|
return True
|
||||||
|
if entry.get('google_maps_enrichment', {}).get('website'):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_website(url: str, country: str | None = None) -> bool:
|
||||||
|
"""Check if URL is a valid institutional website."""
|
||||||
|
if not url:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove www prefix
|
||||||
|
if domain.startswith('www.'):
|
||||||
|
domain = domain[4:]
|
||||||
|
|
||||||
|
# Check blacklist
|
||||||
|
for blacklisted in DOMAIN_BLACKLIST:
|
||||||
|
if blacklisted in domain:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def score_website(url: str, country: str, name: str) -> int:
|
||||||
|
"""Score a website URL based on likelihood of being official site."""
|
||||||
|
score = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Prefer country-specific TLDs
|
||||||
|
preferred = PREFERRED_TLDS.get(country, [])
|
||||||
|
for i, tld in enumerate(preferred):
|
||||||
|
if domain.endswith(tld):
|
||||||
|
score += (len(preferred) - i) * 10
|
||||||
|
break
|
||||||
|
|
||||||
|
# Prefer HTTPS
|
||||||
|
if parsed.scheme == 'https':
|
||||||
|
score += 5
|
||||||
|
|
||||||
|
# Prefer shorter paths (homepage vs deep link)
|
||||||
|
path_depth = len([p for p in parsed.path.split('/') if p])
|
||||||
|
score -= path_depth * 2
|
||||||
|
|
||||||
|
# Check if institution name words appear in domain
|
||||||
|
name_words = set(re.findall(r'\w+', name.lower()))
|
||||||
|
domain_words = set(re.findall(r'\w+', domain))
|
||||||
|
common_words = name_words & domain_words
|
||||||
|
score += len(common_words) * 5
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def search_for_website(name: str, location: dict, inst_type: str | None = None) -> list[dict]:
|
||||||
|
"""Search for institution website using DuckDuckGo."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Build search queries
|
||||||
|
queries = []
|
||||||
|
|
||||||
|
city = location.get('city', '')
|
||||||
|
country = location.get('country', '')
|
||||||
|
|
||||||
|
# Primary query: name + city
|
||||||
|
if city:
|
||||||
|
queries.append(f'"{name}" {city}')
|
||||||
|
|
||||||
|
# Secondary query: name + country + institution type
|
||||||
|
if inst_type:
|
||||||
|
queries.append(f'"{name}" {country} {inst_type} official')
|
||||||
|
|
||||||
|
# Tertiary: just the name with "official website"
|
||||||
|
queries.append(f'"{name}" official website')
|
||||||
|
|
||||||
|
ddgs = DDGS()
|
||||||
|
|
||||||
|
for query in queries[:2]: # Limit to 2 queries per institution
|
||||||
|
try:
|
||||||
|
search_results = list(ddgs.text(query, max_results=5))
|
||||||
|
|
||||||
|
for r in search_results:
|
||||||
|
url = r.get('href') or r.get('url')
|
||||||
|
if url and is_valid_website(url, country):
|
||||||
|
results.append({
|
||||||
|
'url': url,
|
||||||
|
'title': r.get('title', ''),
|
||||||
|
'snippet': r.get('body', ''),
|
||||||
|
'query': query,
|
||||||
|
'score': score_website(url, country, name)
|
||||||
|
})
|
||||||
|
|
||||||
|
time.sleep(1) # Rate limit between queries
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Search error for '{query}': {e}")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Sort by score and deduplicate
|
||||||
|
seen_domains = set()
|
||||||
|
unique_results = []
|
||||||
|
for r in sorted(results, key=lambda x: -x['score']):
|
||||||
|
domain = urlparse(r['url']).netloc.lower()
|
||||||
|
if domain not in seen_domains:
|
||||||
|
seen_domains.add(domain)
|
||||||
|
unique_results.append(r)
|
||||||
|
|
||||||
|
return unique_results[:3] # Return top 3 unique results
|
||||||
|
|
||||||
|
|
||||||
|
async def verify_website(url: str) -> dict:
|
||||||
|
"""Verify that a website is accessible and get basic info."""
|
||||||
|
result = {
|
||||||
|
'accessible': False,
|
||||||
|
'final_url': url,
|
||||||
|
'status_code': None,
|
||||||
|
'title': None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(follow_redirects=True, timeout=15.0) as client:
|
||||||
|
response = await client.get(url)
|
||||||
|
result['accessible'] = response.status_code == 200
|
||||||
|
result['status_code'] = response.status_code
|
||||||
|
result['final_url'] = str(response.url)
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
if result['accessible']:
|
||||||
|
match = re.search(r'<title[^>]*>([^<]+)</title>', response.text, re.I)
|
||||||
|
if match:
|
||||||
|
result['title'] = match.group(1).strip()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Failed to verify {url}: {e}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint() -> dict:
|
||||||
|
"""Load progress checkpoint."""
|
||||||
|
if CHECKPOINT_FILE.exists():
|
||||||
|
with open(CHECKPOINT_FILE, 'r') as f:
|
||||||
|
return json.load(f)
|
||||||
|
return {'processed_files': [], 'found_count': 0, 'not_found_count': 0}
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(checkpoint: dict):
|
||||||
|
"""Save progress checkpoint."""
|
||||||
|
with open(CHECKPOINT_FILE, 'w') as f:
|
||||||
|
json.dump(checkpoint, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def update_custodian_file(filepath: Path, website_url: str, discovery_info: dict) -> bool:
|
||||||
|
"""Update custodian YAML file with discovered website."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
entry = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not entry:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Add website discovery section
|
||||||
|
entry['website_discovery'] = {
|
||||||
|
'website_url': website_url,
|
||||||
|
'discovery_date': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'discovery_method': 'duckduckgo_search',
|
||||||
|
'search_query': discovery_info.get('query', ''),
|
||||||
|
'confidence_score': min(discovery_info.get('score', 0) / 50, 1.0), # Normalize to 0-1
|
||||||
|
'verification': {
|
||||||
|
'accessible': discovery_info.get('verification', {}).get('accessible', False),
|
||||||
|
'page_title': discovery_info.get('verification', {}).get('title'),
|
||||||
|
'final_url': discovery_info.get('verification', {}).get('final_url'),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to update {filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def process_file(filepath: Path, dry_run: bool = False) -> dict:
|
||||||
|
"""Process a single custodian file."""
|
||||||
|
result = {
|
||||||
|
'filename': filepath.name,
|
||||||
|
'status': 'skipped',
|
||||||
|
'website': None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
entry = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not entry:
|
||||||
|
result['status'] = 'empty'
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Skip if already has website
|
||||||
|
if has_website(entry):
|
||||||
|
result['status'] = 'has_website'
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Get institution info
|
||||||
|
name = get_custodian_name(entry)
|
||||||
|
if not name:
|
||||||
|
result['status'] = 'no_name'
|
||||||
|
return result
|
||||||
|
|
||||||
|
location = get_location_info(entry)
|
||||||
|
inst_type = get_institution_type(entry)
|
||||||
|
country = location.get('country', filepath.name[:2])
|
||||||
|
|
||||||
|
logger.info(f"Searching for: {name} ({location.get('city', 'unknown city')}, {country})")
|
||||||
|
|
||||||
|
# Search for website
|
||||||
|
search_results = search_for_website(name, location, inst_type)
|
||||||
|
|
||||||
|
if not search_results:
|
||||||
|
result['status'] = 'not_found'
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Verify top result
|
||||||
|
best = search_results[0]
|
||||||
|
verification = await verify_website(best['url'])
|
||||||
|
best['verification'] = verification
|
||||||
|
|
||||||
|
if verification['accessible']:
|
||||||
|
result['website'] = verification['final_url']
|
||||||
|
result['status'] = 'found'
|
||||||
|
result['discovery_info'] = best
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
update_custodian_file(filepath, verification['final_url'], best)
|
||||||
|
logger.info(f" → Found: {verification['final_url']}")
|
||||||
|
else:
|
||||||
|
# Try second result if first is inaccessible
|
||||||
|
if len(search_results) > 1:
|
||||||
|
second = search_results[1]
|
||||||
|
verification2 = await verify_website(second['url'])
|
||||||
|
if verification2['accessible']:
|
||||||
|
second['verification'] = verification2
|
||||||
|
result['website'] = verification2['final_url']
|
||||||
|
result['status'] = 'found'
|
||||||
|
result['discovery_info'] = second
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
update_custodian_file(filepath, verification2['final_url'], second)
|
||||||
|
logger.info(f" → Found (2nd): {verification2['final_url']}")
|
||||||
|
else:
|
||||||
|
result['status'] = 'inaccessible'
|
||||||
|
else:
|
||||||
|
result['status'] = 'inaccessible'
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
result['status'] = 'error'
|
||||||
|
result['error'] = str(e)
|
||||||
|
logger.error(f"Error processing {filepath}: {e}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Discover websites for custodian files')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be discovered')
|
||||||
|
parser.add_argument('--limit', type=int, help='Process only first N files')
|
||||||
|
parser.add_argument('--file', type=str, help='Process a single specific file')
|
||||||
|
parser.add_argument('--country', type=str, help='Filter by country code (e.g., JP, CZ)')
|
||||||
|
parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get files to process
|
||||||
|
if args.file:
|
||||||
|
files = [Path(args.file)]
|
||||||
|
else:
|
||||||
|
pattern = f"{args.country}-*.yaml" if args.country else "*.yaml"
|
||||||
|
files = sorted(CUSTODIAN_DIR.glob(pattern))
|
||||||
|
|
||||||
|
# Filter out non-custodian files
|
||||||
|
files = [f for f in files if f.name[0].isupper() and '-' in f.name]
|
||||||
|
|
||||||
|
# Load checkpoint
|
||||||
|
checkpoint = load_checkpoint() if args.resume else {'processed_files': [], 'found_count': 0, 'not_found_count': 0}
|
||||||
|
processed_set = set(checkpoint['processed_files'])
|
||||||
|
|
||||||
|
if args.resume:
|
||||||
|
files = [f for f in files if f.name not in processed_set]
|
||||||
|
logger.info(f"Resuming: {len(processed_set)} files already processed, {len(files)} remaining")
|
||||||
|
|
||||||
|
# Apply limit
|
||||||
|
if args.limit:
|
||||||
|
files = files[:args.limit]
|
||||||
|
|
||||||
|
logger.info(f"Processing {len(files)} custodian files...")
|
||||||
|
|
||||||
|
# Process files
|
||||||
|
found_count = checkpoint.get('found_count', 0)
|
||||||
|
not_found_count = checkpoint.get('not_found_count', 0)
|
||||||
|
|
||||||
|
for i, filepath in enumerate(files):
|
||||||
|
result = await process_file(filepath, args.dry_run)
|
||||||
|
|
||||||
|
# Update counts
|
||||||
|
if result['status'] == 'found':
|
||||||
|
found_count += 1
|
||||||
|
elif result['status'] in ('not_found', 'inaccessible'):
|
||||||
|
not_found_count += 1
|
||||||
|
|
||||||
|
# Update checkpoint
|
||||||
|
if not args.dry_run:
|
||||||
|
checkpoint['processed_files'].append(filepath.name)
|
||||||
|
checkpoint['found_count'] = found_count
|
||||||
|
checkpoint['not_found_count'] = not_found_count
|
||||||
|
|
||||||
|
if (i + 1) % 10 == 0:
|
||||||
|
save_checkpoint(checkpoint)
|
||||||
|
|
||||||
|
# Progress update
|
||||||
|
if (i + 1) % 10 == 0:
|
||||||
|
logger.info(f"Progress: {i + 1}/{len(files)} - Found: {found_count}, Not found: {not_found_count}")
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
# Final checkpoint save
|
||||||
|
if not args.dry_run:
|
||||||
|
save_checkpoint(checkpoint)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
logger.info(f"\n{'='*50}")
|
||||||
|
logger.info(f"Discovery complete!")
|
||||||
|
logger.info(f" Files processed: {len(files)}")
|
||||||
|
logger.info(f" Websites found: {found_count}")
|
||||||
|
logger.info(f" Not found: {not_found_count}")
|
||||||
|
logger.info(f"{'='*50}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main())
|
||||||
150
scripts/discover_websites_crawl4ai.py
Normal file
150
scripts/discover_websites_crawl4ai.py
Normal file
|
|
@ -0,0 +1,150 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simplified Website Discovery for Custodians using crawl4ai.
|
||||||
|
Discovers websites by:
|
||||||
|
1. Searching DuckDuckGo
|
||||||
|
2. Verifying with crawl4ai
|
||||||
|
3. Updating YAML files with discovered URLs
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import httpx
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
||||||
|
CHECKPOINT_FILE = CUSTODIAN_DIR / ".website_discovery_crawl4ai_checkpoint.json"
|
||||||
|
REQUEST_DELAY = 3.0 # seconds between requests
|
||||||
|
DUCKDUCKGO_SEARCH = "https://duckduckgo.com/html/?q="
|
||||||
|
|
||||||
|
async def discover_websites(name, city, country):
|
||||||
|
"""Search DuckDuckGo and verify websites."""
|
||||||
|
logger.info(f"Searching for: {name}")
|
||||||
|
|
||||||
|
# Simple search - use .format() to avoid f-string issues
|
||||||
|
city_part = f" {city}" if city else ""
|
||||||
|
query = f"{name}{city_part}" if city_part else f"{name}"
|
||||||
|
|
||||||
|
# Search DuckDuckGo
|
||||||
|
search_url = f"{DUCKDUCKGO_SEARCH}{query.replace(' ', '+')}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
|
||||||
|
response = await client.get(search_url)
|
||||||
|
if response.status_code not in [200, 202]:
|
||||||
|
logger.warning(f"Search failed: {response.status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
html = response.text
|
||||||
|
links = []
|
||||||
|
for match in re.finditer(r'<a[^>]+href="([^"]+)"[^"]*"([^"]*")\s*>([^<]+)</a>', html, re.I):
|
||||||
|
href = match.group(1).replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
|
if href:
|
||||||
|
links.append({'url': href, 'title': match.group(3)})
|
||||||
|
|
||||||
|
if not links:
|
||||||
|
logger.info(f"No results found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.info(f"Found {len(links)} candidates, verifying...")
|
||||||
|
|
||||||
|
verified = []
|
||||||
|
for link in sorted(links, key=lambda x: len(x['title'])):
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||||
|
verify_response = await client.get(link['url'])
|
||||||
|
if verify_response.status_code == 200:
|
||||||
|
logger.info(f"Verified: {link['url']}")
|
||||||
|
verified.append({
|
||||||
|
'url': link['url'],
|
||||||
|
'title': link['title'],
|
||||||
|
'status': 'found'
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logger.debug(f"Verification failed for {link['url']}")
|
||||||
|
except Exception:
|
||||||
|
logger.debug(f"Verification error for {link['url']}")
|
||||||
|
|
||||||
|
if verified:
|
||||||
|
best = verified[0]
|
||||||
|
logger.info(f"Best candidate: {best['url']}")
|
||||||
|
return {
|
||||||
|
'status': 'found',
|
||||||
|
'message': f"Discovered and verified: {best['url']}",
|
||||||
|
'website_url': best['url'],
|
||||||
|
'title': best.get('title'),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
logger.info(f"No valid websites found")
|
||||||
|
return {
|
||||||
|
'status': 'not_found',
|
||||||
|
'message': 'No valid results found'
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Search error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def update_custodian_file(filepath, website_url, title):
|
||||||
|
"""Update custodian YAML file with discovered website."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
entry = yaml.safe_load(f)
|
||||||
|
if not entry:
|
||||||
|
logger.error(f"Invalid file: {filepath}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Add website discovery section
|
||||||
|
entry['website_discovery'] = {
|
||||||
|
'website_url': website_url,
|
||||||
|
'discovery_date': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'discovery_method': 'crawl4ai_search_and_verify',
|
||||||
|
'title': title,
|
||||||
|
'confidence_score': 0.0, # Will be updated if verification succeeds
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||||
|
|
||||||
|
logger.info(f"Updated: {filepath}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to update {filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))[:1] # Test with 1 file
|
||||||
|
|
||||||
|
logger.info(f"Processing {len(files)} custodian files...")
|
||||||
|
|
||||||
|
for filepath in files:
|
||||||
|
name = Path(filepath).stem.replace('_', ' ')
|
||||||
|
logger.info(f"Processing: {name}")
|
||||||
|
|
||||||
|
url = await discover_websites(name, None, 'JP')
|
||||||
|
|
||||||
|
if url:
|
||||||
|
website_url = url.get('website_url') or url.get('url')
|
||||||
|
title = url.get('title')
|
||||||
|
if update_custodian_file(filepath, website_url, title):
|
||||||
|
logger.info(f" → Discovered: {website_url}")
|
||||||
|
else:
|
||||||
|
logger.info(f"No website found")
|
||||||
|
|
||||||
|
logger.info("Done!")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -75,19 +75,26 @@ REQUEST_DELAY = 2.0 # seconds between requests
|
||||||
|
|
||||||
def get_website_url(entry: dict) -> str | None:
|
def get_website_url(entry: dict) -> str | None:
|
||||||
"""Extract website URL from custodian entry."""
|
"""Extract website URL from custodian entry."""
|
||||||
# Priority 1: Original entry webadres
|
# Priority 1: Original entry webadres (Dutch ISIL format)
|
||||||
if entry.get('original_entry', {}).get('webadres_organisatie'):
|
if entry.get('original_entry', {}).get('webadres_organisatie'):
|
||||||
url = entry['original_entry']['webadres_organisatie']
|
url = entry['original_entry']['webadres_organisatie']
|
||||||
if url and url.strip() and url.strip().lower() not in ('null', 'none', ''):
|
if url and url.strip() and url.strip().lower() not in ('null', 'none', ''):
|
||||||
return normalize_url(url.strip())
|
return normalize_url(url.strip())
|
||||||
|
|
||||||
# Priority 2: Museum register website
|
# Priority 2: Website in identifiers array (Czech ISIL and ARON format)
|
||||||
|
for ident in entry.get('original_entry', {}).get('identifiers', []):
|
||||||
|
if ident.get('identifier_scheme') == 'Website':
|
||||||
|
url = ident.get('identifier_value') or ident.get('identifier_url')
|
||||||
|
if url and url.strip():
|
||||||
|
return normalize_url(url.strip())
|
||||||
|
|
||||||
|
# Priority 3: Museum register website
|
||||||
if entry.get('museum_register_enrichment', {}).get('website_url'):
|
if entry.get('museum_register_enrichment', {}).get('website_url'):
|
||||||
url = entry['museum_register_enrichment']['website_url']
|
url = entry['museum_register_enrichment']['website_url']
|
||||||
if url and url.strip():
|
if url and url.strip():
|
||||||
return normalize_url(url.strip())
|
return normalize_url(url.strip())
|
||||||
|
|
||||||
# Priority 3: Wikidata official website
|
# Priority 4: Wikidata official website
|
||||||
if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'):
|
if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'):
|
||||||
url = entry['wikidata_enrichment']['wikidata_official_website']
|
url = entry['wikidata_enrichment']['wikidata_official_website']
|
||||||
# Handle list of URLs (take first one)
|
# Handle list of URLs (take first one)
|
||||||
|
|
@ -96,13 +103,13 @@ def get_website_url(entry: dict) -> str | None:
|
||||||
if url and isinstance(url, str) and url.strip():
|
if url and isinstance(url, str) and url.strip():
|
||||||
return normalize_url(url.strip())
|
return normalize_url(url.strip())
|
||||||
|
|
||||||
# Priority 4: Google Maps website
|
# Priority 5: Google Maps website
|
||||||
if entry.get('google_maps_enrichment', {}).get('website'):
|
if entry.get('google_maps_enrichment', {}).get('website'):
|
||||||
url = entry['google_maps_enrichment']['website']
|
url = entry['google_maps_enrichment']['website']
|
||||||
if url and url.strip():
|
if url and url.strip():
|
||||||
return normalize_url(url.strip())
|
return normalize_url(url.strip())
|
||||||
|
|
||||||
# Priority 5: Web enrichment source URL
|
# Priority 6: Web enrichment source URL
|
||||||
if entry.get('web_enrichment', {}).get('source_url'):
|
if entry.get('web_enrichment', {}).get('source_url'):
|
||||||
url = entry['web_enrichment']['source_url']
|
url = entry['web_enrichment']['source_url']
|
||||||
if url and url.strip():
|
if url and url.strip():
|
||||||
|
|
|
||||||
|
|
@ -54,9 +54,22 @@ def extract_person_text(data: dict[str, Any]) -> str:
|
||||||
parts = []
|
parts = []
|
||||||
|
|
||||||
profile = data.get("profile_data", {})
|
profile = data.get("profile_data", {})
|
||||||
|
person = data.get("person", {})
|
||||||
|
source_staff = data.get("source_staff_info", {})
|
||||||
|
extraction = data.get("extraction_metadata", {})
|
||||||
|
|
||||||
# Full name (primary identifier)
|
# Full name - check ALL possible locations in order of priority
|
||||||
name = profile.get("full_name", "")
|
name = (
|
||||||
|
profile.get("full_name") or
|
||||||
|
profile.get("name") or
|
||||||
|
person.get("full_name") or
|
||||||
|
person.get("name") or
|
||||||
|
source_staff.get("name") or
|
||||||
|
source_staff.get("person_name") or
|
||||||
|
extraction.get("person_name") or
|
||||||
|
data.get("name") or
|
||||||
|
""
|
||||||
|
)
|
||||||
if name:
|
if name:
|
||||||
parts.append(f"Name: {name}")
|
parts.append(f"Name: {name}")
|
||||||
|
|
||||||
|
|
@ -259,13 +272,21 @@ def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
profile = data.get("profile_data", {})
|
profile = data.get("profile_data", {})
|
||||||
|
person = data.get("person", {})
|
||||||
|
source_staff = data.get("source_staff_info", {})
|
||||||
extraction = data.get("extraction_metadata", {})
|
extraction = data.get("extraction_metadata", {})
|
||||||
|
|
||||||
# Full name - check multiple possible field names
|
# Full name - check ALL possible field names (same as extract_person_text)
|
||||||
name = (
|
name = (
|
||||||
profile.get("name", "") or
|
profile.get("full_name") or
|
||||||
profile.get("full_name", "") or
|
profile.get("name") or
|
||||||
data.get("name", "")
|
person.get("full_name") or
|
||||||
|
person.get("name") or
|
||||||
|
source_staff.get("name") or
|
||||||
|
source_staff.get("person_name") or
|
||||||
|
extraction.get("person_name") or
|
||||||
|
data.get("name") or
|
||||||
|
""
|
||||||
)
|
)
|
||||||
if name:
|
if name:
|
||||||
metadata["name"] = name
|
metadata["name"] = name
|
||||||
|
|
@ -414,16 +435,19 @@ def find_person_files(data_dir: Path) -> list[Path]:
|
||||||
|
|
||||||
|
|
||||||
class PersonRetriever:
|
class PersonRetriever:
|
||||||
"""Qdrant retriever specifically for person entities."""
|
"""Qdrant retriever specifically for person entities.
|
||||||
|
|
||||||
|
Uses MiniLM (384-dim) embeddings by default for consistency with
|
||||||
|
the hybrid_retriever.py query-time embedding model.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
port: int = 6333,
|
port: int = 6333,
|
||||||
collection_name: str = "heritage_persons",
|
collection_name: str = "heritage_persons",
|
||||||
embedding_model: str = "text-embedding-3-small",
|
embedding_model: str = "all-MiniLM-L6-v2", # MiniLM for local embeddings
|
||||||
embedding_dim: int = 1536,
|
embedding_dim: int = 384, # MiniLM output dimension
|
||||||
api_key: str | None = None,
|
|
||||||
url: str | None = None,
|
url: str | None = None,
|
||||||
https: bool = False,
|
https: bool = False,
|
||||||
prefix: str | None = None,
|
prefix: str | None = None,
|
||||||
|
|
@ -434,7 +458,7 @@ class PersonRetriever:
|
||||||
self.collection_name = collection_name
|
self.collection_name = collection_name
|
||||||
self.embedding_model = embedding_model
|
self.embedding_model = embedding_model
|
||||||
self.embedding_dim = embedding_dim
|
self.embedding_dim = embedding_dim
|
||||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
# MiniLM model runs locally, no API key needed
|
||||||
|
|
||||||
# Initialize Qdrant client
|
# Initialize Qdrant client
|
||||||
if url:
|
if url:
|
||||||
|
|
@ -451,25 +475,23 @@ class PersonRetriever:
|
||||||
else:
|
else:
|
||||||
self.client = QdrantClient(host=host, port=port, timeout=60)
|
self.client = QdrantClient(host=host, port=port, timeout=60)
|
||||||
|
|
||||||
self._openai_client = None
|
self._sentence_model = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def openai_client(self):
|
def sentence_model(self):
|
||||||
"""Lazy-load OpenAI client."""
|
"""Lazy-load SentenceTransformer model."""
|
||||||
if self._openai_client is None:
|
if self._sentence_model is None:
|
||||||
import openai
|
from sentence_transformers import SentenceTransformer
|
||||||
self._openai_client = openai.OpenAI(api_key=self.api_key)
|
logger.info(f"Loading embedding model: {self.embedding_model}")
|
||||||
return self._openai_client
|
self._sentence_model = SentenceTransformer(self.embedding_model)
|
||||||
|
return self._sentence_model
|
||||||
|
|
||||||
def _get_embeddings_batch(self, texts: list[str]) -> list[list[float]]:
|
def _get_embeddings_batch(self, texts: list[str]) -> list[list[float]]:
|
||||||
"""Get embedding vectors for multiple texts."""
|
"""Get embedding vectors for multiple texts using MiniLM."""
|
||||||
if not texts:
|
if not texts:
|
||||||
return []
|
return []
|
||||||
response = self.openai_client.embeddings.create(
|
embeddings = self.sentence_model.encode(texts, show_progress_bar=False)
|
||||||
input=texts,
|
return embeddings.tolist()
|
||||||
model=self.embedding_model
|
|
||||||
)
|
|
||||||
return [item.embedding for item in sorted(response.data, key=lambda x: x.index)]
|
|
||||||
|
|
||||||
def ensure_collection(self) -> None:
|
def ensure_collection(self) -> None:
|
||||||
"""Ensure the collection exists, create if not."""
|
"""Ensure the collection exists, create if not."""
|
||||||
|
|
@ -655,10 +677,7 @@ def main():
|
||||||
logger.info(f" Metadata: {list(doc['metadata'].keys())}")
|
logger.info(f" Metadata: {list(doc['metadata'].keys())}")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
# Check for OpenAI API key
|
# Note: MiniLM model runs locally, no API key needed
|
||||||
if not os.getenv("OPENAI_API_KEY"):
|
|
||||||
logger.error("OPENAI_API_KEY environment variable is required for embeddings")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Create retriever
|
# Create retriever
|
||||||
if args.url:
|
if args.url:
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue