chore: add YAML history fix scripts and update ducklake/deploy tooling
- Add fix_yaml_history.py and fix_yaml_history_v2.py for cleaning up malformed ghcid_history entries with duplicate/redundant data - Update load_custodians_to_ducklake.py for DuckDB lakehouse loading - Update migrate_web_archives.py for web archive management - Update deploy.sh with improvements - Ignore entire data/ducklake/ directory (generated databases)
This commit is contained in:
parent
7f85238f67
commit
90a1f20271
6 changed files with 456 additions and 79 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -109,3 +109,4 @@ infrastructure/terraform/*.tfstate*
|
|||
# DuckDB lakehouse databases
|
||||
data/ducklake/*.duckdb
|
||||
data/ducklake/*.duckdb.wal
|
||||
data/ducklake/
|
||||
|
|
|
|||
|
|
@ -5,13 +5,13 @@
|
|||
# Usage: ./deploy.sh [options]
|
||||
# Options:
|
||||
# --infra Deploy infrastructure changes (Terraform)
|
||||
# --data Deploy ontology/schema data
|
||||
# --data Deploy ontologies directly to Oxigraph (no intermediate storage)
|
||||
# --frontend Build and deploy frontend
|
||||
# --api Deploy FastAPI backend (DSPy SPARQL generation)
|
||||
# --ducklake Deploy DuckLake API backend
|
||||
# --reload Reload data into Oxigraph
|
||||
# --all Deploy everything
|
||||
# --status Check server status only
|
||||
# --clear Clear Oxigraph store before loading (use with --data)
|
||||
|
||||
set -e
|
||||
|
||||
|
|
@ -34,7 +34,7 @@ fi
|
|||
HCLOUD_TOKEN="${HETZNER_HC_API_TOKEN:-}"
|
||||
SERVER_NAME="glam-sparql"
|
||||
SERVER_USER="root"
|
||||
REMOTE_DATA_DIR="/mnt/data"
|
||||
REMOTE_DATA_DIR="/mnt/data" # Used for LinkML/UML static files (if needed in future)
|
||||
|
||||
# Check for required token
|
||||
if [ -z "$HCLOUD_TOKEN" ]; then
|
||||
|
|
@ -50,11 +50,11 @@ DEPLOY_DATA=false
|
|||
DEPLOY_FRONTEND=false
|
||||
DEPLOY_API=false
|
||||
DEPLOY_DUCKLAKE=false
|
||||
RELOAD_OXIGRAPH=false
|
||||
CLEAR_OXIGRAPH=false
|
||||
STATUS_ONLY=false
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--reload] [--all] [--status]"
|
||||
echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--all] [--status] [--clear]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
|
@ -75,8 +75,8 @@ for arg in "$@"; do
|
|||
--ducklake)
|
||||
DEPLOY_DUCKLAKE=true
|
||||
;;
|
||||
--reload)
|
||||
RELOAD_OXIGRAPH=true
|
||||
--clear)
|
||||
CLEAR_OXIGRAPH=true
|
||||
;;
|
||||
--all)
|
||||
DEPLOY_INFRA=true
|
||||
|
|
@ -84,7 +84,6 @@ for arg in "$@"; do
|
|||
DEPLOY_FRONTEND=true
|
||||
DEPLOY_API=true
|
||||
DEPLOY_DUCKLAKE=true
|
||||
RELOAD_OXIGRAPH=true
|
||||
;;
|
||||
--status)
|
||||
STATUS_ONLY=true
|
||||
|
|
@ -236,69 +235,168 @@ if [ -z "$SERVER_IP" ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
# Wait for SSH
|
||||
if [ "$DEPLOY_DATA" = true ] || [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ] || [ "$RELOAD_OXIGRAPH" = true ]; then
|
||||
# Wait for SSH (only needed for frontend, API, DuckLake - not for --data which uses HTTP directly)
|
||||
if [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ]; then
|
||||
wait_for_ssh "$SERVER_IP"
|
||||
fi
|
||||
|
||||
# Deploy data
|
||||
# Deploy data directly to Oxigraph
|
||||
if [ "$DEPLOY_DATA" = true ]; then
|
||||
echo ""
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE} Deploying Ontology & Schema Data${NC}"
|
||||
echo -e "${BLUE} Deploying Ontologies Directly to Oxigraph${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
|
||||
# Ensure remote directories exist
|
||||
ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
|
||||
"mkdir -p $REMOTE_DATA_DIR/{ontologies,rdf,linkml,uml}"
|
||||
OXIGRAPH_ENDPOINT="http://$SERVER_IP:7878"
|
||||
|
||||
# Sync ontologies
|
||||
echo -e "${YELLOW}Syncing ontology files...${NC}"
|
||||
rsync -avz --progress \
|
||||
-e "ssh -o StrictHostKeyChecking=no" \
|
||||
--include="*.ttl" --include="*.rdf" --include="*.owl" --include="*.jsonld" \
|
||||
--exclude="*" \
|
||||
"$PROJECT_ROOT/data/ontology/" \
|
||||
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/ontologies/"
|
||||
# Check Oxigraph is running
|
||||
echo -e "${YELLOW}Checking Oxigraph status...${NC}"
|
||||
if ! curl -s --connect-timeout 5 "$OXIGRAPH_ENDPOINT/" > /dev/null 2>&1; then
|
||||
echo -e "${RED}Error: Oxigraph not responding at $OXIGRAPH_ENDPOINT${NC}"
|
||||
echo "Ensure Oxigraph is running on the server."
|
||||
exit 1
|
||||
fi
|
||||
echo -e "${GREEN}Oxigraph is running${NC}"
|
||||
|
||||
# Sync RDF schemas
|
||||
echo -e "${YELLOW}Syncing RDF schema files...${NC}"
|
||||
rsync -avz --progress \
|
||||
-e "ssh -o StrictHostKeyChecking=no" \
|
||||
--exclude="archive_*" \
|
||||
"$PROJECT_ROOT/schemas/20251121/rdf/" \
|
||||
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/rdf/"
|
||||
# Get initial triple count
|
||||
INITIAL_COUNT=$(curl -s -X POST \
|
||||
-H "Content-Type: application/sparql-query" \
|
||||
-H "Accept: application/sparql-results+json" \
|
||||
--data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \
|
||||
"$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"")
|
||||
echo -e "${BLUE}Initial triple count: $INITIAL_COUNT${NC}"
|
||||
|
||||
# Sync LinkML schemas
|
||||
echo -e "${YELLOW}Syncing LinkML schemas...${NC}"
|
||||
rsync -avz --progress \
|
||||
-e "ssh -o StrictHostKeyChecking=no" \
|
||||
--include="*.yaml" --include="*/" --exclude="*" \
|
||||
"$PROJECT_ROOT/schemas/20251121/linkml/" \
|
||||
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/linkml/"
|
||||
|
||||
# Sync UML diagrams
|
||||
echo -e "${YELLOW}Syncing UML diagrams...${NC}"
|
||||
rsync -avz --progress \
|
||||
-e "ssh -o StrictHostKeyChecking=no" \
|
||||
--include="*.mmd" --exclude="*" \
|
||||
"$PROJECT_ROOT/schemas/20251121/uml/mermaid/" \
|
||||
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/uml/"
|
||||
|
||||
# Sync NDE Heritage Custodian RDF files
|
||||
if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then
|
||||
echo -e "${YELLOW}Syncing NDE Heritage Custodian RDF files...${NC}"
|
||||
ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
|
||||
"mkdir -p $REMOTE_DATA_DIR/nde/rdf"
|
||||
rsync -avz --progress \
|
||||
-e "ssh -o StrictHostKeyChecking=no" \
|
||||
--include="*.ttl" --exclude="*" \
|
||||
"$PROJECT_ROOT/data/nde/rdf/" \
|
||||
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/nde/rdf/"
|
||||
echo -e "${GREEN}NDE RDF: $(ls -1 $PROJECT_ROOT/data/nde/rdf/*.ttl 2>/dev/null | wc -l) files synced${NC}"
|
||||
# Clear store if requested
|
||||
if [ "$CLEAR_OXIGRAPH" = true ]; then
|
||||
echo -e "${YELLOW}Clearing Oxigraph store...${NC}"
|
||||
curl -s -X POST \
|
||||
-H "Content-Type: application/sparql-update" \
|
||||
--data "CLEAR ALL" \
|
||||
"$OXIGRAPH_ENDPOINT/update" > /dev/null
|
||||
echo -e "${GREEN}Store cleared${NC}"
|
||||
fi
|
||||
|
||||
# Function to load a file to Oxigraph
|
||||
load_file() {
|
||||
local file="$1"
|
||||
local content_type="$2"
|
||||
local filename=$(basename "$file")
|
||||
|
||||
local http_code=$(curl -s -X POST \
|
||||
-H "Content-Type: $content_type" \
|
||||
--data-binary "@$file" \
|
||||
-w "%{http_code}" \
|
||||
-o /dev/null \
|
||||
"$OXIGRAPH_ENDPOINT/store?default")
|
||||
|
||||
if [ "$http_code" = "204" ] || [ "$http_code" = "200" ]; then
|
||||
echo -e " ${GREEN}✓${NC} $filename"
|
||||
return 0
|
||||
else
|
||||
echo -e " ${RED}✗${NC} $filename (HTTP $http_code)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Track statistics
|
||||
LOADED=0
|
||||
FAILED=0
|
||||
|
||||
# Load base ontologies from data/ontology/
|
||||
echo ""
|
||||
echo -e "${YELLOW}Loading base ontologies...${NC}"
|
||||
|
||||
# Turtle files
|
||||
for file in "$PROJECT_ROOT/data/ontology"/*.ttl; do
|
||||
if [ -f "$file" ]; then
|
||||
if load_file "$file" "text/turtle"; then
|
||||
LOADED=$((LOADED + 1))
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# RDF/XML files (.rdf and .owl)
|
||||
for file in "$PROJECT_ROOT/data/ontology"/*.rdf "$PROJECT_ROOT/data/ontology"/*.owl; do
|
||||
if [ -f "$file" ]; then
|
||||
if load_file "$file" "application/rdf+xml"; then
|
||||
LOADED=$((LOADED + 1))
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# JSON-LD files
|
||||
for file in "$PROJECT_ROOT/data/ontology"/*.jsonld; do
|
||||
if [ -f "$file" ]; then
|
||||
if load_file "$file" "application/ld+json"; then
|
||||
LOADED=$((LOADED + 1))
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# Load generated RDF schemas
|
||||
echo ""
|
||||
echo -e "${YELLOW}Loading generated RDF schemas...${NC}"
|
||||
|
||||
for file in "$PROJECT_ROOT/schemas/20251121/rdf"/*.ttl; do
|
||||
if [ -f "$file" ]; then
|
||||
# Skip archived files
|
||||
if [[ "$file" == *"archive_"* ]]; then
|
||||
continue
|
||||
fi
|
||||
if load_file "$file" "text/turtle"; then
|
||||
LOADED=$((LOADED + 1))
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# Load NDE Heritage Custodian RDF files
|
||||
if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then
|
||||
echo ""
|
||||
echo -e "${YELLOW}Loading NDE Heritage Custodian data...${NC}"
|
||||
NDE_COUNT=0
|
||||
NDE_TOTAL=$(ls -1 "$PROJECT_ROOT/data/nde/rdf"/*.ttl 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
for file in "$PROJECT_ROOT/data/nde/rdf"/*.ttl; do
|
||||
if [ -f "$file" ]; then
|
||||
NDE_COUNT=$((NDE_COUNT + 1))
|
||||
# Show progress every 100 files
|
||||
if [ $((NDE_COUNT % 100)) -eq 0 ]; then
|
||||
echo -e " Loading NDE files: $NDE_COUNT / $NDE_TOTAL"
|
||||
fi
|
||||
if load_file "$file" "text/turtle" 2>/dev/null; then
|
||||
LOADED=$((LOADED + 1))
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo -e " ${GREEN}Loaded $NDE_COUNT NDE heritage custodian files${NC}"
|
||||
fi
|
||||
|
||||
# Get final triple count
|
||||
echo ""
|
||||
FINAL_COUNT=$(curl -s -X POST \
|
||||
-H "Content-Type: application/sparql-query" \
|
||||
-H "Accept: application/sparql-results+json" \
|
||||
--data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \
|
||||
"$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"")
|
||||
|
||||
ADDED=$((FINAL_COUNT - INITIAL_COUNT))
|
||||
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${GREEN}Data deployment complete${NC}"
|
||||
echo -e " Files loaded: ${GREEN}$LOADED${NC}"
|
||||
echo -e " Files failed: ${RED}$FAILED${NC}"
|
||||
echo -e " Triples added: ${BLUE}$ADDED${NC}"
|
||||
echo -e " Total triples: ${BLUE}$FINAL_COUNT${NC}"
|
||||
fi
|
||||
|
||||
# Deploy frontend
|
||||
|
|
@ -487,19 +585,6 @@ ENDSSH
|
|||
echo -e "${GREEN}DuckLake API deployment complete${NC}"
|
||||
fi
|
||||
|
||||
# Reload Oxigraph
|
||||
if [ "$RELOAD_OXIGRAPH" = true ]; then
|
||||
echo ""
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE} Reloading Oxigraph Data${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
|
||||
ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
|
||||
"/var/lib/glam/scripts/load-ontologies.sh"
|
||||
|
||||
echo -e "${GREEN}Oxigraph reload complete${NC}"
|
||||
fi
|
||||
|
||||
# Final status
|
||||
echo ""
|
||||
echo -e "${BLUE}════════════════════════════════════════════════════════════════${NC}"
|
||||
|
|
|
|||
90
scripts/fix_yaml_history.py
Executable file
90
scripts/fix_yaml_history.py
Executable file
|
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix malformed ghcid_history entries in YAML files.
|
||||
|
||||
The issue: Some files have multiple history entries concatenated on a single line:
|
||||
reason: Migrated from CH to EG namespace - Assiut - ghcid: XX-XX-XXX-L-AUL
|
||||
|
||||
This should be split into separate list items.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def fix_yaml_content(content: str) -> str:
|
||||
"""Fix malformed ghcid_history entries."""
|
||||
lines = content.split('\n')
|
||||
fixed_lines = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Check if this is a reason line that has an embedded " - ghcid:" or similar
|
||||
# Pattern: reason: ... - ghcid: or reason: ... - valid_from:
|
||||
if 'reason:' in line and ' - ghcid:' in line:
|
||||
# Split the line at the embedded list marker
|
||||
parts = line.split(' - ghcid:', 1)
|
||||
# Add the first part (the reason line, truncated)
|
||||
fixed_lines.append(parts[0].rstrip())
|
||||
# Add the second part as a new list item
|
||||
# Find the proper indentation (should be same level as the - that started this entry)
|
||||
indent_match = re.match(r'^(\s*)', line)
|
||||
base_indent = indent_match.group(1) if indent_match else ' '
|
||||
# The new entry should be at the list item level
|
||||
fixed_lines.append(f"{base_indent}- ghcid:{parts[1]}")
|
||||
elif 'reason:' in line and ' - valid_from:' in line:
|
||||
parts = line.split(' - valid_from:', 1)
|
||||
fixed_lines.append(parts[0].rstrip())
|
||||
indent_match = re.match(r'^(\s*)', line)
|
||||
base_indent = indent_match.group(1) if indent_match else ' '
|
||||
fixed_lines.append(f"{base_indent}- valid_from:{parts[1]}")
|
||||
else:
|
||||
fixed_lines.append(line)
|
||||
|
||||
i += 1
|
||||
|
||||
return '\n'.join(fixed_lines)
|
||||
|
||||
|
||||
def process_file(filepath: Path, dry_run: bool = False) -> bool:
|
||||
"""Process a single file. Returns True if changes were made."""
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
original = f.read()
|
||||
|
||||
fixed = fix_yaml_content(original)
|
||||
|
||||
if fixed != original:
|
||||
if dry_run:
|
||||
print(f"Would fix: {filepath.name}")
|
||||
else:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(fixed)
|
||||
print(f"Fixed: {filepath.name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
custodian_dir = Path('data/custodian')
|
||||
if not custodian_dir.exists():
|
||||
print("Error: data/custodian directory not found")
|
||||
sys.exit(1)
|
||||
|
||||
fixed_count = 0
|
||||
|
||||
# Process files known to have issues (BE and EG prefixes)
|
||||
for prefix in ['BE-', 'EG-']:
|
||||
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
|
||||
if process_file(yaml_file, dry_run):
|
||||
fixed_count += 1
|
||||
|
||||
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
154
scripts/fix_yaml_history_v2.py
Normal file
154
scripts/fix_yaml_history_v2.py
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix malformed ghcid_history entries in YAML files.
|
||||
Version 2: More robust parsing and reconstruction.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def fix_ghcid_history_section(content: str) -> str:
|
||||
"""Fix the ghcid_history section of a YAML file."""
|
||||
|
||||
# Find the ghcid_history section
|
||||
history_match = re.search(r'(\s*)ghcid_history:\s*\n', content)
|
||||
if not history_match:
|
||||
return content # No ghcid_history section
|
||||
|
||||
base_indent = history_match.group(1)
|
||||
list_indent = base_indent + " "
|
||||
item_indent = list_indent + " "
|
||||
|
||||
# Find the end of ghcid_history section (next top-level key at same or less indent)
|
||||
start_pos = history_match.end()
|
||||
|
||||
# Find where ghcid_history ends by looking for next key at same level
|
||||
remaining = content[start_pos:]
|
||||
|
||||
# Match pattern for next section at base_indent level or less
|
||||
end_pattern = re.compile(rf'^{base_indent}[a-z_]+:', re.MULTILINE)
|
||||
end_match = end_pattern.search(remaining)
|
||||
|
||||
if end_match:
|
||||
history_section = remaining[:end_match.start()]
|
||||
after_section = remaining[end_match.start():]
|
||||
else:
|
||||
history_section = remaining
|
||||
after_section = ""
|
||||
|
||||
# Parse all history entries from the section
|
||||
# They might be concatenated on one line or split incorrectly
|
||||
|
||||
# Extract all ghcid entries - they have pattern: ghcid: <value>
|
||||
entry_pattern = re.compile(
|
||||
r'(?:^|\s*-\s*)ghcid:\s*(\S+).*?'
|
||||
r'(?:valid_from:\s*[\'"]?([^\'"]+)[\'"]?)?\s*'
|
||||
r'(?:ghcid_numeric:\s*(\d+))?\s*'
|
||||
r'(?:reason:\s*([^\n]+))?',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
# Simpler approach: Just extract key-value pairs
|
||||
entries = []
|
||||
current_entry = {}
|
||||
|
||||
# Split by potential entry boundaries and reconstruct
|
||||
lines = history_section.split('\n')
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
|
||||
# Start of new entry
|
||||
if stripped.startswith('- ghcid:') or (stripped.startswith('ghcid:') and not current_entry):
|
||||
if current_entry:
|
||||
entries.append(current_entry)
|
||||
value = stripped.replace('- ghcid:', '').replace('ghcid:', '').strip()
|
||||
current_entry = {'ghcid': value}
|
||||
elif stripped.startswith('- ') and ':' in stripped[2:]:
|
||||
# This might be a new entry starting with a different key
|
||||
if current_entry:
|
||||
entries.append(current_entry)
|
||||
# Parse the key-value
|
||||
key_val = stripped[2:]
|
||||
if ':' in key_val:
|
||||
key, val = key_val.split(':', 1)
|
||||
current_entry = {key.strip(): val.strip().strip("'\"")}
|
||||
elif ':' in stripped:
|
||||
# It's a key-value pair for current entry
|
||||
key, val = stripped.split(':', 1)
|
||||
key = key.strip().replace('- ', '')
|
||||
val = val.strip().strip("'\"")
|
||||
if key and val:
|
||||
current_entry[key] = val
|
||||
|
||||
if current_entry:
|
||||
entries.append(current_entry)
|
||||
|
||||
# Deduplicate entries by ghcid + valid_from
|
||||
seen = set()
|
||||
unique_entries = []
|
||||
for entry in entries:
|
||||
key = (entry.get('ghcid', ''), entry.get('valid_from', ''))
|
||||
if key not in seen and entry.get('ghcid'):
|
||||
seen.add(key)
|
||||
unique_entries.append(entry)
|
||||
|
||||
# Reconstruct the section properly
|
||||
new_history = f"{base_indent}ghcid_history:\n"
|
||||
for entry in unique_entries:
|
||||
new_history += f"{list_indent}- ghcid: {entry.get('ghcid', '')}\n"
|
||||
if 'valid_from' in entry:
|
||||
new_history += f"{item_indent}valid_from: '{entry['valid_from']}'\n"
|
||||
if 'ghcid_numeric' in entry:
|
||||
new_history += f"{item_indent}ghcid_numeric: {entry['ghcid_numeric']}\n"
|
||||
if 'reason' in entry:
|
||||
# Escape colons in reason text by quoting
|
||||
reason = entry['reason']
|
||||
if ':' in reason and not reason.startswith('"') and not reason.startswith("'"):
|
||||
reason = f'"{reason}"'
|
||||
new_history += f"{item_indent}reason: {reason}\n"
|
||||
|
||||
# Rebuild the content
|
||||
before_section = content[:history_match.start()] + history_match.group(0).rstrip() + '\n'
|
||||
|
||||
return before_section + new_history + after_section
|
||||
|
||||
|
||||
def process_file(filepath: Path, dry_run: bool = False) -> bool:
|
||||
"""Process a single file."""
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
original = f.read()
|
||||
|
||||
fixed = fix_ghcid_history_section(original)
|
||||
|
||||
if fixed != original:
|
||||
if dry_run:
|
||||
print(f"Would fix: {filepath.name}")
|
||||
else:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(fixed)
|
||||
print(f"Fixed: {filepath.name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
custodian_dir = Path('data/custodian')
|
||||
fixed_count = 0
|
||||
|
||||
for prefix in ['BE-', 'EG-']:
|
||||
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
|
||||
if process_file(yaml_file, dry_run):
|
||||
fixed_count += 1
|
||||
|
||||
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -104,12 +104,47 @@ def extract_top_level_fields(data: dict) -> dict:
|
|||
# Extract original entry
|
||||
original = data.get("original_entry", {})
|
||||
if original:
|
||||
record["org_name"] = original.get("organisatie", "")
|
||||
org_types = original.get("type", [])
|
||||
record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types)
|
||||
# Try multiple field names for organization name
|
||||
record["org_name"] = original.get("name", "") or original.get("organisatie", "")
|
||||
|
||||
# Try multiple field names for institution type
|
||||
# First try 'institution_type' (CH-Annotator format), then 'type' (older format)
|
||||
inst_type = original.get("institution_type", "") or original.get("type", "")
|
||||
if isinstance(inst_type, list):
|
||||
inst_type = ",".join(inst_type)
|
||||
else:
|
||||
inst_type = str(inst_type) if inst_type else ""
|
||||
|
||||
# Normalize institution type names to standard codes
|
||||
type_normalize = {
|
||||
"OFFICIAL_INSTITUTION": "OFFICIAL",
|
||||
"RESEARCH_CENTER": "RESEARCH",
|
||||
"BOTANICAL_ZOO": "BOTANICAL",
|
||||
"EDUCATION_PROVIDER": "EDUCATION",
|
||||
"COLLECTING_SOCIETY": "SOCIETY",
|
||||
"INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE",
|
||||
}
|
||||
record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else ""
|
||||
|
||||
record["wikidata_id"] = original.get("wikidata_id", "")
|
||||
record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
|
||||
|
||||
# Fallback: Extract org_type from GHCID code (4th component)
|
||||
# GHCID format: CC-RR-CCC-T-ABBREV where T is the type code
|
||||
if not record["org_type"] and record.get("ghcid_current"):
|
||||
ghcid_parts = record["ghcid_current"].split("-")
|
||||
if len(ghcid_parts) >= 4:
|
||||
type_code = ghcid_parts[3]
|
||||
# Map single-letter codes to full type names
|
||||
type_map = {
|
||||
"G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM",
|
||||
"O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN",
|
||||
"B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES",
|
||||
"I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES",
|
||||
"D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL"
|
||||
}
|
||||
record["org_type"] = type_map.get(type_code, type_code)
|
||||
|
||||
# Extract Google Maps data
|
||||
gm = data.get("google_maps_enrichment", {})
|
||||
if gm:
|
||||
|
|
|
|||
|
|
@ -211,6 +211,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
recognition_confidence FLOAT,
|
||||
linking_confidence FLOAT,
|
||||
wikidata_id VARCHAR,
|
||||
source_page VARCHAR,
|
||||
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
|
||||
)
|
||||
""")
|
||||
|
|
@ -224,9 +225,12 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
claim_id_counter = 0
|
||||
|
||||
web_folders = get_web_archive_folders()
|
||||
logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...")
|
||||
total_folders = len(web_folders)
|
||||
logger.info(f"Processing {total_folders} web archive folders for DuckLake...")
|
||||
|
||||
for folder in web_folders:
|
||||
for idx, folder in enumerate(web_folders):
|
||||
if idx % 100 == 0:
|
||||
logger.info(f"Progress: {idx}/{total_folders} folders processed ({idx*100//total_folders}%)")
|
||||
entry_index = int(folder.name)
|
||||
ghcid = mapping.get(entry_index)
|
||||
|
||||
|
|
@ -321,6 +325,11 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
with open(annotations_path, 'r', encoding='utf-8') as f:
|
||||
annotations = yaml.safe_load(f)
|
||||
|
||||
# Get source page from html_file
|
||||
html_file = annotations.get('html_file', '')
|
||||
# Extract just the filename part (e.g., "pages/index.html" -> "index.html")
|
||||
source_page = html_file.split('/')[-1] if html_file else 'index.html'
|
||||
|
||||
session = annotations.get('session', {})
|
||||
claims = session.get('claims', {})
|
||||
|
||||
|
|
@ -329,7 +338,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
claim_id_counter += 1
|
||||
provenance = claim.get('provenance', {})
|
||||
con.execute("""
|
||||
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", [
|
||||
claim_id_counter,
|
||||
ghcid,
|
||||
|
|
@ -342,7 +351,8 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
provenance.get('path'),
|
||||
claim.get('recognition_confidence', 0),
|
||||
claim.get('linking_confidence', 0),
|
||||
claim.get('wikidata_id')
|
||||
claim.get('wikidata_id'),
|
||||
source_page
|
||||
])
|
||||
|
||||
# Process aggregate claims
|
||||
|
|
@ -350,7 +360,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
claim_id_counter += 1
|
||||
provenance = claim.get('provenance', {})
|
||||
con.execute("""
|
||||
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", [
|
||||
claim_id_counter,
|
||||
ghcid,
|
||||
|
|
@ -363,7 +373,8 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
provenance.get('path'),
|
||||
provenance.get('confidence', 0),
|
||||
0,
|
||||
None
|
||||
None,
|
||||
source_page
|
||||
])
|
||||
except Exception as e:
|
||||
logger.debug(f"Error processing annotations for {ghcid}: {e}")
|
||||
|
|
@ -373,6 +384,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)")
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)")
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)")
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_source_page ON web_claims(source_page)")
|
||||
|
||||
# Get stats
|
||||
archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]
|
||||
|
|
|
|||
Loading…
Reference in a new issue