chore: add YAML history fix scripts and update ducklake/deploy tooling

- Add fix_yaml_history.py and fix_yaml_history_v2.py for cleaning up
  malformed ghcid_history entries with duplicate/redundant data
- Update load_custodians_to_ducklake.py for DuckDB lakehouse loading
- Update migrate_web_archives.py for web archive management
- Update deploy.sh with improvements
- Ignore entire data/ducklake/ directory (generated databases)
This commit is contained in:
kempersc 2025-12-07 18:45:52 +01:00
parent 7f85238f67
commit 90a1f20271
6 changed files with 456 additions and 79 deletions

1
.gitignore vendored
View file

@ -109,3 +109,4 @@ infrastructure/terraform/*.tfstate*
# DuckDB lakehouse databases
data/ducklake/*.duckdb
data/ducklake/*.duckdb.wal
data/ducklake/

View file

@ -5,13 +5,13 @@
# Usage: ./deploy.sh [options]
# Options:
# --infra Deploy infrastructure changes (Terraform)
# --data Deploy ontology/schema data
# --data Deploy ontologies directly to Oxigraph (no intermediate storage)
# --frontend Build and deploy frontend
# --api Deploy FastAPI backend (DSPy SPARQL generation)
# --ducklake Deploy DuckLake API backend
# --reload Reload data into Oxigraph
# --all Deploy everything
# --status Check server status only
# --clear Clear Oxigraph store before loading (use with --data)
set -e
@ -34,7 +34,7 @@ fi
HCLOUD_TOKEN="${HETZNER_HC_API_TOKEN:-}"
SERVER_NAME="glam-sparql"
SERVER_USER="root"
REMOTE_DATA_DIR="/mnt/data"
REMOTE_DATA_DIR="/mnt/data" # Used for LinkML/UML static files (if needed in future)
# Check for required token
if [ -z "$HCLOUD_TOKEN" ]; then
@ -50,11 +50,11 @@ DEPLOY_DATA=false
DEPLOY_FRONTEND=false
DEPLOY_API=false
DEPLOY_DUCKLAKE=false
RELOAD_OXIGRAPH=false
CLEAR_OXIGRAPH=false
STATUS_ONLY=false
if [ $# -eq 0 ]; then
echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--reload] [--all] [--status]"
echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--all] [--status] [--clear]"
exit 1
fi
@ -75,8 +75,8 @@ for arg in "$@"; do
--ducklake)
DEPLOY_DUCKLAKE=true
;;
--reload)
RELOAD_OXIGRAPH=true
--clear)
CLEAR_OXIGRAPH=true
;;
--all)
DEPLOY_INFRA=true
@ -84,7 +84,6 @@ for arg in "$@"; do
DEPLOY_FRONTEND=true
DEPLOY_API=true
DEPLOY_DUCKLAKE=true
RELOAD_OXIGRAPH=true
;;
--status)
STATUS_ONLY=true
@ -236,69 +235,168 @@ if [ -z "$SERVER_IP" ]; then
exit 1
fi
# Wait for SSH
if [ "$DEPLOY_DATA" = true ] || [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ] || [ "$RELOAD_OXIGRAPH" = true ]; then
# Wait for SSH (only needed for frontend, API, DuckLake - not for --data which uses HTTP directly)
if [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ]; then
wait_for_ssh "$SERVER_IP"
fi
# Deploy data
# Deploy data directly to Oxigraph
if [ "$DEPLOY_DATA" = true ]; then
echo ""
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${BLUE} Deploying Ontology & Schema Data${NC}"
echo -e "${BLUE} Deploying Ontologies Directly to Oxigraph${NC}"
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
# Ensure remote directories exist
ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
"mkdir -p $REMOTE_DATA_DIR/{ontologies,rdf,linkml,uml}"
OXIGRAPH_ENDPOINT="http://$SERVER_IP:7878"
# Sync ontologies
echo -e "${YELLOW}Syncing ontology files...${NC}"
rsync -avz --progress \
-e "ssh -o StrictHostKeyChecking=no" \
--include="*.ttl" --include="*.rdf" --include="*.owl" --include="*.jsonld" \
--exclude="*" \
"$PROJECT_ROOT/data/ontology/" \
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/ontologies/"
# Check Oxigraph is running
echo -e "${YELLOW}Checking Oxigraph status...${NC}"
if ! curl -s --connect-timeout 5 "$OXIGRAPH_ENDPOINT/" > /dev/null 2>&1; then
echo -e "${RED}Error: Oxigraph not responding at $OXIGRAPH_ENDPOINT${NC}"
echo "Ensure Oxigraph is running on the server."
exit 1
fi
echo -e "${GREEN}Oxigraph is running${NC}"
# Sync RDF schemas
echo -e "${YELLOW}Syncing RDF schema files...${NC}"
rsync -avz --progress \
-e "ssh -o StrictHostKeyChecking=no" \
--exclude="archive_*" \
"$PROJECT_ROOT/schemas/20251121/rdf/" \
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/rdf/"
# Get initial triple count
INITIAL_COUNT=$(curl -s -X POST \
-H "Content-Type: application/sparql-query" \
-H "Accept: application/sparql-results+json" \
--data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \
"$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"")
echo -e "${BLUE}Initial triple count: $INITIAL_COUNT${NC}"
# Sync LinkML schemas
echo -e "${YELLOW}Syncing LinkML schemas...${NC}"
rsync -avz --progress \
-e "ssh -o StrictHostKeyChecking=no" \
--include="*.yaml" --include="*/" --exclude="*" \
"$PROJECT_ROOT/schemas/20251121/linkml/" \
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/linkml/"
# Sync UML diagrams
echo -e "${YELLOW}Syncing UML diagrams...${NC}"
rsync -avz --progress \
-e "ssh -o StrictHostKeyChecking=no" \
--include="*.mmd" --exclude="*" \
"$PROJECT_ROOT/schemas/20251121/uml/mermaid/" \
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/uml/"
# Sync NDE Heritage Custodian RDF files
if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then
echo -e "${YELLOW}Syncing NDE Heritage Custodian RDF files...${NC}"
ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
"mkdir -p $REMOTE_DATA_DIR/nde/rdf"
rsync -avz --progress \
-e "ssh -o StrictHostKeyChecking=no" \
--include="*.ttl" --exclude="*" \
"$PROJECT_ROOT/data/nde/rdf/" \
"$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/nde/rdf/"
echo -e "${GREEN}NDE RDF: $(ls -1 $PROJECT_ROOT/data/nde/rdf/*.ttl 2>/dev/null | wc -l) files synced${NC}"
# Clear store if requested
if [ "$CLEAR_OXIGRAPH" = true ]; then
echo -e "${YELLOW}Clearing Oxigraph store...${NC}"
curl -s -X POST \
-H "Content-Type: application/sparql-update" \
--data "CLEAR ALL" \
"$OXIGRAPH_ENDPOINT/update" > /dev/null
echo -e "${GREEN}Store cleared${NC}"
fi
# Function to load a file to Oxigraph
load_file() {
local file="$1"
local content_type="$2"
local filename=$(basename "$file")
local http_code=$(curl -s -X POST \
-H "Content-Type: $content_type" \
--data-binary "@$file" \
-w "%{http_code}" \
-o /dev/null \
"$OXIGRAPH_ENDPOINT/store?default")
if [ "$http_code" = "204" ] || [ "$http_code" = "200" ]; then
echo -e " ${GREEN}${NC} $filename"
return 0
else
echo -e " ${RED}${NC} $filename (HTTP $http_code)"
return 1
fi
}
# Track statistics
LOADED=0
FAILED=0
# Load base ontologies from data/ontology/
echo ""
echo -e "${YELLOW}Loading base ontologies...${NC}"
# Turtle files
for file in "$PROJECT_ROOT/data/ontology"/*.ttl; do
if [ -f "$file" ]; then
if load_file "$file" "text/turtle"; then
LOADED=$((LOADED + 1))
else
FAILED=$((FAILED + 1))
fi
fi
done
# RDF/XML files (.rdf and .owl)
for file in "$PROJECT_ROOT/data/ontology"/*.rdf "$PROJECT_ROOT/data/ontology"/*.owl; do
if [ -f "$file" ]; then
if load_file "$file" "application/rdf+xml"; then
LOADED=$((LOADED + 1))
else
FAILED=$((FAILED + 1))
fi
fi
done
# JSON-LD files
for file in "$PROJECT_ROOT/data/ontology"/*.jsonld; do
if [ -f "$file" ]; then
if load_file "$file" "application/ld+json"; then
LOADED=$((LOADED + 1))
else
FAILED=$((FAILED + 1))
fi
fi
done
# Load generated RDF schemas
echo ""
echo -e "${YELLOW}Loading generated RDF schemas...${NC}"
for file in "$PROJECT_ROOT/schemas/20251121/rdf"/*.ttl; do
if [ -f "$file" ]; then
# Skip archived files
if [[ "$file" == *"archive_"* ]]; then
continue
fi
if load_file "$file" "text/turtle"; then
LOADED=$((LOADED + 1))
else
FAILED=$((FAILED + 1))
fi
fi
done
# Load NDE Heritage Custodian RDF files
if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then
echo ""
echo -e "${YELLOW}Loading NDE Heritage Custodian data...${NC}"
NDE_COUNT=0
NDE_TOTAL=$(ls -1 "$PROJECT_ROOT/data/nde/rdf"/*.ttl 2>/dev/null | wc -l | tr -d ' ')
for file in "$PROJECT_ROOT/data/nde/rdf"/*.ttl; do
if [ -f "$file" ]; then
NDE_COUNT=$((NDE_COUNT + 1))
# Show progress every 100 files
if [ $((NDE_COUNT % 100)) -eq 0 ]; then
echo -e " Loading NDE files: $NDE_COUNT / $NDE_TOTAL"
fi
if load_file "$file" "text/turtle" 2>/dev/null; then
LOADED=$((LOADED + 1))
else
FAILED=$((FAILED + 1))
fi
fi
done
echo -e " ${GREEN}Loaded $NDE_COUNT NDE heritage custodian files${NC}"
fi
# Get final triple count
echo ""
FINAL_COUNT=$(curl -s -X POST \
-H "Content-Type: application/sparql-query" \
-H "Accept: application/sparql-results+json" \
--data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \
"$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"")
ADDED=$((FINAL_COUNT - INITIAL_COUNT))
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}Data deployment complete${NC}"
echo -e " Files loaded: ${GREEN}$LOADED${NC}"
echo -e " Files failed: ${RED}$FAILED${NC}"
echo -e " Triples added: ${BLUE}$ADDED${NC}"
echo -e " Total triples: ${BLUE}$FINAL_COUNT${NC}"
fi
# Deploy frontend
@ -487,19 +585,6 @@ ENDSSH
echo -e "${GREEN}DuckLake API deployment complete${NC}"
fi
# Reload Oxigraph
if [ "$RELOAD_OXIGRAPH" = true ]; then
echo ""
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${BLUE} Reloading Oxigraph Data${NC}"
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
"/var/lib/glam/scripts/load-ontologies.sh"
echo -e "${GREEN}Oxigraph reload complete${NC}"
fi
# Final status
echo ""
echo -e "${BLUE}════════════════════════════════════════════════════════════════${NC}"

90
scripts/fix_yaml_history.py Executable file
View file

@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
Fix malformed ghcid_history entries in YAML files.
The issue: Some files have multiple history entries concatenated on a single line:
reason: Migrated from CH to EG namespace - Assiut - ghcid: XX-XX-XXX-L-AUL
This should be split into separate list items.
"""
import os
import re
import sys
from pathlib import Path
def fix_yaml_content(content: str) -> str:
"""Fix malformed ghcid_history entries."""
lines = content.split('\n')
fixed_lines = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this is a reason line that has an embedded " - ghcid:" or similar
# Pattern: reason: ... - ghcid: or reason: ... - valid_from:
if 'reason:' in line and ' - ghcid:' in line:
# Split the line at the embedded list marker
parts = line.split(' - ghcid:', 1)
# Add the first part (the reason line, truncated)
fixed_lines.append(parts[0].rstrip())
# Add the second part as a new list item
# Find the proper indentation (should be same level as the - that started this entry)
indent_match = re.match(r'^(\s*)', line)
base_indent = indent_match.group(1) if indent_match else ' '
# The new entry should be at the list item level
fixed_lines.append(f"{base_indent}- ghcid:{parts[1]}")
elif 'reason:' in line and ' - valid_from:' in line:
parts = line.split(' - valid_from:', 1)
fixed_lines.append(parts[0].rstrip())
indent_match = re.match(r'^(\s*)', line)
base_indent = indent_match.group(1) if indent_match else ' '
fixed_lines.append(f"{base_indent}- valid_from:{parts[1]}")
else:
fixed_lines.append(line)
i += 1
return '\n'.join(fixed_lines)
def process_file(filepath: Path, dry_run: bool = False) -> bool:
"""Process a single file. Returns True if changes were made."""
with open(filepath, 'r', encoding='utf-8') as f:
original = f.read()
fixed = fix_yaml_content(original)
if fixed != original:
if dry_run:
print(f"Would fix: {filepath.name}")
else:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(fixed)
print(f"Fixed: {filepath.name}")
return True
return False
def main():
dry_run = '--dry-run' in sys.argv
custodian_dir = Path('data/custodian')
if not custodian_dir.exists():
print("Error: data/custodian directory not found")
sys.exit(1)
fixed_count = 0
# Process files known to have issues (BE and EG prefixes)
for prefix in ['BE-', 'EG-']:
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
if process_file(yaml_file, dry_run):
fixed_count += 1
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
if __name__ == '__main__':
main()

View file

@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Fix malformed ghcid_history entries in YAML files.
Version 2: More robust parsing and reconstruction.
"""
import re
import sys
from pathlib import Path
def fix_ghcid_history_section(content: str) -> str:
"""Fix the ghcid_history section of a YAML file."""
# Find the ghcid_history section
history_match = re.search(r'(\s*)ghcid_history:\s*\n', content)
if not history_match:
return content # No ghcid_history section
base_indent = history_match.group(1)
list_indent = base_indent + " "
item_indent = list_indent + " "
# Find the end of ghcid_history section (next top-level key at same or less indent)
start_pos = history_match.end()
# Find where ghcid_history ends by looking for next key at same level
remaining = content[start_pos:]
# Match pattern for next section at base_indent level or less
end_pattern = re.compile(rf'^{base_indent}[a-z_]+:', re.MULTILINE)
end_match = end_pattern.search(remaining)
if end_match:
history_section = remaining[:end_match.start()]
after_section = remaining[end_match.start():]
else:
history_section = remaining
after_section = ""
# Parse all history entries from the section
# They might be concatenated on one line or split incorrectly
# Extract all ghcid entries - they have pattern: ghcid: <value>
entry_pattern = re.compile(
r'(?:^|\s*-\s*)ghcid:\s*(\S+).*?'
r'(?:valid_from:\s*[\'"]?([^\'"]+)[\'"]?)?\s*'
r'(?:ghcid_numeric:\s*(\d+))?\s*'
r'(?:reason:\s*([^\n]+))?',
re.DOTALL
)
# Simpler approach: Just extract key-value pairs
entries = []
current_entry = {}
# Split by potential entry boundaries and reconstruct
lines = history_section.split('\n')
for line in lines:
stripped = line.strip()
if not stripped:
continue
# Start of new entry
if stripped.startswith('- ghcid:') or (stripped.startswith('ghcid:') and not current_entry):
if current_entry:
entries.append(current_entry)
value = stripped.replace('- ghcid:', '').replace('ghcid:', '').strip()
current_entry = {'ghcid': value}
elif stripped.startswith('- ') and ':' in stripped[2:]:
# This might be a new entry starting with a different key
if current_entry:
entries.append(current_entry)
# Parse the key-value
key_val = stripped[2:]
if ':' in key_val:
key, val = key_val.split(':', 1)
current_entry = {key.strip(): val.strip().strip("'\"")}
elif ':' in stripped:
# It's a key-value pair for current entry
key, val = stripped.split(':', 1)
key = key.strip().replace('- ', '')
val = val.strip().strip("'\"")
if key and val:
current_entry[key] = val
if current_entry:
entries.append(current_entry)
# Deduplicate entries by ghcid + valid_from
seen = set()
unique_entries = []
for entry in entries:
key = (entry.get('ghcid', ''), entry.get('valid_from', ''))
if key not in seen and entry.get('ghcid'):
seen.add(key)
unique_entries.append(entry)
# Reconstruct the section properly
new_history = f"{base_indent}ghcid_history:\n"
for entry in unique_entries:
new_history += f"{list_indent}- ghcid: {entry.get('ghcid', '')}\n"
if 'valid_from' in entry:
new_history += f"{item_indent}valid_from: '{entry['valid_from']}'\n"
if 'ghcid_numeric' in entry:
new_history += f"{item_indent}ghcid_numeric: {entry['ghcid_numeric']}\n"
if 'reason' in entry:
# Escape colons in reason text by quoting
reason = entry['reason']
if ':' in reason and not reason.startswith('"') and not reason.startswith("'"):
reason = f'"{reason}"'
new_history += f"{item_indent}reason: {reason}\n"
# Rebuild the content
before_section = content[:history_match.start()] + history_match.group(0).rstrip() + '\n'
return before_section + new_history + after_section
def process_file(filepath: Path, dry_run: bool = False) -> bool:
"""Process a single file."""
with open(filepath, 'r', encoding='utf-8') as f:
original = f.read()
fixed = fix_ghcid_history_section(original)
if fixed != original:
if dry_run:
print(f"Would fix: {filepath.name}")
else:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(fixed)
print(f"Fixed: {filepath.name}")
return True
return False
def main():
dry_run = '--dry-run' in sys.argv
custodian_dir = Path('data/custodian')
fixed_count = 0
for prefix in ['BE-', 'EG-']:
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
if process_file(yaml_file, dry_run):
fixed_count += 1
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
if __name__ == '__main__':
main()

View file

@ -104,12 +104,47 @@ def extract_top_level_fields(data: dict) -> dict:
# Extract original entry
original = data.get("original_entry", {})
if original:
record["org_name"] = original.get("organisatie", "")
org_types = original.get("type", [])
record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types)
# Try multiple field names for organization name
record["org_name"] = original.get("name", "") or original.get("organisatie", "")
# Try multiple field names for institution type
# First try 'institution_type' (CH-Annotator format), then 'type' (older format)
inst_type = original.get("institution_type", "") or original.get("type", "")
if isinstance(inst_type, list):
inst_type = ",".join(inst_type)
else:
inst_type = str(inst_type) if inst_type else ""
# Normalize institution type names to standard codes
type_normalize = {
"OFFICIAL_INSTITUTION": "OFFICIAL",
"RESEARCH_CENTER": "RESEARCH",
"BOTANICAL_ZOO": "BOTANICAL",
"EDUCATION_PROVIDER": "EDUCATION",
"COLLECTING_SOCIETY": "SOCIETY",
"INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE",
}
record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else ""
record["wikidata_id"] = original.get("wikidata_id", "")
record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
# Fallback: Extract org_type from GHCID code (4th component)
# GHCID format: CC-RR-CCC-T-ABBREV where T is the type code
if not record["org_type"] and record.get("ghcid_current"):
ghcid_parts = record["ghcid_current"].split("-")
if len(ghcid_parts) >= 4:
type_code = ghcid_parts[3]
# Map single-letter codes to full type names
type_map = {
"G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM",
"O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN",
"B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES",
"I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES",
"D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL"
}
record["org_type"] = type_map.get(type_code, type_code)
# Extract Google Maps data
gm = data.get("google_maps_enrichment", {})
if gm:

View file

@ -211,6 +211,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
recognition_confidence FLOAT,
linking_confidence FLOAT,
wikidata_id VARCHAR,
source_page VARCHAR,
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
)
""")
@ -224,9 +225,12 @@ def build_ducklake_database(mapping: Dict[int, str]):
claim_id_counter = 0
web_folders = get_web_archive_folders()
logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...")
total_folders = len(web_folders)
logger.info(f"Processing {total_folders} web archive folders for DuckLake...")
for folder in web_folders:
for idx, folder in enumerate(web_folders):
if idx % 100 == 0:
logger.info(f"Progress: {idx}/{total_folders} folders processed ({idx*100//total_folders}%)")
entry_index = int(folder.name)
ghcid = mapping.get(entry_index)
@ -321,6 +325,11 @@ def build_ducklake_database(mapping: Dict[int, str]):
with open(annotations_path, 'r', encoding='utf-8') as f:
annotations = yaml.safe_load(f)
# Get source page from html_file
html_file = annotations.get('html_file', '')
# Extract just the filename part (e.g., "pages/index.html" -> "index.html")
source_page = html_file.split('/')[-1] if html_file else 'index.html'
session = annotations.get('session', {})
claims = session.get('claims', {})
@ -329,7 +338,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
claim_id_counter += 1
provenance = claim.get('provenance', {})
con.execute("""
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", [
claim_id_counter,
ghcid,
@ -342,7 +351,8 @@ def build_ducklake_database(mapping: Dict[int, str]):
provenance.get('path'),
claim.get('recognition_confidence', 0),
claim.get('linking_confidence', 0),
claim.get('wikidata_id')
claim.get('wikidata_id'),
source_page
])
# Process aggregate claims
@ -350,7 +360,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
claim_id_counter += 1
provenance = claim.get('provenance', {})
con.execute("""
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", [
claim_id_counter,
ghcid,
@ -363,7 +373,8 @@ def build_ducklake_database(mapping: Dict[int, str]):
provenance.get('path'),
provenance.get('confidence', 0),
0,
None
None,
source_page
])
except Exception as e:
logger.debug(f"Error processing annotations for {ghcid}: {e}")
@ -373,6 +384,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)")
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)")
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)")
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_source_page ON web_claims(source_page)")
# Get stats
archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]