chore: add YAML history fix scripts and update ducklake/deploy tooling

- Add fix_yaml_history.py and fix_yaml_history_v2.py for cleaning up malformed ghcid_history entries with duplicate/redundant data - Update load_custodians_to_ducklake.py for DuckDB lakehouse loading - Update migrate_web_archives.py for web archive management - Update deploy.sh with improvements - Ignore entire data/ducklake/ directory (generated databases)
2025-12-07 18:45:52 +01:00 · 2025-12-07 18:45:52 +01:00 · 90a1f20271
commit 90a1f20271
parent 7f85238f67
6 changed files with 456 additions and 79 deletions
--- a/.gitignore
+++ b/.gitignore
@ -109,3 +109,4 @@ infrastructure/terraform/*.tfstate*
 # DuckDB lakehouse databases
 data/ducklake/*.duckdb
 data/ducklake/*.duckdb.wal
+data/ducklake/
--- a/infrastructure/deploy.sh
+++ b/infrastructure/deploy.sh
@ -5,13 +5,13 @@
 # Usage: ./deploy.sh [options]
 #   Options:
 #     --infra      Deploy infrastructure changes (Terraform)
-#     --data       Deploy ontology/schema data
+#     --data       Deploy ontologies directly to Oxigraph (no intermediate storage)
 #     --frontend   Build and deploy frontend
 #     --api        Deploy FastAPI backend (DSPy SPARQL generation)
 #     --ducklake   Deploy DuckLake API backend
-#     --reload     Reload data into Oxigraph
 #     --all        Deploy everything
 #     --status     Check server status only
+#     --clear      Clear Oxigraph store before loading (use with --data)

 set -e

@ -34,7 +34,7 @@ fi
 HCLOUD_TOKEN="${HETZNER_HC_API_TOKEN:-}"
 SERVER_NAME="glam-sparql"
 SERVER_USER="root"
-REMOTE_DATA_DIR="/mnt/data"
+REMOTE_DATA_DIR="/mnt/data"  # Used for LinkML/UML static files (if needed in future)

 # Check for required token
 if [ -z "$HCLOUD_TOKEN" ]; then
@ -50,11 +50,11 @@ DEPLOY_DATA=false
 DEPLOY_FRONTEND=false
 DEPLOY_API=false
 DEPLOY_DUCKLAKE=false
-RELOAD_OXIGRAPH=false
+CLEAR_OXIGRAPH=false
 STATUS_ONLY=false

 if [ $# -eq 0 ]; then
-  echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--reload] [--all] [--status]"
+  echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--all] [--status] [--clear]"
  exit 1
 fi

@ -75,8 +75,8 @@ for arg in "$@"; do
    --ducklake)
      DEPLOY_DUCKLAKE=true
      ;;
-    --reload)
-      RELOAD_OXIGRAPH=true
+    --clear)
+      CLEAR_OXIGRAPH=true
      ;;
    --all)
      DEPLOY_INFRA=true
@ -84,7 +84,6 @@ for arg in "$@"; do
      DEPLOY_FRONTEND=true
      DEPLOY_API=true
      DEPLOY_DUCKLAKE=true
-      RELOAD_OXIGRAPH=true
      ;;
    --status)
      STATUS_ONLY=true
@ -236,69 +235,168 @@ if [ -z "$SERVER_IP" ]; then
  exit 1
 fi

-# Wait for SSH
-if [ "$DEPLOY_DATA" = true ] || [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ] || [ "$RELOAD_OXIGRAPH" = true ]; then
+# Wait for SSH (only needed for frontend, API, DuckLake - not for --data which uses HTTP directly)
+if [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ]; then
  wait_for_ssh "$SERVER_IP"
 fi

-# Deploy data
+# Deploy data directly to Oxigraph
 if [ "$DEPLOY_DATA" = true ]; then
  echo ""
  echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
-  echo -e "${BLUE}  Deploying Ontology & Schema Data${NC}"
+  echo -e "${BLUE}  Deploying Ontologies Directly to Oxigraph${NC}"
  echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
  
-  # Ensure remote directories exist
-  ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
-    "mkdir -p $REMOTE_DATA_DIR/{ontologies,rdf,linkml,uml}"
+  OXIGRAPH_ENDPOINT="http://$SERVER_IP:7878"
  
-  # Sync ontologies
-  echo -e "${YELLOW}Syncing ontology files...${NC}"
-  rsync -avz --progress \
-    -e "ssh -o StrictHostKeyChecking=no" \
-    --include="*.ttl" --include="*.rdf" --include="*.owl" --include="*.jsonld" \
-    --exclude="*" \
-    "$PROJECT_ROOT/data/ontology/" \
-    "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/ontologies/"
+  # Check Oxigraph is running
+  echo -e "${YELLOW}Checking Oxigraph status...${NC}"
+  if ! curl -s --connect-timeout 5 "$OXIGRAPH_ENDPOINT/" > /dev/null 2>&1; then
+    echo -e "${RED}Error: Oxigraph not responding at $OXIGRAPH_ENDPOINT${NC}"
+    echo "Ensure Oxigraph is running on the server."
+    exit 1
+  fi
+  echo -e "${GREEN}Oxigraph is running${NC}"
  
-  # Sync RDF schemas
-  echo -e "${YELLOW}Syncing RDF schema files...${NC}"
-  rsync -avz --progress \
-    -e "ssh -o StrictHostKeyChecking=no" \
-    --exclude="archive_*" \
-    "$PROJECT_ROOT/schemas/20251121/rdf/" \
-    "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/rdf/"
+  # Get initial triple count
+  INITIAL_COUNT=$(curl -s -X POST \
+    -H "Content-Type: application/sparql-query" \
+    -H "Accept: application/sparql-results+json" \
+    --data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \
+    "$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"")
+  echo -e "${BLUE}Initial triple count: $INITIAL_COUNT${NC}"
  
-  # Sync LinkML schemas
-  echo -e "${YELLOW}Syncing LinkML schemas...${NC}"
-  rsync -avz --progress \
-    -e "ssh -o StrictHostKeyChecking=no" \
-    --include="*.yaml" --include="*/" --exclude="*" \
-    "$PROJECT_ROOT/schemas/20251121/linkml/" \
-    "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/linkml/"
-  
-  # Sync UML diagrams
-  echo -e "${YELLOW}Syncing UML diagrams...${NC}"
-  rsync -avz --progress \
-    -e "ssh -o StrictHostKeyChecking=no" \
-    --include="*.mmd" --exclude="*" \
-    "$PROJECT_ROOT/schemas/20251121/uml/mermaid/" \
-    "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/uml/"
-  
-  # Sync NDE Heritage Custodian RDF files
-  if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then
-    echo -e "${YELLOW}Syncing NDE Heritage Custodian RDF files...${NC}"
-    ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
-      "mkdir -p $REMOTE_DATA_DIR/nde/rdf"
-    rsync -avz --progress \
-      -e "ssh -o StrictHostKeyChecking=no" \
-      --include="*.ttl" --exclude="*" \
-      "$PROJECT_ROOT/data/nde/rdf/" \
-      "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/nde/rdf/"
-    echo -e "${GREEN}NDE RDF: $(ls -1 $PROJECT_ROOT/data/nde/rdf/*.ttl 2>/dev/null | wc -l) files synced${NC}"
+  # Clear store if requested
+  if [ "$CLEAR_OXIGRAPH" = true ]; then
+    echo -e "${YELLOW}Clearing Oxigraph store...${NC}"
+    curl -s -X POST \
+      -H "Content-Type: application/sparql-update" \
+      --data "CLEAR ALL" \
+      "$OXIGRAPH_ENDPOINT/update" > /dev/null
+    echo -e "${GREEN}Store cleared${NC}"
  fi
  
+  # Function to load a file to Oxigraph
+  load_file() {
+    local file="$1"
+    local content_type="$2"
+    local filename=$(basename "$file")
+    
+    local http_code=$(curl -s -X POST \
+      -H "Content-Type: $content_type" \
+      --data-binary "@$file" \
+      -w "%{http_code}" \
+      -o /dev/null \
+      "$OXIGRAPH_ENDPOINT/store?default")
+    
+    if [ "$http_code" = "204" ] || [ "$http_code" = "200" ]; then
+      echo -e "  ${GREEN}✓${NC} $filename"
+      return 0
+    else
+      echo -e "  ${RED}✗${NC} $filename (HTTP $http_code)"
+      return 1
+    fi
+  }
+  
+  # Track statistics
+  LOADED=0
+  FAILED=0
+  
+  # Load base ontologies from data/ontology/
+  echo ""
+  echo -e "${YELLOW}Loading base ontologies...${NC}"
+  
+  # Turtle files
+  for file in "$PROJECT_ROOT/data/ontology"/*.ttl; do
+    if [ -f "$file" ]; then
+      if load_file "$file" "text/turtle"; then
+        LOADED=$((LOADED + 1))
+      else
+        FAILED=$((FAILED + 1))
+      fi
+    fi
+  done
+  
+  # RDF/XML files (.rdf and .owl)
+  for file in "$PROJECT_ROOT/data/ontology"/*.rdf "$PROJECT_ROOT/data/ontology"/*.owl; do
+    if [ -f "$file" ]; then
+      if load_file "$file" "application/rdf+xml"; then
+        LOADED=$((LOADED + 1))
+      else
+        FAILED=$((FAILED + 1))
+      fi
+    fi
+  done
+  
+  # JSON-LD files
+  for file in "$PROJECT_ROOT/data/ontology"/*.jsonld; do
+    if [ -f "$file" ]; then
+      if load_file "$file" "application/ld+json"; then
+        LOADED=$((LOADED + 1))
+      else
+        FAILED=$((FAILED + 1))
+      fi
+    fi
+  done
+  
+  # Load generated RDF schemas
+  echo ""
+  echo -e "${YELLOW}Loading generated RDF schemas...${NC}"
+  
+  for file in "$PROJECT_ROOT/schemas/20251121/rdf"/*.ttl; do
+    if [ -f "$file" ]; then
+      # Skip archived files
+      if [[ "$file" == *"archive_"* ]]; then
+        continue
+      fi
+      if load_file "$file" "text/turtle"; then
+        LOADED=$((LOADED + 1))
+      else
+        FAILED=$((FAILED + 1))
+      fi
+    fi
+  done
+  
+  # Load NDE Heritage Custodian RDF files
+  if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then
+    echo ""
+    echo -e "${YELLOW}Loading NDE Heritage Custodian data...${NC}"
+    NDE_COUNT=0
+    NDE_TOTAL=$(ls -1 "$PROJECT_ROOT/data/nde/rdf"/*.ttl 2>/dev/null | wc -l | tr -d ' ')
+    
+    for file in "$PROJECT_ROOT/data/nde/rdf"/*.ttl; do
+      if [ -f "$file" ]; then
+        NDE_COUNT=$((NDE_COUNT + 1))
+        # Show progress every 100 files
+        if [ $((NDE_COUNT % 100)) -eq 0 ]; then
+          echo -e "  Loading NDE files: $NDE_COUNT / $NDE_TOTAL"
+        fi
+        if load_file "$file" "text/turtle" 2>/dev/null; then
+          LOADED=$((LOADED + 1))
+        else
+          FAILED=$((FAILED + 1))
+        fi
+      fi
+    done
+    echo -e "  ${GREEN}Loaded $NDE_COUNT NDE heritage custodian files${NC}"
+  fi
+  
+  # Get final triple count
+  echo ""
+  FINAL_COUNT=$(curl -s -X POST \
+    -H "Content-Type: application/sparql-query" \
+    -H "Accept: application/sparql-results+json" \
+    --data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \
+    "$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"")
+  
+  ADDED=$((FINAL_COUNT - INITIAL_COUNT))
+  
+  echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
  echo -e "${GREEN}Data deployment complete${NC}"
+  echo -e "  Files loaded: ${GREEN}$LOADED${NC}"
+  echo -e "  Files failed: ${RED}$FAILED${NC}"
+  echo -e "  Triples added: ${BLUE}$ADDED${NC}"
+  echo -e "  Total triples: ${BLUE}$FINAL_COUNT${NC}"
 fi

 # Deploy frontend
@ -487,19 +585,6 @@ ENDSSH
  echo -e "${GREEN}DuckLake API deployment complete${NC}"
 fi

-# Reload Oxigraph
-if [ "$RELOAD_OXIGRAPH" = true ]; then
-  echo ""
-  echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
-  echo -e "${BLUE}  Reloading Oxigraph Data${NC}"
-  echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
-  
-  ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \
-    "/var/lib/glam/scripts/load-ontologies.sh"
-  
-  echo -e "${GREEN}Oxigraph reload complete${NC}"
-fi
-
 # Final status
 echo ""
 echo -e "${BLUE}════════════════════════════════════════════════════════════════${NC}"
--- a/scripts/fix_yaml_history.py
+++ b/scripts/fix_yaml_history.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Fix malformed ghcid_history entries in YAML files.
+
+The issue: Some files have multiple history entries concatenated on a single line:
+    reason: Migrated from CH to EG namespace - Assiut  - ghcid: XX-XX-XXX-L-AUL
+
+This should be split into separate list items.
+"""
+
+import os
+import re
+import sys
+from pathlib import Path
+
+def fix_yaml_content(content: str) -> str:
+    """Fix malformed ghcid_history entries."""
+    lines = content.split('\n')
+    fixed_lines = []
+    i = 0
+    
+    while i < len(lines):
+        line = lines[i]
+        
+        # Check if this is a reason line that has an embedded " - ghcid:" or similar
+        # Pattern: reason: ... - ghcid: or reason: ... - valid_from:
+        if 'reason:' in line and '  - ghcid:' in line:
+            # Split the line at the embedded list marker
+            parts = line.split('  - ghcid:', 1)
+            # Add the first part (the reason line, truncated)
+            fixed_lines.append(parts[0].rstrip())
+            # Add the second part as a new list item
+            # Find the proper indentation (should be same level as the - that started this entry)
+            indent_match = re.match(r'^(\s*)', line)
+            base_indent = indent_match.group(1) if indent_match else '  '
+            # The new entry should be at the list item level
+            fixed_lines.append(f"{base_indent}- ghcid:{parts[1]}")
+        elif 'reason:' in line and '  - valid_from:' in line:
+            parts = line.split('  - valid_from:', 1)
+            fixed_lines.append(parts[0].rstrip())
+            indent_match = re.match(r'^(\s*)', line)
+            base_indent = indent_match.group(1) if indent_match else '  '
+            fixed_lines.append(f"{base_indent}- valid_from:{parts[1]}")
+        else:
+            fixed_lines.append(line)
+        
+        i += 1
+    
+    return '\n'.join(fixed_lines)
+
+
+def process_file(filepath: Path, dry_run: bool = False) -> bool:
+    """Process a single file. Returns True if changes were made."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        original = f.read()
+    
+    fixed = fix_yaml_content(original)
+    
+    if fixed != original:
+        if dry_run:
+            print(f"Would fix: {filepath.name}")
+        else:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(fixed)
+            print(f"Fixed: {filepath.name}")
+        return True
+    return False
+
+
+def main():
+    dry_run = '--dry-run' in sys.argv
+    
+    custodian_dir = Path('data/custodian')
+    if not custodian_dir.exists():
+        print("Error: data/custodian directory not found")
+        sys.exit(1)
+    
+    fixed_count = 0
+    
+    # Process files known to have issues (BE and EG prefixes)
+    for prefix in ['BE-', 'EG-']:
+        for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
+            if process_file(yaml_file, dry_run):
+                fixed_count += 1
+    
+    print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/fix_yaml_history_v2.py
+++ b/scripts/fix_yaml_history_v2.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Fix malformed ghcid_history entries in YAML files.
+Version 2: More robust parsing and reconstruction.
+"""
+
+import re
+import sys
+from pathlib import Path
+
+
+def fix_ghcid_history_section(content: str) -> str:
+    """Fix the ghcid_history section of a YAML file."""
+    
+    # Find the ghcid_history section
+    history_match = re.search(r'(\s*)ghcid_history:\s*\n', content)
+    if not history_match:
+        return content  # No ghcid_history section
+    
+    base_indent = history_match.group(1)
+    list_indent = base_indent + "  "
+    item_indent = list_indent + "  "
+    
+    # Find the end of ghcid_history section (next top-level key at same or less indent)
+    start_pos = history_match.end()
+    
+    # Find where ghcid_history ends by looking for next key at same level
+    remaining = content[start_pos:]
+    
+    # Match pattern for next section at base_indent level or less
+    end_pattern = re.compile(rf'^{base_indent}[a-z_]+:', re.MULTILINE)
+    end_match = end_pattern.search(remaining)
+    
+    if end_match:
+        history_section = remaining[:end_match.start()]
+        after_section = remaining[end_match.start():]
+    else:
+        history_section = remaining
+        after_section = ""
+    
+    # Parse all history entries from the section
+    # They might be concatenated on one line or split incorrectly
+    
+    # Extract all ghcid entries - they have pattern: ghcid: <value>
+    entry_pattern = re.compile(
+        r'(?:^|\s*-\s*)ghcid:\s*(\S+).*?'
+        r'(?:valid_from:\s*[\'"]?([^\'"]+)[\'"]?)?\s*'
+        r'(?:ghcid_numeric:\s*(\d+))?\s*'
+        r'(?:reason:\s*([^\n]+))?',
+        re.DOTALL
+    )
+    
+    # Simpler approach: Just extract key-value pairs
+    entries = []
+    current_entry = {}
+    
+    # Split by potential entry boundaries and reconstruct
+    lines = history_section.split('\n')
+    
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            continue
+            
+        # Start of new entry
+        if stripped.startswith('- ghcid:') or (stripped.startswith('ghcid:') and not current_entry):
+            if current_entry:
+                entries.append(current_entry)
+            value = stripped.replace('- ghcid:', '').replace('ghcid:', '').strip()
+            current_entry = {'ghcid': value}
+        elif stripped.startswith('- ') and ':' in stripped[2:]:
+            # This might be a new entry starting with a different key
+            if current_entry:
+                entries.append(current_entry)
+            # Parse the key-value
+            key_val = stripped[2:]
+            if ':' in key_val:
+                key, val = key_val.split(':', 1)
+                current_entry = {key.strip(): val.strip().strip("'\"")}
+        elif ':' in stripped:
+            # It's a key-value pair for current entry
+            key, val = stripped.split(':', 1)
+            key = key.strip().replace('- ', '')
+            val = val.strip().strip("'\"")
+            if key and val:
+                current_entry[key] = val
+    
+    if current_entry:
+        entries.append(current_entry)
+    
+    # Deduplicate entries by ghcid + valid_from
+    seen = set()
+    unique_entries = []
+    for entry in entries:
+        key = (entry.get('ghcid', ''), entry.get('valid_from', ''))
+        if key not in seen and entry.get('ghcid'):
+            seen.add(key)
+            unique_entries.append(entry)
+    
+    # Reconstruct the section properly
+    new_history = f"{base_indent}ghcid_history:\n"
+    for entry in unique_entries:
+        new_history += f"{list_indent}- ghcid: {entry.get('ghcid', '')}\n"
+        if 'valid_from' in entry:
+            new_history += f"{item_indent}valid_from: '{entry['valid_from']}'\n"
+        if 'ghcid_numeric' in entry:
+            new_history += f"{item_indent}ghcid_numeric: {entry['ghcid_numeric']}\n"
+        if 'reason' in entry:
+            # Escape colons in reason text by quoting
+            reason = entry['reason']
+            if ':' in reason and not reason.startswith('"') and not reason.startswith("'"):
+                reason = f'"{reason}"'
+            new_history += f"{item_indent}reason: {reason}\n"
+    
+    # Rebuild the content
+    before_section = content[:history_match.start()] + history_match.group(0).rstrip() + '\n'
+    
+    return before_section + new_history + after_section
+
+
+def process_file(filepath: Path, dry_run: bool = False) -> bool:
+    """Process a single file."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        original = f.read()
+    
+    fixed = fix_ghcid_history_section(original)
+    
+    if fixed != original:
+        if dry_run:
+            print(f"Would fix: {filepath.name}")
+        else:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(fixed)
+            print(f"Fixed: {filepath.name}")
+        return True
+    return False
+
+
+def main():
+    dry_run = '--dry-run' in sys.argv
+    
+    custodian_dir = Path('data/custodian')
+    fixed_count = 0
+    
+    for prefix in ['BE-', 'EG-']:
+        for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
+            if process_file(yaml_file, dry_run):
+                fixed_count += 1
+    
+    print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/load_custodians_to_ducklake.py
+++ b/scripts/load_custodians_to_ducklake.py
@ -104,12 +104,47 @@ def extract_top_level_fields(data: dict) -> dict:
    # Extract original entry
    original = data.get("original_entry", {})
    if original:
-        record["org_name"] = original.get("organisatie", "")
-        org_types = original.get("type", [])
-        record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types)
+        # Try multiple field names for organization name
+        record["org_name"] = original.get("name", "") or original.get("organisatie", "")
+        
+        # Try multiple field names for institution type
+        # First try 'institution_type' (CH-Annotator format), then 'type' (older format)
+        inst_type = original.get("institution_type", "") or original.get("type", "")
+        if isinstance(inst_type, list):
+            inst_type = ",".join(inst_type)
+        else:
+            inst_type = str(inst_type) if inst_type else ""
+        
+        # Normalize institution type names to standard codes
+        type_normalize = {
+            "OFFICIAL_INSTITUTION": "OFFICIAL",
+            "RESEARCH_CENTER": "RESEARCH",
+            "BOTANICAL_ZOO": "BOTANICAL",
+            "EDUCATION_PROVIDER": "EDUCATION",
+            "COLLECTING_SOCIETY": "SOCIETY",
+            "INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE",
+        }
+        record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else ""
+        
        record["wikidata_id"] = original.get("wikidata_id", "")
        record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
    
+    # Fallback: Extract org_type from GHCID code (4th component)
+    # GHCID format: CC-RR-CCC-T-ABBREV where T is the type code
+    if not record["org_type"] and record.get("ghcid_current"):
+        ghcid_parts = record["ghcid_current"].split("-")
+        if len(ghcid_parts) >= 4:
+            type_code = ghcid_parts[3]
+            # Map single-letter codes to full type names
+            type_map = {
+                "G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM",
+                "O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN",
+                "B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES",
+                "I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES",
+                "D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL"
+            }
+            record["org_type"] = type_map.get(type_code, type_code)
+    
    # Extract Google Maps data
    gm = data.get("google_maps_enrichment", {})
    if gm:
--- a/scripts/migrate_web_archives.py
+++ b/scripts/migrate_web_archives.py
@ -211,6 +211,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
            recognition_confidence FLOAT,
            linking_confidence FLOAT,
            wikidata_id VARCHAR,
+            source_page VARCHAR,
            FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
        )
    """)
@ -224,9 +225,12 @@ def build_ducklake_database(mapping: Dict[int, str]):
    claim_id_counter = 0
    
    web_folders = get_web_archive_folders()
-    logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...")
+    total_folders = len(web_folders)
+    logger.info(f"Processing {total_folders} web archive folders for DuckLake...")
    
-    for folder in web_folders:
+    for idx, folder in enumerate(web_folders):
+        if idx % 100 == 0:
+            logger.info(f"Progress: {idx}/{total_folders} folders processed ({idx*100//total_folders}%)")
        entry_index = int(folder.name)
        ghcid = mapping.get(entry_index)
        
@ -321,6 +325,11 @@ def build_ducklake_database(mapping: Dict[int, str]):
                    with open(annotations_path, 'r', encoding='utf-8') as f:
                        annotations = yaml.safe_load(f)
                    
+                    # Get source page from html_file
+                    html_file = annotations.get('html_file', '')
+                    # Extract just the filename part (e.g., "pages/index.html" -> "index.html")
+                    source_page = html_file.split('/')[-1] if html_file else 'index.html'
+                    
                    session = annotations.get('session', {})
                    claims = session.get('claims', {})
                    
@ -329,7 +338,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
                        claim_id_counter += 1
                        provenance = claim.get('provenance', {})
                        con.execute("""
-                            INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                            INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        """, [
                            claim_id_counter,
                            ghcid,
@ -342,7 +351,8 @@ def build_ducklake_database(mapping: Dict[int, str]):
                            provenance.get('path'),
                            claim.get('recognition_confidence', 0),
                            claim.get('linking_confidence', 0),
-                            claim.get('wikidata_id')
+                            claim.get('wikidata_id'),
+                            source_page
                        ])
                    
                    # Process aggregate claims
@ -350,7 +360,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
                        claim_id_counter += 1
                        provenance = claim.get('provenance', {})
                        con.execute("""
-                            INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                            INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        """, [
                            claim_id_counter,
                            ghcid,
@ -363,7 +373,8 @@ def build_ducklake_database(mapping: Dict[int, str]):
                            provenance.get('path'),
                            provenance.get('confidence', 0),
                            0,
-                            None
+                            None,
+                            source_page
                        ])
                except Exception as e:
                    logger.debug(f"Error processing annotations for {ghcid}: {e}")
@ -373,6 +384,7 @@ def build_ducklake_database(mapping: Dict[int, str]):
    con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)")
    con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)")
    con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)")
+    con.execute("CREATE INDEX IF NOT EXISTS idx_claims_source_page ON web_claims(source_page)")
    
    # Get stats
    archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]