diff --git a/.gitignore b/.gitignore index 1b5bfbd9c8..98554d7237 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ infrastructure/terraform/*.tfstate* # DuckDB lakehouse databases data/ducklake/*.duckdb data/ducklake/*.duckdb.wal +data/ducklake/ diff --git a/infrastructure/deploy.sh b/infrastructure/deploy.sh index b5b7d7c299..e2765b2f55 100755 --- a/infrastructure/deploy.sh +++ b/infrastructure/deploy.sh @@ -5,13 +5,13 @@ # Usage: ./deploy.sh [options] # Options: # --infra Deploy infrastructure changes (Terraform) -# --data Deploy ontology/schema data +# --data Deploy ontologies directly to Oxigraph (no intermediate storage) # --frontend Build and deploy frontend # --api Deploy FastAPI backend (DSPy SPARQL generation) # --ducklake Deploy DuckLake API backend -# --reload Reload data into Oxigraph # --all Deploy everything # --status Check server status only +# --clear Clear Oxigraph store before loading (use with --data) set -e @@ -34,7 +34,7 @@ fi HCLOUD_TOKEN="${HETZNER_HC_API_TOKEN:-}" SERVER_NAME="glam-sparql" SERVER_USER="root" -REMOTE_DATA_DIR="/mnt/data" +REMOTE_DATA_DIR="/mnt/data" # Used for LinkML/UML static files (if needed in future) # Check for required token if [ -z "$HCLOUD_TOKEN" ]; then @@ -50,11 +50,11 @@ DEPLOY_DATA=false DEPLOY_FRONTEND=false DEPLOY_API=false DEPLOY_DUCKLAKE=false -RELOAD_OXIGRAPH=false +CLEAR_OXIGRAPH=false STATUS_ONLY=false if [ $# -eq 0 ]; then - echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--reload] [--all] [--status]" + echo "Usage: $0 [--infra] [--data] [--frontend] [--api] [--ducklake] [--all] [--status] [--clear]" exit 1 fi @@ -75,8 +75,8 @@ for arg in "$@"; do --ducklake) DEPLOY_DUCKLAKE=true ;; - --reload) - RELOAD_OXIGRAPH=true + --clear) + CLEAR_OXIGRAPH=true ;; --all) DEPLOY_INFRA=true @@ -84,7 +84,6 @@ for arg in "$@"; do DEPLOY_FRONTEND=true DEPLOY_API=true DEPLOY_DUCKLAKE=true - RELOAD_OXIGRAPH=true ;; --status) STATUS_ONLY=true @@ -236,69 +235,168 @@ if [ -z "$SERVER_IP" ]; then exit 1 fi -# Wait for SSH -if [ "$DEPLOY_DATA" = true ] || [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ] || [ "$RELOAD_OXIGRAPH" = true ]; then +# Wait for SSH (only needed for frontend, API, DuckLake - not for --data which uses HTTP directly) +if [ "$DEPLOY_FRONTEND" = true ] || [ "$DEPLOY_API" = true ] || [ "$DEPLOY_DUCKLAKE" = true ]; then wait_for_ssh "$SERVER_IP" fi -# Deploy data +# Deploy data directly to Oxigraph if [ "$DEPLOY_DATA" = true ]; then echo "" echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" - echo -e "${BLUE} Deploying Ontology & Schema Data${NC}" + echo -e "${BLUE} Deploying Ontologies Directly to Oxigraph${NC}" echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" - # Ensure remote directories exist - ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \ - "mkdir -p $REMOTE_DATA_DIR/{ontologies,rdf,linkml,uml}" + OXIGRAPH_ENDPOINT="http://$SERVER_IP:7878" - # Sync ontologies - echo -e "${YELLOW}Syncing ontology files...${NC}" - rsync -avz --progress \ - -e "ssh -o StrictHostKeyChecking=no" \ - --include="*.ttl" --include="*.rdf" --include="*.owl" --include="*.jsonld" \ - --exclude="*" \ - "$PROJECT_ROOT/data/ontology/" \ - "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/ontologies/" + # Check Oxigraph is running + echo -e "${YELLOW}Checking Oxigraph status...${NC}" + if ! curl -s --connect-timeout 5 "$OXIGRAPH_ENDPOINT/" > /dev/null 2>&1; then + echo -e "${RED}Error: Oxigraph not responding at $OXIGRAPH_ENDPOINT${NC}" + echo "Ensure Oxigraph is running on the server." + exit 1 + fi + echo -e "${GREEN}Oxigraph is running${NC}" - # Sync RDF schemas - echo -e "${YELLOW}Syncing RDF schema files...${NC}" - rsync -avz --progress \ - -e "ssh -o StrictHostKeyChecking=no" \ - --exclude="archive_*" \ - "$PROJECT_ROOT/schemas/20251121/rdf/" \ - "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/rdf/" + # Get initial triple count + INITIAL_COUNT=$(curl -s -X POST \ + -H "Content-Type: application/sparql-query" \ + -H "Accept: application/sparql-results+json" \ + --data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \ + "$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"") + echo -e "${BLUE}Initial triple count: $INITIAL_COUNT${NC}" - # Sync LinkML schemas - echo -e "${YELLOW}Syncing LinkML schemas...${NC}" - rsync -avz --progress \ - -e "ssh -o StrictHostKeyChecking=no" \ - --include="*.yaml" --include="*/" --exclude="*" \ - "$PROJECT_ROOT/schemas/20251121/linkml/" \ - "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/linkml/" - - # Sync UML diagrams - echo -e "${YELLOW}Syncing UML diagrams...${NC}" - rsync -avz --progress \ - -e "ssh -o StrictHostKeyChecking=no" \ - --include="*.mmd" --exclude="*" \ - "$PROJECT_ROOT/schemas/20251121/uml/mermaid/" \ - "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/uml/" - - # Sync NDE Heritage Custodian RDF files - if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then - echo -e "${YELLOW}Syncing NDE Heritage Custodian RDF files...${NC}" - ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \ - "mkdir -p $REMOTE_DATA_DIR/nde/rdf" - rsync -avz --progress \ - -e "ssh -o StrictHostKeyChecking=no" \ - --include="*.ttl" --exclude="*" \ - "$PROJECT_ROOT/data/nde/rdf/" \ - "$SERVER_USER@$SERVER_IP:$REMOTE_DATA_DIR/nde/rdf/" - echo -e "${GREEN}NDE RDF: $(ls -1 $PROJECT_ROOT/data/nde/rdf/*.ttl 2>/dev/null | wc -l) files synced${NC}" + # Clear store if requested + if [ "$CLEAR_OXIGRAPH" = true ]; then + echo -e "${YELLOW}Clearing Oxigraph store...${NC}" + curl -s -X POST \ + -H "Content-Type: application/sparql-update" \ + --data "CLEAR ALL" \ + "$OXIGRAPH_ENDPOINT/update" > /dev/null + echo -e "${GREEN}Store cleared${NC}" fi + # Function to load a file to Oxigraph + load_file() { + local file="$1" + local content_type="$2" + local filename=$(basename "$file") + + local http_code=$(curl -s -X POST \ + -H "Content-Type: $content_type" \ + --data-binary "@$file" \ + -w "%{http_code}" \ + -o /dev/null \ + "$OXIGRAPH_ENDPOINT/store?default") + + if [ "$http_code" = "204" ] || [ "$http_code" = "200" ]; then + echo -e " ${GREEN}✓${NC} $filename" + return 0 + else + echo -e " ${RED}✗${NC} $filename (HTTP $http_code)" + return 1 + fi + } + + # Track statistics + LOADED=0 + FAILED=0 + + # Load base ontologies from data/ontology/ + echo "" + echo -e "${YELLOW}Loading base ontologies...${NC}" + + # Turtle files + for file in "$PROJECT_ROOT/data/ontology"/*.ttl; do + if [ -f "$file" ]; then + if load_file "$file" "text/turtle"; then + LOADED=$((LOADED + 1)) + else + FAILED=$((FAILED + 1)) + fi + fi + done + + # RDF/XML files (.rdf and .owl) + for file in "$PROJECT_ROOT/data/ontology"/*.rdf "$PROJECT_ROOT/data/ontology"/*.owl; do + if [ -f "$file" ]; then + if load_file "$file" "application/rdf+xml"; then + LOADED=$((LOADED + 1)) + else + FAILED=$((FAILED + 1)) + fi + fi + done + + # JSON-LD files + for file in "$PROJECT_ROOT/data/ontology"/*.jsonld; do + if [ -f "$file" ]; then + if load_file "$file" "application/ld+json"; then + LOADED=$((LOADED + 1)) + else + FAILED=$((FAILED + 1)) + fi + fi + done + + # Load generated RDF schemas + echo "" + echo -e "${YELLOW}Loading generated RDF schemas...${NC}" + + for file in "$PROJECT_ROOT/schemas/20251121/rdf"/*.ttl; do + if [ -f "$file" ]; then + # Skip archived files + if [[ "$file" == *"archive_"* ]]; then + continue + fi + if load_file "$file" "text/turtle"; then + LOADED=$((LOADED + 1)) + else + FAILED=$((FAILED + 1)) + fi + fi + done + + # Load NDE Heritage Custodian RDF files + if [ -d "$PROJECT_ROOT/data/nde/rdf" ]; then + echo "" + echo -e "${YELLOW}Loading NDE Heritage Custodian data...${NC}" + NDE_COUNT=0 + NDE_TOTAL=$(ls -1 "$PROJECT_ROOT/data/nde/rdf"/*.ttl 2>/dev/null | wc -l | tr -d ' ') + + for file in "$PROJECT_ROOT/data/nde/rdf"/*.ttl; do + if [ -f "$file" ]; then + NDE_COUNT=$((NDE_COUNT + 1)) + # Show progress every 100 files + if [ $((NDE_COUNT % 100)) -eq 0 ]; then + echo -e " Loading NDE files: $NDE_COUNT / $NDE_TOTAL" + fi + if load_file "$file" "text/turtle" 2>/dev/null; then + LOADED=$((LOADED + 1)) + else + FAILED=$((FAILED + 1)) + fi + fi + done + echo -e " ${GREEN}Loaded $NDE_COUNT NDE heritage custodian files${NC}" + fi + + # Get final triple count + echo "" + FINAL_COUNT=$(curl -s -X POST \ + -H "Content-Type: application/sparql-query" \ + -H "Accept: application/sparql-results+json" \ + --data "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }" \ + "$OXIGRAPH_ENDPOINT/query" | jq -r ".results.bindings[0].count.value // \"0\"") + + ADDED=$((FINAL_COUNT - INITIAL_COUNT)) + + echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" echo -e "${GREEN}Data deployment complete${NC}" + echo -e " Files loaded: ${GREEN}$LOADED${NC}" + echo -e " Files failed: ${RED}$FAILED${NC}" + echo -e " Triples added: ${BLUE}$ADDED${NC}" + echo -e " Total triples: ${BLUE}$FINAL_COUNT${NC}" fi # Deploy frontend @@ -487,19 +585,6 @@ ENDSSH echo -e "${GREEN}DuckLake API deployment complete${NC}" fi -# Reload Oxigraph -if [ "$RELOAD_OXIGRAPH" = true ]; then - echo "" - echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" - echo -e "${BLUE} Reloading Oxigraph Data${NC}" - echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" - - ssh -o StrictHostKeyChecking=no "$SERVER_USER@$SERVER_IP" \ - "/var/lib/glam/scripts/load-ontologies.sh" - - echo -e "${GREEN}Oxigraph reload complete${NC}" -fi - # Final status echo "" echo -e "${BLUE}════════════════════════════════════════════════════════════════${NC}" diff --git a/scripts/fix_yaml_history.py b/scripts/fix_yaml_history.py new file mode 100755 index 0000000000..c6528ef9b3 --- /dev/null +++ b/scripts/fix_yaml_history.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +""" +Fix malformed ghcid_history entries in YAML files. + +The issue: Some files have multiple history entries concatenated on a single line: + reason: Migrated from CH to EG namespace - Assiut - ghcid: XX-XX-XXX-L-AUL + +This should be split into separate list items. +""" + +import os +import re +import sys +from pathlib import Path + +def fix_yaml_content(content: str) -> str: + """Fix malformed ghcid_history entries.""" + lines = content.split('\n') + fixed_lines = [] + i = 0 + + while i < len(lines): + line = lines[i] + + # Check if this is a reason line that has an embedded " - ghcid:" or similar + # Pattern: reason: ... - ghcid: or reason: ... - valid_from: + if 'reason:' in line and ' - ghcid:' in line: + # Split the line at the embedded list marker + parts = line.split(' - ghcid:', 1) + # Add the first part (the reason line, truncated) + fixed_lines.append(parts[0].rstrip()) + # Add the second part as a new list item + # Find the proper indentation (should be same level as the - that started this entry) + indent_match = re.match(r'^(\s*)', line) + base_indent = indent_match.group(1) if indent_match else ' ' + # The new entry should be at the list item level + fixed_lines.append(f"{base_indent}- ghcid:{parts[1]}") + elif 'reason:' in line and ' - valid_from:' in line: + parts = line.split(' - valid_from:', 1) + fixed_lines.append(parts[0].rstrip()) + indent_match = re.match(r'^(\s*)', line) + base_indent = indent_match.group(1) if indent_match else ' ' + fixed_lines.append(f"{base_indent}- valid_from:{parts[1]}") + else: + fixed_lines.append(line) + + i += 1 + + return '\n'.join(fixed_lines) + + +def process_file(filepath: Path, dry_run: bool = False) -> bool: + """Process a single file. Returns True if changes were made.""" + with open(filepath, 'r', encoding='utf-8') as f: + original = f.read() + + fixed = fix_yaml_content(original) + + if fixed != original: + if dry_run: + print(f"Would fix: {filepath.name}") + else: + with open(filepath, 'w', encoding='utf-8') as f: + f.write(fixed) + print(f"Fixed: {filepath.name}") + return True + return False + + +def main(): + dry_run = '--dry-run' in sys.argv + + custodian_dir = Path('data/custodian') + if not custodian_dir.exists(): + print("Error: data/custodian directory not found") + sys.exit(1) + + fixed_count = 0 + + # Process files known to have issues (BE and EG prefixes) + for prefix in ['BE-', 'EG-']: + for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')): + if process_file(yaml_file, dry_run): + fixed_count += 1 + + print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files") + + +if __name__ == '__main__': + main() diff --git a/scripts/fix_yaml_history_v2.py b/scripts/fix_yaml_history_v2.py new file mode 100644 index 0000000000..e70b253e1d --- /dev/null +++ b/scripts/fix_yaml_history_v2.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Fix malformed ghcid_history entries in YAML files. +Version 2: More robust parsing and reconstruction. +""" + +import re +import sys +from pathlib import Path + + +def fix_ghcid_history_section(content: str) -> str: + """Fix the ghcid_history section of a YAML file.""" + + # Find the ghcid_history section + history_match = re.search(r'(\s*)ghcid_history:\s*\n', content) + if not history_match: + return content # No ghcid_history section + + base_indent = history_match.group(1) + list_indent = base_indent + " " + item_indent = list_indent + " " + + # Find the end of ghcid_history section (next top-level key at same or less indent) + start_pos = history_match.end() + + # Find where ghcid_history ends by looking for next key at same level + remaining = content[start_pos:] + + # Match pattern for next section at base_indent level or less + end_pattern = re.compile(rf'^{base_indent}[a-z_]+:', re.MULTILINE) + end_match = end_pattern.search(remaining) + + if end_match: + history_section = remaining[:end_match.start()] + after_section = remaining[end_match.start():] + else: + history_section = remaining + after_section = "" + + # Parse all history entries from the section + # They might be concatenated on one line or split incorrectly + + # Extract all ghcid entries - they have pattern: ghcid: + entry_pattern = re.compile( + r'(?:^|\s*-\s*)ghcid:\s*(\S+).*?' + r'(?:valid_from:\s*[\'"]?([^\'"]+)[\'"]?)?\s*' + r'(?:ghcid_numeric:\s*(\d+))?\s*' + r'(?:reason:\s*([^\n]+))?', + re.DOTALL + ) + + # Simpler approach: Just extract key-value pairs + entries = [] + current_entry = {} + + # Split by potential entry boundaries and reconstruct + lines = history_section.split('\n') + + for line in lines: + stripped = line.strip() + if not stripped: + continue + + # Start of new entry + if stripped.startswith('- ghcid:') or (stripped.startswith('ghcid:') and not current_entry): + if current_entry: + entries.append(current_entry) + value = stripped.replace('- ghcid:', '').replace('ghcid:', '').strip() + current_entry = {'ghcid': value} + elif stripped.startswith('- ') and ':' in stripped[2:]: + # This might be a new entry starting with a different key + if current_entry: + entries.append(current_entry) + # Parse the key-value + key_val = stripped[2:] + if ':' in key_val: + key, val = key_val.split(':', 1) + current_entry = {key.strip(): val.strip().strip("'\"")} + elif ':' in stripped: + # It's a key-value pair for current entry + key, val = stripped.split(':', 1) + key = key.strip().replace('- ', '') + val = val.strip().strip("'\"") + if key and val: + current_entry[key] = val + + if current_entry: + entries.append(current_entry) + + # Deduplicate entries by ghcid + valid_from + seen = set() + unique_entries = [] + for entry in entries: + key = (entry.get('ghcid', ''), entry.get('valid_from', '')) + if key not in seen and entry.get('ghcid'): + seen.add(key) + unique_entries.append(entry) + + # Reconstruct the section properly + new_history = f"{base_indent}ghcid_history:\n" + for entry in unique_entries: + new_history += f"{list_indent}- ghcid: {entry.get('ghcid', '')}\n" + if 'valid_from' in entry: + new_history += f"{item_indent}valid_from: '{entry['valid_from']}'\n" + if 'ghcid_numeric' in entry: + new_history += f"{item_indent}ghcid_numeric: {entry['ghcid_numeric']}\n" + if 'reason' in entry: + # Escape colons in reason text by quoting + reason = entry['reason'] + if ':' in reason and not reason.startswith('"') and not reason.startswith("'"): + reason = f'"{reason}"' + new_history += f"{item_indent}reason: {reason}\n" + + # Rebuild the content + before_section = content[:history_match.start()] + history_match.group(0).rstrip() + '\n' + + return before_section + new_history + after_section + + +def process_file(filepath: Path, dry_run: bool = False) -> bool: + """Process a single file.""" + with open(filepath, 'r', encoding='utf-8') as f: + original = f.read() + + fixed = fix_ghcid_history_section(original) + + if fixed != original: + if dry_run: + print(f"Would fix: {filepath.name}") + else: + with open(filepath, 'w', encoding='utf-8') as f: + f.write(fixed) + print(f"Fixed: {filepath.name}") + return True + return False + + +def main(): + dry_run = '--dry-run' in sys.argv + + custodian_dir = Path('data/custodian') + fixed_count = 0 + + for prefix in ['BE-', 'EG-']: + for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')): + if process_file(yaml_file, dry_run): + fixed_count += 1 + + print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files") + + +if __name__ == '__main__': + main() diff --git a/scripts/load_custodians_to_ducklake.py b/scripts/load_custodians_to_ducklake.py index b871fde011..bde5663e25 100644 --- a/scripts/load_custodians_to_ducklake.py +++ b/scripts/load_custodians_to_ducklake.py @@ -104,12 +104,47 @@ def extract_top_level_fields(data: dict) -> dict: # Extract original entry original = data.get("original_entry", {}) if original: - record["org_name"] = original.get("organisatie", "") - org_types = original.get("type", []) - record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types) + # Try multiple field names for organization name + record["org_name"] = original.get("name", "") or original.get("organisatie", "") + + # Try multiple field names for institution type + # First try 'institution_type' (CH-Annotator format), then 'type' (older format) + inst_type = original.get("institution_type", "") or original.get("type", "") + if isinstance(inst_type, list): + inst_type = ",".join(inst_type) + else: + inst_type = str(inst_type) if inst_type else "" + + # Normalize institution type names to standard codes + type_normalize = { + "OFFICIAL_INSTITUTION": "OFFICIAL", + "RESEARCH_CENTER": "RESEARCH", + "BOTANICAL_ZOO": "BOTANICAL", + "EDUCATION_PROVIDER": "EDUCATION", + "COLLECTING_SOCIETY": "SOCIETY", + "INTANGIBLE_HERITAGE_GROUP": "INTANGIBLE", + } + record["org_type"] = type_normalize.get(inst_type.upper(), inst_type.upper()) if inst_type else "" + record["wikidata_id"] = original.get("wikidata_id", "") record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str) + # Fallback: Extract org_type from GHCID code (4th component) + # GHCID format: CC-RR-CCC-T-ABBREV where T is the type code + if not record["org_type"] and record.get("ghcid_current"): + ghcid_parts = record["ghcid_current"].split("-") + if len(ghcid_parts) >= 4: + type_code = ghcid_parts[3] + # Map single-letter codes to full type names + type_map = { + "G": "GALLERY", "L": "LIBRARY", "A": "ARCHIVE", "M": "MUSEUM", + "O": "OFFICIAL", "R": "RESEARCH", "C": "CORPORATION", "U": "UNKNOWN", + "B": "BOTANICAL", "E": "EDUCATION", "S": "SOCIETY", "F": "FEATURES", + "I": "INTANGIBLE", "X": "MIXED", "P": "PERSONAL", "H": "HOLY_SITES", + "D": "DIGITAL", "N": "NGO", "T": "TASTE_SMELL" + } + record["org_type"] = type_map.get(type_code, type_code) + # Extract Google Maps data gm = data.get("google_maps_enrichment", {}) if gm: diff --git a/scripts/migrate_web_archives.py b/scripts/migrate_web_archives.py index 50702ee126..264f2ef923 100644 --- a/scripts/migrate_web_archives.py +++ b/scripts/migrate_web_archives.py @@ -211,6 +211,7 @@ def build_ducklake_database(mapping: Dict[int, str]): recognition_confidence FLOAT, linking_confidence FLOAT, wikidata_id VARCHAR, + source_page VARCHAR, FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid) ) """) @@ -224,9 +225,12 @@ def build_ducklake_database(mapping: Dict[int, str]): claim_id_counter = 0 web_folders = get_web_archive_folders() - logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...") + total_folders = len(web_folders) + logger.info(f"Processing {total_folders} web archive folders for DuckLake...") - for folder in web_folders: + for idx, folder in enumerate(web_folders): + if idx % 100 == 0: + logger.info(f"Progress: {idx}/{total_folders} folders processed ({idx*100//total_folders}%)") entry_index = int(folder.name) ghcid = mapping.get(entry_index) @@ -321,6 +325,11 @@ def build_ducklake_database(mapping: Dict[int, str]): with open(annotations_path, 'r', encoding='utf-8') as f: annotations = yaml.safe_load(f) + # Get source page from html_file + html_file = annotations.get('html_file', '') + # Extract just the filename part (e.g., "pages/index.html" -> "index.html") + source_page = html_file.split('/')[-1] if html_file else 'index.html' + session = annotations.get('session', {}) claims = session.get('claims', {}) @@ -329,7 +338,7 @@ def build_ducklake_database(mapping: Dict[int, str]): claim_id_counter += 1 provenance = claim.get('provenance', {}) con.execute(""" - INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ claim_id_counter, ghcid, @@ -342,7 +351,8 @@ def build_ducklake_database(mapping: Dict[int, str]): provenance.get('path'), claim.get('recognition_confidence', 0), claim.get('linking_confidence', 0), - claim.get('wikidata_id') + claim.get('wikidata_id'), + source_page ]) # Process aggregate claims @@ -350,7 +360,7 @@ def build_ducklake_database(mapping: Dict[int, str]): claim_id_counter += 1 provenance = claim.get('provenance', {}) con.execute(""" - INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ claim_id_counter, ghcid, @@ -363,7 +373,8 @@ def build_ducklake_database(mapping: Dict[int, str]): provenance.get('path'), provenance.get('confidence', 0), 0, - None + None, + source_page ]) except Exception as e: logger.debug(f"Error processing annotations for {ghcid}: {e}") @@ -373,6 +384,7 @@ def build_ducklake_database(mapping: Dict[int, str]): con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)") con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)") con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)") + con.execute("CREATE INDEX IF NOT EXISTS idx_claims_source_page ON web_claims(source_page)") # Get stats archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]