Backend: - Attach web_archives.duckdb as read-only database in DuckLake - Create views for web_archives, web_pages, web_claims in heritage schema Scripts: - enrich_cities_google.py: Add batch processing and retry logic - migrate_web_archives.py: Improve schema handling and error recovery Frontend: - DuckLakePanel: Add web archives query support - Database.css: Improve layout for query results display
446 lines
16 KiB
Python
446 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate web archives from /data/nde/enriched/entries/web/ to /data/custodian/{GHCID}/web/
|
|
|
|
This script:
|
|
1. Builds a mapping from entry_index -> GHCID by scanning custodian files
|
|
2. Moves (or symlinks) web archive folders to the appropriate custodian folder
|
|
3. Creates a DuckDB database with web archive metadata for DuckLake ingestion
|
|
|
|
Usage:
|
|
python scripts/migrate_web_archives.py --dry-run # Preview changes
|
|
python scripts/migrate_web_archives.py --execute # Actually migrate
|
|
python scripts/migrate_web_archives.py --build-ducklake # Create DuckDB tables
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import re
|
|
import yaml
|
|
import shutil
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, Optional, List, Any
|
|
import json
|
|
|
|
# Try to import duckdb for DuckLake ingestion
|
|
try:
|
|
import duckdb
|
|
HAS_DUCKDB = True
|
|
except ImportError:
|
|
HAS_DUCKDB = False
|
|
print("Warning: duckdb not installed. DuckLake ingestion disabled.")
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths
|
|
BASE_DIR = Path("/Users/kempersc/apps/glam")
|
|
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
|
|
WEB_ARCHIVE_SOURCE = BASE_DIR / "data" / "nde" / "enriched" / "entries" / "web"
|
|
DUCKLAKE_DB = BASE_DIR / "data" / "ducklake" / "web_archives.duckdb"
|
|
MAPPING_FILE = WEB_ARCHIVE_SOURCE / "_entry_to_ghcid.txt"
|
|
|
|
|
|
def build_entry_index_to_ghcid_mapping() -> Dict[int, str]:
|
|
"""
|
|
Load mapping from pre-built file (created via ripgrep for speed).
|
|
Falls back to scanning YAML files if file doesn't exist.
|
|
|
|
Returns:
|
|
Dict mapping entry_index (int) to GHCID (str, e.g., "NL-GE-GEN-S-HKG")
|
|
"""
|
|
mapping = {}
|
|
|
|
# Try to load from pre-built mapping file
|
|
if MAPPING_FILE.exists():
|
|
logger.info(f"Loading mapping from {MAPPING_FILE}")
|
|
with open(MAPPING_FILE, 'r') as f:
|
|
for line in f:
|
|
parts = line.strip().split(' ', 1)
|
|
if len(parts) == 2 and parts[0].isdigit():
|
|
entry_index = int(parts[0])
|
|
ghcid = parts[1]
|
|
mapping[entry_index] = ghcid
|
|
logger.info(f"Loaded {len(mapping)} entries from mapping file")
|
|
return mapping
|
|
|
|
# Fallback: scan YAML files (slow)
|
|
logger.info("Mapping file not found, scanning custodian files...")
|
|
custodian_files = list(CUSTODIAN_DIR.glob("*.yaml"))
|
|
logger.info(f"Scanning {len(custodian_files)} custodian files...")
|
|
|
|
for filepath in custodian_files:
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data and 'entry_index' in data:
|
|
entry_index = data['entry_index']
|
|
if isinstance(entry_index, int):
|
|
ghcid = filepath.stem # e.g., "NL-GE-GEN-S-HKG"
|
|
mapping[entry_index] = ghcid
|
|
except Exception as e:
|
|
logger.debug(f"Error reading {filepath}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Built mapping for {len(mapping)} entries with entry_index")
|
|
return mapping
|
|
|
|
|
|
def get_web_archive_folders() -> List[Path]:
|
|
"""Get list of web archive folders (entry numbers)."""
|
|
folders = []
|
|
for item in WEB_ARCHIVE_SOURCE.iterdir():
|
|
if item.is_dir() and item.name.isdigit():
|
|
folders.append(item)
|
|
return sorted(folders, key=lambda p: int(p.name))
|
|
|
|
|
|
def parse_metadata(metadata_path: Path) -> Optional[Dict[str, Any]]:
|
|
"""Parse web archive metadata.yaml file."""
|
|
try:
|
|
with open(metadata_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse {metadata_path}: {e}")
|
|
return None
|
|
|
|
|
|
def migrate_web_archive(source_folder: Path, ghcid: str, dry_run: bool = True) -> bool:
|
|
"""
|
|
Migrate a web archive folder to the custodian's web/ folder.
|
|
|
|
Args:
|
|
source_folder: Path to source web archive (e.g., .../web/0183/historischekringgente.nl/)
|
|
ghcid: Target GHCID (e.g., "NL-GE-GEN-S-HKG")
|
|
dry_run: If True, only preview changes
|
|
|
|
Returns:
|
|
True if successful
|
|
"""
|
|
target_dir = CUSTODIAN_DIR / ghcid / "web"
|
|
|
|
# Find domain subfolder
|
|
domain_folders = [d for d in source_folder.iterdir() if d.is_dir()]
|
|
|
|
if not domain_folders:
|
|
logger.warning(f"No domain folders in {source_folder}")
|
|
return False
|
|
|
|
for domain_folder in domain_folders:
|
|
domain_name = domain_folder.name
|
|
target_path = target_dir / domain_name
|
|
|
|
if dry_run:
|
|
logger.info(f"[DRY-RUN] Would migrate: {domain_folder} -> {target_path}")
|
|
else:
|
|
try:
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
if target_path.exists():
|
|
logger.warning(f"Target already exists: {target_path}")
|
|
continue
|
|
shutil.copytree(domain_folder, target_path)
|
|
logger.info(f"Migrated: {domain_folder} -> {target_path}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to migrate {domain_folder}: {e}")
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def build_ducklake_database(mapping: Dict[int, str]):
|
|
"""
|
|
Create DuckDB database with web archive metadata for DuckLake.
|
|
|
|
Tables:
|
|
- web_archives: Archive metadata (ghcid, url, timestamp, stats)
|
|
- web_pages: Individual pages with extraction counts
|
|
- web_claims: Extracted claims/entities from annotations
|
|
"""
|
|
if not HAS_DUCKDB:
|
|
logger.error("DuckDB not installed. Cannot build DuckLake database.")
|
|
return
|
|
|
|
DUCKLAKE_DB.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
con = duckdb.connect(str(DUCKLAKE_DB))
|
|
|
|
# Create tables
|
|
con.execute("""
|
|
CREATE TABLE IF NOT EXISTS web_archives (
|
|
ghcid VARCHAR PRIMARY KEY,
|
|
entry_index INTEGER,
|
|
domain VARCHAR,
|
|
url VARCHAR,
|
|
archive_timestamp TIMESTAMP,
|
|
archive_method VARCHAR,
|
|
total_pages INTEGER,
|
|
processed_pages INTEGER,
|
|
warc_file VARCHAR,
|
|
warc_size_bytes BIGINT,
|
|
has_annotations BOOLEAN DEFAULT FALSE
|
|
)
|
|
""")
|
|
|
|
con.execute("""
|
|
CREATE TABLE IF NOT EXISTS web_pages (
|
|
id INTEGER PRIMARY KEY,
|
|
ghcid VARCHAR,
|
|
page_title VARCHAR,
|
|
source_path VARCHAR,
|
|
archived_file VARCHAR,
|
|
extractions_count INTEGER,
|
|
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
|
|
)
|
|
""")
|
|
|
|
con.execute("""
|
|
CREATE TABLE IF NOT EXISTS web_claims (
|
|
id INTEGER PRIMARY KEY,
|
|
ghcid VARCHAR,
|
|
claim_id VARCHAR,
|
|
claim_type VARCHAR,
|
|
text_content VARCHAR,
|
|
hypernym VARCHAR,
|
|
hyponym VARCHAR,
|
|
class_uri VARCHAR,
|
|
xpath VARCHAR,
|
|
recognition_confidence FLOAT,
|
|
linking_confidence FLOAT,
|
|
wikidata_id VARCHAR,
|
|
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
|
|
)
|
|
""")
|
|
|
|
# Clear existing data
|
|
con.execute("DELETE FROM web_claims")
|
|
con.execute("DELETE FROM web_pages")
|
|
con.execute("DELETE FROM web_archives")
|
|
|
|
page_id = 0
|
|
claim_id_counter = 0
|
|
|
|
web_folders = get_web_archive_folders()
|
|
logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...")
|
|
|
|
for folder in web_folders:
|
|
entry_index = int(folder.name)
|
|
ghcid = mapping.get(entry_index)
|
|
|
|
if not ghcid:
|
|
logger.debug(f"No GHCID mapping for entry {entry_index}")
|
|
continue
|
|
|
|
# Find domain folder
|
|
domain_folders = [d for d in folder.iterdir() if d.is_dir()]
|
|
|
|
for domain_folder in domain_folders:
|
|
metadata_path = domain_folder / "metadata.yaml"
|
|
if not metadata_path.exists():
|
|
continue
|
|
|
|
metadata = parse_metadata(metadata_path)
|
|
if not metadata:
|
|
continue
|
|
|
|
# Check for annotations
|
|
annotations_path = domain_folder / "annotations_v1.7.0.yaml"
|
|
has_annotations = annotations_path.exists()
|
|
|
|
# Parse warc info
|
|
warc_info = metadata.get('warc', {})
|
|
|
|
# Insert archive record
|
|
try:
|
|
archive_ts = metadata.get('archive_timestamp')
|
|
if archive_ts:
|
|
archive_ts = datetime.fromisoformat(archive_ts.replace('Z', '+00:00'))
|
|
|
|
con.execute("""
|
|
INSERT INTO web_archives VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", [
|
|
ghcid,
|
|
entry_index,
|
|
domain_folder.name,
|
|
metadata.get('url'),
|
|
archive_ts,
|
|
metadata.get('archive_method'),
|
|
metadata.get('total_pages', 0),
|
|
metadata.get('processed_pages', 0),
|
|
warc_info.get('warc_file'),
|
|
warc_info.get('warc_size_bytes', 0),
|
|
has_annotations
|
|
])
|
|
except Exception as e:
|
|
logger.debug(f"Error inserting archive {ghcid}: {e}")
|
|
continue
|
|
|
|
# Insert pages
|
|
pages = metadata.get('pages', [])
|
|
|
|
# Handle single-page archives (older format with 'files' key)
|
|
if not pages and 'files' in metadata:
|
|
# Create a synthetic page entry from the single-page fetch
|
|
files = metadata.get('files', {})
|
|
rendered_html = files.get('rendered_html')
|
|
if rendered_html:
|
|
pages = [{
|
|
'title': domain_folder.name, # Use domain as title
|
|
'source_path': 'index.html',
|
|
'archived_file': rendered_html,
|
|
'extractions_count': 0
|
|
}]
|
|
# Update the archive's total_pages count
|
|
con.execute("""
|
|
UPDATE web_archives SET total_pages = 1, processed_pages = 1
|
|
WHERE ghcid = ?
|
|
""", [ghcid])
|
|
|
|
for page in pages:
|
|
page_id += 1
|
|
try:
|
|
con.execute("""
|
|
INSERT INTO web_pages VALUES (?, ?, ?, ?, ?, ?)
|
|
""", [
|
|
page_id,
|
|
ghcid,
|
|
page.get('title'),
|
|
page.get('source_path'),
|
|
page.get('archived_file'),
|
|
page.get('extractions_count', 0)
|
|
])
|
|
except Exception as e:
|
|
logger.debug(f"Error inserting page: {e}")
|
|
|
|
# Insert claims from annotations
|
|
if has_annotations:
|
|
try:
|
|
with open(annotations_path, 'r', encoding='utf-8') as f:
|
|
annotations = yaml.safe_load(f)
|
|
|
|
session = annotations.get('session', {})
|
|
claims = session.get('claims', {})
|
|
|
|
# Process entity claims
|
|
for claim in claims.get('entity', []):
|
|
claim_id_counter += 1
|
|
provenance = claim.get('provenance', {})
|
|
con.execute("""
|
|
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", [
|
|
claim_id_counter,
|
|
ghcid,
|
|
claim.get('claim_id'),
|
|
claim.get('claim_type'),
|
|
claim.get('text_content'),
|
|
claim.get('hypernym'),
|
|
claim.get('hyponym'),
|
|
claim.get('class_uri'),
|
|
provenance.get('path'),
|
|
claim.get('recognition_confidence', 0),
|
|
claim.get('linking_confidence', 0),
|
|
claim.get('wikidata_id')
|
|
])
|
|
|
|
# Process aggregate claims
|
|
for claim in claims.get('aggregate', []):
|
|
claim_id_counter += 1
|
|
provenance = claim.get('provenance', {})
|
|
con.execute("""
|
|
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", [
|
|
claim_id_counter,
|
|
ghcid,
|
|
claim.get('claim_id'),
|
|
claim.get('claim_type'),
|
|
claim.get('text_content'),
|
|
None,
|
|
None,
|
|
None,
|
|
provenance.get('path'),
|
|
provenance.get('confidence', 0),
|
|
0,
|
|
None
|
|
])
|
|
except Exception as e:
|
|
logger.debug(f"Error processing annotations for {ghcid}: {e}")
|
|
|
|
# Create indices
|
|
con.execute("CREATE INDEX IF NOT EXISTS idx_pages_ghcid ON web_pages(ghcid)")
|
|
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)")
|
|
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)")
|
|
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)")
|
|
|
|
# Get stats
|
|
archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]
|
|
page_count = con.execute("SELECT COUNT(*) FROM web_pages").fetchone()[0]
|
|
claim_count = con.execute("SELECT COUNT(*) FROM web_claims").fetchone()[0]
|
|
|
|
con.close()
|
|
|
|
logger.info(f"DuckLake database created at: {DUCKLAKE_DB}")
|
|
logger.info(f" - Archives: {archive_count}")
|
|
logger.info(f" - Pages: {page_count}")
|
|
logger.info(f" - Claims: {claim_count}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Migrate web archives to custodian folders")
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview changes without executing')
|
|
parser.add_argument('--execute', action='store_true', help='Actually migrate files')
|
|
parser.add_argument('--build-ducklake', action='store_true', help='Build DuckDB database only')
|
|
parser.add_argument('--build-mapping', action='store_true', help='Just build and show mapping')
|
|
args = parser.parse_args()
|
|
|
|
if not any([args.dry_run, args.execute, args.build_ducklake, args.build_mapping]):
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
# Build the mapping
|
|
mapping = build_entry_index_to_ghcid_mapping()
|
|
|
|
if args.build_mapping:
|
|
print(f"\nMapping has {len(mapping)} entries")
|
|
print("\nSample entries:")
|
|
for idx, (entry_idx, ghcid) in enumerate(sorted(mapping.items())[:20]):
|
|
print(f" {entry_idx:04d} -> {ghcid}")
|
|
return
|
|
|
|
if args.build_ducklake:
|
|
build_ducklake_database(mapping)
|
|
return
|
|
|
|
# Migration mode
|
|
web_folders = get_web_archive_folders()
|
|
logger.info(f"Found {len(web_folders)} web archive folders")
|
|
|
|
migrated = 0
|
|
skipped = 0
|
|
no_mapping = 0
|
|
|
|
for folder in web_folders:
|
|
entry_index = int(folder.name)
|
|
ghcid = mapping.get(entry_index)
|
|
|
|
if not ghcid:
|
|
logger.debug(f"No GHCID for entry {entry_index}")
|
|
no_mapping += 1
|
|
continue
|
|
|
|
success = migrate_web_archive(folder, ghcid, dry_run=not args.execute)
|
|
if success:
|
|
migrated += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
print(f"\n{'[DRY-RUN] ' if args.dry_run else ''}Migration summary:")
|
|
print(f" - Migrated: {migrated}")
|
|
print(f" - Skipped: {skipped}")
|
|
print(f" - No mapping: {no_mapping}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|