#!/usr/bin/env python3 """ Migrate web archives from /data/nde/enriched/entries/web/ to /data/custodian/{GHCID}/web/ This script: 1. Builds a mapping from entry_index -> GHCID by scanning custodian files 2. Moves (or symlinks) web archive folders to the appropriate custodian folder 3. Creates a DuckDB database with web archive metadata for DuckLake ingestion Usage: python scripts/migrate_web_archives.py --dry-run # Preview changes python scripts/migrate_web_archives.py --execute # Actually migrate python scripts/migrate_web_archives.py --build-ducklake # Create DuckDB tables """ import os import sys import re import yaml import shutil import argparse import logging from pathlib import Path from datetime import datetime from typing import Dict, Optional, List, Any import json # Try to import duckdb for DuckLake ingestion try: import duckdb HAS_DUCKDB = True except ImportError: HAS_DUCKDB = False print("Warning: duckdb not installed. DuckLake ingestion disabled.") logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Paths BASE_DIR = Path("/Users/kempersc/apps/glam") CUSTODIAN_DIR = BASE_DIR / "data" / "custodian" WEB_ARCHIVE_SOURCE = BASE_DIR / "data" / "nde" / "enriched" / "entries" / "web" DUCKLAKE_DB = BASE_DIR / "data" / "ducklake" / "web_archives.duckdb" MAPPING_FILE = WEB_ARCHIVE_SOURCE / "_entry_to_ghcid.txt" def build_entry_index_to_ghcid_mapping() -> Dict[int, str]: """ Load mapping from pre-built file (created via ripgrep for speed). Falls back to scanning YAML files if file doesn't exist. Returns: Dict mapping entry_index (int) to GHCID (str, e.g., "NL-GE-GEN-S-HKG") """ mapping = {} # Try to load from pre-built mapping file if MAPPING_FILE.exists(): logger.info(f"Loading mapping from {MAPPING_FILE}") with open(MAPPING_FILE, 'r') as f: for line in f: parts = line.strip().split(' ', 1) if len(parts) == 2 and parts[0].isdigit(): entry_index = int(parts[0]) ghcid = parts[1] mapping[entry_index] = ghcid logger.info(f"Loaded {len(mapping)} entries from mapping file") return mapping # Fallback: scan YAML files (slow) logger.info("Mapping file not found, scanning custodian files...") custodian_files = list(CUSTODIAN_DIR.glob("*.yaml")) logger.info(f"Scanning {len(custodian_files)} custodian files...") for filepath in custodian_files: try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and 'entry_index' in data: entry_index = data['entry_index'] if isinstance(entry_index, int): ghcid = filepath.stem # e.g., "NL-GE-GEN-S-HKG" mapping[entry_index] = ghcid except Exception as e: logger.debug(f"Error reading {filepath}: {e}") continue logger.info(f"Built mapping for {len(mapping)} entries with entry_index") return mapping def get_web_archive_folders() -> List[Path]: """Get list of web archive folders (entry numbers).""" folders = [] for item in WEB_ARCHIVE_SOURCE.iterdir(): if item.is_dir() and item.name.isdigit(): folders.append(item) return sorted(folders, key=lambda p: int(p.name)) def parse_metadata(metadata_path: Path) -> Optional[Dict[str, Any]]: """Parse web archive metadata.yaml file.""" try: with open(metadata_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except Exception as e: logger.error(f"Failed to parse {metadata_path}: {e}") return None def migrate_web_archive(source_folder: Path, ghcid: str, dry_run: bool = True) -> bool: """ Migrate a web archive folder to the custodian's web/ folder. Args: source_folder: Path to source web archive (e.g., .../web/0183/historischekringgente.nl/) ghcid: Target GHCID (e.g., "NL-GE-GEN-S-HKG") dry_run: If True, only preview changes Returns: True if successful """ target_dir = CUSTODIAN_DIR / ghcid / "web" # Find domain subfolder domain_folders = [d for d in source_folder.iterdir() if d.is_dir()] if not domain_folders: logger.warning(f"No domain folders in {source_folder}") return False for domain_folder in domain_folders: domain_name = domain_folder.name target_path = target_dir / domain_name if dry_run: logger.info(f"[DRY-RUN] Would migrate: {domain_folder} -> {target_path}") else: try: target_dir.mkdir(parents=True, exist_ok=True) if target_path.exists(): logger.warning(f"Target already exists: {target_path}") continue shutil.copytree(domain_folder, target_path) logger.info(f"Migrated: {domain_folder} -> {target_path}") except Exception as e: logger.error(f"Failed to migrate {domain_folder}: {e}") return False return True def build_ducklake_database(mapping: Dict[int, str]): """ Create DuckDB database with web archive metadata for DuckLake. Tables: - web_archives: Archive metadata (ghcid, url, timestamp, stats) - web_pages: Individual pages with extraction counts - web_claims: Extracted claims/entities from annotations """ if not HAS_DUCKDB: logger.error("DuckDB not installed. Cannot build DuckLake database.") return DUCKLAKE_DB.parent.mkdir(parents=True, exist_ok=True) con = duckdb.connect(str(DUCKLAKE_DB)) # Create tables con.execute(""" CREATE TABLE IF NOT EXISTS web_archives ( ghcid VARCHAR PRIMARY KEY, entry_index INTEGER, domain VARCHAR, url VARCHAR, archive_timestamp TIMESTAMP, archive_method VARCHAR, total_pages INTEGER, processed_pages INTEGER, warc_file VARCHAR, warc_size_bytes BIGINT, has_annotations BOOLEAN DEFAULT FALSE ) """) con.execute(""" CREATE TABLE IF NOT EXISTS web_pages ( id INTEGER PRIMARY KEY, ghcid VARCHAR, page_title VARCHAR, source_path VARCHAR, archived_file VARCHAR, extractions_count INTEGER, FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid) ) """) con.execute(""" CREATE TABLE IF NOT EXISTS web_claims ( id INTEGER PRIMARY KEY, ghcid VARCHAR, claim_id VARCHAR, claim_type VARCHAR, text_content VARCHAR, hypernym VARCHAR, hyponym VARCHAR, class_uri VARCHAR, xpath VARCHAR, recognition_confidence FLOAT, linking_confidence FLOAT, wikidata_id VARCHAR, FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid) ) """) # Clear existing data con.execute("DELETE FROM web_claims") con.execute("DELETE FROM web_pages") con.execute("DELETE FROM web_archives") page_id = 0 claim_id_counter = 0 web_folders = get_web_archive_folders() logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...") for folder in web_folders: entry_index = int(folder.name) ghcid = mapping.get(entry_index) if not ghcid: logger.debug(f"No GHCID mapping for entry {entry_index}") continue # Find domain folder domain_folders = [d for d in folder.iterdir() if d.is_dir()] for domain_folder in domain_folders: metadata_path = domain_folder / "metadata.yaml" if not metadata_path.exists(): continue metadata = parse_metadata(metadata_path) if not metadata: continue # Check for annotations annotations_path = domain_folder / "annotations_v1.7.0.yaml" has_annotations = annotations_path.exists() # Parse warc info warc_info = metadata.get('warc', {}) # Insert archive record try: archive_ts = metadata.get('archive_timestamp') if archive_ts: archive_ts = datetime.fromisoformat(archive_ts.replace('Z', '+00:00')) con.execute(""" INSERT INTO web_archives VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ ghcid, entry_index, domain_folder.name, metadata.get('url'), archive_ts, metadata.get('archive_method'), metadata.get('total_pages', 0), metadata.get('processed_pages', 0), warc_info.get('warc_file'), warc_info.get('warc_size_bytes', 0), has_annotations ]) except Exception as e: logger.debug(f"Error inserting archive {ghcid}: {e}") continue # Insert pages for page in metadata.get('pages', []): page_id += 1 try: con.execute(""" INSERT INTO web_pages VALUES (?, ?, ?, ?, ?, ?) """, [ page_id, ghcid, page.get('title'), page.get('source_path'), page.get('archived_file'), page.get('extractions_count', 0) ]) except Exception as e: logger.debug(f"Error inserting page: {e}") # Insert claims from annotations if has_annotations: try: with open(annotations_path, 'r', encoding='utf-8') as f: annotations = yaml.safe_load(f) session = annotations.get('session', {}) claims = session.get('claims', {}) # Process entity claims for claim in claims.get('entity', []): claim_id_counter += 1 provenance = claim.get('provenance', {}) con.execute(""" INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ claim_id_counter, ghcid, claim.get('claim_id'), claim.get('claim_type'), claim.get('text_content'), claim.get('hypernym'), claim.get('hyponym'), claim.get('class_uri'), provenance.get('path'), claim.get('recognition_confidence', 0), claim.get('linking_confidence', 0), claim.get('wikidata_id') ]) # Process aggregate claims for claim in claims.get('aggregate', []): claim_id_counter += 1 provenance = claim.get('provenance', {}) con.execute(""" INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ claim_id_counter, ghcid, claim.get('claim_id'), claim.get('claim_type'), claim.get('text_content'), None, None, None, provenance.get('path'), provenance.get('confidence', 0), 0, None ]) except Exception as e: logger.debug(f"Error processing annotations for {ghcid}: {e}") # Create indices con.execute("CREATE INDEX IF NOT EXISTS idx_pages_ghcid ON web_pages(ghcid)") con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)") con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)") con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)") # Get stats archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0] page_count = con.execute("SELECT COUNT(*) FROM web_pages").fetchone()[0] claim_count = con.execute("SELECT COUNT(*) FROM web_claims").fetchone()[0] con.close() logger.info(f"DuckLake database created at: {DUCKLAKE_DB}") logger.info(f" - Archives: {archive_count}") logger.info(f" - Pages: {page_count}") logger.info(f" - Claims: {claim_count}") def main(): parser = argparse.ArgumentParser(description="Migrate web archives to custodian folders") parser.add_argument('--dry-run', action='store_true', help='Preview changes without executing') parser.add_argument('--execute', action='store_true', help='Actually migrate files') parser.add_argument('--build-ducklake', action='store_true', help='Build DuckDB database only') parser.add_argument('--build-mapping', action='store_true', help='Just build and show mapping') args = parser.parse_args() if not any([args.dry_run, args.execute, args.build_ducklake, args.build_mapping]): parser.print_help() sys.exit(1) # Build the mapping mapping = build_entry_index_to_ghcid_mapping() if args.build_mapping: print(f"\nMapping has {len(mapping)} entries") print("\nSample entries:") for idx, (entry_idx, ghcid) in enumerate(sorted(mapping.items())[:20]): print(f" {entry_idx:04d} -> {ghcid}") return if args.build_ducklake: build_ducklake_database(mapping) return # Migration mode web_folders = get_web_archive_folders() logger.info(f"Found {len(web_folders)} web archive folders") migrated = 0 skipped = 0 no_mapping = 0 for folder in web_folders: entry_index = int(folder.name) ghcid = mapping.get(entry_index) if not ghcid: logger.debug(f"No GHCID for entry {entry_index}") no_mapping += 1 continue success = migrate_web_archive(folder, ghcid, dry_run=not args.execute) if success: migrated += 1 else: skipped += 1 print(f"\n{'[DRY-RUN] ' if args.dry_run else ''}Migration summary:") print(f" - Migrated: {migrated}") print(f" - Skipped: {skipped}") print(f" - No mapping: {no_mapping}") if __name__ == '__main__': main()