glam/scripts/migrate_web_archives.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

426 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Migrate web archives from /data/nde/enriched/entries/web/ to /data/custodian/{GHCID}/web/
This script:
1. Builds a mapping from entry_index -> GHCID by scanning custodian files
2. Moves (or symlinks) web archive folders to the appropriate custodian folder
3. Creates a DuckDB database with web archive metadata for DuckLake ingestion
Usage:
python scripts/migrate_web_archives.py --dry-run # Preview changes
python scripts/migrate_web_archives.py --execute # Actually migrate
python scripts/migrate_web_archives.py --build-ducklake # Create DuckDB tables
"""
import os
import sys
import re
import yaml
import shutil
import argparse
import logging
from pathlib import Path
from datetime import datetime
from typing import Dict, Optional, List, Any
import json
# Try to import duckdb for DuckLake ingestion
try:
import duckdb
HAS_DUCKDB = True
except ImportError:
HAS_DUCKDB = False
print("Warning: duckdb not installed. DuckLake ingestion disabled.")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Paths
BASE_DIR = Path("/Users/kempersc/apps/glam")
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
WEB_ARCHIVE_SOURCE = BASE_DIR / "data" / "nde" / "enriched" / "entries" / "web"
DUCKLAKE_DB = BASE_DIR / "data" / "ducklake" / "web_archives.duckdb"
MAPPING_FILE = WEB_ARCHIVE_SOURCE / "_entry_to_ghcid.txt"
def build_entry_index_to_ghcid_mapping() -> Dict[int, str]:
"""
Load mapping from pre-built file (created via ripgrep for speed).
Falls back to scanning YAML files if file doesn't exist.
Returns:
Dict mapping entry_index (int) to GHCID (str, e.g., "NL-GE-GEN-S-HKG")
"""
mapping = {}
# Try to load from pre-built mapping file
if MAPPING_FILE.exists():
logger.info(f"Loading mapping from {MAPPING_FILE}")
with open(MAPPING_FILE, 'r') as f:
for line in f:
parts = line.strip().split(' ', 1)
if len(parts) == 2 and parts[0].isdigit():
entry_index = int(parts[0])
ghcid = parts[1]
mapping[entry_index] = ghcid
logger.info(f"Loaded {len(mapping)} entries from mapping file")
return mapping
# Fallback: scan YAML files (slow)
logger.info("Mapping file not found, scanning custodian files...")
custodian_files = list(CUSTODIAN_DIR.glob("*.yaml"))
logger.info(f"Scanning {len(custodian_files)} custodian files...")
for filepath in custodian_files:
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and 'entry_index' in data:
entry_index = data['entry_index']
if isinstance(entry_index, int):
ghcid = filepath.stem # e.g., "NL-GE-GEN-S-HKG"
mapping[entry_index] = ghcid
except Exception as e:
logger.debug(f"Error reading {filepath}: {e}")
continue
logger.info(f"Built mapping for {len(mapping)} entries with entry_index")
return mapping
def get_web_archive_folders() -> List[Path]:
"""Get list of web archive folders (entry numbers)."""
folders = []
for item in WEB_ARCHIVE_SOURCE.iterdir():
if item.is_dir() and item.name.isdigit():
folders.append(item)
return sorted(folders, key=lambda p: int(p.name))
def parse_metadata(metadata_path: Path) -> Optional[Dict[str, Any]]:
"""Parse web archive metadata.yaml file."""
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
logger.error(f"Failed to parse {metadata_path}: {e}")
return None
def migrate_web_archive(source_folder: Path, ghcid: str, dry_run: bool = True) -> bool:
"""
Migrate a web archive folder to the custodian's web/ folder.
Args:
source_folder: Path to source web archive (e.g., .../web/0183/historischekringgente.nl/)
ghcid: Target GHCID (e.g., "NL-GE-GEN-S-HKG")
dry_run: If True, only preview changes
Returns:
True if successful
"""
target_dir = CUSTODIAN_DIR / ghcid / "web"
# Find domain subfolder
domain_folders = [d for d in source_folder.iterdir() if d.is_dir()]
if not domain_folders:
logger.warning(f"No domain folders in {source_folder}")
return False
for domain_folder in domain_folders:
domain_name = domain_folder.name
target_path = target_dir / domain_name
if dry_run:
logger.info(f"[DRY-RUN] Would migrate: {domain_folder} -> {target_path}")
else:
try:
target_dir.mkdir(parents=True, exist_ok=True)
if target_path.exists():
logger.warning(f"Target already exists: {target_path}")
continue
shutil.copytree(domain_folder, target_path)
logger.info(f"Migrated: {domain_folder} -> {target_path}")
except Exception as e:
logger.error(f"Failed to migrate {domain_folder}: {e}")
return False
return True
def build_ducklake_database(mapping: Dict[int, str]):
"""
Create DuckDB database with web archive metadata for DuckLake.
Tables:
- web_archives: Archive metadata (ghcid, url, timestamp, stats)
- web_pages: Individual pages with extraction counts
- web_claims: Extracted claims/entities from annotations
"""
if not HAS_DUCKDB:
logger.error("DuckDB not installed. Cannot build DuckLake database.")
return
DUCKLAKE_DB.parent.mkdir(parents=True, exist_ok=True)
con = duckdb.connect(str(DUCKLAKE_DB))
# Create tables
con.execute("""
CREATE TABLE IF NOT EXISTS web_archives (
ghcid VARCHAR PRIMARY KEY,
entry_index INTEGER,
domain VARCHAR,
url VARCHAR,
archive_timestamp TIMESTAMP,
archive_method VARCHAR,
total_pages INTEGER,
processed_pages INTEGER,
warc_file VARCHAR,
warc_size_bytes BIGINT,
has_annotations BOOLEAN DEFAULT FALSE
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS web_pages (
id INTEGER PRIMARY KEY,
ghcid VARCHAR,
page_title VARCHAR,
source_path VARCHAR,
archived_file VARCHAR,
extractions_count INTEGER,
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS web_claims (
id INTEGER PRIMARY KEY,
ghcid VARCHAR,
claim_id VARCHAR,
claim_type VARCHAR,
text_content VARCHAR,
hypernym VARCHAR,
hyponym VARCHAR,
class_uri VARCHAR,
xpath VARCHAR,
recognition_confidence FLOAT,
linking_confidence FLOAT,
wikidata_id VARCHAR,
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
)
""")
# Clear existing data
con.execute("DELETE FROM web_claims")
con.execute("DELETE FROM web_pages")
con.execute("DELETE FROM web_archives")
page_id = 0
claim_id_counter = 0
web_folders = get_web_archive_folders()
logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...")
for folder in web_folders:
entry_index = int(folder.name)
ghcid = mapping.get(entry_index)
if not ghcid:
logger.debug(f"No GHCID mapping for entry {entry_index}")
continue
# Find domain folder
domain_folders = [d for d in folder.iterdir() if d.is_dir()]
for domain_folder in domain_folders:
metadata_path = domain_folder / "metadata.yaml"
if not metadata_path.exists():
continue
metadata = parse_metadata(metadata_path)
if not metadata:
continue
# Check for annotations
annotations_path = domain_folder / "annotations_v1.7.0.yaml"
has_annotations = annotations_path.exists()
# Parse warc info
warc_info = metadata.get('warc', {})
# Insert archive record
try:
archive_ts = metadata.get('archive_timestamp')
if archive_ts:
archive_ts = datetime.fromisoformat(archive_ts.replace('Z', '+00:00'))
con.execute("""
INSERT INTO web_archives VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", [
ghcid,
entry_index,
domain_folder.name,
metadata.get('url'),
archive_ts,
metadata.get('archive_method'),
metadata.get('total_pages', 0),
metadata.get('processed_pages', 0),
warc_info.get('warc_file'),
warc_info.get('warc_size_bytes', 0),
has_annotations
])
except Exception as e:
logger.debug(f"Error inserting archive {ghcid}: {e}")
continue
# Insert pages
for page in metadata.get('pages', []):
page_id += 1
try:
con.execute("""
INSERT INTO web_pages VALUES (?, ?, ?, ?, ?, ?)
""", [
page_id,
ghcid,
page.get('title'),
page.get('source_path'),
page.get('archived_file'),
page.get('extractions_count', 0)
])
except Exception as e:
logger.debug(f"Error inserting page: {e}")
# Insert claims from annotations
if has_annotations:
try:
with open(annotations_path, 'r', encoding='utf-8') as f:
annotations = yaml.safe_load(f)
session = annotations.get('session', {})
claims = session.get('claims', {})
# Process entity claims
for claim in claims.get('entity', []):
claim_id_counter += 1
provenance = claim.get('provenance', {})
con.execute("""
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", [
claim_id_counter,
ghcid,
claim.get('claim_id'),
claim.get('claim_type'),
claim.get('text_content'),
claim.get('hypernym'),
claim.get('hyponym'),
claim.get('class_uri'),
provenance.get('path'),
claim.get('recognition_confidence', 0),
claim.get('linking_confidence', 0),
claim.get('wikidata_id')
])
# Process aggregate claims
for claim in claims.get('aggregate', []):
claim_id_counter += 1
provenance = claim.get('provenance', {})
con.execute("""
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", [
claim_id_counter,
ghcid,
claim.get('claim_id'),
claim.get('claim_type'),
claim.get('text_content'),
None,
None,
None,
provenance.get('path'),
provenance.get('confidence', 0),
0,
None
])
except Exception as e:
logger.debug(f"Error processing annotations for {ghcid}: {e}")
# Create indices
con.execute("CREATE INDEX IF NOT EXISTS idx_pages_ghcid ON web_pages(ghcid)")
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)")
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)")
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)")
# Get stats
archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]
page_count = con.execute("SELECT COUNT(*) FROM web_pages").fetchone()[0]
claim_count = con.execute("SELECT COUNT(*) FROM web_claims").fetchone()[0]
con.close()
logger.info(f"DuckLake database created at: {DUCKLAKE_DB}")
logger.info(f" - Archives: {archive_count}")
logger.info(f" - Pages: {page_count}")
logger.info(f" - Claims: {claim_count}")
def main():
parser = argparse.ArgumentParser(description="Migrate web archives to custodian folders")
parser.add_argument('--dry-run', action='store_true', help='Preview changes without executing')
parser.add_argument('--execute', action='store_true', help='Actually migrate files')
parser.add_argument('--build-ducklake', action='store_true', help='Build DuckDB database only')
parser.add_argument('--build-mapping', action='store_true', help='Just build and show mapping')
args = parser.parse_args()
if not any([args.dry_run, args.execute, args.build_ducklake, args.build_mapping]):
parser.print_help()
sys.exit(1)
# Build the mapping
mapping = build_entry_index_to_ghcid_mapping()
if args.build_mapping:
print(f"\nMapping has {len(mapping)} entries")
print("\nSample entries:")
for idx, (entry_idx, ghcid) in enumerate(sorted(mapping.items())[:20]):
print(f" {entry_idx:04d} -> {ghcid}")
return
if args.build_ducklake:
build_ducklake_database(mapping)
return
# Migration mode
web_folders = get_web_archive_folders()
logger.info(f"Found {len(web_folders)} web archive folders")
migrated = 0
skipped = 0
no_mapping = 0
for folder in web_folders:
entry_index = int(folder.name)
ghcid = mapping.get(entry_index)
if not ghcid:
logger.debug(f"No GHCID for entry {entry_index}")
no_mapping += 1
continue
success = migrate_web_archive(folder, ghcid, dry_run=not args.execute)
if success:
migrated += 1
else:
skipped += 1
print(f"\n{'[DRY-RUN] ' if args.dry_run else ''}Migration summary:")
print(f" - Migrated: {migrated}")
print(f" - Skipped: {skipped}")
print(f" - No mapping: {no_mapping}")
if __name__ == '__main__':
main()