#!/usr/bin/env python3 """ Migrate web directories from index-based naming to GHCID-based naming. This script reads the _entry_to_ghcid.txt mapping file and renames directories from numeric indices (e.g., 0002, 0681) to their corresponding GHCID names (e.g., NL-DR-ASS-A-DA, NL-NH-HEE-S-HKH). Usage: python scripts/migrate_web_dirs_to_ghcid.py [--dry-run] [--verbose] Options: --dry-run Show what would be done without making changes --verbose Show detailed progress information """ import os import sys import argparse from pathlib import Path from collections import defaultdict def parse_mapping_file(mapping_path: Path) -> dict[str, str]: """Parse the entry to GHCID mapping file. Returns dict mapping index (as string with leading zeros) to GHCID. For duplicate indices, uses the first mapping encountered. """ mapping = {} seen_indices = set() with open(mapping_path, 'r') as f: for line in f: line = line.strip() if not line: continue parts = line.split(maxsplit=1) if len(parts) != 2: print(f"Warning: Skipping malformed line: {line}") continue index_str, ghcid = parts # Handle both "0" and "0002" style indices # Normalize to 4-digit zero-padded format try: index_int = int(index_str) index_normalized = str(index_int) # Without padding for directory matching index_padded = f"{index_int:04d}" # With padding for some directories except ValueError: print(f"Warning: Invalid index '{index_str}' in line: {line}") continue # Skip duplicates - use first mapping if index_normalized in seen_indices: continue seen_indices.add(index_normalized) mapping[index_normalized] = ghcid mapping[index_padded] = ghcid # Also store padded version return mapping def find_index_directories(web_dir: Path) -> list[Path]: """Find all directories with numeric names.""" index_dirs = [] for item in web_dir.iterdir(): if item.is_dir() and item.name.isdigit(): index_dirs.append(item) elif item.is_dir() and item.name.lstrip('0').isdigit(): # Handle zero-padded names like "0002" index_dirs.append(item) return sorted(index_dirs, key=lambda p: int(p.name.lstrip('0') or '0')) def migrate_directories( web_dir: Path, mapping: dict[str, str], dry_run: bool = False, verbose: bool = False ) -> tuple[int, int, int]: """Migrate directories from index to GHCID naming. Returns (success_count, skip_count, error_count). """ index_dirs = find_index_directories(web_dir) success_count = 0 skip_count = 0 error_count = 0 for dir_path in index_dirs: dir_name = dir_path.name # Try both padded and unpadded versions index_key = dir_name.lstrip('0') or '0' if index_key not in mapping and dir_name not in mapping: if verbose: print(f"Skip: No mapping for index {dir_name}") skip_count += 1 continue ghcid = mapping.get(index_key) or mapping.get(dir_name) new_path = web_dir / ghcid # Check if target already exists if new_path.exists(): if verbose: print(f"Skip: Target already exists: {ghcid}") skip_count += 1 continue # Perform the rename if dry_run: print(f"Would rename: {dir_name} -> {ghcid}") success_count += 1 else: try: dir_path.rename(new_path) if verbose: print(f"Renamed: {dir_name} -> {ghcid}") success_count += 1 except OSError as e: print(f"Error renaming {dir_name}: {e}") error_count += 1 return success_count, skip_count, error_count def main(): parser = argparse.ArgumentParser( description="Migrate web directories from index to GHCID naming" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be done without making changes" ) parser.add_argument( "--verbose", action="store_true", help="Show detailed progress information" ) args = parser.parse_args() # Paths project_root = Path(__file__).parent.parent web_dir = project_root / "data" / "custodian" / "web" mapping_file = web_dir / "_entry_to_ghcid.txt" # Validate paths if not web_dir.exists(): print(f"Error: Web directory not found: {web_dir}") sys.exit(1) if not mapping_file.exists(): print(f"Error: Mapping file not found: {mapping_file}") sys.exit(1) # Parse mapping print(f"Reading mapping from: {mapping_file}") mapping = parse_mapping_file(mapping_file) print(f"Found {len(mapping) // 2} unique index-to-GHCID mappings") # Find directories to migrate index_dirs = find_index_directories(web_dir) print(f"Found {len(index_dirs)} index-based directories") if args.dry_run: print("\n=== DRY RUN MODE ===\n") # Perform migration success, skip, error = migrate_directories( web_dir, mapping, dry_run=args.dry_run, verbose=args.verbose ) # Summary print(f"\n=== Summary ===") print(f"Successfully {'would rename' if args.dry_run else 'renamed'}: {success}") print(f"Skipped (no mapping or target exists): {skip}") print(f"Errors: {error}") if args.dry_run and success > 0: print(f"\nRun without --dry-run to perform actual migration") return 0 if error == 0 else 1 if __name__ == "__main__": sys.exit(main())