#!/usr/bin/env python3 """ Migrate GHCID files from Q-number suffix convention to snake_case name convention. This script: 1. Finds files with Q-number suffixes (e.g., LB-BA-BEI-A-AIF-Q4783187.yaml) 2. Extracts the institution name 3. Generates snake_case suffix from the native name 4. Updates GHCID, regenerates UUIDs 5. Renames files """ import os import re import uuid import hashlib import unicodedata from pathlib import Path import yaml # UUID v5 namespace for GHCID (same as in other scripts) GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # DNS namespace def generate_name_suffix(native_name: str) -> str: """Convert native language institution name to snake_case suffix.""" # Normalize unicode (NFD decomposition) and remove diacritics normalized = unicodedata.normalize('NFD', native_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Convert to lowercase lowercase = ascii_name.lower() # Remove apostrophes, commas, and other punctuation no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase) # Replace spaces and hyphens with underscores underscored = re.sub(r'[\s\-]+', '_', no_punct) # Remove any remaining non-alphanumeric characters (except underscores) clean = re.sub(r'[^a-z0-9_]', '', underscored) # Collapse multiple underscores final = re.sub(r'_+', '_', clean).strip('_') return final def generate_uuid_v5(ghcid_string: str) -> str: """Generate UUID v5 from GHCID string.""" return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) def generate_uuid_v8_sha256(ghcid_string: str) -> str: """Generate UUID v8 (SHA-256 based) from GHCID string.""" hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest() # Take first 16 bytes and set version 8 and variant bits uuid_bytes = bytearray(hash_bytes[:16]) uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant return str(uuid.UUID(bytes=bytes(uuid_bytes))) def generate_numeric_id(ghcid_string: str) -> int: """Generate 64-bit numeric ID from GHCID string.""" hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest() return int.from_bytes(hash_bytes[:8], 'big') & ((1 << 63) - 1) # 63-bit positive def process_file(filepath: Path, dry_run: bool = True): """Process a single Q-number file.""" filename = filepath.name # Extract base GHCID and Q-number match = re.match(r'^(.+)-Q(\d+)\.yaml$', filename) if not match: print(f" SKIP: {filename} (doesn't match Q-number pattern)") return None base_ghcid = match.group(1) q_number = match.group(2) # Load YAML with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get institution name name = data.get('original_entry', {}).get('name', '') if not name: print(f" ERROR: {filename} has no name field") return None # Generate snake_case suffix name_suffix = generate_name_suffix(name) # New GHCID new_ghcid = f"{base_ghcid}-{name_suffix}" old_ghcid = f"{base_ghcid}-Q{q_number}" print(f"\n File: {filename}") print(f" Name: {name}") print(f" Old GHCID: {old_ghcid}") print(f" New GHCID: {new_ghcid}") # Generate new UUIDs new_uuid_v5 = generate_uuid_v5(new_ghcid) new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid) new_numeric = generate_numeric_id(new_ghcid) print(f" New UUID v5: {new_uuid_v5}") print(f" New UUID v8: {new_uuid_v8}") print(f" New Numeric: {new_numeric}") if not dry_run: # Update GHCID section if 'ghcid' in data: # Add to history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] # Record old GHCID in history from datetime import datetime, timezone now = datetime.now(timezone.utc).isoformat() data['ghcid']['ghcid_history'].append({ 'ghcid': old_ghcid, 'ghcid_uuid': data['ghcid'].get('ghcid_uuid'), 'ghcid_uuid_sha256': data['ghcid'].get('ghcid_uuid_sha256'), 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), 'valid_from': data.get('provenance', {}).get('generated_at', '2025-12-06T00:00:00Z'), 'valid_to': now, 'reason': 'Migrated from Q-number suffix to snake_case name suffix convention' }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_original'] = base_ghcid # Store base without suffix data['ghcid']['ghcid_uuid'] = new_uuid_v5 data['ghcid']['ghcid_uuid_sha256'] = new_uuid_v8 data['ghcid']['ghcid_numeric'] = new_numeric # Write updated data new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename with open(new_filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Delete old file if new_filepath != filepath: filepath.unlink() print(f" MIGRATED: {filename} → {new_filename}") return new_filepath else: print(f" New filename would be: {new_ghcid}.yaml") return None def main(): import argparse parser = argparse.ArgumentParser(description='Migrate Q-number GHCID files to snake_case') parser.add_argument('--execute', action='store_true', help='Actually perform migration (default is dry run)') parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files') args = parser.parse_args() custodian_dir = Path(args.path) # Find Q-number files q_files = list(custodian_dir.glob('*-Q[0-9]*.yaml')) # Filter to only those with Q-number suffix (not Q as part of abbreviation) q_suffix_files = [f for f in q_files if re.match(r'.+-Q\d+\.yaml$', f.name)] print(f"Found {len(q_suffix_files)} files with Q-number suffixes") if args.execute: print("\n=== EXECUTING MIGRATION ===") else: print("\n=== DRY RUN (use --execute to actually migrate) ===") migrated = 0 for filepath in sorted(q_suffix_files): result = process_file(filepath, dry_run=not args.execute) if result: migrated += 1 if args.execute: print(f"\n=== MIGRATION COMPLETE: {migrated} files migrated ===") else: print(f"\n=== DRY RUN COMPLETE: {len(q_suffix_files)} files would be migrated ===") if __name__ == '__main__': main()