glam/scripts/migrate_qnumber_to_snakecase.py
2025-12-07 00:26:01 +01:00

188 lines
6.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Migrate GHCID files from Q-number suffix convention to snake_case name convention.
This script:
1. Finds files with Q-number suffixes (e.g., LB-BA-BEI-A-AIF-Q4783187.yaml)
2. Extracts the institution name
3. Generates snake_case suffix from the native name
4. Updates GHCID, regenerates UUIDs
5. Renames files
"""
import os
import re
import uuid
import hashlib
import unicodedata
from pathlib import Path
import yaml
# UUID v5 namespace for GHCID (same as in other scripts)
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # DNS namespace
def generate_name_suffix(native_name: str) -> str:
"""Convert native language institution name to snake_case suffix."""
# Normalize unicode (NFD decomposition) and remove diacritics
normalized = unicodedata.normalize('NFD', native_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Convert to lowercase
lowercase = ascii_name.lower()
# Remove apostrophes, commas, and other punctuation
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
# Replace spaces and hyphens with underscores
underscored = re.sub(r'[\s\-]+', '_', no_punct)
# Remove any remaining non-alphanumeric characters (except underscores)
clean = re.sub(r'[^a-z0-9_]', '', underscored)
# Collapse multiple underscores
final = re.sub(r'_+', '_', clean).strip('_')
return final
def generate_uuid_v5(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
# Take first 16 bytes and set version 8 and variant bits
uuid_bytes = bytearray(hash_bytes[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def generate_numeric_id(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
return int.from_bytes(hash_bytes[:8], 'big') & ((1 << 63) - 1) # 63-bit positive
def process_file(filepath: Path, dry_run: bool = True):
"""Process a single Q-number file."""
filename = filepath.name
# Extract base GHCID and Q-number
match = re.match(r'^(.+)-Q(\d+)\.yaml$', filename)
if not match:
print(f" SKIP: {filename} (doesn't match Q-number pattern)")
return None
base_ghcid = match.group(1)
q_number = match.group(2)
# Load YAML
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get institution name
name = data.get('original_entry', {}).get('name', '')
if not name:
print(f" ERROR: {filename} has no name field")
return None
# Generate snake_case suffix
name_suffix = generate_name_suffix(name)
# New GHCID
new_ghcid = f"{base_ghcid}-{name_suffix}"
old_ghcid = f"{base_ghcid}-Q{q_number}"
print(f"\n File: {filename}")
print(f" Name: {name}")
print(f" Old GHCID: {old_ghcid}")
print(f" New GHCID: {new_ghcid}")
# Generate new UUIDs
new_uuid_v5 = generate_uuid_v5(new_ghcid)
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
new_numeric = generate_numeric_id(new_ghcid)
print(f" New UUID v5: {new_uuid_v5}")
print(f" New UUID v8: {new_uuid_v8}")
print(f" New Numeric: {new_numeric}")
if not dry_run:
# Update GHCID section
if 'ghcid' in data:
# Add to history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
# Record old GHCID in history
from datetime import datetime, timezone
now = datetime.now(timezone.utc).isoformat()
data['ghcid']['ghcid_history'].append({
'ghcid': old_ghcid,
'ghcid_uuid': data['ghcid'].get('ghcid_uuid'),
'ghcid_uuid_sha256': data['ghcid'].get('ghcid_uuid_sha256'),
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
'valid_from': data.get('provenance', {}).get('generated_at', '2025-12-06T00:00:00Z'),
'valid_to': now,
'reason': 'Migrated from Q-number suffix to snake_case name suffix convention'
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_original'] = base_ghcid # Store base without suffix
data['ghcid']['ghcid_uuid'] = new_uuid_v5
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_v8
data['ghcid']['ghcid_numeric'] = new_numeric
# Write updated data
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
with open(new_filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Delete old file
if new_filepath != filepath:
filepath.unlink()
print(f" MIGRATED: {filename}{new_filename}")
return new_filepath
else:
print(f" New filename would be: {new_ghcid}.yaml")
return None
def main():
import argparse
parser = argparse.ArgumentParser(description='Migrate Q-number GHCID files to snake_case')
parser.add_argument('--execute', action='store_true', help='Actually perform migration (default is dry run)')
parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files')
args = parser.parse_args()
custodian_dir = Path(args.path)
# Find Q-number files
q_files = list(custodian_dir.glob('*-Q[0-9]*.yaml'))
# Filter to only those with Q-number suffix (not Q as part of abbreviation)
q_suffix_files = [f for f in q_files if re.match(r'.+-Q\d+\.yaml$', f.name)]
print(f"Found {len(q_suffix_files)} files with Q-number suffixes")
if args.execute:
print("\n=== EXECUTING MIGRATION ===")
else:
print("\n=== DRY RUN (use --execute to actually migrate) ===")
migrated = 0
for filepath in sorted(q_suffix_files):
result = process_file(filepath, dry_run=not args.execute)
if result:
migrated += 1
if args.execute:
print(f"\n=== MIGRATION COMPLETE: {migrated} files migrated ===")
else:
print(f"\n=== DRY RUN COMPLETE: {len(q_suffix_files)} files would be migrated ===")
if __name__ == '__main__':
main()