188 lines
6.8 KiB
Python
Executable file
188 lines
6.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Migrate GHCID files from Q-number suffix convention to snake_case name convention.
|
|
|
|
This script:
|
|
1. Finds files with Q-number suffixes (e.g., LB-BA-BEI-A-AIF-Q4783187.yaml)
|
|
2. Extracts the institution name
|
|
3. Generates snake_case suffix from the native name
|
|
4. Updates GHCID, regenerates UUIDs
|
|
5. Renames files
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import uuid
|
|
import hashlib
|
|
import unicodedata
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
# UUID v5 namespace for GHCID (same as in other scripts)
|
|
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # DNS namespace
|
|
|
|
def generate_name_suffix(native_name: str) -> str:
|
|
"""Convert native language institution name to snake_case suffix."""
|
|
# Normalize unicode (NFD decomposition) and remove diacritics
|
|
normalized = unicodedata.normalize('NFD', native_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Convert to lowercase
|
|
lowercase = ascii_name.lower()
|
|
|
|
# Remove apostrophes, commas, and other punctuation
|
|
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
|
|
|
|
# Replace spaces and hyphens with underscores
|
|
underscored = re.sub(r'[\s\-]+', '_', no_punct)
|
|
|
|
# Remove any remaining non-alphanumeric characters (except underscores)
|
|
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
|
|
|
# Collapse multiple underscores
|
|
final = re.sub(r'_+', '_', clean).strip('_')
|
|
|
|
return final
|
|
|
|
def generate_uuid_v5(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
|
|
|
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
|
# Take first 16 bytes and set version 8 and variant bits
|
|
uuid_bytes = bytearray(hash_bytes[:16])
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
|
|
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
|
|
|
def generate_numeric_id(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
|
return int.from_bytes(hash_bytes[:8], 'big') & ((1 << 63) - 1) # 63-bit positive
|
|
|
|
def process_file(filepath: Path, dry_run: bool = True):
|
|
"""Process a single Q-number file."""
|
|
filename = filepath.name
|
|
|
|
# Extract base GHCID and Q-number
|
|
match = re.match(r'^(.+)-Q(\d+)\.yaml$', filename)
|
|
if not match:
|
|
print(f" SKIP: {filename} (doesn't match Q-number pattern)")
|
|
return None
|
|
|
|
base_ghcid = match.group(1)
|
|
q_number = match.group(2)
|
|
|
|
# Load YAML
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get institution name
|
|
name = data.get('original_entry', {}).get('name', '')
|
|
if not name:
|
|
print(f" ERROR: {filename} has no name field")
|
|
return None
|
|
|
|
# Generate snake_case suffix
|
|
name_suffix = generate_name_suffix(name)
|
|
|
|
# New GHCID
|
|
new_ghcid = f"{base_ghcid}-{name_suffix}"
|
|
old_ghcid = f"{base_ghcid}-Q{q_number}"
|
|
|
|
print(f"\n File: {filename}")
|
|
print(f" Name: {name}")
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
print(f" New GHCID: {new_ghcid}")
|
|
|
|
# Generate new UUIDs
|
|
new_uuid_v5 = generate_uuid_v5(new_ghcid)
|
|
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
|
|
new_numeric = generate_numeric_id(new_ghcid)
|
|
|
|
print(f" New UUID v5: {new_uuid_v5}")
|
|
print(f" New UUID v8: {new_uuid_v8}")
|
|
print(f" New Numeric: {new_numeric}")
|
|
|
|
if not dry_run:
|
|
# Update GHCID section
|
|
if 'ghcid' in data:
|
|
# Add to history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
# Record old GHCID in history
|
|
from datetime import datetime, timezone
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': old_ghcid,
|
|
'ghcid_uuid': data['ghcid'].get('ghcid_uuid'),
|
|
'ghcid_uuid_sha256': data['ghcid'].get('ghcid_uuid_sha256'),
|
|
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
|
'valid_from': data.get('provenance', {}).get('generated_at', '2025-12-06T00:00:00Z'),
|
|
'valid_to': now,
|
|
'reason': 'Migrated from Q-number suffix to snake_case name suffix convention'
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_original'] = base_ghcid # Store base without suffix
|
|
data['ghcid']['ghcid_uuid'] = new_uuid_v5
|
|
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_v8
|
|
data['ghcid']['ghcid_numeric'] = new_numeric
|
|
|
|
# Write updated data
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
with open(new_filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Delete old file
|
|
if new_filepath != filepath:
|
|
filepath.unlink()
|
|
print(f" MIGRATED: {filename} → {new_filename}")
|
|
|
|
return new_filepath
|
|
else:
|
|
print(f" New filename would be: {new_ghcid}.yaml")
|
|
return None
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Migrate Q-number GHCID files to snake_case')
|
|
parser.add_argument('--execute', action='store_true', help='Actually perform migration (default is dry run)')
|
|
parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.path)
|
|
|
|
# Find Q-number files
|
|
q_files = list(custodian_dir.glob('*-Q[0-9]*.yaml'))
|
|
|
|
# Filter to only those with Q-number suffix (not Q as part of abbreviation)
|
|
q_suffix_files = [f for f in q_files if re.match(r'.+-Q\d+\.yaml$', f.name)]
|
|
|
|
print(f"Found {len(q_suffix_files)} files with Q-number suffixes")
|
|
|
|
if args.execute:
|
|
print("\n=== EXECUTING MIGRATION ===")
|
|
else:
|
|
print("\n=== DRY RUN (use --execute to actually migrate) ===")
|
|
|
|
migrated = 0
|
|
for filepath in sorted(q_suffix_files):
|
|
result = process_file(filepath, dry_run=not args.execute)
|
|
if result:
|
|
migrated += 1
|
|
|
|
if args.execute:
|
|
print(f"\n=== MIGRATION COMPLETE: {migrated} files migrated ===")
|
|
else:
|
|
print(f"\n=== DRY RUN COMPLETE: {len(q_suffix_files)} files would be migrated ===")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|