#!/usr/bin/env python3 """ Fix GHCID collision victim files. These files have a trailing dash in their filename (e.g., NL-DR-ASS-L-BD-.yaml) indicating they were collision victims whose internal GHCID was incorrectly set to their collision partner's GHCID instead of getting their own unique GHCID. This script: 1. Reads the institution's real name from original_entry.organisatie 2. Generates a proper name suffix from that name 3. Creates a new unique GHCID with the proper suffix 4. Regenerates all GHCID-derived identifiers (UUID, numeric) 5. Updates the file with correct identifiers 6. Renames the file to match the new GHCID """ import hashlib import re import shutil import unicodedata import uuid from datetime import datetime, timezone from pathlib import Path from typing import Optional import yaml # GHCID namespace for UUID generation GHCID_NAMESPACE = uuid.NAMESPACE_URL GHCID_URL_PREFIX = "https://glam.registry/" # Skip words for abbreviation generation (Dutch and common) SKIP_WORDS = { 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', 's', 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a', 'an', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'museum', 'bibliotheek', 'archief', 'collectie' } def normalize_diacritics(text: str) -> str: """Normalize diacritics to ASCII equivalents.""" normalized = unicodedata.normalize('NFD', text) ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') return ascii_text def generate_name_suffix(native_name: str) -> str: """Convert native language institution name to snake_case suffix. Examples: "Biblionet Drenthe POI" → "biblionet_drenthe_poi" "Fries Verzetsmuseum" → "fries_verzetsmuseum" "Musée d'Orsay" → "musee_dorsay" """ # Normalize unicode and remove diacritics ascii_name = normalize_diacritics(native_name) # Convert to lowercase lowercase = ascii_name.lower() # Remove apostrophes, commas, and other punctuation no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase) # Replace spaces and hyphens with underscores underscored = re.sub(r'[\s\-]+', '_', no_punct) # Remove any remaining non-alphanumeric characters (except underscores) clean = re.sub(r'[^a-z0-9_]', '', underscored) # Collapse multiple underscores final = re.sub(r'_+', '_', clean).strip('_') return final def generate_ghcid_uuid(ghcid: str) -> str: """Generate UUID v5 from GHCID.""" return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}{ghcid}")) def generate_ghcid_uuid_sha256(ghcid: str) -> str: """Generate UUID v8 (SHA-256 based) from GHCID.""" return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}sha256/{ghcid}")) def generate_ghcid_numeric(ghcid: str) -> int: """Generate 64-bit numeric ID from GHCID.""" sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest() return int(sha256_hash[:16], 16) def fix_collision_victim(file_path: Path, dry_run: bool = False) -> Optional[Path]: """Fix a single collision victim file. Args: file_path: Path to the collision victim YAML file dry_run: If True, only print what would be done Returns: New file path after renaming, or None if skipped/failed """ print(f"\n{'='*80}") print(f"Processing: {file_path.name}") print(f"{'='*80}") # Read file try: with open(file_path) as f: data = yaml.safe_load(f) except Exception as e: print(f" ERROR: Could not read file: {e}") return None if data is None: print(f" SKIP: File is empty or invalid") return None # Get institution name org_name = data.get('original_entry', {}).get('organisatie') if not org_name: print(f" ERROR: No organisatie found in original_entry") return None print(f" Institution: {org_name}") # Get current GHCID info ghcid_data = data.get('ghcid', {}) old_ghcid = ghcid_data.get('ghcid_current', '') print(f" Old GHCID: {old_ghcid}") # Extract base GHCID from filename (remove trailing dash) base_ghcid = file_path.stem.rstrip('-') print(f" Base GHCID: {base_ghcid}") # Generate new name suffix from institution name name_suffix = generate_name_suffix(org_name) print(f" Name suffix: {name_suffix}") # Create new GHCID new_ghcid = f"{base_ghcid}-{name_suffix}" print(f" New GHCID: {new_ghcid}") # Check if this would be the same as old (only filename is wrong) if new_ghcid == old_ghcid: expected_filename = f"{new_ghcid}.yaml" if file_path.name != expected_filename: print(f" GHCID correct, but filename wrong - needs rename only") if dry_run: print(f" DRY RUN: Would rename to {expected_filename}") return None new_file_path = file_path.parent / expected_filename if new_file_path.exists(): print(f" ERROR: Target file already exists: {new_file_path.name}") return None shutil.move(str(file_path), str(new_file_path)) print(f" Renamed: {file_path.name} → {new_file_path.name}") return new_file_path else: print(f" SKIP: GHCID and filename both correct") return None # Generate new identifiers new_uuid = generate_ghcid_uuid(new_ghcid) new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid) new_numeric = generate_ghcid_numeric(new_ghcid) print(f" New UUID: {new_uuid}") print(f" New numeric: {new_numeric}") if dry_run: print(f" DRY RUN: Would update file and rename to {new_ghcid}.yaml") return None # Update GHCID section timestamp = datetime.now(timezone.utc).isoformat() # Preserve old GHCID in history ghcid_history = ghcid_data.get('ghcid_history', []) # Add history entry for the fix ghcid_history.append({ 'ghcid': old_ghcid, 'ghcid_uuid': ghcid_data.get('ghcid_uuid', ''), 'ghcid_numeric': ghcid_data.get('ghcid_numeric', 0), 'valid_from': ghcid_data.get('generated_at', ''), 'valid_to': timestamp, 'reason': f"Collision fix: had partner's GHCID, corrected to institution's own GHCID based on name '{org_name}'" }) data['ghcid'] = { 'ghcid_current': new_ghcid, 'ghcid_uuid': new_uuid, 'ghcid_uuid_sha256': new_uuid_sha256, 'ghcid_numeric': new_numeric, 'generated_at': timestamp, 'ghcid_history': ghcid_history } # Update identifiers list identifiers = data.get('identifiers', []) updated_identifiers = [] for ident in identifiers: scheme = ident.get('identifier_scheme', '') if scheme == 'GHCID': ident['identifier_value'] = new_ghcid ident['identifier_url'] = f"https://w3id.org/heritage/custodian/{new_ghcid}" elif scheme == 'GHCID_UUID': ident['identifier_value'] = new_uuid elif scheme == 'GHCID_NUMERIC': ident['identifier_value'] = str(new_numeric) updated_identifiers.append(ident) data['identifiers'] = updated_identifiers # Write updated data back to file with open(file_path, 'w') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f" Updated file content") # Rename file to match new GHCID new_file_path = file_path.parent / f"{new_ghcid}.yaml" if new_file_path.exists(): print(f" ERROR: Target file already exists: {new_file_path.name}") return None shutil.move(str(file_path), str(new_file_path)) print(f" Renamed: {file_path.name} → {new_file_path.name}") return new_file_path def main(): import argparse parser = argparse.ArgumentParser(description='Fix GHCID collision victim files') parser.add_argument('--dry-run', action='store_true', help='Only show what would be done') parser.add_argument('--file', type=str, help='Process only this specific file') args = parser.parse_args() custodian_dir = Path('data/custodian') if args.file: files = [Path(args.file)] else: # Find all collision victim files (trailing dash pattern) files = sorted(custodian_dir.glob('NL-*-.yaml')) print(f"Found {len(files)} collision victim file(s)") fixed = 0 skipped = 0 errors = 0 for f in files: result = fix_collision_victim(f, dry_run=args.dry_run) if result: fixed += 1 elif result is None: # Check if it was empty if f.stat().st_size == 0: print(f"\n EMPTY FILE: {f.name} - should be deleted") errors += 1 else: skipped += 1 print(f"\n{'='*80}") print(f"SUMMARY") print(f"{'='*80}") print(f" Fixed: {fixed}") print(f" Skipped: {skipped}") print(f" Errors/Empty: {errors}") if __name__ == '__main__': main()