#!/usr/bin/env python3 """ Fix 12 Netherlands custodian files missing ghcid_uuid. These files were created from LinkedIn matching (Session 12) and have ghcid.ghcid_current but are missing ghcid_uuid and ghcid_numeric. Usage: python scripts/fix_nl_missing_uuids.py """ import sys from pathlib import Path from datetime import datetime, timezone import yaml # Add src to Python path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import GHCIDComponents # Files missing ghcid_uuid MISSING_FILES = [ "NL-FR-LEE-A-FFAA", "NL-GE-ARN-A-SIFA", "NL-GE-ARN-I-KIEN", "NL-GE-NIJ-M-MKN", "NL-GE-XXX-M-MVV", "NL-NB-HIL-M-DD", "NL-NH-AAL-M-FAM", "NL-NH-AMS-G-W", "NL-NH-BER-M-HS", "NL-OV-RAA-I-VVV", "NL-ZH-HOE-M-NMKM", "NL-ZH-ROT-G-KM", ] def parse_ghcid_string(ghcid_str: str) -> GHCIDComponents: """Parse GHCID string into components.""" parts = ghcid_str.split("-") if len(parts) < 5: raise ValueError(f"Invalid GHCID format: {ghcid_str}") return GHCIDComponents( country_code=parts[0], region_code=parts[1], city_locode=parts[2], institution_type=parts[3], abbreviation=parts[4], wikidata_qid=parts[5] if len(parts) > 5 else None ) def fix_file(filepath: Path) -> bool: """Add missing UUID fields to a custodian file.""" print(f"Processing: {filepath.name}") # Read the file with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: print(f" ERROR: Empty file") return False # Get the GHCID ghcid_block = data.get('ghcid', {}) ghcid_str = ghcid_block.get('ghcid_current') if not ghcid_str: print(f" ERROR: No ghcid_current found") return False # Check if already has UUID if 'ghcid_uuid' in ghcid_block: print(f" SKIP: Already has ghcid_uuid") return False try: # Parse GHCID and generate UUIDs components = parse_ghcid_string(ghcid_str) ghcid_uuid = str(components.to_uuid()) ghcid_numeric = components.to_numeric() ghcid_uuid_sha256 = str(components.to_uuid_sha256()) # Add UUIDs to the ghcid block ghcid_block['ghcid_uuid'] = ghcid_uuid ghcid_block['ghcid_numeric'] = ghcid_numeric ghcid_block['ghcid_uuid_sha256'] = ghcid_uuid_sha256 # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" FIXED: Added ghcid_uuid={ghcid_uuid[:8]}...") return True except Exception as e: print(f" ERROR: {e}") return False def main(): data_dir = Path(__file__).parent.parent / "data" / "custodian" fixed = 0 errors = 0 skipped = 0 for ghcid in MISSING_FILES: filepath = data_dir / f"{ghcid}.yaml" if not filepath.exists(): print(f"NOT FOUND: {filepath}") errors += 1 continue result = fix_file(filepath) if result: fixed += 1 elif result is None: errors += 1 else: skipped += 1 print(f"\n=== Summary ===") print(f"Fixed: {fixed}") print(f"Skipped: {skipped}") print(f"Errors: {errors}") print(f"Total: {len(MISSING_FILES)}") if __name__ == "__main__": main()