glam/scripts/fix_nl_missing_uuids.py
2025-12-21 00:01:54 +01:00

134 lines
3.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix 12 Netherlands custodian files missing ghcid_uuid.
These files were created from LinkedIn matching (Session 12) and have
ghcid.ghcid_current but are missing ghcid_uuid and ghcid_numeric.
Usage:
python scripts/fix_nl_missing_uuids.py
"""
import sys
from pathlib import Path
from datetime import datetime, timezone
import yaml
# Add src to Python path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.identifiers.ghcid import GHCIDComponents
# Files missing ghcid_uuid
MISSING_FILES = [
"NL-FR-LEE-A-FFAA",
"NL-GE-ARN-A-SIFA",
"NL-GE-ARN-I-KIEN",
"NL-GE-NIJ-M-MKN",
"NL-GE-XXX-M-MVV",
"NL-NB-HIL-M-DD",
"NL-NH-AAL-M-FAM",
"NL-NH-AMS-G-W",
"NL-NH-BER-M-HS",
"NL-OV-RAA-I-VVV",
"NL-ZH-HOE-M-NMKM",
"NL-ZH-ROT-G-KM",
]
def parse_ghcid_string(ghcid_str: str) -> GHCIDComponents:
"""Parse GHCID string into components."""
parts = ghcid_str.split("-")
if len(parts) < 5:
raise ValueError(f"Invalid GHCID format: {ghcid_str}")
return GHCIDComponents(
country_code=parts[0],
region_code=parts[1],
city_locode=parts[2],
institution_type=parts[3],
abbreviation=parts[4],
wikidata_qid=parts[5] if len(parts) > 5 else None
)
def fix_file(filepath: Path) -> bool:
"""Add missing UUID fields to a custodian file."""
print(f"Processing: {filepath.name}")
# Read the file
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
print(f" ERROR: Empty file")
return False
# Get the GHCID
ghcid_block = data.get('ghcid', {})
ghcid_str = ghcid_block.get('ghcid_current')
if not ghcid_str:
print(f" ERROR: No ghcid_current found")
return False
# Check if already has UUID
if 'ghcid_uuid' in ghcid_block:
print(f" SKIP: Already has ghcid_uuid")
return False
try:
# Parse GHCID and generate UUIDs
components = parse_ghcid_string(ghcid_str)
ghcid_uuid = str(components.to_uuid())
ghcid_numeric = components.to_numeric()
ghcid_uuid_sha256 = str(components.to_uuid_sha256())
# Add UUIDs to the ghcid block
ghcid_block['ghcid_uuid'] = ghcid_uuid
ghcid_block['ghcid_numeric'] = ghcid_numeric
ghcid_block['ghcid_uuid_sha256'] = ghcid_uuid_sha256
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f" FIXED: Added ghcid_uuid={ghcid_uuid[:8]}...")
return True
except Exception as e:
print(f" ERROR: {e}")
return False
def main():
data_dir = Path(__file__).parent.parent / "data" / "custodian"
fixed = 0
errors = 0
skipped = 0
for ghcid in MISSING_FILES:
filepath = data_dir / f"{ghcid}.yaml"
if not filepath.exists():
print(f"NOT FOUND: {filepath}")
errors += 1
continue
result = fix_file(filepath)
if result:
fixed += 1
elif result is None:
errors += 1
else:
skipped += 1
print(f"\n=== Summary ===")
print(f"Fixed: {fixed}")
print(f"Skipped: {skipped}")
print(f"Errors: {errors}")
print(f"Total: {len(MISSING_FILES)}")
if __name__ == "__main__":
main()