134 lines
3.5 KiB
Python
Executable file
134 lines
3.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix 12 Netherlands custodian files missing ghcid_uuid.
|
|
|
|
These files were created from LinkedIn matching (Session 12) and have
|
|
ghcid.ghcid_current but are missing ghcid_uuid and ghcid_numeric.
|
|
|
|
Usage:
|
|
python scripts/fix_nl_missing_uuids.py
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import yaml
|
|
|
|
# Add src to Python path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.identifiers.ghcid import GHCIDComponents
|
|
|
|
# Files missing ghcid_uuid
|
|
MISSING_FILES = [
|
|
"NL-FR-LEE-A-FFAA",
|
|
"NL-GE-ARN-A-SIFA",
|
|
"NL-GE-ARN-I-KIEN",
|
|
"NL-GE-NIJ-M-MKN",
|
|
"NL-GE-XXX-M-MVV",
|
|
"NL-NB-HIL-M-DD",
|
|
"NL-NH-AAL-M-FAM",
|
|
"NL-NH-AMS-G-W",
|
|
"NL-NH-BER-M-HS",
|
|
"NL-OV-RAA-I-VVV",
|
|
"NL-ZH-HOE-M-NMKM",
|
|
"NL-ZH-ROT-G-KM",
|
|
]
|
|
|
|
|
|
def parse_ghcid_string(ghcid_str: str) -> GHCIDComponents:
|
|
"""Parse GHCID string into components."""
|
|
parts = ghcid_str.split("-")
|
|
if len(parts) < 5:
|
|
raise ValueError(f"Invalid GHCID format: {ghcid_str}")
|
|
|
|
return GHCIDComponents(
|
|
country_code=parts[0],
|
|
region_code=parts[1],
|
|
city_locode=parts[2],
|
|
institution_type=parts[3],
|
|
abbreviation=parts[4],
|
|
wikidata_qid=parts[5] if len(parts) > 5 else None
|
|
)
|
|
|
|
|
|
def fix_file(filepath: Path) -> bool:
|
|
"""Add missing UUID fields to a custodian file."""
|
|
print(f"Processing: {filepath.name}")
|
|
|
|
# Read the file
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
print(f" ERROR: Empty file")
|
|
return False
|
|
|
|
# Get the GHCID
|
|
ghcid_block = data.get('ghcid', {})
|
|
ghcid_str = ghcid_block.get('ghcid_current')
|
|
|
|
if not ghcid_str:
|
|
print(f" ERROR: No ghcid_current found")
|
|
return False
|
|
|
|
# Check if already has UUID
|
|
if 'ghcid_uuid' in ghcid_block:
|
|
print(f" SKIP: Already has ghcid_uuid")
|
|
return False
|
|
|
|
try:
|
|
# Parse GHCID and generate UUIDs
|
|
components = parse_ghcid_string(ghcid_str)
|
|
ghcid_uuid = str(components.to_uuid())
|
|
ghcid_numeric = components.to_numeric()
|
|
ghcid_uuid_sha256 = str(components.to_uuid_sha256())
|
|
|
|
# Add UUIDs to the ghcid block
|
|
ghcid_block['ghcid_uuid'] = ghcid_uuid
|
|
ghcid_block['ghcid_numeric'] = ghcid_numeric
|
|
ghcid_block['ghcid_uuid_sha256'] = ghcid_uuid_sha256
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f" FIXED: Added ghcid_uuid={ghcid_uuid[:8]}...")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
data_dir = Path(__file__).parent.parent / "data" / "custodian"
|
|
|
|
fixed = 0
|
|
errors = 0
|
|
skipped = 0
|
|
|
|
for ghcid in MISSING_FILES:
|
|
filepath = data_dir / f"{ghcid}.yaml"
|
|
if not filepath.exists():
|
|
print(f"NOT FOUND: {filepath}")
|
|
errors += 1
|
|
continue
|
|
|
|
result = fix_file(filepath)
|
|
if result:
|
|
fixed += 1
|
|
elif result is None:
|
|
errors += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
print(f"\n=== Summary ===")
|
|
print(f"Fixed: {fixed}")
|
|
print(f"Skipped: {skipped}")
|
|
print(f"Errors: {errors}")
|
|
print(f"Total: {len(MISSING_FILES)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|