glam/scripts/regenerate_ghcids_emic_name.py
2025-12-09 07:56:35 +01:00

268 lines
9 KiB
Python

#!/usr/bin/env python3
"""
Regenerate GHCIDs for custodians where emic_name differs from claim_value.
This script finds custodians that have been enriched with emic_name (native language name)
and regenerates their GHCID abbreviation component using the emic_name instead of the
English claim_value.
Per AGENTS.md grandfathering policy: Existing GHCIDs from UNESCO MoW custodians are
grandfathered for PID stability. This script generates a REPORT of which files WOULD
be updated, but does NOT automatically apply changes without explicit confirmation.
Usage:
# Dry run (default) - show what would change
python scripts/regenerate_ghcids_emic_name.py
# Apply changes
python scripts/regenerate_ghcids_emic_name.py --apply
# Process specific country
python scripts/regenerate_ghcids_emic_name.py --country DE
# Limit number of files
python scripts/regenerate_ghcids_emic_name.py --limit 10
"""
import argparse
import os
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
import yaml
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.identifiers.ghcid import extract_abbreviation_from_name
def get_current_abbreviation(ghcid: str) -> str:
"""Extract the abbreviation component from a GHCID string.
GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-collision_suffix
"""
parts = ghcid.split('-')
if len(parts) >= 5:
# Everything after the type code is the abbreviation (may include collision suffix)
return '-'.join(parts[4:])
return ''
def build_new_ghcid(old_ghcid: str, new_abbrev: str) -> str:
"""Build a new GHCID with updated abbreviation.
Preserves country, region, city, and type codes.
"""
parts = old_ghcid.split('-')
if len(parts) >= 5:
# Take first 4 parts (CC-RR-CCC-T) and append new abbreviation
return '-'.join(parts[:4] + [new_abbrev])
return old_ghcid
def generate_uuid_v5(ghcid_string: str) -> str:
"""Generate deterministic UUID v5 from GHCID string."""
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # URL namespace
return str(uuid.uuid5(GHCID_NAMESPACE, f"https://w3id.org/heritage/custodian/{ghcid_string}"))
def process_custodian_file(filepath: Path, apply: bool = False) -> dict | None:
"""Process a single custodian YAML file.
Returns info dict if update needed, None otherwise.
"""
with open(filepath) as f:
data = yaml.safe_load(f)
if not data:
return None
# Get custodian_name block
cn = data.get('custodian_name', {})
emic_name = cn.get('emic_name', '')
claim_value = cn.get('claim_value', '')
if not emic_name:
return None
# Get current GHCID
ghcid_data = data.get('ghcid', {})
current_ghcid = ghcid_data.get('ghcid_current', '')
if not current_ghcid:
return None
# Calculate abbreviations
current_abbrev = get_current_abbreviation(current_ghcid)
# Generate new abbreviation from emic_name
new_abbrev = extract_abbreviation_from_name(emic_name)
# Also generate what the English abbreviation would be
english_abbrev = extract_abbreviation_from_name(claim_value)
# Skip if abbreviations are the same
if current_abbrev.upper() == new_abbrev.upper():
return None
# Build new GHCID
new_ghcid = build_new_ghcid(current_ghcid, new_abbrev)
new_uuid = generate_uuid_v5(new_ghcid)
result = {
'file': filepath.name,
'current_ghcid': current_ghcid,
'new_ghcid': new_ghcid,
'current_abbrev': current_abbrev,
'new_abbrev': new_abbrev,
'english_abbrev': english_abbrev,
'emic_name': emic_name,
'claim_value': claim_value,
'new_uuid': new_uuid,
}
if apply:
# Update the YAML data
timestamp = datetime.now(timezone.utc).isoformat()
# Update GHCID fields
old_ghcid_current = ghcid_data.get('ghcid_current', '')
ghcid_data['ghcid_current'] = new_ghcid
ghcid_data['ghcid_uuid'] = new_uuid
# Note: We don't regenerate ghcid_numeric here, would need the full algorithm
ghcid_data['generation_timestamp'] = timestamp
# Add to history
history = ghcid_data.get('ghcid_history', [])
history.append({
'ghcid': new_ghcid,
'valid_from': timestamp,
'reason': f'GHCID abbreviation regenerated from emic_name "{emic_name}" (was "{claim_value}", abbrev {current_abbrev}{new_abbrev})',
})
ghcid_data['ghcid_history'] = history
data['ghcid'] = ghcid_data
# Add provenance note
prov = data.get('provenance', {})
notes = prov.get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f'GHCID regenerated {timestamp}: abbreviation {current_abbrev}{new_abbrev} from emic_name')
prov['notes'] = notes
data['provenance'] = prov
# Write updated YAML
with open(filepath, 'w') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file if needed
new_filename = f"{new_ghcid}.yaml"
if filepath.name != new_filename:
new_path = filepath.parent / new_filename
if new_path.exists():
result['rename_error'] = f"Target file {new_filename} already exists"
else:
filepath.rename(new_path)
result['renamed_to'] = new_filename
return result
def main():
parser = argparse.ArgumentParser(description="Regenerate GHCIDs using emic_name")
parser.add_argument('--apply', action='store_true', help='Actually apply changes (default: dry run)')
parser.add_argument('--country', type=str, help='Only process files for specific country code (e.g., DE)')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed output')
args = parser.parse_args()
custodian_dir = Path(__file__).parent.parent / "data" / "custodian"
print("=" * 70)
print("GHCID Regeneration from Emic Names")
print("=" * 70)
print(f"Directory: {custodian_dir}")
print(f"Mode: {'APPLY CHANGES' if args.apply else 'DRY RUN (use --apply to modify files)'}")
if args.country:
print(f"Country filter: {args.country}")
if args.limit:
print(f"Limit: {args.limit} files")
print("=" * 70)
print()
# Find files with emic_name
yaml_files = sorted(custodian_dir.glob("*.yaml"))
if args.country:
yaml_files = [f for f in yaml_files if f.name.startswith(f"{args.country}-")]
if args.limit:
yaml_files = yaml_files[:args.limit]
results = []
processed = 0
for filepath in yaml_files:
processed += 1
if processed % 500 == 0:
print(f" Processed {processed}/{len(yaml_files)} files...")
try:
result = process_custodian_file(filepath, apply=args.apply)
if result:
results.append(result)
except Exception as e:
print(f" Error processing {filepath.name}: {e}")
# Report
print()
print("=" * 70)
print(f"RESULTS: {len(results)} files {'updated' if args.apply else 'would be updated'}")
print("=" * 70)
print()
if results:
# Group by country
by_country = {}
for r in results:
country = r['current_ghcid'][:2]
if country not in by_country:
by_country[country] = []
by_country[country].append(r)
for country in sorted(by_country.keys()):
items = by_country[country]
print(f"\n{country}: {len(items)} files")
print("-" * 50)
for r in items[:10 if not args.verbose else None]: # Show first 10 per country unless verbose
print(f" {r['current_ghcid']}{r['new_ghcid']}")
print(f" claim: {r['claim_value'][:50]}...")
print(f" emic: {r['emic_name'][:50]}...")
print(f" abbrev: {r['current_abbrev']}{r['new_abbrev']}")
if args.apply:
if r.get('renamed_to'):
print(f" RENAMED TO: {r['renamed_to']}")
if r.get('rename_error'):
print(f" RENAME ERROR: {r['rename_error']}")
print()
if len(items) > 10 and not args.verbose:
print(f" ... and {len(items) - 10} more (use -v to see all)")
print()
print("=" * 70)
if not args.apply:
print("This was a DRY RUN. Use --apply to actually modify files.")
else:
print(f"Done! {len(results)} files updated.")
print("=" * 70)
if __name__ == "__main__":
main()