#!/usr/bin/env python3 """ Regenerate GHCIDs for custodians where emic_name differs from claim_value. This script finds custodians that have been enriched with emic_name (native language name) and regenerates their GHCID abbreviation component using the emic_name instead of the English claim_value. Per AGENTS.md grandfathering policy: Existing GHCIDs from UNESCO MoW custodians are grandfathered for PID stability. This script generates a REPORT of which files WOULD be updated, but does NOT automatically apply changes without explicit confirmation. Usage: # Dry run (default) - show what would change python scripts/regenerate_ghcids_emic_name.py # Apply changes python scripts/regenerate_ghcids_emic_name.py --apply # Process specific country python scripts/regenerate_ghcids_emic_name.py --country DE # Limit number of files python scripts/regenerate_ghcids_emic_name.py --limit 10 """ import argparse import os import re import sys import uuid from datetime import datetime, timezone from pathlib import Path import yaml # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import extract_abbreviation_from_name def get_current_abbreviation(ghcid: str) -> str: """Extract the abbreviation component from a GHCID string. GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-collision_suffix """ parts = ghcid.split('-') if len(parts) >= 5: # Everything after the type code is the abbreviation (may include collision suffix) return '-'.join(parts[4:]) return '' def build_new_ghcid(old_ghcid: str, new_abbrev: str) -> str: """Build a new GHCID with updated abbreviation. Preserves country, region, city, and type codes. """ parts = old_ghcid.split('-') if len(parts) >= 5: # Take first 4 parts (CC-RR-CCC-T) and append new abbreviation return '-'.join(parts[:4] + [new_abbrev]) return old_ghcid def generate_uuid_v5(ghcid_string: str) -> str: """Generate deterministic UUID v5 from GHCID string.""" GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # URL namespace return str(uuid.uuid5(GHCID_NAMESPACE, f"https://w3id.org/heritage/custodian/{ghcid_string}")) def process_custodian_file(filepath: Path, apply: bool = False) -> dict | None: """Process a single custodian YAML file. Returns info dict if update needed, None otherwise. """ with open(filepath) as f: data = yaml.safe_load(f) if not data: return None # Get custodian_name block cn = data.get('custodian_name', {}) emic_name = cn.get('emic_name', '') claim_value = cn.get('claim_value', '') if not emic_name: return None # Get current GHCID ghcid_data = data.get('ghcid', {}) current_ghcid = ghcid_data.get('ghcid_current', '') if not current_ghcid: return None # Calculate abbreviations current_abbrev = get_current_abbreviation(current_ghcid) # Generate new abbreviation from emic_name new_abbrev = extract_abbreviation_from_name(emic_name) # Also generate what the English abbreviation would be english_abbrev = extract_abbreviation_from_name(claim_value) # Skip if abbreviations are the same if current_abbrev.upper() == new_abbrev.upper(): return None # Build new GHCID new_ghcid = build_new_ghcid(current_ghcid, new_abbrev) new_uuid = generate_uuid_v5(new_ghcid) result = { 'file': filepath.name, 'current_ghcid': current_ghcid, 'new_ghcid': new_ghcid, 'current_abbrev': current_abbrev, 'new_abbrev': new_abbrev, 'english_abbrev': english_abbrev, 'emic_name': emic_name, 'claim_value': claim_value, 'new_uuid': new_uuid, } if apply: # Update the YAML data timestamp = datetime.now(timezone.utc).isoformat() # Update GHCID fields old_ghcid_current = ghcid_data.get('ghcid_current', '') ghcid_data['ghcid_current'] = new_ghcid ghcid_data['ghcid_uuid'] = new_uuid # Note: We don't regenerate ghcid_numeric here, would need the full algorithm ghcid_data['generation_timestamp'] = timestamp # Add to history history = ghcid_data.get('ghcid_history', []) history.append({ 'ghcid': new_ghcid, 'valid_from': timestamp, 'reason': f'GHCID abbreviation regenerated from emic_name "{emic_name}" (was "{claim_value}", abbrev {current_abbrev}→{new_abbrev})', }) ghcid_data['ghcid_history'] = history data['ghcid'] = ghcid_data # Add provenance note prov = data.get('provenance', {}) notes = prov.get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f'GHCID regenerated {timestamp}: abbreviation {current_abbrev}→{new_abbrev} from emic_name') prov['notes'] = notes data['provenance'] = prov # Write updated YAML with open(filepath, 'w') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if needed new_filename = f"{new_ghcid}.yaml" if filepath.name != new_filename: new_path = filepath.parent / new_filename if new_path.exists(): result['rename_error'] = f"Target file {new_filename} already exists" else: filepath.rename(new_path) result['renamed_to'] = new_filename return result def main(): parser = argparse.ArgumentParser(description="Regenerate GHCIDs using emic_name") parser.add_argument('--apply', action='store_true', help='Actually apply changes (default: dry run)') parser.add_argument('--country', type=str, help='Only process files for specific country code (e.g., DE)') parser.add_argument('--limit', type=int, help='Limit number of files to process') parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed output') args = parser.parse_args() custodian_dir = Path(__file__).parent.parent / "data" / "custodian" print("=" * 70) print("GHCID Regeneration from Emic Names") print("=" * 70) print(f"Directory: {custodian_dir}") print(f"Mode: {'APPLY CHANGES' if args.apply else 'DRY RUN (use --apply to modify files)'}") if args.country: print(f"Country filter: {args.country}") if args.limit: print(f"Limit: {args.limit} files") print("=" * 70) print() # Find files with emic_name yaml_files = sorted(custodian_dir.glob("*.yaml")) if args.country: yaml_files = [f for f in yaml_files if f.name.startswith(f"{args.country}-")] if args.limit: yaml_files = yaml_files[:args.limit] results = [] processed = 0 for filepath in yaml_files: processed += 1 if processed % 500 == 0: print(f" Processed {processed}/{len(yaml_files)} files...") try: result = process_custodian_file(filepath, apply=args.apply) if result: results.append(result) except Exception as e: print(f" Error processing {filepath.name}: {e}") # Report print() print("=" * 70) print(f"RESULTS: {len(results)} files {'updated' if args.apply else 'would be updated'}") print("=" * 70) print() if results: # Group by country by_country = {} for r in results: country = r['current_ghcid'][:2] if country not in by_country: by_country[country] = [] by_country[country].append(r) for country in sorted(by_country.keys()): items = by_country[country] print(f"\n{country}: {len(items)} files") print("-" * 50) for r in items[:10 if not args.verbose else None]: # Show first 10 per country unless verbose print(f" {r['current_ghcid']} → {r['new_ghcid']}") print(f" claim: {r['claim_value'][:50]}...") print(f" emic: {r['emic_name'][:50]}...") print(f" abbrev: {r['current_abbrev']} → {r['new_abbrev']}") if args.apply: if r.get('renamed_to'): print(f" RENAMED TO: {r['renamed_to']}") if r.get('rename_error'): print(f" RENAME ERROR: {r['rename_error']}") print() if len(items) > 10 and not args.verbose: print(f" ... and {len(items) - 10} more (use -v to see all)") print() print("=" * 70) if not args.apply: print("This was a DRY RUN. Use --apply to actually modify files.") else: print(f"Done! {len(results)} files updated.") print("=" * 70) if __name__ == "__main__": main()