268 lines
9 KiB
Python
268 lines
9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Regenerate GHCIDs for custodians where emic_name differs from claim_value.
|
|
|
|
This script finds custodians that have been enriched with emic_name (native language name)
|
|
and regenerates their GHCID abbreviation component using the emic_name instead of the
|
|
English claim_value.
|
|
|
|
Per AGENTS.md grandfathering policy: Existing GHCIDs from UNESCO MoW custodians are
|
|
grandfathered for PID stability. This script generates a REPORT of which files WOULD
|
|
be updated, but does NOT automatically apply changes without explicit confirmation.
|
|
|
|
Usage:
|
|
# Dry run (default) - show what would change
|
|
python scripts/regenerate_ghcids_emic_name.py
|
|
|
|
# Apply changes
|
|
python scripts/regenerate_ghcids_emic_name.py --apply
|
|
|
|
# Process specific country
|
|
python scripts/regenerate_ghcids_emic_name.py --country DE
|
|
|
|
# Limit number of files
|
|
python scripts/regenerate_ghcids_emic_name.py --limit 10
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.identifiers.ghcid import extract_abbreviation_from_name
|
|
|
|
|
|
def get_current_abbreviation(ghcid: str) -> str:
|
|
"""Extract the abbreviation component from a GHCID string.
|
|
|
|
GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-collision_suffix
|
|
"""
|
|
parts = ghcid.split('-')
|
|
if len(parts) >= 5:
|
|
# Everything after the type code is the abbreviation (may include collision suffix)
|
|
return '-'.join(parts[4:])
|
|
return ''
|
|
|
|
|
|
def build_new_ghcid(old_ghcid: str, new_abbrev: str) -> str:
|
|
"""Build a new GHCID with updated abbreviation.
|
|
|
|
Preserves country, region, city, and type codes.
|
|
"""
|
|
parts = old_ghcid.split('-')
|
|
if len(parts) >= 5:
|
|
# Take first 4 parts (CC-RR-CCC-T) and append new abbreviation
|
|
return '-'.join(parts[:4] + [new_abbrev])
|
|
return old_ghcid
|
|
|
|
|
|
def generate_uuid_v5(ghcid_string: str) -> str:
|
|
"""Generate deterministic UUID v5 from GHCID string."""
|
|
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # URL namespace
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, f"https://w3id.org/heritage/custodian/{ghcid_string}"))
|
|
|
|
|
|
def process_custodian_file(filepath: Path, apply: bool = False) -> dict | None:
|
|
"""Process a single custodian YAML file.
|
|
|
|
Returns info dict if update needed, None otherwise.
|
|
"""
|
|
with open(filepath) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return None
|
|
|
|
# Get custodian_name block
|
|
cn = data.get('custodian_name', {})
|
|
emic_name = cn.get('emic_name', '')
|
|
claim_value = cn.get('claim_value', '')
|
|
|
|
if not emic_name:
|
|
return None
|
|
|
|
# Get current GHCID
|
|
ghcid_data = data.get('ghcid', {})
|
|
current_ghcid = ghcid_data.get('ghcid_current', '')
|
|
|
|
if not current_ghcid:
|
|
return None
|
|
|
|
# Calculate abbreviations
|
|
current_abbrev = get_current_abbreviation(current_ghcid)
|
|
|
|
# Generate new abbreviation from emic_name
|
|
new_abbrev = extract_abbreviation_from_name(emic_name)
|
|
|
|
# Also generate what the English abbreviation would be
|
|
english_abbrev = extract_abbreviation_from_name(claim_value)
|
|
|
|
# Skip if abbreviations are the same
|
|
if current_abbrev.upper() == new_abbrev.upper():
|
|
return None
|
|
|
|
# Build new GHCID
|
|
new_ghcid = build_new_ghcid(current_ghcid, new_abbrev)
|
|
new_uuid = generate_uuid_v5(new_ghcid)
|
|
|
|
result = {
|
|
'file': filepath.name,
|
|
'current_ghcid': current_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'current_abbrev': current_abbrev,
|
|
'new_abbrev': new_abbrev,
|
|
'english_abbrev': english_abbrev,
|
|
'emic_name': emic_name,
|
|
'claim_value': claim_value,
|
|
'new_uuid': new_uuid,
|
|
}
|
|
|
|
if apply:
|
|
# Update the YAML data
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update GHCID fields
|
|
old_ghcid_current = ghcid_data.get('ghcid_current', '')
|
|
ghcid_data['ghcid_current'] = new_ghcid
|
|
ghcid_data['ghcid_uuid'] = new_uuid
|
|
# Note: We don't regenerate ghcid_numeric here, would need the full algorithm
|
|
ghcid_data['generation_timestamp'] = timestamp
|
|
|
|
# Add to history
|
|
history = ghcid_data.get('ghcid_history', [])
|
|
history.append({
|
|
'ghcid': new_ghcid,
|
|
'valid_from': timestamp,
|
|
'reason': f'GHCID abbreviation regenerated from emic_name "{emic_name}" (was "{claim_value}", abbrev {current_abbrev}→{new_abbrev})',
|
|
})
|
|
ghcid_data['ghcid_history'] = history
|
|
|
|
data['ghcid'] = ghcid_data
|
|
|
|
# Add provenance note
|
|
prov = data.get('provenance', {})
|
|
notes = prov.get('notes', [])
|
|
if isinstance(notes, str):
|
|
notes = [notes]
|
|
notes.append(f'GHCID regenerated {timestamp}: abbreviation {current_abbrev}→{new_abbrev} from emic_name')
|
|
prov['notes'] = notes
|
|
data['provenance'] = prov
|
|
|
|
# Write updated YAML
|
|
with open(filepath, 'w') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Rename file if needed
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
if filepath.name != new_filename:
|
|
new_path = filepath.parent / new_filename
|
|
if new_path.exists():
|
|
result['rename_error'] = f"Target file {new_filename} already exists"
|
|
else:
|
|
filepath.rename(new_path)
|
|
result['renamed_to'] = new_filename
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Regenerate GHCIDs using emic_name")
|
|
parser.add_argument('--apply', action='store_true', help='Actually apply changes (default: dry run)')
|
|
parser.add_argument('--country', type=str, help='Only process files for specific country code (e.g., DE)')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed output')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(__file__).parent.parent / "data" / "custodian"
|
|
|
|
print("=" * 70)
|
|
print("GHCID Regeneration from Emic Names")
|
|
print("=" * 70)
|
|
print(f"Directory: {custodian_dir}")
|
|
print(f"Mode: {'APPLY CHANGES' if args.apply else 'DRY RUN (use --apply to modify files)'}")
|
|
if args.country:
|
|
print(f"Country filter: {args.country}")
|
|
if args.limit:
|
|
print(f"Limit: {args.limit} files")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Find files with emic_name
|
|
yaml_files = sorted(custodian_dir.glob("*.yaml"))
|
|
|
|
if args.country:
|
|
yaml_files = [f for f in yaml_files if f.name.startswith(f"{args.country}-")]
|
|
|
|
if args.limit:
|
|
yaml_files = yaml_files[:args.limit]
|
|
|
|
results = []
|
|
processed = 0
|
|
|
|
for filepath in yaml_files:
|
|
processed += 1
|
|
if processed % 500 == 0:
|
|
print(f" Processed {processed}/{len(yaml_files)} files...")
|
|
|
|
try:
|
|
result = process_custodian_file(filepath, apply=args.apply)
|
|
if result:
|
|
results.append(result)
|
|
except Exception as e:
|
|
print(f" Error processing {filepath.name}: {e}")
|
|
|
|
# Report
|
|
print()
|
|
print("=" * 70)
|
|
print(f"RESULTS: {len(results)} files {'updated' if args.apply else 'would be updated'}")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
if results:
|
|
# Group by country
|
|
by_country = {}
|
|
for r in results:
|
|
country = r['current_ghcid'][:2]
|
|
if country not in by_country:
|
|
by_country[country] = []
|
|
by_country[country].append(r)
|
|
|
|
for country in sorted(by_country.keys()):
|
|
items = by_country[country]
|
|
print(f"\n{country}: {len(items)} files")
|
|
print("-" * 50)
|
|
|
|
for r in items[:10 if not args.verbose else None]: # Show first 10 per country unless verbose
|
|
print(f" {r['current_ghcid']} → {r['new_ghcid']}")
|
|
print(f" claim: {r['claim_value'][:50]}...")
|
|
print(f" emic: {r['emic_name'][:50]}...")
|
|
print(f" abbrev: {r['current_abbrev']} → {r['new_abbrev']}")
|
|
if args.apply:
|
|
if r.get('renamed_to'):
|
|
print(f" RENAMED TO: {r['renamed_to']}")
|
|
if r.get('rename_error'):
|
|
print(f" RENAME ERROR: {r['rename_error']}")
|
|
print()
|
|
|
|
if len(items) > 10 and not args.verbose:
|
|
print(f" ... and {len(items) - 10} more (use -v to see all)")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
if not args.apply:
|
|
print("This was a DRY RUN. Use --apply to actually modify files.")
|
|
else:
|
|
print(f"Done! {len(results)} files updated.")
|
|
print("=" * 70)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|