glam/scripts/fix_czech_collisions.py
2025-12-10 13:01:13 +01:00

277 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""
Handle GHCID collisions by adding name suffixes.
When fixing region codes, some files may collide with existing files.
This script resolves those by adding snake_case name suffixes.
"""
import os
import re
import sys
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
# Same mappings as the main fix script
LETTER_TO_ISO = {
'HL': '10', 'PR': '10', 'ST': '20', 'JC': '31', 'PL': '32',
'KA': '41', 'US': '42', 'LI': '51', 'KR': '52', 'PA': '53',
'VY': '63', 'JI': '64', 'JM': '64', 'OL': '71', 'ZL': '72', 'MO': '80',
}
DISTRICT_TO_REGION = {
'78': '80', '79': '80', '81': '80', '82': '80', '83': '80',
'84': '80', '85': '80', '86': '80', '87': '80', '88': '80',
'89': '80', '90': '80',
}
VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'}
REGION_NAMES = {
'10': 'Prague', '20': 'Central Bohemian', '31': 'South Bohemian',
'32': 'Plzeň', '41': 'Karlovy Vary', '42': 'Ústí nad Labem',
'51': 'Liberec', '52': 'Hradec Králové', '53': 'Pardubice',
'63': 'Vysočina', '64': 'South Moravian', '71': 'Olomouc',
'72': 'Zlín', '80': 'Moravian-Silesian',
}
def get_correct_region_code(old_code: str) -> str:
"""Get correct ISO code for a region."""
if old_code in VALID_ISO_CODES:
return old_code
if old_code in LETTER_TO_ISO:
return LETTER_TO_ISO[old_code]
if old_code in DISTRICT_TO_REGION:
return DISTRICT_TO_REGION[old_code]
return None
def generate_name_suffix(name: str) -> str:
"""Convert institution name to snake_case suffix."""
# Normalize unicode (NFD decomposition) and remove diacritics
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Convert to lowercase
lowercase = ascii_name.lower()
# Remove apostrophes, commas, and other punctuation
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
# Replace spaces and hyphens with underscores
underscored = re.sub(r'[\s\-]+', '_', no_punct)
# Remove any remaining non-alphanumeric characters (except underscores)
clean = re.sub(r'[^a-z0-9_]', '', underscored)
# Collapse multiple underscores
final = re.sub(r'_+', '_', clean).strip('_')
return final
def extract_name_from_yaml(filepath: Path) -> str:
"""Extract institution name from YAML file."""
content = filepath.read_text(encoding='utf-8')
# Try to find name in original_entry
match = re.search(r'original_entry:\s*\n\s*name:\s*(.+)', content)
if match:
return match.group(1).strip()
# Try custodian_name.claim_value
match = re.search(r'claim_value:\s*(.+)', content)
if match:
return match.group(1).strip()
return None
def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
old_region: str, new_region: str) -> str:
"""Fix the YAML content with new GHCID and region codes."""
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
region_name = REGION_NAMES.get(new_region, 'Unknown')
reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) with name suffix for collision resolution per ISO 3166-2:CZ"
# Replace GHCID in ghcid_current
content = re.sub(
r'(ghcid_current:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace GHCID in identifiers
content = re.sub(
r'(identifier_value:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace region_code in location_resolution
content = re.sub(
r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Replace region_code in location section
content = re.sub(
r'(location:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Check if ghcid_history already exists
if 'ghcid_history:' in content:
new_history_items = f'''- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Previous GHCID before collision resolution"
'''
content = re.sub(
r'(ghcid_history:\s*\n\s*)',
r'\g<1>' + new_history_items,
content
)
else:
history_entry = f'''
ghcid_history:
- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Original GHCID before collision resolution"'''
content = re.sub(
r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
r'\g<1>' + history_entry,
content
)
return content
def process_collision_file(filepath: Path, dry_run: bool = False) -> dict:
"""Process a file that couldn't be fixed due to collision."""
filename = filepath.name
# Extract current GHCID from filename
match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
if not match:
return {'status': 'skipped', 'reason': 'filename pattern mismatch'}
old_region = match.group(1)
city = match.group(2)
inst_type = match.group(3)
abbrev = match.group(4)
# Get correct region code
new_region = get_correct_region_code(old_region)
if not new_region:
return {'status': 'error', 'reason': f'unknown region code: {old_region}'}
if old_region == new_region:
return {'status': 'skipped', 'reason': 'already correct'}
# Get institution name
inst_name = extract_name_from_yaml(filepath)
if not inst_name:
return {'status': 'error', 'reason': 'could not extract institution name'}
# Generate name suffix
name_suffix = generate_name_suffix(inst_name)
old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}"
new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}-{name_suffix}"
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
if dry_run:
return {
'status': 'would_fix',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'name': inst_name,
'name_suffix': name_suffix,
}
# Check if new file already exists
if new_filepath.exists():
return {
'status': 'collision',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'reason': 'Target still exists even with name suffix'
}
# Read, fix, and write content
content = filepath.read_text(encoding='utf-8')
new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
new_filepath.write_text(new_content, encoding='utf-8')
filepath.unlink()
return {
'status': 'fixed',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'name': inst_name,
}
def main():
import argparse
parser = argparse.ArgumentParser(description='Handle Czech region code collisions')
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed')
parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory')
args = parser.parse_args()
custodian_dir = Path(args.dir)
# Find all CZ-*.yaml files with non-ISO region codes
non_iso_codes = set(LETTER_TO_ISO.keys()) | set(DISTRICT_TO_REGION.keys())
collision_files = []
for code in non_iso_codes:
collision_files.extend(custodian_dir.glob(f'CZ-{code}-*.yaml'))
print(f"Found {len(collision_files)} files with non-ISO region codes (collision victims)")
results = {'fixed': [], 'would_fix': [], 'errors': [], 'collisions': []}
for filepath in sorted(collision_files):
result = process_collision_file(filepath, dry_run=args.dry_run)
status = result['status']
if status in ('fixed', 'would_fix'):
results[status].append(result)
action = 'Would fix' if args.dry_run else 'Fixed'
print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
elif status == 'collision':
results['collisions'].append(result)
print(f" COLLISION: {result}")
elif status == 'error':
results['errors'].append(result)
print(f" ERROR: {filepath.name} - {result['reason']}")
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
print(f" Collisions: {len(results['collisions'])}")
print(f" Errors: {len(results['errors'])}")
if __name__ == '__main__':
main()