277 lines
8.9 KiB
Python
277 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Handle GHCID collisions by adding name suffixes.
|
|
|
|
When fixing region codes, some files may collide with existing files.
|
|
This script resolves those by adding snake_case name suffixes.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Same mappings as the main fix script
|
|
LETTER_TO_ISO = {
|
|
'HL': '10', 'PR': '10', 'ST': '20', 'JC': '31', 'PL': '32',
|
|
'KA': '41', 'US': '42', 'LI': '51', 'KR': '52', 'PA': '53',
|
|
'VY': '63', 'JI': '64', 'JM': '64', 'OL': '71', 'ZL': '72', 'MO': '80',
|
|
}
|
|
|
|
DISTRICT_TO_REGION = {
|
|
'78': '80', '79': '80', '81': '80', '82': '80', '83': '80',
|
|
'84': '80', '85': '80', '86': '80', '87': '80', '88': '80',
|
|
'89': '80', '90': '80',
|
|
}
|
|
|
|
VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'}
|
|
|
|
REGION_NAMES = {
|
|
'10': 'Prague', '20': 'Central Bohemian', '31': 'South Bohemian',
|
|
'32': 'Plzeň', '41': 'Karlovy Vary', '42': 'Ústí nad Labem',
|
|
'51': 'Liberec', '52': 'Hradec Králové', '53': 'Pardubice',
|
|
'63': 'Vysočina', '64': 'South Moravian', '71': 'Olomouc',
|
|
'72': 'Zlín', '80': 'Moravian-Silesian',
|
|
}
|
|
|
|
|
|
def get_correct_region_code(old_code: str) -> str:
|
|
"""Get correct ISO code for a region."""
|
|
if old_code in VALID_ISO_CODES:
|
|
return old_code
|
|
if old_code in LETTER_TO_ISO:
|
|
return LETTER_TO_ISO[old_code]
|
|
if old_code in DISTRICT_TO_REGION:
|
|
return DISTRICT_TO_REGION[old_code]
|
|
return None
|
|
|
|
|
|
def generate_name_suffix(name: str) -> str:
|
|
"""Convert institution name to snake_case suffix."""
|
|
# Normalize unicode (NFD decomposition) and remove diacritics
|
|
normalized = unicodedata.normalize('NFD', name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Convert to lowercase
|
|
lowercase = ascii_name.lower()
|
|
|
|
# Remove apostrophes, commas, and other punctuation
|
|
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
|
|
|
|
# Replace spaces and hyphens with underscores
|
|
underscored = re.sub(r'[\s\-]+', '_', no_punct)
|
|
|
|
# Remove any remaining non-alphanumeric characters (except underscores)
|
|
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
|
|
|
# Collapse multiple underscores
|
|
final = re.sub(r'_+', '_', clean).strip('_')
|
|
|
|
return final
|
|
|
|
|
|
def extract_name_from_yaml(filepath: Path) -> str:
|
|
"""Extract institution name from YAML file."""
|
|
content = filepath.read_text(encoding='utf-8')
|
|
|
|
# Try to find name in original_entry
|
|
match = re.search(r'original_entry:\s*\n\s*name:\s*(.+)', content)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
# Try custodian_name.claim_value
|
|
match = re.search(r'claim_value:\s*(.+)', content)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
return None
|
|
|
|
|
|
def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
|
|
old_region: str, new_region: str) -> str:
|
|
"""Fix the YAML content with new GHCID and region codes."""
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
region_name = REGION_NAMES.get(new_region, 'Unknown')
|
|
|
|
reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) with name suffix for collision resolution per ISO 3166-2:CZ"
|
|
|
|
# Replace GHCID in ghcid_current
|
|
content = re.sub(
|
|
r'(ghcid_current:\s*)' + re.escape(old_ghcid),
|
|
r'\g<1>' + new_ghcid,
|
|
content
|
|
)
|
|
|
|
# Replace GHCID in identifiers
|
|
content = re.sub(
|
|
r'(identifier_value:\s*)' + re.escape(old_ghcid),
|
|
r'\g<1>' + new_ghcid,
|
|
content
|
|
)
|
|
|
|
# Replace region_code in location_resolution
|
|
content = re.sub(
|
|
r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
|
|
r'\g<1>' + new_region,
|
|
content,
|
|
flags=re.DOTALL
|
|
)
|
|
|
|
# Replace region_code in location section
|
|
content = re.sub(
|
|
r'(location:.*?region_code:\s*)' + re.escape(old_region),
|
|
r'\g<1>' + new_region,
|
|
content,
|
|
flags=re.DOTALL
|
|
)
|
|
|
|
# Check if ghcid_history already exists
|
|
if 'ghcid_history:' in content:
|
|
new_history_items = f'''- ghcid: {new_ghcid}
|
|
valid_from: "{timestamp}"
|
|
valid_to: null
|
|
reason: "{reason}"
|
|
- ghcid: {old_ghcid}
|
|
valid_from: null
|
|
valid_to: "{timestamp}"
|
|
reason: "Previous GHCID before collision resolution"
|
|
'''
|
|
content = re.sub(
|
|
r'(ghcid_history:\s*\n\s*)',
|
|
r'\g<1>' + new_history_items,
|
|
content
|
|
)
|
|
else:
|
|
history_entry = f'''
|
|
ghcid_history:
|
|
- ghcid: {new_ghcid}
|
|
valid_from: "{timestamp}"
|
|
valid_to: null
|
|
reason: "{reason}"
|
|
- ghcid: {old_ghcid}
|
|
valid_from: null
|
|
valid_to: "{timestamp}"
|
|
reason: "Original GHCID before collision resolution"'''
|
|
content = re.sub(
|
|
r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
|
|
r'\g<1>' + history_entry,
|
|
content
|
|
)
|
|
|
|
return content
|
|
|
|
|
|
def process_collision_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Process a file that couldn't be fixed due to collision."""
|
|
|
|
filename = filepath.name
|
|
|
|
# Extract current GHCID from filename
|
|
match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
|
|
if not match:
|
|
return {'status': 'skipped', 'reason': 'filename pattern mismatch'}
|
|
|
|
old_region = match.group(1)
|
|
city = match.group(2)
|
|
inst_type = match.group(3)
|
|
abbrev = match.group(4)
|
|
|
|
# Get correct region code
|
|
new_region = get_correct_region_code(old_region)
|
|
if not new_region:
|
|
return {'status': 'error', 'reason': f'unknown region code: {old_region}'}
|
|
|
|
if old_region == new_region:
|
|
return {'status': 'skipped', 'reason': 'already correct'}
|
|
|
|
# Get institution name
|
|
inst_name = extract_name_from_yaml(filepath)
|
|
if not inst_name:
|
|
return {'status': 'error', 'reason': 'could not extract institution name'}
|
|
|
|
# Generate name suffix
|
|
name_suffix = generate_name_suffix(inst_name)
|
|
|
|
old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}"
|
|
new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}-{name_suffix}"
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if dry_run:
|
|
return {
|
|
'status': 'would_fix',
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'name': inst_name,
|
|
'name_suffix': name_suffix,
|
|
}
|
|
|
|
# Check if new file already exists
|
|
if new_filepath.exists():
|
|
return {
|
|
'status': 'collision',
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'reason': 'Target still exists even with name suffix'
|
|
}
|
|
|
|
# Read, fix, and write content
|
|
content = filepath.read_text(encoding='utf-8')
|
|
new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
|
|
new_filepath.write_text(new_content, encoding='utf-8')
|
|
filepath.unlink()
|
|
|
|
return {
|
|
'status': 'fixed',
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'name': inst_name,
|
|
}
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Handle Czech region code collisions')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed')
|
|
parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.dir)
|
|
|
|
# Find all CZ-*.yaml files with non-ISO region codes
|
|
non_iso_codes = set(LETTER_TO_ISO.keys()) | set(DISTRICT_TO_REGION.keys())
|
|
|
|
collision_files = []
|
|
for code in non_iso_codes:
|
|
collision_files.extend(custodian_dir.glob(f'CZ-{code}-*.yaml'))
|
|
|
|
print(f"Found {len(collision_files)} files with non-ISO region codes (collision victims)")
|
|
|
|
results = {'fixed': [], 'would_fix': [], 'errors': [], 'collisions': []}
|
|
|
|
for filepath in sorted(collision_files):
|
|
result = process_collision_file(filepath, dry_run=args.dry_run)
|
|
status = result['status']
|
|
|
|
if status in ('fixed', 'would_fix'):
|
|
results[status].append(result)
|
|
action = 'Would fix' if args.dry_run else 'Fixed'
|
|
print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
|
|
elif status == 'collision':
|
|
results['collisions'].append(result)
|
|
print(f" COLLISION: {result}")
|
|
elif status == 'error':
|
|
results['errors'].append(result)
|
|
print(f" ERROR: {filepath.name} - {result['reason']}")
|
|
|
|
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
|
print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
|
|
print(f" Collisions: {len(results['collisions'])}")
|
|
print(f" Errors: {len(results['errors'])}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|