#!/usr/bin/env python3 """ Migrate Egyptian institutions incorrectly placed under CH (Switzerland) to EG (Egypt). """ import re import sqlite3 import unicodedata from datetime import datetime, timezone from pathlib import Path # Egyptian city mapping EGYPTIAN_CITIES = { 'Cairo': {'region': 'C', 'city_code': 'CAI'}, 'Alexandria': {'region': 'ALX', 'city_code': 'ALX'}, 'Giza': {'region': 'GZ', 'city_code': 'GIZ'}, 'Assiut': {'region': 'AST', 'city_code': 'ASS'}, 'Helwan': {'region': 'C', 'city_code': 'HEL'}, '6th of October City': {'region': 'GZ', 'city_code': 'OCT'}, 'Ain Shams': {'region': 'C', 'city_code': 'ASH'}, 'Maadi': {'region': 'C', 'city_code': 'MAA'}, 'New Cairo': {'region': 'C', 'city_code': 'NCA'}, } def extract_city_from_name(name): """Extract Egyptian city from institution name.""" name_lower = name.lower() if 'cairo' in name_lower or 'ain shams' in name_lower or 'helwan' in name_lower: return 'Cairo' if 'alexandria' in name_lower: return 'Alexandria' if 'assiut' in name_lower or 'asyut' in name_lower: return 'Assiut' if 'giza' in name_lower or 'october' in name_lower: return 'Giza' if 'nile' in name_lower or 'maadi' in name_lower: return 'Cairo' # Most Egyptian institutions without city are in Cairo if 'egypt' in name_lower or 'egyptian' in name_lower: return 'Cairo' # Default for national institutions return 'Cairo' # Default def update_file(file_path, city_name, dry_run=False): """Update file from CH to EG namespace.""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() city_info = EGYPTIAN_CITIES.get(city_name, {'region': 'C', 'city_code': 'CAI'}) region_code = city_info['region'] city_code = city_info['city_code'] # Get current GHCID old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content) if not old_ghcid_match: return False, None old_ghcid = old_ghcid_match.group(1).strip() # Create new GHCID with EG namespace new_ghcid = re.sub(r'^CH-XX-XXX-', f'EG-{region_code}-{city_code}-', old_ghcid) if dry_run: return True, (old_ghcid, new_ghcid) # Update all GHCID references content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}') content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}') content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}') content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}") # Update country code content = re.sub(r'country:\s*CH', 'country: EG', content) content = re.sub(r'country_code:\s*CH', 'country_code: EG', content) # Update location_resolution content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content) content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content) # Add history entry timestamp = datetime.now(timezone.utc).isoformat() history_entry = f""" - ghcid: {new_ghcid} valid_from: '{timestamp}' reason: Migrated from CH to EG namespace - {city_name}""" history_match = re.search(r'(ghcid_history:\s*\n)', content) if history_match: insert_pos = history_match.end() content = content[:insert_pos] + history_entry + content[insert_pos:] with open(file_path, 'w', encoding='utf-8') as f: f.write(content) # Rename file old_filename = file_path.name new_filename = old_filename.replace('CH-XX-XXX-', f'EG-{region_code}-{city_code}-') if new_filename != old_filename: new_path = file_path.parent / new_filename file_path.rename(new_path) return True, (old_ghcid, new_ghcid) def main(): import sys dry_run = '--dry-run' in sys.argv base_dir = Path(__file__).parent.parent custodian_dir = base_dir / 'data' / 'custodian' print("Egyptian Institution Migration (CH → EG)") print("=" * 50) if dry_run: print("DRY RUN MODE\n") # Find CH-XX-XXX files that are actually Egyptian xxx_files = list(custodian_dir.glob('CH-XX-XXX-*.yaml')) print(f"Found {len(xxx_files)} CH-XX-XXX files\n") migrated = 0 egyptian_keywords = ['egypt', 'cairo', 'alexandria', 'ain shams', 'helwan', 'assiut', 'giza', 'nile', 'al-azhar', 'dar al-kutub', 'guc', 'auc', 'bue'] for file_path in xxx_files: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Check if this is an Egyptian institution name_match = re.search(r'claim_value:\s*(.+)', content) if not name_match: continue inst_name = name_match.group(1).strip().lower() is_egyptian = any(keyword in inst_name for keyword in egyptian_keywords) if not is_egyptian: continue city = extract_city_from_name(inst_name) success, ghcid_change = update_file(file_path, city, dry_run) if success: if dry_run: print(f" {file_path.name}") print(f" → {ghcid_change[0]} → {ghcid_change[1]}") else: print(f"✓ Migrated: {file_path.name} → {city}") migrated += 1 print(f"\n{'=' * 50}") print(f"Migrated: {migrated}") if __name__ == '__main__': main()