glam/scripts/migrate_egyptian_from_ch.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

154 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
Migrate Egyptian institutions incorrectly placed under CH (Switzerland) to EG (Egypt).
"""
import re
import sqlite3
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
# Egyptian city mapping
EGYPTIAN_CITIES = {
'Cairo': {'region': 'C', 'city_code': 'CAI'},
'Alexandria': {'region': 'ALX', 'city_code': 'ALX'},
'Giza': {'region': 'GZ', 'city_code': 'GIZ'},
'Assiut': {'region': 'AST', 'city_code': 'ASS'},
'Helwan': {'region': 'C', 'city_code': 'HEL'},
'6th of October City': {'region': 'GZ', 'city_code': 'OCT'},
'Ain Shams': {'region': 'C', 'city_code': 'ASH'},
'Maadi': {'region': 'C', 'city_code': 'MAA'},
'New Cairo': {'region': 'C', 'city_code': 'NCA'},
}
def extract_city_from_name(name):
"""Extract Egyptian city from institution name."""
name_lower = name.lower()
if 'cairo' in name_lower or 'ain shams' in name_lower or 'helwan' in name_lower:
return 'Cairo'
if 'alexandria' in name_lower:
return 'Alexandria'
if 'assiut' in name_lower or 'asyut' in name_lower:
return 'Assiut'
if 'giza' in name_lower or 'october' in name_lower:
return 'Giza'
if 'nile' in name_lower or 'maadi' in name_lower:
return 'Cairo' # Most Egyptian institutions without city are in Cairo
if 'egypt' in name_lower or 'egyptian' in name_lower:
return 'Cairo' # Default for national institutions
return 'Cairo' # Default
def update_file(file_path, city_name, dry_run=False):
"""Update file from CH to EG namespace."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
city_info = EGYPTIAN_CITIES.get(city_name, {'region': 'C', 'city_code': 'CAI'})
region_code = city_info['region']
city_code = city_info['city_code']
# Get current GHCID
old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content)
if not old_ghcid_match:
return False, None
old_ghcid = old_ghcid_match.group(1).strip()
# Create new GHCID with EG namespace
new_ghcid = re.sub(r'^CH-XX-XXX-', f'EG-{region_code}-{city_code}-', old_ghcid)
if dry_run:
return True, (old_ghcid, new_ghcid)
# Update all GHCID references
content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}')
content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}')
content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}')
content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}")
# Update country code
content = re.sub(r'country:\s*CH', 'country: EG', content)
content = re.sub(r'country_code:\s*CH', 'country_code: EG', content)
# Update location_resolution
content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content)
content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content)
# Add history entry
timestamp = datetime.now(timezone.utc).isoformat()
history_entry = f"""
- ghcid: {new_ghcid}
valid_from: '{timestamp}'
reason: Migrated from CH to EG namespace - {city_name}"""
history_match = re.search(r'(ghcid_history:\s*\n)', content)
if history_match:
insert_pos = history_match.end()
content = content[:insert_pos] + history_entry + content[insert_pos:]
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
# Rename file
old_filename = file_path.name
new_filename = old_filename.replace('CH-XX-XXX-', f'EG-{region_code}-{city_code}-')
if new_filename != old_filename:
new_path = file_path.parent / new_filename
file_path.rename(new_path)
return True, (old_ghcid, new_ghcid)
def main():
import sys
dry_run = '--dry-run' in sys.argv
base_dir = Path(__file__).parent.parent
custodian_dir = base_dir / 'data' / 'custodian'
print("Egyptian Institution Migration (CH → EG)")
print("=" * 50)
if dry_run:
print("DRY RUN MODE\n")
# Find CH-XX-XXX files that are actually Egyptian
xxx_files = list(custodian_dir.glob('CH-XX-XXX-*.yaml'))
print(f"Found {len(xxx_files)} CH-XX-XXX files\n")
migrated = 0
egyptian_keywords = ['egypt', 'cairo', 'alexandria', 'ain shams', 'helwan', 'assiut',
'giza', 'nile', 'al-azhar', 'dar al-kutub', 'guc', 'auc', 'bue']
for file_path in xxx_files:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Check if this is an Egyptian institution
name_match = re.search(r'claim_value:\s*(.+)', content)
if not name_match:
continue
inst_name = name_match.group(1).strip().lower()
is_egyptian = any(keyword in inst_name for keyword in egyptian_keywords)
if not is_egyptian:
continue
city = extract_city_from_name(inst_name)
success, ghcid_change = update_file(file_path, city, dry_run)
if success:
if dry_run:
print(f" {file_path.name}")
print(f"{ghcid_change[0]}{ghcid_change[1]}")
else:
print(f"✓ Migrated: {file_path.name}{city}")
migrated += 1
print(f"\n{'=' * 50}")
print(f"Migrated: {migrated}")
if __name__ == '__main__':
main()