Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
154 lines
5.4 KiB
Python
154 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate Egyptian institutions incorrectly placed under CH (Switzerland) to EG (Egypt).
|
|
"""
|
|
|
|
import re
|
|
import sqlite3
|
|
import unicodedata
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Egyptian city mapping
|
|
EGYPTIAN_CITIES = {
|
|
'Cairo': {'region': 'C', 'city_code': 'CAI'},
|
|
'Alexandria': {'region': 'ALX', 'city_code': 'ALX'},
|
|
'Giza': {'region': 'GZ', 'city_code': 'GIZ'},
|
|
'Assiut': {'region': 'AST', 'city_code': 'ASS'},
|
|
'Helwan': {'region': 'C', 'city_code': 'HEL'},
|
|
'6th of October City': {'region': 'GZ', 'city_code': 'OCT'},
|
|
'Ain Shams': {'region': 'C', 'city_code': 'ASH'},
|
|
'Maadi': {'region': 'C', 'city_code': 'MAA'},
|
|
'New Cairo': {'region': 'C', 'city_code': 'NCA'},
|
|
}
|
|
|
|
def extract_city_from_name(name):
|
|
"""Extract Egyptian city from institution name."""
|
|
name_lower = name.lower()
|
|
|
|
if 'cairo' in name_lower or 'ain shams' in name_lower or 'helwan' in name_lower:
|
|
return 'Cairo'
|
|
if 'alexandria' in name_lower:
|
|
return 'Alexandria'
|
|
if 'assiut' in name_lower or 'asyut' in name_lower:
|
|
return 'Assiut'
|
|
if 'giza' in name_lower or 'october' in name_lower:
|
|
return 'Giza'
|
|
if 'nile' in name_lower or 'maadi' in name_lower:
|
|
return 'Cairo' # Most Egyptian institutions without city are in Cairo
|
|
if 'egypt' in name_lower or 'egyptian' in name_lower:
|
|
return 'Cairo' # Default for national institutions
|
|
|
|
return 'Cairo' # Default
|
|
|
|
def update_file(file_path, city_name, dry_run=False):
|
|
"""Update file from CH to EG namespace."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
city_info = EGYPTIAN_CITIES.get(city_name, {'region': 'C', 'city_code': 'CAI'})
|
|
region_code = city_info['region']
|
|
city_code = city_info['city_code']
|
|
|
|
# Get current GHCID
|
|
old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content)
|
|
if not old_ghcid_match:
|
|
return False, None
|
|
|
|
old_ghcid = old_ghcid_match.group(1).strip()
|
|
|
|
# Create new GHCID with EG namespace
|
|
new_ghcid = re.sub(r'^CH-XX-XXX-', f'EG-{region_code}-{city_code}-', old_ghcid)
|
|
|
|
if dry_run:
|
|
return True, (old_ghcid, new_ghcid)
|
|
|
|
# Update all GHCID references
|
|
content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}')
|
|
content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}')
|
|
content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}')
|
|
content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}")
|
|
|
|
# Update country code
|
|
content = re.sub(r'country:\s*CH', 'country: EG', content)
|
|
content = re.sub(r'country_code:\s*CH', 'country_code: EG', content)
|
|
|
|
# Update location_resolution
|
|
content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content)
|
|
content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content)
|
|
|
|
# Add history entry
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
history_entry = f"""
|
|
- ghcid: {new_ghcid}
|
|
valid_from: '{timestamp}'
|
|
reason: Migrated from CH to EG namespace - {city_name}"""
|
|
|
|
history_match = re.search(r'(ghcid_history:\s*\n)', content)
|
|
if history_match:
|
|
insert_pos = history_match.end()
|
|
content = content[:insert_pos] + history_entry + content[insert_pos:]
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
# Rename file
|
|
old_filename = file_path.name
|
|
new_filename = old_filename.replace('CH-XX-XXX-', f'EG-{region_code}-{city_code}-')
|
|
if new_filename != old_filename:
|
|
new_path = file_path.parent / new_filename
|
|
file_path.rename(new_path)
|
|
|
|
return True, (old_ghcid, new_ghcid)
|
|
|
|
def main():
|
|
import sys
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
base_dir = Path(__file__).parent.parent
|
|
custodian_dir = base_dir / 'data' / 'custodian'
|
|
|
|
print("Egyptian Institution Migration (CH → EG)")
|
|
print("=" * 50)
|
|
if dry_run:
|
|
print("DRY RUN MODE\n")
|
|
|
|
# Find CH-XX-XXX files that are actually Egyptian
|
|
xxx_files = list(custodian_dir.glob('CH-XX-XXX-*.yaml'))
|
|
print(f"Found {len(xxx_files)} CH-XX-XXX files\n")
|
|
|
|
migrated = 0
|
|
egyptian_keywords = ['egypt', 'cairo', 'alexandria', 'ain shams', 'helwan', 'assiut',
|
|
'giza', 'nile', 'al-azhar', 'dar al-kutub', 'guc', 'auc', 'bue']
|
|
|
|
for file_path in xxx_files:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Check if this is an Egyptian institution
|
|
name_match = re.search(r'claim_value:\s*(.+)', content)
|
|
if not name_match:
|
|
continue
|
|
|
|
inst_name = name_match.group(1).strip().lower()
|
|
|
|
is_egyptian = any(keyword in inst_name for keyword in egyptian_keywords)
|
|
if not is_egyptian:
|
|
continue
|
|
|
|
city = extract_city_from_name(inst_name)
|
|
success, ghcid_change = update_file(file_path, city, dry_run)
|
|
|
|
if success:
|
|
if dry_run:
|
|
print(f" {file_path.name}")
|
|
print(f" → {ghcid_change[0]} → {ghcid_change[1]}")
|
|
else:
|
|
print(f"✓ Migrated: {file_path.name} → {city}")
|
|
migrated += 1
|
|
|
|
print(f"\n{'=' * 50}")
|
|
print(f"Migrated: {migrated}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|