Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
353 lines
11 KiB
Python
Executable file
353 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Resolve XX region codes using city names extracted from institution names.
|
||
|
||
This script handles files without coordinates or Wikidata IDs by:
|
||
1. Extracting city names from institution names
|
||
2. Looking up cities in GeoNames database
|
||
3. Mapping to ISO 3166-2 region codes
|
||
|
||
Following AGENTS.md Rules:
|
||
- Rule 5: Additive only - never delete existing data
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import yaml
|
||
import sqlite3
|
||
import re
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, Any, List, Tuple
|
||
|
||
# Belgian city name patterns
|
||
BELGIAN_CITIES = {
|
||
'brussel': 'BRU', 'bruxelles': 'BRU', 'brussels': 'BRU',
|
||
'antwerpen': 'VAN', 'anvers': 'VAN', 'antwerp': 'VAN',
|
||
'gent': 'VOV', 'ghent': 'VOV', 'gand': 'VOV',
|
||
'brugge': 'VWV', 'bruges': 'VWV',
|
||
'leuven': 'VBR', 'louvain': 'VBR',
|
||
'mechelen': 'VAN', 'malines': 'VAN',
|
||
'hasselt': 'VLI',
|
||
'luik': 'WLG', 'liège': 'WLG', 'liege': 'WLG',
|
||
'charleroi': 'WHT',
|
||
'namur': 'WNA', 'namen': 'WNA',
|
||
'mons': 'WHT', 'bergen': 'WHT',
|
||
'tournai': 'WHT', 'doornik': 'WHT',
|
||
'kortrijk': 'VWV', 'courtrai': 'VWV',
|
||
'oostende': 'VWV', 'ostende': 'VWV',
|
||
'aalst': 'VOV', 'alost': 'VOV',
|
||
'sint-niklaas': 'VOV',
|
||
'dendermonde': 'VOV',
|
||
'genk': 'VLI',
|
||
'roeselare': 'VWV',
|
||
'mouscron': 'WHT', 'moeskroen': 'WHT',
|
||
'tienen': 'VBR', 'tirlemont': 'VBR',
|
||
'ieper': 'VWV', 'ypres': 'VWV',
|
||
'turnhout': 'VAN',
|
||
'waregem': 'VWV',
|
||
'lokeren': 'VOV',
|
||
'beveren': 'VOV',
|
||
'vilvoorde': 'VBR',
|
||
'dilbeek': 'VBR',
|
||
'schoten': 'VAN',
|
||
'brasschaat': 'VAN',
|
||
'boom': 'VAN',
|
||
'mortsel': 'VAN',
|
||
'temse': 'VOV',
|
||
'herzele': 'VOV',
|
||
'brecht': 'VAN',
|
||
'oudenaarde': 'VOV',
|
||
'rotselaar': 'VBR',
|
||
'niel': 'VAN',
|
||
'lint': 'VAN',
|
||
'ravels': 'VAN',
|
||
'bree': 'VLI',
|
||
'peer': 'VLI',
|
||
'meeuwen': 'VLI',
|
||
'gruitrode': 'VLI',
|
||
'arlon': 'WLX', 'aarlen': 'WLX',
|
||
'bastogne': 'WLX', 'bastenaken': 'WLX',
|
||
}
|
||
|
||
# Austrian state codes
|
||
AUSTRIAN_STATES = {
|
||
'wien': '9', 'vienna': '9',
|
||
'salzburg': '5',
|
||
'tirol': '7', 'tyrol': '7', 'innsbruck': '7',
|
||
'vorarlberg': '8', 'bregenz': '8',
|
||
'kärnten': '2', 'carinthia': '2', 'klagenfurt': '2',
|
||
'steiermark': '6', 'styria': '6', 'graz': '6',
|
||
'oberösterreich': '4', 'upper austria': '4', 'linz': '4',
|
||
'niederösterreich': '3', 'lower austria': '3', 'st. pölten': '3',
|
||
'burgenland': '1', 'eisenstadt': '1',
|
||
}
|
||
|
||
# Bulgarian province codes
|
||
BULGARIAN_PROVINCES = {
|
||
'sofia': '22', 'софія': '22',
|
||
'plovdiv': '16', 'пловдив': '16',
|
||
'varna': '03', 'варна': '03',
|
||
'burgas': '02', 'бургас': '02',
|
||
'ruse': '18', 'русе': '18',
|
||
'stara zagora': '24',
|
||
'pleven': '15', 'плевен': '15',
|
||
}
|
||
|
||
# Swiss canton codes (abbreviated)
|
||
SWISS_CANTONS = {
|
||
'zürich': 'ZH', 'zurich': 'ZH',
|
||
'bern': 'BE', 'berne': 'BE',
|
||
'luzern': 'LU', 'lucerne': 'LU',
|
||
'genève': 'GE', 'geneva': 'GE', 'genf': 'GE',
|
||
'basel': 'BS',
|
||
'lausanne': 'VD',
|
||
'winterthur': 'ZH',
|
||
'st. gallen': 'SG', 'st gallen': 'SG',
|
||
'lugano': 'TI',
|
||
'biel': 'BE', 'bienne': 'BE',
|
||
'thun': 'BE',
|
||
'fribourg': 'FR', 'freiburg': 'FR',
|
||
'schaffhausen': 'SH',
|
||
'chur': 'GR',
|
||
'neuchâtel': 'NE', 'neuchatel': 'NE',
|
||
'sion': 'VS',
|
||
'aarau': 'AG',
|
||
'baden': 'AG',
|
||
}
|
||
|
||
|
||
def extract_city_from_name(name: str, country: str) -> Optional[Tuple[str, str]]:
|
||
"""
|
||
Extract city name from institution name.
|
||
Returns (city_name, region_code) or None.
|
||
"""
|
||
name_lower = name.lower()
|
||
|
||
if country == 'BE':
|
||
for city, region in BELGIAN_CITIES.items():
|
||
if city in name_lower:
|
||
return (city.title(), region)
|
||
|
||
elif country == 'AT':
|
||
for city, region in AUSTRIAN_STATES.items():
|
||
if city in name_lower:
|
||
return (city.title(), region)
|
||
|
||
elif country == 'BG':
|
||
for city, region in BULGARIAN_PROVINCES.items():
|
||
if city in name_lower:
|
||
return (city.title(), region)
|
||
|
||
elif country == 'CH':
|
||
for city, region in SWISS_CANTONS.items():
|
||
if city in name_lower:
|
||
return (city.title(), region)
|
||
|
||
return None
|
||
|
||
|
||
def update_file_with_region(filepath: Path, region_code: str, city_name: str,
|
||
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
||
"""Update a custodian file with resolved region code."""
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
except Exception as e:
|
||
print(f" Error reading {filepath}: {e}")
|
||
return False, None
|
||
|
||
if 'ghcid' not in data:
|
||
return False, None
|
||
|
||
ghcid = data['ghcid']
|
||
if 'location_resolution' not in ghcid:
|
||
ghcid['location_resolution'] = {}
|
||
|
||
loc_res = ghcid['location_resolution']
|
||
country_code = loc_res.get('country_code', '')
|
||
|
||
if not country_code:
|
||
return False, None
|
||
|
||
old_region = loc_res.get('region_code', 'XX')
|
||
|
||
if old_region != 'XX':
|
||
return False, None
|
||
|
||
# Update location resolution
|
||
loc_res['region_code'] = region_code
|
||
loc_res['region_name'] = city_name
|
||
loc_res['method'] = 'NAME_LOOKUP'
|
||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||
|
||
# Update GHCID string
|
||
old_ghcid = ghcid.get('ghcid_current', '')
|
||
new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
||
|
||
if new_ghcid != old_ghcid:
|
||
ghcid['ghcid_current'] = new_ghcid
|
||
|
||
if 'ghcid_history' not in ghcid:
|
||
ghcid['ghcid_history'] = []
|
||
|
||
ghcid['ghcid_history'].append({
|
||
'ghcid': new_ghcid,
|
||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||
'reason': f"Region resolved via name lookup: XX->{region_code} (city: {city_name})"
|
||
})
|
||
|
||
# Add provenance note
|
||
if 'provenance' not in data:
|
||
data['provenance'] = {}
|
||
if 'notes' not in data['provenance']:
|
||
data['provenance']['notes'] = []
|
||
elif isinstance(data['provenance']['notes'], str):
|
||
data['provenance']['notes'] = [data['provenance']['notes']]
|
||
|
||
data['provenance']['notes'].append(
|
||
f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
||
f"XX->{region_code} via name lookup (city: {city_name})"
|
||
)
|
||
|
||
# Determine new filename
|
||
new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
||
new_filepath = filepath.parent / new_filename
|
||
|
||
if not dry_run:
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||
|
||
if new_filepath != filepath and not new_filepath.exists():
|
||
filepath.rename(new_filepath)
|
||
|
||
return True, new_filepath if new_filepath != filepath else None
|
||
|
||
|
||
def main():
|
||
"""Main entry point."""
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description='Resolve XX region codes using city names from institution names'
|
||
)
|
||
parser.add_argument('--apply', action='store_true',
|
||
help='Actually apply the fixes (default: dry run)')
|
||
parser.add_argument('--path', type=str, default='data/custodian',
|
||
help='Path to custodian files directory')
|
||
parser.add_argument('--limit', type=int, default=100,
|
||
help='Limit number of files to process')
|
||
parser.add_argument('--country', type=str,
|
||
help='Only process files for a specific country')
|
||
|
||
args = parser.parse_args()
|
||
|
||
custodian_dir = Path(args.path)
|
||
if not custodian_dir.exists():
|
||
print(f"Error: Directory {custodian_dir} does not exist")
|
||
sys.exit(1)
|
||
|
||
dry_run = not args.apply
|
||
|
||
print("=" * 70)
|
||
print("REGION RESOLUTION VIA NAME LOOKUP")
|
||
print("=" * 70)
|
||
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
||
print()
|
||
|
||
# Find files with XX region codes
|
||
files_to_process = []
|
||
|
||
for filepath in custodian_dir.glob('*-XX-*.yaml'):
|
||
files_to_process.append(filepath)
|
||
|
||
print(f"Found {len(files_to_process)} files with XX region codes")
|
||
|
||
# Load files and extract institution names
|
||
file_data = []
|
||
for filepath in files_to_process[:args.limit]:
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
# Get country code
|
||
country = None
|
||
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
|
||
country = data['ghcid']['location_resolution'].get('country_code')
|
||
|
||
if not country:
|
||
continue
|
||
|
||
if args.country and country != args.country:
|
||
continue
|
||
|
||
# Get institution name
|
||
name = None
|
||
if 'custodian_name' in data:
|
||
name = data['custodian_name'].get('claim_value')
|
||
if not name and 'original_entry' in data:
|
||
name = data['original_entry'].get('name')
|
||
|
||
if not name:
|
||
continue
|
||
|
||
file_data.append({
|
||
'filepath': filepath,
|
||
'data': data,
|
||
'country': country,
|
||
'name': name
|
||
})
|
||
except Exception as e:
|
||
print(f"Error loading {filepath}: {e}")
|
||
|
||
print(f"Processing {len(file_data)} files with institution names")
|
||
print()
|
||
|
||
# Process each file
|
||
resolved = 0
|
||
renamed = 0
|
||
no_match = 0
|
||
|
||
for f in file_data:
|
||
filepath = f['filepath']
|
||
name = f['name']
|
||
country = f['country']
|
||
|
||
# Try to extract city from name
|
||
result = extract_city_from_name(name, country)
|
||
|
||
if not result:
|
||
no_match += 1
|
||
continue
|
||
|
||
city_name, region_code = result
|
||
|
||
print(f"Processing {filepath.name}...")
|
||
print(f" Name: {name}")
|
||
print(f" City: {city_name} -> Region: {region_code}")
|
||
|
||
# Update file
|
||
success, new_path = update_file_with_region(filepath, region_code, city_name, dry_run=dry_run)
|
||
|
||
if success:
|
||
resolved += 1
|
||
if new_path:
|
||
renamed += 1
|
||
print(f" {filepath.name} -> {new_path.name}")
|
||
|
||
print()
|
||
print("=" * 70)
|
||
print("SUMMARY")
|
||
print("=" * 70)
|
||
print(f"Files processed: {len(file_data)}")
|
||
print(f"Resolved: {resolved}")
|
||
print(f"Renamed: {renamed}")
|
||
print(f"No city match: {no_match}")
|
||
|
||
if dry_run:
|
||
print()
|
||
print("This was a DRY RUN. Use --apply to make changes.")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|