glam/scripts/resolve_locations_by_name.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

353 lines
11 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Resolve XX region codes using city names extracted from institution names.
This script handles files without coordinates or Wikidata IDs by:
1. Extracting city names from institution names
2. Looking up cities in GeoNames database
3. Mapping to ISO 3166-2 region codes
Following AGENTS.md Rules:
- Rule 5: Additive only - never delete existing data
"""
import os
import sys
import yaml
import sqlite3
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
# Belgian city name patterns
BELGIAN_CITIES = {
'brussel': 'BRU', 'bruxelles': 'BRU', 'brussels': 'BRU',
'antwerpen': 'VAN', 'anvers': 'VAN', 'antwerp': 'VAN',
'gent': 'VOV', 'ghent': 'VOV', 'gand': 'VOV',
'brugge': 'VWV', 'bruges': 'VWV',
'leuven': 'VBR', 'louvain': 'VBR',
'mechelen': 'VAN', 'malines': 'VAN',
'hasselt': 'VLI',
'luik': 'WLG', 'liège': 'WLG', 'liege': 'WLG',
'charleroi': 'WHT',
'namur': 'WNA', 'namen': 'WNA',
'mons': 'WHT', 'bergen': 'WHT',
'tournai': 'WHT', 'doornik': 'WHT',
'kortrijk': 'VWV', 'courtrai': 'VWV',
'oostende': 'VWV', 'ostende': 'VWV',
'aalst': 'VOV', 'alost': 'VOV',
'sint-niklaas': 'VOV',
'dendermonde': 'VOV',
'genk': 'VLI',
'roeselare': 'VWV',
'mouscron': 'WHT', 'moeskroen': 'WHT',
'tienen': 'VBR', 'tirlemont': 'VBR',
'ieper': 'VWV', 'ypres': 'VWV',
'turnhout': 'VAN',
'waregem': 'VWV',
'lokeren': 'VOV',
'beveren': 'VOV',
'vilvoorde': 'VBR',
'dilbeek': 'VBR',
'schoten': 'VAN',
'brasschaat': 'VAN',
'boom': 'VAN',
'mortsel': 'VAN',
'temse': 'VOV',
'herzele': 'VOV',
'brecht': 'VAN',
'oudenaarde': 'VOV',
'rotselaar': 'VBR',
'niel': 'VAN',
'lint': 'VAN',
'ravels': 'VAN',
'bree': 'VLI',
'peer': 'VLI',
'meeuwen': 'VLI',
'gruitrode': 'VLI',
'arlon': 'WLX', 'aarlen': 'WLX',
'bastogne': 'WLX', 'bastenaken': 'WLX',
}
# Austrian state codes
AUSTRIAN_STATES = {
'wien': '9', 'vienna': '9',
'salzburg': '5',
'tirol': '7', 'tyrol': '7', 'innsbruck': '7',
'vorarlberg': '8', 'bregenz': '8',
'kärnten': '2', 'carinthia': '2', 'klagenfurt': '2',
'steiermark': '6', 'styria': '6', 'graz': '6',
'oberösterreich': '4', 'upper austria': '4', 'linz': '4',
'niederösterreich': '3', 'lower austria': '3', 'st. pölten': '3',
'burgenland': '1', 'eisenstadt': '1',
}
# Bulgarian province codes
BULGARIAN_PROVINCES = {
'sofia': '22', 'софія': '22',
'plovdiv': '16', 'пловдив': '16',
'varna': '03', 'варна': '03',
'burgas': '02', 'бургас': '02',
'ruse': '18', 'русе': '18',
'stara zagora': '24',
'pleven': '15', 'плевен': '15',
}
# Swiss canton codes (abbreviated)
SWISS_CANTONS = {
'zürich': 'ZH', 'zurich': 'ZH',
'bern': 'BE', 'berne': 'BE',
'luzern': 'LU', 'lucerne': 'LU',
'genève': 'GE', 'geneva': 'GE', 'genf': 'GE',
'basel': 'BS',
'lausanne': 'VD',
'winterthur': 'ZH',
'st. gallen': 'SG', 'st gallen': 'SG',
'lugano': 'TI',
'biel': 'BE', 'bienne': 'BE',
'thun': 'BE',
'fribourg': 'FR', 'freiburg': 'FR',
'schaffhausen': 'SH',
'chur': 'GR',
'neuchâtel': 'NE', 'neuchatel': 'NE',
'sion': 'VS',
'aarau': 'AG',
'baden': 'AG',
}
def extract_city_from_name(name: str, country: str) -> Optional[Tuple[str, str]]:
"""
Extract city name from institution name.
Returns (city_name, region_code) or None.
"""
name_lower = name.lower()
if country == 'BE':
for city, region in BELGIAN_CITIES.items():
if city in name_lower:
return (city.title(), region)
elif country == 'AT':
for city, region in AUSTRIAN_STATES.items():
if city in name_lower:
return (city.title(), region)
elif country == 'BG':
for city, region in BULGARIAN_PROVINCES.items():
if city in name_lower:
return (city.title(), region)
elif country == 'CH':
for city, region in SWISS_CANTONS.items():
if city in name_lower:
return (city.title(), region)
return None
def update_file_with_region(filepath: Path, region_code: str, city_name: str,
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
"""Update a custodian file with resolved region code."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {filepath}: {e}")
return False, None
if 'ghcid' not in data:
return False, None
ghcid = data['ghcid']
if 'location_resolution' not in ghcid:
ghcid['location_resolution'] = {}
loc_res = ghcid['location_resolution']
country_code = loc_res.get('country_code', '')
if not country_code:
return False, None
old_region = loc_res.get('region_code', 'XX')
if old_region != 'XX':
return False, None
# Update location resolution
loc_res['region_code'] = region_code
loc_res['region_name'] = city_name
loc_res['method'] = 'NAME_LOOKUP'
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
# Update GHCID string
old_ghcid = ghcid.get('ghcid_current', '')
new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
if new_ghcid != old_ghcid:
ghcid['ghcid_current'] = new_ghcid
if 'ghcid_history' not in ghcid:
ghcid['ghcid_history'] = []
ghcid['ghcid_history'].append({
'ghcid': new_ghcid,
'valid_from': datetime.now(timezone.utc).isoformat(),
'reason': f"Region resolved via name lookup: XX->{region_code} (city: {city_name})"
})
# Add provenance note
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
elif isinstance(data['provenance']['notes'], str):
data['provenance']['notes'] = [data['provenance']['notes']]
data['provenance']['notes'].append(
f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
f"XX->{region_code} via name lookup (city: {city_name})"
)
# Determine new filename
new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
new_filepath = filepath.parent / new_filename
if not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
if new_filepath != filepath and not new_filepath.exists():
filepath.rename(new_filepath)
return True, new_filepath if new_filepath != filepath else None
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Resolve XX region codes using city names from institution names'
)
parser.add_argument('--apply', action='store_true',
help='Actually apply the fixes (default: dry run)')
parser.add_argument('--path', type=str, default='data/custodian',
help='Path to custodian files directory')
parser.add_argument('--limit', type=int, default=100,
help='Limit number of files to process')
parser.add_argument('--country', type=str,
help='Only process files for a specific country')
args = parser.parse_args()
custodian_dir = Path(args.path)
if not custodian_dir.exists():
print(f"Error: Directory {custodian_dir} does not exist")
sys.exit(1)
dry_run = not args.apply
print("=" * 70)
print("REGION RESOLUTION VIA NAME LOOKUP")
print("=" * 70)
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
print()
# Find files with XX region codes
files_to_process = []
for filepath in custodian_dir.glob('*-XX-*.yaml'):
files_to_process.append(filepath)
print(f"Found {len(files_to_process)} files with XX region codes")
# Load files and extract institution names
file_data = []
for filepath in files_to_process[:args.limit]:
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get country code
country = None
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
country = data['ghcid']['location_resolution'].get('country_code')
if not country:
continue
if args.country and country != args.country:
continue
# Get institution name
name = None
if 'custodian_name' in data:
name = data['custodian_name'].get('claim_value')
if not name and 'original_entry' in data:
name = data['original_entry'].get('name')
if not name:
continue
file_data.append({
'filepath': filepath,
'data': data,
'country': country,
'name': name
})
except Exception as e:
print(f"Error loading {filepath}: {e}")
print(f"Processing {len(file_data)} files with institution names")
print()
# Process each file
resolved = 0
renamed = 0
no_match = 0
for f in file_data:
filepath = f['filepath']
name = f['name']
country = f['country']
# Try to extract city from name
result = extract_city_from_name(name, country)
if not result:
no_match += 1
continue
city_name, region_code = result
print(f"Processing {filepath.name}...")
print(f" Name: {name}")
print(f" City: {city_name} -> Region: {region_code}")
# Update file
success, new_path = update_file_with_region(filepath, region_code, city_name, dry_run=dry_run)
if success:
resolved += 1
if new_path:
renamed += 1
print(f" {filepath.name} -> {new_path.name}")
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files processed: {len(file_data)}")
print(f"Resolved: {resolved}")
print(f"Renamed: {renamed}")
print(f"No city match: {no_match}")
if dry_run:
print()
print("This was a DRY RUN. Use --apply to make changes.")
if __name__ == '__main__':
main()