glam/scripts/resolve_locations_by_name.py

#!/usr/bin/env python3
"""
Resolve XX region codes using city names extracted from institution names.

This script handles files without coordinates or Wikidata IDs by:
1. Extracting city names from institution names
2. Looking up cities in GeoNames database
3. Mapping to ISO 3166-2 region codes

Following AGENTS.md Rules:
- Rule 5: Additive only - never delete existing data
"""

import os
import sys
import yaml
import sqlite3
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple

# Belgian city name patterns
BELGIAN_CITIES = {
    'brussel': 'BRU', 'bruxelles': 'BRU', 'brussels': 'BRU',
    'antwerpen': 'VAN', 'anvers': 'VAN', 'antwerp': 'VAN',
    'gent': 'VOV', 'ghent': 'VOV', 'gand': 'VOV',
    'brugge': 'VWV', 'bruges': 'VWV',
    'leuven': 'VBR', 'louvain': 'VBR',
    'mechelen': 'VAN', 'malines': 'VAN',
    'hasselt': 'VLI',
    'luik': 'WLG', 'liège': 'WLG', 'liege': 'WLG',
    'charleroi': 'WHT',
    'namur': 'WNA', 'namen': 'WNA',
    'mons': 'WHT', 'bergen': 'WHT',
    'tournai': 'WHT', 'doornik': 'WHT',
    'kortrijk': 'VWV', 'courtrai': 'VWV',
    'oostende': 'VWV', 'ostende': 'VWV',
    'aalst': 'VOV', 'alost': 'VOV',
    'sint-niklaas': 'VOV',
    'dendermonde': 'VOV',
    'genk': 'VLI',
    'roeselare': 'VWV',
    'mouscron': 'WHT', 'moeskroen': 'WHT',
    'tienen': 'VBR', 'tirlemont': 'VBR',
    'ieper': 'VWV', 'ypres': 'VWV',
    'turnhout': 'VAN',
    'waregem': 'VWV',
    'lokeren': 'VOV',
    'beveren': 'VOV',
    'vilvoorde': 'VBR',
    'dilbeek': 'VBR',
    'schoten': 'VAN',
    'brasschaat': 'VAN',
    'boom': 'VAN',
    'mortsel': 'VAN',
    'temse': 'VOV',
    'herzele': 'VOV',
    'brecht': 'VAN',
    'oudenaarde': 'VOV',
    'rotselaar': 'VBR',
    'niel': 'VAN',
    'lint': 'VAN',
    'ravels': 'VAN',
    'bree': 'VLI',
    'peer': 'VLI',
    'meeuwen': 'VLI',
    'gruitrode': 'VLI',
    'arlon': 'WLX', 'aarlen': 'WLX',
    'bastogne': 'WLX', 'bastenaken': 'WLX',
}

# Austrian state codes
AUSTRIAN_STATES = {
    'wien': '9', 'vienna': '9',
    'salzburg': '5',
    'tirol': '7', 'tyrol': '7', 'innsbruck': '7',
    'vorarlberg': '8', 'bregenz': '8',
    'kärnten': '2', 'carinthia': '2', 'klagenfurt': '2',
    'steiermark': '6', 'styria': '6', 'graz': '6',
    'oberösterreich': '4', 'upper austria': '4', 'linz': '4',
    'niederösterreich': '3', 'lower austria': '3', 'st. pölten': '3',
    'burgenland': '1', 'eisenstadt': '1',
}

# Bulgarian province codes
BULGARIAN_PROVINCES = {
    'sofia': '22', 'софія': '22',
    'plovdiv': '16', 'пловдив': '16',
    'varna': '03', 'варна': '03',
    'burgas': '02', 'бургас': '02',
    'ruse': '18', 'русе': '18',
    'stara zagora': '24',
    'pleven': '15', 'плевен': '15',
}

# Swiss canton codes (abbreviated)
SWISS_CANTONS = {
    'zürich': 'ZH', 'zurich': 'ZH',
    'bern': 'BE', 'berne': 'BE',
    'luzern': 'LU', 'lucerne': 'LU',
    'genève': 'GE', 'geneva': 'GE', 'genf': 'GE',
    'basel': 'BS',
    'lausanne': 'VD',
    'winterthur': 'ZH',
    'st. gallen': 'SG', 'st gallen': 'SG',
    'lugano': 'TI',
    'biel': 'BE', 'bienne': 'BE',
    'thun': 'BE',
    'fribourg': 'FR', 'freiburg': 'FR',
    'schaffhausen': 'SH',
    'chur': 'GR',
    'neuchâtel': 'NE', 'neuchatel': 'NE',
    'sion': 'VS',
    'aarau': 'AG',
    'baden': 'AG',
}


def extract_city_from_name(name: str, country: str) -> Optional[Tuple[str, str]]:
    """
    Extract city name from institution name.
    Returns (city_name, region_code) or None.
    """
    name_lower = name.lower()

    if country == 'BE':
        for city, region in BELGIAN_CITIES.items():
            if city in name_lower:
                return (city.title(), region)

    elif country == 'AT':
        for city, region in AUSTRIAN_STATES.items():
            if city in name_lower:
                return (city.title(), region)

    elif country == 'BG':
        for city, region in BULGARIAN_PROVINCES.items():
            if city in name_lower:
                return (city.title(), region)

    elif country == 'CH':
        for city, region in SWISS_CANTONS.items():
            if city in name_lower:
                return (city.title(), region)

    return None


def update_file_with_region(filepath: Path, region_code: str, city_name: str,
                            dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
    """Update a custodian file with resolved region code."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception as e:
        print(f"  Error reading {filepath}: {e}")
        return False, None

    if 'ghcid' not in data:
        return False, None

    ghcid = data['ghcid']
    if 'location_resolution' not in ghcid:
        ghcid['location_resolution'] = {}

    loc_res = ghcid['location_resolution']
    country_code = loc_res.get('country_code', '')

    if not country_code:
        return False, None

    old_region = loc_res.get('region_code', 'XX')

    if old_region != 'XX':
        return False, None

    # Update location resolution
    loc_res['region_code'] = region_code
    loc_res['region_name'] = city_name
    loc_res['method'] = 'NAME_LOOKUP'
    loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()

    # Update GHCID string
    old_ghcid = ghcid.get('ghcid_current', '')
    new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')

    if new_ghcid != old_ghcid:
        ghcid['ghcid_current'] = new_ghcid

        if 'ghcid_history' not in ghcid:
            ghcid['ghcid_history'] = []

        ghcid['ghcid_history'].append({
            'ghcid': new_ghcid,
            'valid_from': datetime.now(timezone.utc).isoformat(),
            'reason': f"Region resolved via name lookup: XX->{region_code} (city: {city_name})"
        })

    # Add provenance note
    if 'provenance' not in data:
        data['provenance'] = {}
    if 'notes' not in data['provenance']:
        data['provenance']['notes'] = []
    elif isinstance(data['provenance']['notes'], str):
        data['provenance']['notes'] = [data['provenance']['notes']]

    data['provenance']['notes'].append(
        f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
        f"XX->{region_code} via name lookup (city: {city_name})"
    )

    # Determine new filename
    new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
    new_filepath = filepath.parent / new_filename

    if not dry_run:
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        if new_filepath != filepath and not new_filepath.exists():
            filepath.rename(new_filepath)

    return True, new_filepath if new_filepath != filepath else None


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Resolve XX region codes using city names from institution names'
    )
    parser.add_argument('--apply', action='store_true',
                        help='Actually apply the fixes (default: dry run)')
    parser.add_argument('--path', type=str, default='data/custodian',
                        help='Path to custodian files directory')
    parser.add_argument('--limit', type=int, default=100,
                        help='Limit number of files to process')
    parser.add_argument('--country', type=str,
                        help='Only process files for a specific country')

    args = parser.parse_args()

    custodian_dir = Path(args.path)
    if not custodian_dir.exists():
        print(f"Error: Directory {custodian_dir} does not exist")
        sys.exit(1)

    dry_run = not args.apply

    print("=" * 70)
    print("REGION RESOLUTION VIA NAME LOOKUP")
    print("=" * 70)
    print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
    print()

    # Find files with XX region codes
    files_to_process = []

    for filepath in custodian_dir.glob('*-XX-*.yaml'):
        files_to_process.append(filepath)

    print(f"Found {len(files_to_process)} files with XX region codes")

    # Load files and extract institution names
    file_data = []
    for filepath in files_to_process[:args.limit]:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            # Get country code
            country = None
            if 'ghcid' in data and 'location_resolution' in data['ghcid']:
                country = data['ghcid']['location_resolution'].get('country_code')

            if not country:
                continue

            if args.country and country != args.country:
                continue

            # Get institution name
            name = None
            if 'custodian_name' in data:
                name = data['custodian_name'].get('claim_value')
            if not name and 'original_entry' in data:
                name = data['original_entry'].get('name')

            if not name:
                continue

            file_data.append({
                'filepath': filepath,
                'data': data,
                'country': country,
                'name': name
            })
        except Exception as e:
            print(f"Error loading {filepath}: {e}")

    print(f"Processing {len(file_data)} files with institution names")
    print()

    # Process each file
    resolved = 0
    renamed = 0
    no_match = 0

    for f in file_data:
        filepath = f['filepath']
        name = f['name']
        country = f['country']

        # Try to extract city from name
        result = extract_city_from_name(name, country)

        if not result:
            no_match += 1
            continue

        city_name, region_code = result

        print(f"Processing {filepath.name}...")
        print(f"  Name: {name}")
        print(f"  City: {city_name} -> Region: {region_code}")

        # Update file
        success, new_path = update_file_with_region(filepath, region_code, city_name, dry_run=dry_run)

        if success:
            resolved += 1
            if new_path:
                renamed += 1
                print(f"  {filepath.name} -> {new_path.name}")

    print()
    print("=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"Files processed: {len(file_data)}")
    print(f"Resolved: {resolved}")
    print(f"Renamed: {renamed}")
    print(f"No city match: {no_match}")

    if dry_run:
        print()
        print("This was a DRY RUN. Use --apply to make changes.")


if __name__ == '__main__':
    main()