glam/scripts/resolve_xx_regions_wikidata.py

#!/usr/bin/env python3
"""
Resolve XX region codes using Wikidata P131 (located in administrative territorial entity).

This script:
1. Reads custodian files with XX region codes
2. Queries Wikidata for P131 administrative hierarchy
3. Extracts ISO 3166-2 region codes (P300)
4. Updates the files with resolved region codes

Following AGENTS.md Rule 5: Additive only - never delete existing data.
"""

import os
import sys
import yaml
import json
import time
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
import re


def query_wikidata_sparql(sparql_query: str) -> Optional[List[Dict]]:
    """Execute SPARQL query using curl to Wikidata endpoint."""
    import urllib.request
    import urllib.parse

    url = "https://query.wikidata.org/sparql"
    headers = {
        'Accept': 'application/sparql-results+json',
        'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
    }

    data = urllib.parse.urlencode({'query': sparql_query}).encode('utf-8')

    try:
        request = urllib.request.Request(url, data=data, headers=headers)
        with urllib.request.urlopen(request, timeout=60) as response:
            result = json.loads(response.read().decode('utf-8'))
            return result.get('results', {}).get('bindings', [])
    except Exception as e:
        print(f"  SPARQL error: {e}")
        return None


def batch_query_admin_regions(wikidata_ids: List[str]) -> Dict[str, Dict[str, Any]]:
    """Query Wikidata for administrative regions of multiple entities."""

    # Build VALUES clause with QIDs
    values = ' '.join([f'wd:{qid}' for qid in wikidata_ids])

    # Query for P131 chain and P300 (ISO 3166-2 code)
    query = f"""
    SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin1Code ?coords WHERE {{
      VALUES ?item {{ {values} }}

      # Get first-level admin division with ISO 3166-2 code
      OPTIONAL {{
        ?item wdt:P131+ ?admin1.
        ?admin1 wdt:P300 ?admin1Code.
      }}

      # Get coordinates
      OPTIONAL {{
        ?item wdt:P625 ?coords.
      }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """

    results = query_wikidata_sparql(query)
    if not results:
        return {}

    # Process results - group by item
    processed = {}
    for row in results:
        item_uri = row.get('item', {}).get('value', '')
        if not item_uri:
            continue

        qid = item_uri.split('/')[-1]

        if qid not in processed:
            processed[qid] = {
                'label': row.get('itemLabel', {}).get('value', ''),
                'admin_codes': set(),
                'admin_labels': {},
                'coords': None
            }

        # Extract ISO 3166-2 code
        if 'admin1Code' in row:
            code = row['admin1Code'].get('value', '')
            if code:
                processed[qid]['admin_codes'].add(code)
                label = row.get('admin1Label', {}).get('value', '')
                processed[qid]['admin_labels'][code] = label

        # Extract coordinates
        if 'coords' in row and not processed[qid]['coords']:
            coords_str = row['coords'].get('value', '')
            # Parse "Point(lon lat)" format
            match = re.search(r'Point\(([^\s]+)\s+([^\)]+)\)', coords_str)
            if match:
                processed[qid]['coords'] = {
                    'longitude': float(match.group(1)),
                    'latitude': float(match.group(2))
                }

    # Convert sets to lists
    for qid in processed:
        processed[qid]['admin_codes'] = list(processed[qid]['admin_codes'])

    return processed


def extract_region_from_iso_code(iso_code: str) -> Tuple[str, str]:
    """Extract country and region code from ISO 3166-2 code like 'FR-IDF'."""
    if '-' in iso_code:
        parts = iso_code.split('-', 1)
        return parts[0], parts[1]
    return '', iso_code


def update_custodian_file(filepath: Path, region_code: str, region_label: str,
                          wikidata_source: str, dry_run: bool = True) -> bool:
    """Update a custodian file with resolved region code."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception as e:
        print(f"  Error reading {filepath}: {e}")
        return False

    # Update ghcid.location_resolution
    if 'ghcid' not in data:
        print(f"  No ghcid section in {filepath}")
        return False

    ghcid = data['ghcid']
    if 'location_resolution' not in ghcid:
        ghcid['location_resolution'] = {}

    loc_res = ghcid['location_resolution']

    # Only update if currently XX
    if loc_res.get('region_code') != 'XX':
        print(f"  Already resolved: {filepath}")
        return False

    # Update region code
    old_region = loc_res.get('region_code', 'XX')
    loc_res['region_code'] = region_code
    loc_res['region_label'] = region_label
    loc_res['region_source'] = 'wikidata_p131'
    loc_res['region_resolved_at'] = datetime.now(timezone.utc).isoformat()
    loc_res['region_wikidata_source'] = wikidata_source

    # Update GHCID string (replace XX with new region code)
    old_ghcid = ghcid.get('ghcid_current', '')
    if old_ghcid and '-XX-' in old_ghcid:
        new_ghcid = old_ghcid.replace('-XX-', f'-{region_code}-')
        ghcid['ghcid_current'] = new_ghcid

        # Add to history
        if 'ghcid_history' not in ghcid:
            ghcid['ghcid_history'] = []

        ghcid['ghcid_history'].append({
            'ghcid': new_ghcid,
            'valid_from': datetime.now(timezone.utc).isoformat(),
            'reason': f"Region code resolved from XX to {region_code} via Wikidata P131"
        })

    # Rename file to match new GHCID
    new_filename = filepath.name.replace('-XX-', f'-{region_code}-')
    new_filepath = filepath.parent / new_filename

    # Add provenance note
    if 'provenance' not in data:
        data['provenance'] = {}
    if 'notes' not in data['provenance']:
        data['provenance']['notes'] = []
    elif isinstance(data['provenance']['notes'], str):
        data['provenance']['notes'] = [data['provenance']['notes']]

    data['provenance']['notes'].append(
        f"Region code resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
        f"XX → {region_code} ({region_label}) via Wikidata P131 chain"
    )

    if not dry_run:
        # Write updated file
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        # Rename file if GHCID changed
        if new_filepath != filepath and not new_filepath.exists():
            filepath.rename(new_filepath)
            print(f"  Renamed: {filepath.name} → {new_filename}")

    return True


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Resolve XX region codes using Wikidata P131'
    )
    parser.add_argument('--apply', action='store_true',
                        help='Actually apply the fixes (default: dry run)')
    parser.add_argument('--path', type=str, default='data/custodian',
                        help='Path to custodian files directory')
    parser.add_argument('--limit', type=int, default=50,
                        help='Limit number of files to process per batch')
    parser.add_argument('--country', type=str,
                        help='Only process files for a specific country (e.g., FR, KR)')

    args = parser.parse_args()

    custodian_dir = Path(args.path)
    if not custodian_dir.exists():
        print(f"Error: Directory {custodian_dir} does not exist")
        sys.exit(1)

    dry_run = not args.apply

    print("=" * 70)
    print("XX REGION CODE RESOLUTION VIA WIKIDATA P131")
    print("=" * 70)
    print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
    print()

    # Find all files with XX region code
    xx_files = []
    for filepath in custodian_dir.glob('*-XX-*.yaml'):
        # Read to get Wikidata ID and country
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            # Get Wikidata ID
            wd_id = None
            if 'wikidata_enrichment' in data:
                wd_id = data['wikidata_enrichment'].get('wikidata_entity_id')
            if not wd_id and 'original_entry' in data:
                wd_id = data['original_entry'].get('wikidata_id')

            # Get country code
            country = None
            if 'ghcid' in data and 'location_resolution' in data['ghcid']:
                country = data['ghcid']['location_resolution'].get('country_code')

            if wd_id and country:
                if args.country is None or country == args.country:
                    xx_files.append({
                        'filepath': filepath,
                        'wikidata_id': wd_id,
                        'country': country
                    })
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
            continue

    print(f"Found {len(xx_files)} files with XX region codes")

    if args.country:
        print(f"Filtering to country: {args.country}")

    # Limit for batch processing
    files_to_process = xx_files[:args.limit]
    print(f"Processing {len(files_to_process)} files (limit: {args.limit})")
    print()

    # Group by Wikidata ID to avoid duplicate queries
    by_wikidata = {}
    for item in files_to_process:
        wd_id = item['wikidata_id']
        if wd_id not in by_wikidata:
            by_wikidata[wd_id] = []
        by_wikidata[wd_id].append(item)

    print(f"Unique Wikidata IDs to query: {len(by_wikidata)}")

    # Query Wikidata in batches of 50
    batch_size = 50
    wikidata_ids = list(by_wikidata.keys())
    all_results = {}

    for i in range(0, len(wikidata_ids), batch_size):
        batch = wikidata_ids[i:i+batch_size]
        print(f"\nQuerying Wikidata batch {i//batch_size + 1} ({len(batch)} IDs)...")

        results = batch_query_admin_regions(batch)
        all_results.update(results)

        # Rate limiting
        if i + batch_size < len(wikidata_ids):
            time.sleep(1)

    print(f"\nReceived data for {len(all_results)} entities")

    # Process files
    resolved = 0
    not_resolved = 0

    for item in files_to_process:
        wd_id = item['wikidata_id']
        filepath = item['filepath']
        expected_country = item['country']

        wd_data = all_results.get(wd_id)
        if not wd_data or not wd_data.get('admin_codes'):
            print(f"  No admin data: {filepath.name} ({wd_id})")
            not_resolved += 1
            continue

        # Find matching ISO 3166-2 code for the expected country
        region_code = None
        region_label = None

        for iso_code in wd_data['admin_codes']:
            country, region = extract_region_from_iso_code(iso_code)
            if country == expected_country:
                region_code = region
                region_label = wd_data['admin_labels'].get(iso_code, '')
                break

        if not region_code:
            print(f"  No matching region for {expected_country}: {filepath.name} ({wd_id})")
            print(f"    Found: {wd_data['admin_codes']}")
            not_resolved += 1
            continue

        # Ensure region_label is not None
        if region_label is None:
            region_label = ""

        print(f"  {filepath.name}: XX → {region_code} ({region_label})")

        if update_custodian_file(filepath, region_code, region_label, wd_id, dry_run=dry_run):
            resolved += 1

    print()
    print("=" * 70)
    print(f"SUMMARY")
    print("=" * 70)
    print(f"Files processed: {len(files_to_process)}")
    print(f"Resolved: {resolved}")
    print(f"Not resolved: {not_resolved}")

    if dry_run:
        print()
        print("This was a DRY RUN. Use --apply to make changes.")


if __name__ == '__main__':
    main()