#!/usr/bin/env python3 """ Resolve XX region codes using Wikidata P131 (located in administrative territorial entity). This script: 1. Reads custodian files with XX region codes 2. Queries Wikidata for P131 administrative hierarchy 3. Extracts ISO 3166-2 region codes (P300) 4. Updates the files with resolved region codes Following AGENTS.md Rule 5: Additive only - never delete existing data. """ import os import sys import yaml import json import time import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List, Tuple import re def query_wikidata_sparql(sparql_query: str) -> Optional[List[Dict]]: """Execute SPARQL query using curl to Wikidata endpoint.""" import urllib.request import urllib.parse url = "https://query.wikidata.org/sparql" headers = { 'Accept': 'application/sparql-results+json', 'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)' } data = urllib.parse.urlencode({'query': sparql_query}).encode('utf-8') try: request = urllib.request.Request(url, data=data, headers=headers) with urllib.request.urlopen(request, timeout=60) as response: result = json.loads(response.read().decode('utf-8')) return result.get('results', {}).get('bindings', []) except Exception as e: print(f" SPARQL error: {e}") return None def batch_query_admin_regions(wikidata_ids: List[str]) -> Dict[str, Dict[str, Any]]: """Query Wikidata for administrative regions of multiple entities.""" # Build VALUES clause with QIDs values = ' '.join([f'wd:{qid}' for qid in wikidata_ids]) # Query for P131 chain and P300 (ISO 3166-2 code) query = f""" SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin1Code ?coords WHERE {{ VALUES ?item {{ {values} }} # Get first-level admin division with ISO 3166-2 code OPTIONAL {{ ?item wdt:P131+ ?admin1. ?admin1 wdt:P300 ?admin1Code. }} # Get coordinates OPTIONAL {{ ?item wdt:P625 ?coords. }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} """ results = query_wikidata_sparql(query) if not results: return {} # Process results - group by item processed = {} for row in results: item_uri = row.get('item', {}).get('value', '') if not item_uri: continue qid = item_uri.split('/')[-1] if qid not in processed: processed[qid] = { 'label': row.get('itemLabel', {}).get('value', ''), 'admin_codes': set(), 'admin_labels': {}, 'coords': None } # Extract ISO 3166-2 code if 'admin1Code' in row: code = row['admin1Code'].get('value', '') if code: processed[qid]['admin_codes'].add(code) label = row.get('admin1Label', {}).get('value', '') processed[qid]['admin_labels'][code] = label # Extract coordinates if 'coords' in row and not processed[qid]['coords']: coords_str = row['coords'].get('value', '') # Parse "Point(lon lat)" format match = re.search(r'Point\(([^\s]+)\s+([^\)]+)\)', coords_str) if match: processed[qid]['coords'] = { 'longitude': float(match.group(1)), 'latitude': float(match.group(2)) } # Convert sets to lists for qid in processed: processed[qid]['admin_codes'] = list(processed[qid]['admin_codes']) return processed def extract_region_from_iso_code(iso_code: str) -> Tuple[str, str]: """Extract country and region code from ISO 3166-2 code like 'FR-IDF'.""" if '-' in iso_code: parts = iso_code.split('-', 1) return parts[0], parts[1] return '', iso_code def update_custodian_file(filepath: Path, region_code: str, region_label: str, wikidata_source: str, dry_run: bool = True) -> bool: """Update a custodian file with resolved region code.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f" Error reading {filepath}: {e}") return False # Update ghcid.location_resolution if 'ghcid' not in data: print(f" No ghcid section in {filepath}") return False ghcid = data['ghcid'] if 'location_resolution' not in ghcid: ghcid['location_resolution'] = {} loc_res = ghcid['location_resolution'] # Only update if currently XX if loc_res.get('region_code') != 'XX': print(f" Already resolved: {filepath}") return False # Update region code old_region = loc_res.get('region_code', 'XX') loc_res['region_code'] = region_code loc_res['region_label'] = region_label loc_res['region_source'] = 'wikidata_p131' loc_res['region_resolved_at'] = datetime.now(timezone.utc).isoformat() loc_res['region_wikidata_source'] = wikidata_source # Update GHCID string (replace XX with new region code) old_ghcid = ghcid.get('ghcid_current', '') if old_ghcid and '-XX-' in old_ghcid: new_ghcid = old_ghcid.replace('-XX-', f'-{region_code}-') ghcid['ghcid_current'] = new_ghcid # Add to history if 'ghcid_history' not in ghcid: ghcid['ghcid_history'] = [] ghcid['ghcid_history'].append({ 'ghcid': new_ghcid, 'valid_from': datetime.now(timezone.utc).isoformat(), 'reason': f"Region code resolved from XX to {region_code} via Wikidata P131" }) # Rename file to match new GHCID new_filename = filepath.name.replace('-XX-', f'-{region_code}-') new_filepath = filepath.parent / new_filename # Add provenance note if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] elif isinstance(data['provenance']['notes'], str): data['provenance']['notes'] = [data['provenance']['notes']] data['provenance']['notes'].append( f"Region code resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " f"XX → {region_code} ({region_label}) via Wikidata P131 chain" ) if not dry_run: # Write updated file with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Rename file if GHCID changed if new_filepath != filepath and not new_filepath.exists(): filepath.rename(new_filepath) print(f" Renamed: {filepath.name} → {new_filename}") return True def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser( description='Resolve XX region codes using Wikidata P131' ) parser.add_argument('--apply', action='store_true', help='Actually apply the fixes (default: dry run)') parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files directory') parser.add_argument('--limit', type=int, default=50, help='Limit number of files to process per batch') parser.add_argument('--country', type=str, help='Only process files for a specific country (e.g., FR, KR)') args = parser.parse_args() custodian_dir = Path(args.path) if not custodian_dir.exists(): print(f"Error: Directory {custodian_dir} does not exist") sys.exit(1) dry_run = not args.apply print("=" * 70) print("XX REGION CODE RESOLUTION VIA WIKIDATA P131") print("=" * 70) print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") print() # Find all files with XX region code xx_files = [] for filepath in custodian_dir.glob('*-XX-*.yaml'): # Read to get Wikidata ID and country try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get Wikidata ID wd_id = None if 'wikidata_enrichment' in data: wd_id = data['wikidata_enrichment'].get('wikidata_entity_id') if not wd_id and 'original_entry' in data: wd_id = data['original_entry'].get('wikidata_id') # Get country code country = None if 'ghcid' in data and 'location_resolution' in data['ghcid']: country = data['ghcid']['location_resolution'].get('country_code') if wd_id and country: if args.country is None or country == args.country: xx_files.append({ 'filepath': filepath, 'wikidata_id': wd_id, 'country': country }) except Exception as e: print(f"Error reading {filepath}: {e}") continue print(f"Found {len(xx_files)} files with XX region codes") if args.country: print(f"Filtering to country: {args.country}") # Limit for batch processing files_to_process = xx_files[:args.limit] print(f"Processing {len(files_to_process)} files (limit: {args.limit})") print() # Group by Wikidata ID to avoid duplicate queries by_wikidata = {} for item in files_to_process: wd_id = item['wikidata_id'] if wd_id not in by_wikidata: by_wikidata[wd_id] = [] by_wikidata[wd_id].append(item) print(f"Unique Wikidata IDs to query: {len(by_wikidata)}") # Query Wikidata in batches of 50 batch_size = 50 wikidata_ids = list(by_wikidata.keys()) all_results = {} for i in range(0, len(wikidata_ids), batch_size): batch = wikidata_ids[i:i+batch_size] print(f"\nQuerying Wikidata batch {i//batch_size + 1} ({len(batch)} IDs)...") results = batch_query_admin_regions(batch) all_results.update(results) # Rate limiting if i + batch_size < len(wikidata_ids): time.sleep(1) print(f"\nReceived data for {len(all_results)} entities") # Process files resolved = 0 not_resolved = 0 for item in files_to_process: wd_id = item['wikidata_id'] filepath = item['filepath'] expected_country = item['country'] wd_data = all_results.get(wd_id) if not wd_data or not wd_data.get('admin_codes'): print(f" No admin data: {filepath.name} ({wd_id})") not_resolved += 1 continue # Find matching ISO 3166-2 code for the expected country region_code = None region_label = None for iso_code in wd_data['admin_codes']: country, region = extract_region_from_iso_code(iso_code) if country == expected_country: region_code = region region_label = wd_data['admin_labels'].get(iso_code, '') break if not region_code: print(f" No matching region for {expected_country}: {filepath.name} ({wd_id})") print(f" Found: {wd_data['admin_codes']}") not_resolved += 1 continue # Ensure region_label is not None if region_label is None: region_label = "" print(f" {filepath.name}: XX → {region_code} ({region_label})") if update_custodian_file(filepath, region_code, region_label, wd_id, dry_run=dry_run): resolved += 1 print() print("=" * 70) print(f"SUMMARY") print("=" * 70) print(f"Files processed: {len(files_to_process)}") print(f"Resolved: {resolved}") print(f"Not resolved: {not_resolved}") if dry_run: print() print("This was a DRY RUN. Use --apply to make changes.") if __name__ == '__main__': main()