glam/scripts/resolve_xx_regions_wikidata.py
2025-12-07 00:26:01 +01:00

365 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Resolve XX region codes using Wikidata P131 (located in administrative territorial entity).
This script:
1. Reads custodian files with XX region codes
2. Queries Wikidata for P131 administrative hierarchy
3. Extracts ISO 3166-2 region codes (P300)
4. Updates the files with resolved region codes
Following AGENTS.md Rule 5: Additive only - never delete existing data.
"""
import os
import sys
import yaml
import json
import time
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
import re
def query_wikidata_sparql(sparql_query: str) -> Optional[List[Dict]]:
"""Execute SPARQL query using curl to Wikidata endpoint."""
import urllib.request
import urllib.parse
url = "https://query.wikidata.org/sparql"
headers = {
'Accept': 'application/sparql-results+json',
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
}
data = urllib.parse.urlencode({'query': sparql_query}).encode('utf-8')
try:
request = urllib.request.Request(url, data=data, headers=headers)
with urllib.request.urlopen(request, timeout=60) as response:
result = json.loads(response.read().decode('utf-8'))
return result.get('results', {}).get('bindings', [])
except Exception as e:
print(f" SPARQL error: {e}")
return None
def batch_query_admin_regions(wikidata_ids: List[str]) -> Dict[str, Dict[str, Any]]:
"""Query Wikidata for administrative regions of multiple entities."""
# Build VALUES clause with QIDs
values = ' '.join([f'wd:{qid}' for qid in wikidata_ids])
# Query for P131 chain and P300 (ISO 3166-2 code)
query = f"""
SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin1Code ?coords WHERE {{
VALUES ?item {{ {values} }}
# Get first-level admin division with ISO 3166-2 code
OPTIONAL {{
?item wdt:P131+ ?admin1.
?admin1 wdt:P300 ?admin1Code.
}}
# Get coordinates
OPTIONAL {{
?item wdt:P625 ?coords.
}}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}
"""
results = query_wikidata_sparql(query)
if not results:
return {}
# Process results - group by item
processed = {}
for row in results:
item_uri = row.get('item', {}).get('value', '')
if not item_uri:
continue
qid = item_uri.split('/')[-1]
if qid not in processed:
processed[qid] = {
'label': row.get('itemLabel', {}).get('value', ''),
'admin_codes': set(),
'admin_labels': {},
'coords': None
}
# Extract ISO 3166-2 code
if 'admin1Code' in row:
code = row['admin1Code'].get('value', '')
if code:
processed[qid]['admin_codes'].add(code)
label = row.get('admin1Label', {}).get('value', '')
processed[qid]['admin_labels'][code] = label
# Extract coordinates
if 'coords' in row and not processed[qid]['coords']:
coords_str = row['coords'].get('value', '')
# Parse "Point(lon lat)" format
match = re.search(r'Point\(([^\s]+)\s+([^\)]+)\)', coords_str)
if match:
processed[qid]['coords'] = {
'longitude': float(match.group(1)),
'latitude': float(match.group(2))
}
# Convert sets to lists
for qid in processed:
processed[qid]['admin_codes'] = list(processed[qid]['admin_codes'])
return processed
def extract_region_from_iso_code(iso_code: str) -> Tuple[str, str]:
"""Extract country and region code from ISO 3166-2 code like 'FR-IDF'."""
if '-' in iso_code:
parts = iso_code.split('-', 1)
return parts[0], parts[1]
return '', iso_code
def update_custodian_file(filepath: Path, region_code: str, region_label: str,
wikidata_source: str, dry_run: bool = True) -> bool:
"""Update a custodian file with resolved region code."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {filepath}: {e}")
return False
# Update ghcid.location_resolution
if 'ghcid' not in data:
print(f" No ghcid section in {filepath}")
return False
ghcid = data['ghcid']
if 'location_resolution' not in ghcid:
ghcid['location_resolution'] = {}
loc_res = ghcid['location_resolution']
# Only update if currently XX
if loc_res.get('region_code') != 'XX':
print(f" Already resolved: {filepath}")
return False
# Update region code
old_region = loc_res.get('region_code', 'XX')
loc_res['region_code'] = region_code
loc_res['region_label'] = region_label
loc_res['region_source'] = 'wikidata_p131'
loc_res['region_resolved_at'] = datetime.now(timezone.utc).isoformat()
loc_res['region_wikidata_source'] = wikidata_source
# Update GHCID string (replace XX with new region code)
old_ghcid = ghcid.get('ghcid_current', '')
if old_ghcid and '-XX-' in old_ghcid:
new_ghcid = old_ghcid.replace('-XX-', f'-{region_code}-')
ghcid['ghcid_current'] = new_ghcid
# Add to history
if 'ghcid_history' not in ghcid:
ghcid['ghcid_history'] = []
ghcid['ghcid_history'].append({
'ghcid': new_ghcid,
'valid_from': datetime.now(timezone.utc).isoformat(),
'reason': f"Region code resolved from XX to {region_code} via Wikidata P131"
})
# Rename file to match new GHCID
new_filename = filepath.name.replace('-XX-', f'-{region_code}-')
new_filepath = filepath.parent / new_filename
# Add provenance note
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
elif isinstance(data['provenance']['notes'], str):
data['provenance']['notes'] = [data['provenance']['notes']]
data['provenance']['notes'].append(
f"Region code resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
f"XX → {region_code} ({region_label}) via Wikidata P131 chain"
)
if not dry_run:
# Write updated file
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Rename file if GHCID changed
if new_filepath != filepath and not new_filepath.exists():
filepath.rename(new_filepath)
print(f" Renamed: {filepath.name}{new_filename}")
return True
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Resolve XX region codes using Wikidata P131'
)
parser.add_argument('--apply', action='store_true',
help='Actually apply the fixes (default: dry run)')
parser.add_argument('--path', type=str, default='data/custodian',
help='Path to custodian files directory')
parser.add_argument('--limit', type=int, default=50,
help='Limit number of files to process per batch')
parser.add_argument('--country', type=str,
help='Only process files for a specific country (e.g., FR, KR)')
args = parser.parse_args()
custodian_dir = Path(args.path)
if not custodian_dir.exists():
print(f"Error: Directory {custodian_dir} does not exist")
sys.exit(1)
dry_run = not args.apply
print("=" * 70)
print("XX REGION CODE RESOLUTION VIA WIKIDATA P131")
print("=" * 70)
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
print()
# Find all files with XX region code
xx_files = []
for filepath in custodian_dir.glob('*-XX-*.yaml'):
# Read to get Wikidata ID and country
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get Wikidata ID
wd_id = None
if 'wikidata_enrichment' in data:
wd_id = data['wikidata_enrichment'].get('wikidata_entity_id')
if not wd_id and 'original_entry' in data:
wd_id = data['original_entry'].get('wikidata_id')
# Get country code
country = None
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
country = data['ghcid']['location_resolution'].get('country_code')
if wd_id and country:
if args.country is None or country == args.country:
xx_files.append({
'filepath': filepath,
'wikidata_id': wd_id,
'country': country
})
except Exception as e:
print(f"Error reading {filepath}: {e}")
continue
print(f"Found {len(xx_files)} files with XX region codes")
if args.country:
print(f"Filtering to country: {args.country}")
# Limit for batch processing
files_to_process = xx_files[:args.limit]
print(f"Processing {len(files_to_process)} files (limit: {args.limit})")
print()
# Group by Wikidata ID to avoid duplicate queries
by_wikidata = {}
for item in files_to_process:
wd_id = item['wikidata_id']
if wd_id not in by_wikidata:
by_wikidata[wd_id] = []
by_wikidata[wd_id].append(item)
print(f"Unique Wikidata IDs to query: {len(by_wikidata)}")
# Query Wikidata in batches of 50
batch_size = 50
wikidata_ids = list(by_wikidata.keys())
all_results = {}
for i in range(0, len(wikidata_ids), batch_size):
batch = wikidata_ids[i:i+batch_size]
print(f"\nQuerying Wikidata batch {i//batch_size + 1} ({len(batch)} IDs)...")
results = batch_query_admin_regions(batch)
all_results.update(results)
# Rate limiting
if i + batch_size < len(wikidata_ids):
time.sleep(1)
print(f"\nReceived data for {len(all_results)} entities")
# Process files
resolved = 0
not_resolved = 0
for item in files_to_process:
wd_id = item['wikidata_id']
filepath = item['filepath']
expected_country = item['country']
wd_data = all_results.get(wd_id)
if not wd_data or not wd_data.get('admin_codes'):
print(f" No admin data: {filepath.name} ({wd_id})")
not_resolved += 1
continue
# Find matching ISO 3166-2 code for the expected country
region_code = None
region_label = None
for iso_code in wd_data['admin_codes']:
country, region = extract_region_from_iso_code(iso_code)
if country == expected_country:
region_code = region
region_label = wd_data['admin_labels'].get(iso_code, '')
break
if not region_code:
print(f" No matching region for {expected_country}: {filepath.name} ({wd_id})")
print(f" Found: {wd_data['admin_codes']}")
not_resolved += 1
continue
# Ensure region_label is not None
if region_label is None:
region_label = ""
print(f" {filepath.name}: XX → {region_code} ({region_label})")
if update_custodian_file(filepath, region_code, region_label, wd_id, dry_run=dry_run):
resolved += 1
print()
print("=" * 70)
print(f"SUMMARY")
print("=" * 70)
print(f"Files processed: {len(files_to_process)}")
print(f"Resolved: {resolved}")
print(f"Not resolved: {not_resolved}")
if dry_run:
print()
print("This was a DRY RUN. Use --apply to make changes.")
if __name__ == '__main__':
main()