365 lines
12 KiB
Python
365 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve XX region codes using Wikidata P131 (located in administrative territorial entity).
|
|
|
|
This script:
|
|
1. Reads custodian files with XX region codes
|
|
2. Queries Wikidata for P131 administrative hierarchy
|
|
3. Extracts ISO 3166-2 region codes (P300)
|
|
4. Updates the files with resolved region codes
|
|
|
|
Following AGENTS.md Rule 5: Additive only - never delete existing data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import json
|
|
import time
|
|
import subprocess
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
import re
|
|
|
|
|
|
def query_wikidata_sparql(sparql_query: str) -> Optional[List[Dict]]:
|
|
"""Execute SPARQL query using curl to Wikidata endpoint."""
|
|
import urllib.request
|
|
import urllib.parse
|
|
|
|
url = "https://query.wikidata.org/sparql"
|
|
headers = {
|
|
'Accept': 'application/sparql-results+json',
|
|
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
|
|
}
|
|
|
|
data = urllib.parse.urlencode({'query': sparql_query}).encode('utf-8')
|
|
|
|
try:
|
|
request = urllib.request.Request(url, data=data, headers=headers)
|
|
with urllib.request.urlopen(request, timeout=60) as response:
|
|
result = json.loads(response.read().decode('utf-8'))
|
|
return result.get('results', {}).get('bindings', [])
|
|
except Exception as e:
|
|
print(f" SPARQL error: {e}")
|
|
return None
|
|
|
|
|
|
def batch_query_admin_regions(wikidata_ids: List[str]) -> Dict[str, Dict[str, Any]]:
|
|
"""Query Wikidata for administrative regions of multiple entities."""
|
|
|
|
# Build VALUES clause with QIDs
|
|
values = ' '.join([f'wd:{qid}' for qid in wikidata_ids])
|
|
|
|
# Query for P131 chain and P300 (ISO 3166-2 code)
|
|
query = f"""
|
|
SELECT ?item ?itemLabel ?admin1 ?admin1Label ?admin1Code ?coords WHERE {{
|
|
VALUES ?item {{ {values} }}
|
|
|
|
# Get first-level admin division with ISO 3166-2 code
|
|
OPTIONAL {{
|
|
?item wdt:P131+ ?admin1.
|
|
?admin1 wdt:P300 ?admin1Code.
|
|
}}
|
|
|
|
# Get coordinates
|
|
OPTIONAL {{
|
|
?item wdt:P625 ?coords.
|
|
}}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
|
|
}}
|
|
"""
|
|
|
|
results = query_wikidata_sparql(query)
|
|
if not results:
|
|
return {}
|
|
|
|
# Process results - group by item
|
|
processed = {}
|
|
for row in results:
|
|
item_uri = row.get('item', {}).get('value', '')
|
|
if not item_uri:
|
|
continue
|
|
|
|
qid = item_uri.split('/')[-1]
|
|
|
|
if qid not in processed:
|
|
processed[qid] = {
|
|
'label': row.get('itemLabel', {}).get('value', ''),
|
|
'admin_codes': set(),
|
|
'admin_labels': {},
|
|
'coords': None
|
|
}
|
|
|
|
# Extract ISO 3166-2 code
|
|
if 'admin1Code' in row:
|
|
code = row['admin1Code'].get('value', '')
|
|
if code:
|
|
processed[qid]['admin_codes'].add(code)
|
|
label = row.get('admin1Label', {}).get('value', '')
|
|
processed[qid]['admin_labels'][code] = label
|
|
|
|
# Extract coordinates
|
|
if 'coords' in row and not processed[qid]['coords']:
|
|
coords_str = row['coords'].get('value', '')
|
|
# Parse "Point(lon lat)" format
|
|
match = re.search(r'Point\(([^\s]+)\s+([^\)]+)\)', coords_str)
|
|
if match:
|
|
processed[qid]['coords'] = {
|
|
'longitude': float(match.group(1)),
|
|
'latitude': float(match.group(2))
|
|
}
|
|
|
|
# Convert sets to lists
|
|
for qid in processed:
|
|
processed[qid]['admin_codes'] = list(processed[qid]['admin_codes'])
|
|
|
|
return processed
|
|
|
|
|
|
def extract_region_from_iso_code(iso_code: str) -> Tuple[str, str]:
|
|
"""Extract country and region code from ISO 3166-2 code like 'FR-IDF'."""
|
|
if '-' in iso_code:
|
|
parts = iso_code.split('-', 1)
|
|
return parts[0], parts[1]
|
|
return '', iso_code
|
|
|
|
|
|
def update_custodian_file(filepath: Path, region_code: str, region_label: str,
|
|
wikidata_source: str, dry_run: bool = True) -> bool:
|
|
"""Update a custodian file with resolved region code."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" Error reading {filepath}: {e}")
|
|
return False
|
|
|
|
# Update ghcid.location_resolution
|
|
if 'ghcid' not in data:
|
|
print(f" No ghcid section in {filepath}")
|
|
return False
|
|
|
|
ghcid = data['ghcid']
|
|
if 'location_resolution' not in ghcid:
|
|
ghcid['location_resolution'] = {}
|
|
|
|
loc_res = ghcid['location_resolution']
|
|
|
|
# Only update if currently XX
|
|
if loc_res.get('region_code') != 'XX':
|
|
print(f" Already resolved: {filepath}")
|
|
return False
|
|
|
|
# Update region code
|
|
old_region = loc_res.get('region_code', 'XX')
|
|
loc_res['region_code'] = region_code
|
|
loc_res['region_label'] = region_label
|
|
loc_res['region_source'] = 'wikidata_p131'
|
|
loc_res['region_resolved_at'] = datetime.now(timezone.utc).isoformat()
|
|
loc_res['region_wikidata_source'] = wikidata_source
|
|
|
|
# Update GHCID string (replace XX with new region code)
|
|
old_ghcid = ghcid.get('ghcid_current', '')
|
|
if old_ghcid and '-XX-' in old_ghcid:
|
|
new_ghcid = old_ghcid.replace('-XX-', f'-{region_code}-')
|
|
ghcid['ghcid_current'] = new_ghcid
|
|
|
|
# Add to history
|
|
if 'ghcid_history' not in ghcid:
|
|
ghcid['ghcid_history'] = []
|
|
|
|
ghcid['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'valid_from': datetime.now(timezone.utc).isoformat(),
|
|
'reason': f"Region code resolved from XX to {region_code} via Wikidata P131"
|
|
})
|
|
|
|
# Rename file to match new GHCID
|
|
new_filename = filepath.name.replace('-XX-', f'-{region_code}-')
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
elif isinstance(data['provenance']['notes'], str):
|
|
data['provenance']['notes'] = [data['provenance']['notes']]
|
|
|
|
data['provenance']['notes'].append(
|
|
f"Region code resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
|
f"XX → {region_code} ({region_label}) via Wikidata P131 chain"
|
|
)
|
|
|
|
if not dry_run:
|
|
# Write updated file
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Rename file if GHCID changed
|
|
if new_filepath != filepath and not new_filepath.exists():
|
|
filepath.rename(new_filepath)
|
|
print(f" Renamed: {filepath.name} → {new_filename}")
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Resolve XX region codes using Wikidata P131'
|
|
)
|
|
parser.add_argument('--apply', action='store_true',
|
|
help='Actually apply the fixes (default: dry run)')
|
|
parser.add_argument('--path', type=str, default='data/custodian',
|
|
help='Path to custodian files directory')
|
|
parser.add_argument('--limit', type=int, default=50,
|
|
help='Limit number of files to process per batch')
|
|
parser.add_argument('--country', type=str,
|
|
help='Only process files for a specific country (e.g., FR, KR)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.path)
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory {custodian_dir} does not exist")
|
|
sys.exit(1)
|
|
|
|
dry_run = not args.apply
|
|
|
|
print("=" * 70)
|
|
print("XX REGION CODE RESOLUTION VIA WIKIDATA P131")
|
|
print("=" * 70)
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
|
print()
|
|
|
|
# Find all files with XX region code
|
|
xx_files = []
|
|
for filepath in custodian_dir.glob('*-XX-*.yaml'):
|
|
# Read to get Wikidata ID and country
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get Wikidata ID
|
|
wd_id = None
|
|
if 'wikidata_enrichment' in data:
|
|
wd_id = data['wikidata_enrichment'].get('wikidata_entity_id')
|
|
if not wd_id and 'original_entry' in data:
|
|
wd_id = data['original_entry'].get('wikidata_id')
|
|
|
|
# Get country code
|
|
country = None
|
|
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
|
|
country = data['ghcid']['location_resolution'].get('country_code')
|
|
|
|
if wd_id and country:
|
|
if args.country is None or country == args.country:
|
|
xx_files.append({
|
|
'filepath': filepath,
|
|
'wikidata_id': wd_id,
|
|
'country': country
|
|
})
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
continue
|
|
|
|
print(f"Found {len(xx_files)} files with XX region codes")
|
|
|
|
if args.country:
|
|
print(f"Filtering to country: {args.country}")
|
|
|
|
# Limit for batch processing
|
|
files_to_process = xx_files[:args.limit]
|
|
print(f"Processing {len(files_to_process)} files (limit: {args.limit})")
|
|
print()
|
|
|
|
# Group by Wikidata ID to avoid duplicate queries
|
|
by_wikidata = {}
|
|
for item in files_to_process:
|
|
wd_id = item['wikidata_id']
|
|
if wd_id not in by_wikidata:
|
|
by_wikidata[wd_id] = []
|
|
by_wikidata[wd_id].append(item)
|
|
|
|
print(f"Unique Wikidata IDs to query: {len(by_wikidata)}")
|
|
|
|
# Query Wikidata in batches of 50
|
|
batch_size = 50
|
|
wikidata_ids = list(by_wikidata.keys())
|
|
all_results = {}
|
|
|
|
for i in range(0, len(wikidata_ids), batch_size):
|
|
batch = wikidata_ids[i:i+batch_size]
|
|
print(f"\nQuerying Wikidata batch {i//batch_size + 1} ({len(batch)} IDs)...")
|
|
|
|
results = batch_query_admin_regions(batch)
|
|
all_results.update(results)
|
|
|
|
# Rate limiting
|
|
if i + batch_size < len(wikidata_ids):
|
|
time.sleep(1)
|
|
|
|
print(f"\nReceived data for {len(all_results)} entities")
|
|
|
|
# Process files
|
|
resolved = 0
|
|
not_resolved = 0
|
|
|
|
for item in files_to_process:
|
|
wd_id = item['wikidata_id']
|
|
filepath = item['filepath']
|
|
expected_country = item['country']
|
|
|
|
wd_data = all_results.get(wd_id)
|
|
if not wd_data or not wd_data.get('admin_codes'):
|
|
print(f" No admin data: {filepath.name} ({wd_id})")
|
|
not_resolved += 1
|
|
continue
|
|
|
|
# Find matching ISO 3166-2 code for the expected country
|
|
region_code = None
|
|
region_label = None
|
|
|
|
for iso_code in wd_data['admin_codes']:
|
|
country, region = extract_region_from_iso_code(iso_code)
|
|
if country == expected_country:
|
|
region_code = region
|
|
region_label = wd_data['admin_labels'].get(iso_code, '')
|
|
break
|
|
|
|
if not region_code:
|
|
print(f" No matching region for {expected_country}: {filepath.name} ({wd_id})")
|
|
print(f" Found: {wd_data['admin_codes']}")
|
|
not_resolved += 1
|
|
continue
|
|
|
|
# Ensure region_label is not None
|
|
if region_label is None:
|
|
region_label = ""
|
|
|
|
print(f" {filepath.name}: XX → {region_code} ({region_label})")
|
|
|
|
if update_custodian_file(filepath, region_code, region_label, wd_id, dry_run=dry_run):
|
|
resolved += 1
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print(f"SUMMARY")
|
|
print("=" * 70)
|
|
print(f"Files processed: {len(files_to_process)}")
|
|
print(f"Resolved: {resolved}")
|
|
print(f"Not resolved: {not_resolved}")
|
|
|
|
if dry_run:
|
|
print()
|
|
print("This was a DRY RUN. Use --apply to make changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|