694 lines
23 KiB
Python
694 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve XX region codes and XXX settlement codes using GeoNames reverse geocoding.
|
|
|
|
This script:
|
|
1. Finds files with XX region or XXX settlement codes
|
|
2. Extracts coordinates from the file or queries Wikidata P625
|
|
3. Uses GeoNames database for reverse geocoding
|
|
4. Updates files with resolved region and settlement codes
|
|
5. Renames files to match new GHCID
|
|
|
|
Following AGENTS.md Rules:
|
|
- Rule 5: Additive only - never delete existing data
|
|
- GHCID GeoNames rules: Filter by feature_code (exclude PPLX neighborhoods)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import sqlite3
|
|
import json
|
|
import re
|
|
import math
|
|
import urllib.request
|
|
import urllib.parse
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
|
|
# GeoNames admin1 code to ISO 3166-2 mapping
|
|
# Format: country_code: {geonames_admin1: iso_region_code}
|
|
ADMIN1_TO_ISO = {
|
|
'FR': {
|
|
'11': 'IDF', # Île-de-France
|
|
'24': 'CVL', # Centre-Val de Loire
|
|
'27': 'BFC', # Bourgogne-Franche-Comté
|
|
'28': 'NOR', # Normandy
|
|
'32': 'HDF', # Hauts-de-France
|
|
'44': 'GES', # Grand Est
|
|
'52': 'PDL', # Pays de la Loire
|
|
'53': 'BRE', # Brittany
|
|
'75': 'NAQ', # Nouvelle-Aquitaine
|
|
'76': 'OCC', # Occitanie
|
|
'84': 'ARA', # Auvergne-Rhône-Alpes
|
|
'93': 'PAC', # Provence-Alpes-Côte d'Azur
|
|
'94': 'COR', # Corsica
|
|
},
|
|
'DE': {
|
|
'01': 'BW', # Baden-Württemberg
|
|
'02': 'BY', # Bavaria
|
|
'03': 'BE', # Berlin
|
|
'04': 'BB', # Brandenburg
|
|
'05': 'HB', # Bremen
|
|
'06': 'HH', # Hamburg
|
|
'07': 'HE', # Hesse
|
|
'08': 'MV', # Mecklenburg-Vorpommern
|
|
'09': 'NI', # Lower Saxony
|
|
'10': 'NW', # North Rhine-Westphalia
|
|
'11': 'RP', # Rhineland-Palatinate
|
|
'12': 'SL', # Saarland
|
|
'13': 'SN', # Saxony
|
|
'14': 'ST', # Saxony-Anhalt
|
|
'15': 'SH', # Schleswig-Holstein
|
|
'16': 'TH', # Thuringia
|
|
},
|
|
'US': {
|
|
'AL': 'AL', 'AK': 'AK', 'AZ': 'AZ', 'AR': 'AR', 'CA': 'CA',
|
|
'CO': 'CO', 'CT': 'CT', 'DE': 'DE', 'FL': 'FL', 'GA': 'GA',
|
|
'HI': 'HI', 'ID': 'ID', 'IL': 'IL', 'IN': 'IN', 'IA': 'IA',
|
|
'KS': 'KS', 'KY': 'KY', 'LA': 'LA', 'ME': 'ME', 'MD': 'MD',
|
|
'MA': 'MA', 'MI': 'MI', 'MN': 'MN', 'MS': 'MS', 'MO': 'MO',
|
|
'MT': 'MT', 'NE': 'NE', 'NV': 'NV', 'NH': 'NH', 'NJ': 'NJ',
|
|
'NM': 'NM', 'NY': 'NY', 'NC': 'NC', 'ND': 'ND', 'OH': 'OH',
|
|
'OK': 'OK', 'OR': 'OR', 'PA': 'PA', 'RI': 'RI', 'SC': 'SC',
|
|
'SD': 'SD', 'TN': 'TN', 'TX': 'TX', 'UT': 'UT', 'VT': 'VT',
|
|
'VA': 'VA', 'WA': 'WA', 'WV': 'WV', 'WI': 'WI', 'WY': 'WY',
|
|
'DC': 'DC',
|
|
},
|
|
'GB': {
|
|
'ENG': 'ENG', 'NIR': 'NIR', 'SCT': 'SCT', 'WLS': 'WLS',
|
|
},
|
|
'AU': {
|
|
'01': 'ACT', # Australian Capital Territory
|
|
'02': 'NSW', # New South Wales
|
|
'03': 'NT', # Northern Territory
|
|
'04': 'QLD', # Queensland
|
|
'05': 'SA', # South Australia
|
|
'06': 'TAS', # Tasmania
|
|
'07': 'VIC', # Victoria
|
|
'08': 'WA', # Western Australia
|
|
},
|
|
'CA': {
|
|
'01': 'AB', # Alberta
|
|
'02': 'BC', # British Columbia
|
|
'03': 'MB', # Manitoba
|
|
'04': 'NB', # New Brunswick
|
|
'05': 'NL', # Newfoundland and Labrador
|
|
'07': 'NS', # Nova Scotia
|
|
'08': 'ON', # Ontario
|
|
'09': 'PE', # Prince Edward Island
|
|
'10': 'QC', # Quebec
|
|
'11': 'SK', # Saskatchewan
|
|
'12': 'YT', # Yukon
|
|
'13': 'NT', # Northwest Territories
|
|
'14': 'NU', # Nunavut
|
|
},
|
|
'JP': {
|
|
'01': 'HKD', # Hokkaido
|
|
'02': 'AOM', # Aomori
|
|
'03': 'IWT', # Iwate
|
|
'04': 'MYG', # Miyagi
|
|
'05': 'AKT', # Akita
|
|
'06': 'YGT', # Yamagata
|
|
'07': 'FKS', # Fukushima
|
|
'08': 'IBR', # Ibaraki
|
|
'09': 'TCG', # Tochigi
|
|
'10': 'GNM', # Gunma
|
|
'11': 'SIT', # Saitama
|
|
'12': 'CHB', # Chiba
|
|
'13': 'TKY', # Tokyo
|
|
'14': 'KGW', # Kanagawa
|
|
'15': 'NGT', # Niigata
|
|
'16': 'TYM', # Toyama
|
|
'17': 'ISK', # Ishikawa
|
|
'18': 'FKI', # Fukui
|
|
'19': 'YMN', # Yamanashi
|
|
'20': 'NGN', # Nagano
|
|
'21': 'GFU', # Gifu
|
|
'22': 'SZO', # Shizuoka
|
|
'23': 'AIC', # Aichi
|
|
'24': 'MIE', # Mie
|
|
'25': 'SIG', # Shiga
|
|
'26': 'KYO', # Kyoto
|
|
'27': 'OSK', # Osaka
|
|
'28': 'HYG', # Hyogo
|
|
'29': 'NAR', # Nara
|
|
'30': 'WKY', # Wakayama
|
|
'31': 'TTR', # Tottori
|
|
'32': 'SMN', # Shimane
|
|
'33': 'OKY', # Okayama
|
|
'34': 'HIR', # Hiroshima
|
|
'35': 'YGC', # Yamaguchi
|
|
'36': 'TKS', # Tokushima
|
|
'37': 'KGW', # Kagawa
|
|
'38': 'EHM', # Ehime
|
|
'39': 'KOC', # Kochi
|
|
'40': 'FKO', # Fukuoka
|
|
'41': 'SAG', # Saga
|
|
'42': 'NGS', # Nagasaki
|
|
'43': 'KMM', # Kumamoto
|
|
'44': 'OIT', # Oita
|
|
'45': 'MYZ', # Miyazaki
|
|
'46': 'KGS', # Kagoshima
|
|
'47': 'OKN', # Okinawa
|
|
},
|
|
'CN': {
|
|
'01': 'AH', # Anhui
|
|
'02': 'ZJ', # Zhejiang
|
|
'03': 'JX', # Jiangxi
|
|
'04': 'JS', # Jiangsu
|
|
'05': 'JL', # Jilin
|
|
'06': 'QH', # Qinghai
|
|
'07': 'FJ', # Fujian
|
|
'08': 'HL', # Heilongjiang
|
|
'09': 'HN', # Henan
|
|
'10': 'HB', # Hebei
|
|
'11': 'HN', # Hunan
|
|
'12': 'HA', # Hubei
|
|
'13': 'XZ', # Tibet
|
|
'14': 'XJ', # Xinjiang
|
|
'15': 'NX', # Ningxia
|
|
'16': 'NM', # Inner Mongolia
|
|
'18': 'SD', # Shandong
|
|
'19': 'SX', # Shanxi
|
|
'20': 'SN', # Shaanxi
|
|
'21': 'TJ', # Tianjin
|
|
'22': 'BJ', # Beijing
|
|
'23': 'SH', # Shanghai
|
|
'24': 'HI', # Hainan
|
|
'25': 'CQ', # Chongqing
|
|
'26': 'GS', # Gansu
|
|
'28': 'GX', # Guangxi
|
|
'29': 'SC', # Sichuan
|
|
'30': 'GD', # Guangdong
|
|
'31': 'YN', # Yunnan
|
|
'32': 'GZ', # Guizhou
|
|
'33': 'LN', # Liaoning
|
|
},
|
|
'KR': {
|
|
'01': 'SO', # Seoul
|
|
'02': 'BS', # Busan
|
|
'03': 'TG', # Daegu
|
|
'04': 'IN', # Incheon
|
|
'05': 'GJ', # Gwangju
|
|
'06': 'DJ', # Daejeon
|
|
'07': 'US', # Ulsan
|
|
'08': 'SJ', # Sejong
|
|
'10': 'KG', # Gyeonggi
|
|
'11': 'KW', # Gangwon
|
|
'12': 'CB', # North Chungcheong
|
|
'13': 'CN', # South Chungcheong
|
|
'14': 'JB', # North Jeolla
|
|
'15': 'JN', # South Jeolla
|
|
'16': 'KB', # North Gyeongsang
|
|
'17': 'KN', # South Gyeongsang
|
|
'18': 'JJ', # Jeju
|
|
},
|
|
'BR': {
|
|
'01': 'AC', # Acre
|
|
'02': 'AL', # Alagoas
|
|
'03': 'AP', # Amapá
|
|
'04': 'AM', # Amazonas
|
|
'05': 'BA', # Bahia
|
|
'06': 'CE', # Ceará
|
|
'07': 'DF', # Distrito Federal
|
|
'08': 'ES', # Espírito Santo
|
|
'11': 'MS', # Mato Grosso do Sul
|
|
'14': 'RN', # Rio Grande do Norte
|
|
'16': 'RS', # Rio Grande do Sul
|
|
'17': 'RJ', # Rio de Janeiro
|
|
'18': 'RO', # Rondônia
|
|
'19': 'RR', # Roraima
|
|
'20': 'SC', # Santa Catarina
|
|
'21': 'GO', # Goiás
|
|
'22': 'MA', # Maranhão
|
|
'23': 'MT', # Mato Grosso
|
|
'24': 'MG', # Minas Gerais
|
|
'25': 'PA', # Pará
|
|
'26': 'PB', # Paraíba
|
|
'27': 'SP', # São Paulo
|
|
'28': 'SE', # Sergipe
|
|
'29': 'TO', # Tocantins
|
|
'30': 'PE', # Pernambuco
|
|
'31': 'PI', # Piauí
|
|
'32': 'PR', # Paraná
|
|
},
|
|
# Add more countries as needed
|
|
}
|
|
|
|
# Valid feature codes for settlements (exclude PPLX neighborhoods)
|
|
VALID_FEATURE_CODES = {'PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG'}
|
|
|
|
|
|
def get_geonames_connection(db_path: str = 'data/reference/geonames.db') -> sqlite3.Connection:
|
|
"""Get a connection to the GeoNames database."""
|
|
return sqlite3.connect(db_path)
|
|
|
|
|
|
def reverse_geocode(lat: float, lon: float, country_code: str, conn: sqlite3.Connection) -> Optional[Dict[str, Any]]:
|
|
"""Reverse geocode coordinates to find nearest city using GeoNames.
|
|
|
|
Following AGENTS.md: Filter by feature_code to exclude PPLX (neighborhoods).
|
|
"""
|
|
# Query for nearest city with valid feature code
|
|
query = """
|
|
SELECT
|
|
geonames_id, name, ascii_name, admin1_code, admin1_name,
|
|
latitude, longitude, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY distance_sq
|
|
LIMIT 1
|
|
"""
|
|
|
|
cursor = conn.execute(query, (lat, lat, lon, lon, country_code))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
return None
|
|
|
|
# Calculate actual distance in km
|
|
distance_km = math.sqrt(row[9]) * 111 # Rough conversion (1 degree ≈ 111km)
|
|
|
|
return {
|
|
'geonames_id': row[0],
|
|
'name': row[1],
|
|
'ascii_name': row[2],
|
|
'admin1_code': row[3],
|
|
'admin1_name': row[4],
|
|
'latitude': row[5],
|
|
'longitude': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
'distance_km': distance_km
|
|
}
|
|
|
|
|
|
def generate_city_code(name: str) -> str:
|
|
"""Generate 3-letter city code from name.
|
|
|
|
Rules from AGENTS.md:
|
|
- Single word: First 3 letters
|
|
- Dutch article: Article initial + 2 from main word
|
|
- Multi-word: Initials (up to 3)
|
|
"""
|
|
# Normalize
|
|
name = name.strip()
|
|
|
|
# Dutch/French articles to check
|
|
articles = ["'s", "de", "den", "het", "la", "le", "les", "l'", "the"]
|
|
|
|
words = name.split()
|
|
|
|
if len(words) == 1:
|
|
# Single word - first 3 letters
|
|
return name[:3].upper()
|
|
|
|
# Check for leading article
|
|
if words[0].lower() in articles:
|
|
if len(words) >= 2:
|
|
# Article initial + 2 from main word
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
|
|
# Multi-word: initials
|
|
initials = ''.join(w[0] for w in words if w[0].isalpha())[:3]
|
|
return initials.upper()
|
|
|
|
|
|
def get_iso_region_code(country_code: str, admin1_code: str) -> str:
|
|
"""Map GeoNames admin1_code to ISO 3166-2 region code."""
|
|
if country_code in ADMIN1_TO_ISO:
|
|
mapping = ADMIN1_TO_ISO[country_code]
|
|
if admin1_code in mapping:
|
|
return mapping[admin1_code]
|
|
|
|
# Fallback: use admin1_code directly (works for many countries)
|
|
return admin1_code if admin1_code else 'XX'
|
|
|
|
|
|
def extract_coordinates(data: Dict[str, Any]) -> Optional[Tuple[float, float]]:
|
|
"""Extract coordinates from custodian data."""
|
|
# Try locations array
|
|
if 'locations' in data and data['locations']:
|
|
loc = data['locations'][0]
|
|
if 'latitude' in loc and 'longitude' in loc:
|
|
return (float(loc['latitude']), float(loc['longitude']))
|
|
|
|
# Try ghcid.location_resolution.source_coordinates
|
|
if 'ghcid' in data:
|
|
loc_res = data['ghcid'].get('location_resolution', {})
|
|
src_coords = loc_res.get('source_coordinates', {})
|
|
if 'latitude' in src_coords and 'longitude' in src_coords:
|
|
return (float(src_coords['latitude']), float(src_coords['longitude']))
|
|
|
|
# Try wikidata_enrichment coordinates
|
|
if 'wikidata_enrichment' in data:
|
|
wd = data['wikidata_enrichment']
|
|
if 'coordinates' in wd:
|
|
coords = wd['coordinates']
|
|
if 'latitude' in coords and 'longitude' in coords:
|
|
return (float(coords['latitude']), float(coords['longitude']))
|
|
|
|
return None
|
|
|
|
|
|
def get_wikidata_id(data: Dict[str, Any]) -> Optional[str]:
|
|
"""Extract Wikidata entity ID from custodian data."""
|
|
if 'wikidata_enrichment' in data:
|
|
wd_id = data['wikidata_enrichment'].get('wikidata_entity_id')
|
|
if wd_id:
|
|
return wd_id
|
|
|
|
if 'original_entry' in data:
|
|
wd_id = data['original_entry'].get('wikidata_id')
|
|
if wd_id:
|
|
return wd_id
|
|
|
|
return None
|
|
|
|
|
|
def query_wikidata_coordinates(wikidata_ids: List[str]) -> Dict[str, Tuple[float, float]]:
|
|
"""Query Wikidata for P625 coordinates in batch."""
|
|
if not wikidata_ids:
|
|
return {}
|
|
|
|
values = ' '.join([f'wd:{qid}' for qid in wikidata_ids])
|
|
|
|
query = f"""
|
|
SELECT ?item ?coords WHERE {{
|
|
VALUES ?item {{ {values} }}
|
|
?item wdt:P625 ?coords.
|
|
}}
|
|
"""
|
|
|
|
url = "https://query.wikidata.org/sparql"
|
|
headers = {
|
|
'Accept': 'application/sparql-results+json',
|
|
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
|
|
}
|
|
|
|
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
|
|
|
|
try:
|
|
request = urllib.request.Request(url, data=data, headers=headers)
|
|
with urllib.request.urlopen(request, timeout=60) as response:
|
|
result = json.loads(response.read().decode('utf-8'))
|
|
bindings = result.get('results', {}).get('bindings', [])
|
|
except Exception as e:
|
|
print(f" Wikidata SPARQL error: {e}")
|
|
return {}
|
|
|
|
coords_map = {}
|
|
for row in bindings:
|
|
item_uri = row.get('item', {}).get('value', '')
|
|
coords_str = row.get('coords', {}).get('value', '')
|
|
|
|
if item_uri and coords_str:
|
|
qid = item_uri.split('/')[-1]
|
|
# Parse "Point(lon lat)" format
|
|
match = re.search(r'Point\(([^\s]+)\s+([^\)]+)\)', coords_str)
|
|
if match:
|
|
lon = float(match.group(1))
|
|
lat = float(match.group(2))
|
|
coords_map[qid] = (lat, lon)
|
|
|
|
return coords_map
|
|
|
|
|
|
def update_custodian_file(filepath: Path, geo_result: Dict[str, Any],
|
|
country_code: str, dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
|
"""Update a custodian file with resolved location data."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" Error reading {filepath}: {e}")
|
|
return False, None
|
|
|
|
if 'ghcid' not in data:
|
|
return False, None
|
|
|
|
ghcid = data['ghcid']
|
|
if 'location_resolution' not in ghcid:
|
|
ghcid['location_resolution'] = {}
|
|
|
|
loc_res = ghcid['location_resolution']
|
|
|
|
# Get ISO region code
|
|
region_code = get_iso_region_code(country_code, geo_result.get('admin1_code', ''))
|
|
city_code = generate_city_code(geo_result['ascii_name'])
|
|
|
|
# Update location_resolution
|
|
old_region = loc_res.get('region_code', 'XX')
|
|
old_city = loc_res.get('city_code', 'XXX')
|
|
|
|
changes = []
|
|
|
|
if old_region == 'XX' and region_code != 'XX':
|
|
loc_res['region_code'] = region_code
|
|
loc_res['region_name'] = geo_result.get('admin1_name', '')
|
|
changes.append(f"region XX→{region_code}")
|
|
|
|
if old_city == 'XXX':
|
|
loc_res['city_code'] = city_code
|
|
loc_res['city_name'] = geo_result['name']
|
|
changes.append(f"city XXX→{city_code}")
|
|
|
|
if not changes:
|
|
return False, None
|
|
|
|
# Update GeoNames metadata
|
|
loc_res['method'] = 'REVERSE_GEOCODE'
|
|
loc_res['geonames_id'] = geo_result['geonames_id']
|
|
loc_res['geonames_name'] = geo_result['name']
|
|
loc_res['feature_code'] = geo_result['feature_code']
|
|
loc_res['population'] = geo_result.get('population')
|
|
loc_res['distance_km'] = round(geo_result['distance_km'], 2)
|
|
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update GHCID string
|
|
old_ghcid = ghcid.get('ghcid_current', '')
|
|
new_ghcid = old_ghcid
|
|
|
|
if old_region == 'XX' and region_code != 'XX':
|
|
new_ghcid = new_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
|
|
|
if old_city == 'XXX':
|
|
# Find and replace XXX in GHCID
|
|
new_ghcid = new_ghcid.replace('-XXX-', f'-{city_code}-')
|
|
|
|
if new_ghcid != old_ghcid:
|
|
ghcid['ghcid_current'] = new_ghcid
|
|
|
|
# Add to history
|
|
if 'ghcid_history' not in ghcid:
|
|
ghcid['ghcid_history'] = []
|
|
|
|
ghcid['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'valid_from': datetime.now(timezone.utc).isoformat(),
|
|
'reason': f"Location resolved via GeoNames reverse geocoding: {', '.join(changes)}"
|
|
})
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
elif isinstance(data['provenance']['notes'], str):
|
|
data['provenance']['notes'] = [data['provenance']['notes']]
|
|
|
|
data['provenance']['notes'].append(
|
|
f"Location resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
|
f"{', '.join(changes)} via GeoNames ({geo_result['name']}, {geo_result.get('admin1_name', '')})"
|
|
)
|
|
|
|
# Determine new filename
|
|
new_filename = filepath.name
|
|
if old_region == 'XX' and region_code != 'XX':
|
|
new_filename = new_filename.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
|
if old_city == 'XXX':
|
|
new_filename = new_filename.replace('-XXX-', f'-{city_code}-')
|
|
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if not dry_run:
|
|
# Write updated file
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Rename if needed
|
|
if new_filepath != filepath and not new_filepath.exists():
|
|
filepath.rename(new_filepath)
|
|
|
|
return True, new_filepath if new_filepath != filepath else None
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Resolve XX/XXX location codes using GeoNames reverse geocoding'
|
|
)
|
|
parser.add_argument('--apply', action='store_true',
|
|
help='Actually apply the fixes (default: dry run)')
|
|
parser.add_argument('--path', type=str, default='data/custodian',
|
|
help='Path to custodian files directory')
|
|
parser.add_argument('--db', type=str, default='data/reference/geonames.db',
|
|
help='Path to GeoNames database')
|
|
parser.add_argument('--limit', type=int, default=100,
|
|
help='Limit number of files to process')
|
|
parser.add_argument('--country', type=str,
|
|
help='Only process files for a specific country')
|
|
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.path)
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory {custodian_dir} does not exist")
|
|
sys.exit(1)
|
|
|
|
dry_run = not args.apply
|
|
|
|
print("=" * 70)
|
|
print("LOCATION RESOLUTION VIA GEONAMES REVERSE GEOCODING")
|
|
print("=" * 70)
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
|
print()
|
|
|
|
# Connect to GeoNames DB
|
|
conn = get_geonames_connection(args.db)
|
|
|
|
# Find files with XX region or XXX city codes
|
|
files_to_process = []
|
|
|
|
# Look for files matching patterns
|
|
for filepath in custodian_dir.glob('*-XX-*.yaml'):
|
|
files_to_process.append(filepath)
|
|
for filepath in custodian_dir.glob('*-XXX-*.yaml'):
|
|
if filepath not in files_to_process:
|
|
files_to_process.append(filepath)
|
|
|
|
print(f"Found {len(files_to_process)} files with XX or XXX codes")
|
|
|
|
# Load files and extract info
|
|
file_data = []
|
|
for filepath in files_to_process[:args.limit]:
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get country code
|
|
country = None
|
|
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
|
|
country = data['ghcid']['location_resolution'].get('country_code')
|
|
|
|
if not country:
|
|
continue
|
|
|
|
if args.country and country != args.country:
|
|
continue
|
|
|
|
# Get coordinates
|
|
coords = extract_coordinates(data)
|
|
wikidata_id = get_wikidata_id(data)
|
|
|
|
file_data.append({
|
|
'filepath': filepath,
|
|
'data': data,
|
|
'country': country,
|
|
'coords': coords,
|
|
'wikidata_id': wikidata_id
|
|
})
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}")
|
|
|
|
print(f"Processing {len(file_data)} files")
|
|
|
|
# Separate files with and without coordinates
|
|
with_coords = [f for f in file_data if f['coords']]
|
|
without_coords = [f for f in file_data if not f['coords'] and f['wikidata_id']]
|
|
no_location = [f for f in file_data if not f['coords'] and not f['wikidata_id']]
|
|
|
|
print(f" With coordinates: {len(with_coords)}")
|
|
print(f" Need Wikidata lookup: {len(without_coords)}")
|
|
print(f" No location data: {len(no_location)}")
|
|
print()
|
|
|
|
# Query Wikidata for missing coordinates
|
|
if without_coords:
|
|
print("Querying Wikidata for coordinates...")
|
|
wikidata_ids = [f['wikidata_id'] for f in without_coords]
|
|
|
|
# Batch in groups of 50
|
|
all_coords = {}
|
|
for i in range(0, len(wikidata_ids), 50):
|
|
batch = wikidata_ids[i:i+50]
|
|
coords = query_wikidata_coordinates(batch)
|
|
all_coords.update(coords)
|
|
if i + 50 < len(wikidata_ids):
|
|
import time
|
|
time.sleep(1) # Rate limiting
|
|
|
|
print(f" Retrieved coordinates for {len(all_coords)} entities")
|
|
|
|
# Update file_data with Wikidata coordinates
|
|
for f in without_coords:
|
|
if f['wikidata_id'] in all_coords:
|
|
f['coords'] = all_coords[f['wikidata_id']]
|
|
with_coords.append(f)
|
|
|
|
print()
|
|
print(f"Files with resolvable coordinates: {len(with_coords)}")
|
|
print()
|
|
|
|
# Process files with coordinates
|
|
resolved = 0
|
|
renamed = 0
|
|
|
|
for f in with_coords:
|
|
filepath = f['filepath']
|
|
country = f['country']
|
|
lat, lon = f['coords']
|
|
|
|
# Reverse geocode
|
|
geo_result = reverse_geocode(lat, lon, country, conn)
|
|
|
|
if not geo_result:
|
|
print(f" No GeoNames match: {filepath.name}")
|
|
continue
|
|
|
|
# Update file
|
|
success, new_path = update_custodian_file(filepath, geo_result, country, dry_run=dry_run)
|
|
|
|
if success:
|
|
resolved += 1
|
|
if new_path:
|
|
renamed += 1
|
|
print(f" {filepath.name} → {new_path.name}")
|
|
else:
|
|
print(f" Updated: {filepath.name}")
|
|
|
|
conn.close()
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
print(f"Files processed: {len(file_data)}")
|
|
print(f"Resolved: {resolved}")
|
|
print(f"Renamed: {renamed}")
|
|
print(f"No location data: {len(no_location)}")
|
|
|
|
if dry_run:
|
|
print()
|
|
print("This was a DRY RUN. Use --apply to make changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|