Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
568 lines
19 KiB
Python
568 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve XX region codes using city names already in the file.
|
|
|
|
This script handles files that have city data but unknown region codes.
|
|
It looks up the city in GeoNames to get the admin1 (region) code.
|
|
|
|
Following AGENTS.md Rules:
|
|
- Rule 5: Additive only - never delete existing data
|
|
- GHCID settlement standardization: GeoNames is authoritative
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import sqlite3
|
|
import re
|
|
import unicodedata
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
# GeoNames database
|
|
GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db"
|
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian"
|
|
|
|
# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods)
|
|
SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
|
|
# Country-specific region code mappings (GeoNames admin1 → ISO 3166-2)
|
|
COUNTRY_ADMIN_MAPS = {
|
|
'NL': {
|
|
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI',
|
|
'06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH',
|
|
'15': 'OV', '16': 'FL'
|
|
},
|
|
'BE': {
|
|
'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV',
|
|
'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA',
|
|
'BRU': 'BRU'
|
|
},
|
|
# Georgia: GeoNames admin1 → ISO 3166-2:GE
|
|
'GE': {
|
|
'51': 'TB', # Tbilisi
|
|
'04': 'AJ', # Adjara
|
|
'67': 'KA', # Kakheti
|
|
'66': 'IM', # Imereti
|
|
'68': 'KK', # Kvemo Kartli
|
|
'69': 'MM', # Mtskheta-Mtianeti
|
|
'70': 'RL', # Racha-Lechkhumi and Kvemo Svaneti
|
|
'71': 'SZ', # Samegrelo and Zemo Svaneti
|
|
'72': 'SJ', # Samtskhe-Javakheti
|
|
'73': 'SK', # Shida Kartli
|
|
'65': 'GU', # Guria
|
|
},
|
|
# Czech Republic: GeoNames admin1 → ISO 3166-2:CZ (2-digit NUTS codes)
|
|
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:CZ
|
|
'CZ': {
|
|
'52': '10', # Prague (Praha)
|
|
'88': '20', # Central Bohemian (Středočeský kraj)
|
|
'79': '31', # South Bohemian (Jihočeský kraj)
|
|
'87': '32', # Plzeň Region (Plzeňský kraj)
|
|
'81': '41', # Karlovy Vary Region (Karlovarský kraj)
|
|
'89': '42', # Ústí nad Labem Region (Ústecký kraj)
|
|
'83': '51', # Liberec Region (Liberecký kraj)
|
|
'82': '52', # Hradec Králové Region (Královéhradecký kraj)
|
|
'86': '53', # Pardubice Region (Pardubický kraj)
|
|
'80': '63', # Vysočina Region
|
|
'78': '64', # South Moravian (Jihomoravský kraj)
|
|
'84': '71', # Olomouc Region (Olomoucký kraj)
|
|
'90': '72', # Zlín Region (Zlínský kraj)
|
|
'85': '80', # Moravian-Silesian (Moravskoslezský kraj)
|
|
},
|
|
# Austria: GeoNames admin1 → ISO 3166-2:AT
|
|
'AT': {
|
|
'01': '1', # Burgenland
|
|
'02': '2', # Kärnten (Carinthia)
|
|
'03': '3', # Niederösterreich (Lower Austria)
|
|
'04': '4', # Oberösterreich (Upper Austria)
|
|
'05': '5', # Salzburg
|
|
'06': '6', # Steiermark (Styria)
|
|
'07': '7', # Tirol (Tyrol)
|
|
'08': '8', # Vorarlberg
|
|
'09': '9', # Wien (Vienna)
|
|
},
|
|
# Bulgaria: GeoNames admin1 → ISO 3166-2:BG (2-letter province codes)
|
|
'BG': {
|
|
'38': '01', # Blagoevgrad
|
|
'39': '02', # Burgas
|
|
'40': '08', # Dobrich
|
|
'41': '07', # Gabrovo
|
|
'42': '26', # Haskovo
|
|
'43': '09', # Kardzhali (Kurdzhali)
|
|
'44': '10', # Kyustendil
|
|
'45': '11', # Lovech
|
|
'46': '12', # Montana
|
|
'47': '13', # Pazardzhik
|
|
'48': '14', # Pernik
|
|
'49': '15', # Pleven
|
|
'50': '16', # Plovdiv
|
|
'51': '17', # Razgrad
|
|
'52': '18', # Ruse
|
|
'53': '27', # Shumen
|
|
'54': '19', # Silistra
|
|
'55': '20', # Sliven
|
|
'56': '21', # Smolyan
|
|
'57': '23', # Sofia (Sofiya-Grad)
|
|
'58': '22', # Sofia Province (Sofiya)
|
|
'59': '24', # Stara Zagora
|
|
'60': '25', # Targovishte
|
|
'61': '03', # Varna
|
|
'62': '04', # Veliko Tarnovo
|
|
'63': '05', # Vidin
|
|
'64': '06', # Vratsa
|
|
'65': '28', # Yambol
|
|
},
|
|
# Switzerland: GeoNames already uses ISO 3166-2:CH canton codes
|
|
'CH': {
|
|
'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL',
|
|
'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR',
|
|
'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW',
|
|
'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG',
|
|
'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG',
|
|
'ZH': 'ZH',
|
|
},
|
|
# Vietnam: GeoNames admin1 codes are the ISO 3166-2:VN codes (use directly)
|
|
# GeoNames uses 2-digit codes that match ISO 3166-2:VN province codes
|
|
'VN': {
|
|
'01': 'HN', # Hanoi (Ha Noi)
|
|
'31': 'HP', # Hai Phong
|
|
'48': 'DN', # Da Nang (Đà Nẵng)
|
|
'79': 'SG', # Ho Chi Minh City (Saigon)
|
|
'92': 'CT', # Can Tho
|
|
'75': 'DNa', # Dong Nai
|
|
'24': 'BN', # Bac Ninh
|
|
'22': 'QN', # Quang Ninh (Quảng Ninh)
|
|
'38': 'TH', # Thanh Hoa (Thanh Hóa)
|
|
'46': 'TTH', # Thua Thien-Hue (Thừa Thiên Huế)
|
|
'40': 'NA', # Nghe An (Nghệ An)
|
|
'04': 'CB', # Cao Bang
|
|
'37': 'NB', # Ninh Binh
|
|
'56': 'KH', # Khanh Hoa
|
|
'66': 'DLK', # Dak Lak
|
|
'68': 'LDG', # Lam Dong
|
|
'91': 'AG', # An Giang
|
|
'86': 'VL', # Vinh Long
|
|
'82': 'DTP', # Dong Thap
|
|
'80': 'TNi', # Tay Ninh
|
|
'96': 'CMa', # Ca Mau
|
|
'51': 'QNg', # Quang Ngai
|
|
'52': 'GL', # Gia Lai
|
|
'19': 'TN', # Thai Nguyen
|
|
'25': 'PT', # Phu Tho
|
|
},
|
|
# Japan: GeoNames admin1 → ISO 3166-2:JP (2-digit prefecture codes)
|
|
# See: https://en.wikipedia.org/wiki/ISO_3166-2:JP
|
|
'JP': {
|
|
'01': '23', # Aichi
|
|
'02': '05', # Akita
|
|
'03': '02', # Aomori
|
|
'04': '12', # Chiba
|
|
'05': '38', # Ehime
|
|
'06': '18', # Fukui
|
|
'07': '40', # Fukuoka
|
|
'08': '07', # Fukushima
|
|
'09': '21', # Gifu
|
|
'10': '10', # Gunma
|
|
'11': '34', # Hiroshima
|
|
'12': '01', # Hokkaido
|
|
'13': '28', # Hyogo
|
|
'14': '08', # Ibaraki
|
|
'15': '17', # Ishikawa
|
|
'16': '03', # Iwate
|
|
'17': '37', # Kagawa
|
|
'18': '46', # Kagoshima
|
|
'19': '14', # Kanagawa
|
|
'20': '39', # Kochi
|
|
'21': '43', # Kumamoto
|
|
'22': '26', # Kyoto
|
|
'23': '24', # Mie
|
|
'24': '04', # Miyagi
|
|
'25': '45', # Miyazaki
|
|
'26': '20', # Nagano
|
|
'27': '42', # Nagasaki
|
|
'28': '29', # Nara
|
|
'29': '15', # Niigata
|
|
'30': '44', # Oita
|
|
'31': '33', # Okayama
|
|
'32': '27', # Osaka
|
|
'33': '41', # Saga
|
|
'34': '11', # Saitama
|
|
'35': '25', # Shiga
|
|
'36': '32', # Shimane
|
|
'37': '22', # Shizuoka
|
|
'38': '09', # Tochigi
|
|
'39': '36', # Tokushima
|
|
'40': '13', # Tokyo
|
|
'41': '31', # Tottori
|
|
'42': '16', # Toyama
|
|
'43': '30', # Wakayama
|
|
'44': '06', # Yamagata
|
|
'45': '35', # Yamaguchi
|
|
'46': '19', # Yamanashi
|
|
'47': '47', # Okinawa
|
|
},
|
|
# Egypt: GeoNames admin1 → ISO 3166-2:EG
|
|
# See: https://en.wikipedia.org/wiki/ISO_3166-2:EG
|
|
'EG': {
|
|
'01': 'DK', # Dakahlia
|
|
'02': 'BA', # Red Sea (Al Bahr al Ahmar)
|
|
'03': 'BH', # Beheira
|
|
'04': 'FYM', # Faiyum
|
|
'05': 'GH', # Gharbia
|
|
'06': 'ALX', # Alexandria
|
|
'07': 'IS', # Ismailia
|
|
'08': 'GZ', # Giza
|
|
'09': 'MNF', # Monufia
|
|
'10': 'MN', # Minya
|
|
'11': 'C', # Cairo
|
|
'12': 'KB', # Qalyubia
|
|
'13': 'WAD', # New Valley (Al Wadi al Jadid)
|
|
'14': 'SHR', # Sharqia
|
|
'15': 'SUZ', # Suez
|
|
'16': 'ASN', # Aswan
|
|
'17': 'AST', # Asyut
|
|
'18': 'BNS', # Beni Suweif
|
|
'19': 'PTS', # Port Said
|
|
'20': 'DT', # Damietta
|
|
'21': 'KFS', # Kafr el-Sheikh
|
|
'22': 'MT', # Matruh
|
|
'23': 'KN', # Qena
|
|
'24': 'SHG', # Sohag
|
|
'26': 'JS', # South Sinai
|
|
'27': 'SIN', # North Sinai
|
|
'28': 'LX', # Luxor
|
|
},
|
|
}
|
|
|
|
# City name translations (native → GeoNames ASCII name)
|
|
# Many cities in GeoNames use English/anglicized names
|
|
CITY_NAME_TRANSLATIONS = {
|
|
# German → English
|
|
'wien': 'vienna',
|
|
'munchen': 'munich',
|
|
'koln': 'cologne',
|
|
'nurnberg': 'nuremberg',
|
|
'braunschweig': 'brunswick',
|
|
# Czech → GeoNames (use normalized/ASCII keys)
|
|
'praha': 'prague',
|
|
'plzen': 'pilsen', # Plzeň → plzen after normalization
|
|
'brno': 'brno',
|
|
'ostrava': 'ostrava',
|
|
# Swiss cities
|
|
'geneve': 'geneva',
|
|
'zurich': 'zurich',
|
|
'bern': 'berne',
|
|
'basel': 'basle',
|
|
# Italian cities
|
|
'roma': 'rome',
|
|
'milano': 'milan',
|
|
'napoli': 'naples',
|
|
'firenze': 'florence',
|
|
'venezia': 'venice',
|
|
'torino': 'turin',
|
|
# Austrian special cases (use normalized keys after diacritics removal)
|
|
# GeoNames uses 'oe' for ö, so 'Sankt Poelten'
|
|
'st. polten': 'sankt poelten',
|
|
'st polten': 'sankt poelten',
|
|
'sankt polten': 'sankt poelten',
|
|
# Japanese cities - complex administrative format to GeoNames
|
|
# Format: "District Gun City Machi/Cho" → just the city name
|
|
'haga gun motegi machi': 'motegi',
|
|
'motegi machi': 'motegi',
|
|
# Egyptian landmarks → Cairo
|
|
'nile corniche': 'cairo',
|
|
}
|
|
|
|
|
|
def normalize_city_name(name: str) -> str:
|
|
"""Normalize city name for matching."""
|
|
# NFD normalization to separate diacritics
|
|
normalized = unicodedata.normalize('NFD', name)
|
|
# Remove diacritics
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
# Lowercase
|
|
return ascii_name.lower().strip()
|
|
|
|
|
|
def clean_city_name(city: str) -> str:
|
|
"""Extract base city name from complex strings like 'Praha 1' or 'Zlín - Louky'."""
|
|
# Remove district numbers like "Praha 1", "Praha 9 - Běchovice"
|
|
city = re.sub(r'\s+\d+.*$', '', city)
|
|
# Remove parts after dash
|
|
city = re.sub(r'\s*-\s*.*$', '', city)
|
|
# Remove postal code patterns
|
|
city = re.sub(r'\s+\d{3}\s*\d{2}.*$', '', city)
|
|
return city.strip()
|
|
|
|
|
|
def lookup_city_region(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]:
|
|
"""Look up city in GeoNames and return region info."""
|
|
cursor = conn.cursor()
|
|
|
|
# Clean city name
|
|
base_city = clean_city_name(city_name)
|
|
normalized = normalize_city_name(base_city)
|
|
|
|
# Check for translated name (native → GeoNames)
|
|
if normalized in CITY_NAME_TRANSLATIONS:
|
|
translated = CITY_NAME_TRANSLATIONS[normalized]
|
|
else:
|
|
translated = normalized
|
|
|
|
# Try translated name first, then normalized
|
|
row = None
|
|
for search_name in [translated, normalized]:
|
|
cursor.execute(f'''
|
|
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
|
|
latitude, longitude, feature_code, population
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
|
|
AND LOWER(ascii_name) = ?
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (country, search_name))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
break
|
|
|
|
# If no match, try LIKE search with normalized name
|
|
if not row:
|
|
cursor.execute(f'''
|
|
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
|
|
latitude, longitude, feature_code, population
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
|
|
AND LOWER(ascii_name) LIKE ?
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (country, f'{normalized}%'))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
return None
|
|
|
|
return {
|
|
'geonames_id': row[0],
|
|
'name': row[1],
|
|
'ascii_name': row[2],
|
|
'admin1_code': row[3],
|
|
'admin2_code': row[4],
|
|
'latitude': row[5],
|
|
'longitude': row[6],
|
|
'feature_code': row[7],
|
|
'population': row[8],
|
|
}
|
|
|
|
|
|
def get_region_code(country: str, admin1_code: Optional[str], admin2_code: Optional[str] = None) -> str:
|
|
"""Convert GeoNames admin codes to ISO 3166-2 region codes."""
|
|
if country in COUNTRY_ADMIN_MAPS:
|
|
country_map = COUNTRY_ADMIN_MAPS[country]
|
|
if country == 'BE' and admin2_code:
|
|
return country_map.get(admin2_code, admin1_code or 'XX')
|
|
if admin1_code:
|
|
return country_map.get(admin1_code, admin1_code)
|
|
return 'XX'
|
|
return admin1_code if admin1_code else 'XX'
|
|
|
|
|
|
def find_city_in_file(data: Dict) -> Optional[Tuple[str, str]]:
|
|
"""Find city name and country from file data."""
|
|
country = None
|
|
city = None
|
|
|
|
# Get country from ghcid
|
|
ghcid = data.get('ghcid', {})
|
|
loc_res = ghcid.get('location_resolution', {})
|
|
country = loc_res.get('country_code')
|
|
|
|
# Check original_entry.locations
|
|
if 'original_entry' in data:
|
|
locations = data['original_entry'].get('locations', [])
|
|
for loc in locations:
|
|
if 'city' in loc and loc['city']:
|
|
city = loc['city']
|
|
if not country and 'country' in loc:
|
|
country = loc['country']
|
|
break
|
|
|
|
# Check top-level locations
|
|
if not city:
|
|
locations = data.get('locations', [])
|
|
for loc in locations:
|
|
if 'city' in loc and loc['city']:
|
|
city = loc['city']
|
|
if not country and 'country' in loc:
|
|
country = loc['country']
|
|
break
|
|
|
|
if city and country:
|
|
return (city, country)
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool:
|
|
"""Process a single file with XX region code."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" Error reading {filepath}: {e}")
|
|
return False
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if region is already resolved
|
|
ghcid = data.get('ghcid', {})
|
|
loc_res = ghcid.get('location_resolution', {})
|
|
if loc_res.get('region_code', 'XX') != 'XX':
|
|
return False
|
|
|
|
# Find city name
|
|
city_info = find_city_in_file(data)
|
|
if not city_info:
|
|
return False
|
|
|
|
city_name, country = city_info
|
|
print(f" City: {city_name} ({country})")
|
|
|
|
# Look up in GeoNames
|
|
city_data = lookup_city_region(city_name, country, conn)
|
|
if not city_data:
|
|
print(f" No GeoNames match for '{city_name}'")
|
|
return False
|
|
|
|
region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code'))
|
|
if region_code == 'XX':
|
|
print(f" Could not determine region for admin1={city_data['admin1_code']}")
|
|
return False
|
|
|
|
print(f" Found: {city_data['name']} -> Region {region_code}")
|
|
|
|
if not apply:
|
|
return True
|
|
|
|
# Update GHCID
|
|
current = ghcid.get('ghcid_current', '')
|
|
parts = current.split('-')
|
|
if len(parts) < 5:
|
|
print(f" Invalid GHCID format: {current}")
|
|
return False
|
|
|
|
old_region = parts[1]
|
|
if old_region != 'XX':
|
|
print(f" Region already set: {old_region}")
|
|
return False
|
|
|
|
parts[1] = region_code
|
|
new_ghcid = '-'.join(parts)
|
|
|
|
# Update data
|
|
ghcid['ghcid_current'] = new_ghcid
|
|
loc_res['region_code'] = region_code
|
|
loc_res['region_name'] = f"{country}-{region_code}"
|
|
loc_res['geonames_id'] = city_data['geonames_id']
|
|
loc_res['method'] = 'GEONAMES_CITY_LOOKUP'
|
|
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
ghcid['location_resolution'] = loc_res
|
|
|
|
# Add to history
|
|
history = ghcid.get('ghcid_history', [])
|
|
history.append({
|
|
'ghcid': new_ghcid,
|
|
'valid_from': datetime.now(timezone.utc).isoformat(),
|
|
'reason': f'Region resolved via GeoNames city lookup: XX->{region_code} ({city_data["name"]})'
|
|
})
|
|
ghcid['ghcid_history'] = history
|
|
data['ghcid'] = ghcid
|
|
|
|
# Calculate new filename
|
|
old_name = filepath.name
|
|
new_name = old_name.replace(f'{country}-XX-', f'{country}-{region_code}-')
|
|
new_path = filepath.parent / new_name
|
|
|
|
# Write and rename
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
if new_path != filepath:
|
|
filepath.rename(new_path)
|
|
print(f" Renamed: {old_name} -> {new_name}")
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Resolve XX region codes using city names in files')
|
|
parser.add_argument('--limit', type=int, default=100, help='Max files to process')
|
|
parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)')
|
|
parser.add_argument('--country', help='Filter by country code')
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("REGION RESOLUTION FROM FILE CITY NAMES")
|
|
print("=" * 70)
|
|
print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}")
|
|
print()
|
|
|
|
# Connect to GeoNames
|
|
if not GEONAMES_DB.exists():
|
|
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
|
sys.exit(1)
|
|
|
|
conn = sqlite3.connect(str(GEONAMES_DB))
|
|
|
|
# Find XX files with city names
|
|
xx_files = []
|
|
for f in CUSTODIAN_DIR.glob('*.yaml'):
|
|
if '-XX-' in f.name:
|
|
if args.country and not f.name.startswith(f'{args.country}-'):
|
|
continue
|
|
xx_files.append(f)
|
|
|
|
print(f"Found {len(xx_files)} files with XX region codes")
|
|
|
|
# Filter to files with city names
|
|
files_with_cities = []
|
|
for f in xx_files:
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as fp:
|
|
content = fp.read()
|
|
if 'city:' in content:
|
|
files_with_cities.append(f)
|
|
except:
|
|
pass
|
|
|
|
print(f"Processing {min(len(files_with_cities), args.limit)} files with city names")
|
|
print()
|
|
|
|
resolved = 0
|
|
renamed = 0
|
|
|
|
for f in files_with_cities[:args.limit]:
|
|
print(f"Processing {f.name}...")
|
|
if process_file(f, conn, args.apply):
|
|
resolved += 1
|
|
if args.apply:
|
|
renamed += 1
|
|
|
|
conn.close()
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
print(f"Files processed: {min(len(files_with_cities), args.limit)}")
|
|
print(f"Resolved: {resolved}")
|
|
print(f"Renamed: {renamed}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|