Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
515 lines
19 KiB
Python
515 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Austrian custodian files with city data.
|
|
|
|
Strategy:
|
|
1. Use coordinates for reverse geocoding when available
|
|
2. Extract city names from institution names (Wien, Salzburg, Graz, etc.)
|
|
3. Validate against GeoNames database
|
|
|
|
Usage:
|
|
python scripts/enrich_austrian_cities.py [--dry-run]
|
|
"""
|
|
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
import unicodedata
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Austrian admin1 codes (GeoNames → ISO 3166-2:AT)
|
|
AUSTRIAN_ADMIN1_MAP = {
|
|
'01': 'B', # Burgenland
|
|
'02': 'K', # Carinthia (Kärnten)
|
|
'03': 'NO', # Lower Austria (Niederösterreich)
|
|
'04': 'OO', # Upper Austria (Oberösterreich)
|
|
'05': 'S', # Salzburg
|
|
'06': 'ST', # Styria (Steiermark)
|
|
'07': 'T', # Tyrol (Tirol)
|
|
'08': 'V', # Vorarlberg
|
|
'09': 'W', # Vienna (Wien)
|
|
}
|
|
|
|
# Known Austrian cities in institution names
|
|
AUSTRIAN_CITY_PATTERNS = [
|
|
# Major cities
|
|
(r'\bWien\b', 'Wien'),
|
|
(r'\bVienna\b', 'Wien'),
|
|
(r'\bGraz\b', 'Graz'),
|
|
(r'\bLinz\b', 'Linz'),
|
|
(r'\bSalzburg\b', 'Salzburg'),
|
|
(r'\bInnsbruck\b', 'Innsbruck'),
|
|
(r'\bKlagenfurt\b', 'Klagenfurt'),
|
|
(r'\bVillach\b', 'Villach'),
|
|
(r'\bWels\b', 'Wels'),
|
|
(r'\bSt\.?\s*Pölten\b', 'Sankt Pölten'),
|
|
(r'\bSankt\s+Pölten\b', 'Sankt Pölten'),
|
|
(r'\bDornbirn\b', 'Dornbirn'),
|
|
(r'\bWiener\s+Neustadt\b', 'Wiener Neustadt'),
|
|
(r'\bSteyr\b', 'Steyr'),
|
|
(r'\bFeldkirch\b', 'Feldkirch'),
|
|
(r'\bBregenz\b', 'Bregenz'),
|
|
(r'\bLeonding\b', 'Leonding'),
|
|
(r'\bKlosterneuburg\b', 'Klosterneuburg'),
|
|
(r'\bBaden\b', 'Baden'),
|
|
(r'\bLeoben\b', 'Leoben'),
|
|
(r'\bKrems\b', 'Krems an der Donau'),
|
|
(r'\bAmstetten\b', 'Amstetten'),
|
|
(r'\bMödling\b', 'Mödling'),
|
|
(r'\bKapfenberg\b', 'Kapfenberg'),
|
|
(r'\bLustenau\b', 'Lustenau'),
|
|
(r'\bHallein\b', 'Hallein'),
|
|
(r'\bKufstein\b', 'Kufstein'),
|
|
(r'\bTraun\b', 'Traun'),
|
|
(r'\bAnsfelden\b', 'Ansfelden'),
|
|
(r'\bHohenems\b', 'Hohenems'),
|
|
(r'\bSchwechat\b', 'Schwechat'),
|
|
(r'\bBraunau\b', 'Braunau am Inn'),
|
|
(r'\bStockerau\b', 'Stockerau'),
|
|
(r'\bSaalfelden\b', 'Saalfelden am Steinernen Meer'),
|
|
(r'\bTernitz\b', 'Ternitz'),
|
|
(r'\bPerchtoldsdorf\b', 'Perchtoldsdorf'),
|
|
(r'\bEisenstädter?\b', 'Eisenstadt'),
|
|
(r'\bEisenstadt\b', 'Eisenstadt'),
|
|
(r'\bTelfs\b', 'Telfs'),
|
|
(r'\bWolfsberg\b', 'Wolfsberg'),
|
|
(r'\bHard\b', 'Hard'),
|
|
(r'\bKorneuburg\b', 'Korneuburg'),
|
|
(r'\bNeunkirchen\b', 'Neunkirchen'),
|
|
(r'\bRied\b', 'Ried im Innkreis'),
|
|
(r'\bBad\s+Ischl\b', 'Bad Ischl'),
|
|
(r'\bGmunden\b', 'Gmunden'),
|
|
(r'\bWörgl\b', 'Wörgl'),
|
|
(r'\bMelk\b', 'Melk'),
|
|
(r'\bZell\s+am\s+See\b', 'Zell am See'),
|
|
(r'\bMistelbach\b', 'Mistelbach'),
|
|
(r'\bVöcklabruck\b', 'Vöcklabruck'),
|
|
(r'\bMarchtrenk\b', 'Marchtrenk'),
|
|
(r'\bEnns\b', 'Enns'),
|
|
(r'\bBruck\s+an\s+der\s+Mur\b', 'Bruck an der Mur'),
|
|
(r'\bSpittal\b', 'Spittal an der Drau'),
|
|
(r'\bSchwaz\b', 'Schwaz'),
|
|
(r'\bVoitsberg\b', 'Voitsberg'),
|
|
(r'\bRankweil\b', 'Rankweil'),
|
|
(r'\bBad\s+Vöslau\b', 'Bad Vöslau'),
|
|
(r'\bTulln\b', 'Tulln an der Donau'),
|
|
(r'\bGänserndorf\b', 'Gänserndorf'),
|
|
(r'\bHollabrunn\b', 'Hollabrunn'),
|
|
(r'\bLienz\b', 'Lienz'),
|
|
(r'\bHall\s+in\s+Tirol\b', 'Hall in Tirol'),
|
|
(r'\bFeldkirchen\b', 'Feldkirchen in Kärnten'),
|
|
(r'\bZwettl\b', 'Zwettl'),
|
|
(r'\bWaidhofen\b', 'Waidhofen an der Ybbs'),
|
|
(r'\bMattersburg\b', 'Mattersburg'),
|
|
(r'\bOberwart\b', 'Oberwart'),
|
|
(r'\bJudenburg\b', 'Judenburg'),
|
|
(r'\bPöchlarn\b', 'Pöchlarn'),
|
|
(r'\bFranziskanerplatz\b', 'Wien'), # Common Vienna address
|
|
(r'\bJosefsplatz\b', 'Wien'), # Hofburg, Vienna
|
|
|
|
# Regional references → capital cities
|
|
(r'\bTiroler\b', 'Innsbruck'), # Amt der Tiroler Landesregierung
|
|
(r'\bBurgenländische\b', 'Eisenstadt'), # Burgenländische Landesbibliothek
|
|
(r'\bKärnt(?:en|ner)\b', 'Klagenfurt'), # Kärnten/Kärntner → Klagenfurt
|
|
(r'\bVorarlberg(?:er)?\b', 'Feldkirch'), # Vorarlberg
|
|
(r'\bSteiermark\b', 'Graz'), # Steiermark
|
|
(r'\bSteiermärk\b', 'Graz'), # Steiermärkisch
|
|
(r'\bOÖ\b', 'Linz'), # OÖ = Oberösterreich
|
|
(r'\bOberösterreich\b', 'Linz'), # Oberösterreich
|
|
(r'\bNiederösterreich\b', 'Sankt Pölten'), # Niederösterreich
|
|
(r'\bNÖ\b', 'Sankt Pölten'), # NÖ = Niederösterreich
|
|
(r'\bSalzburg(?:er)?\b', 'Salzburg'), # Salzburger Festspiele
|
|
|
|
# Small towns mentioned in institution names
|
|
(r'\bKaltenleutgeben\b', 'Kaltenleutgeben'),
|
|
(r'\bLambach\b', 'Lambach'),
|
|
(r'\bSeitenstetten\b', 'Seitenstetten'),
|
|
(r'\bMattsee\b', 'Mattsee'),
|
|
(r'\bPöggstall\b', 'Pöggstall'),
|
|
(r'\bLaxenburg\b', 'Laxenburg'),
|
|
(r'\bEggenburg\b', 'Eggenburg'),
|
|
(r'\bPressbaum\b', 'Pressbaum'),
|
|
(r'\bSeeburg\b', 'Seekirchen am Wallersee'), # Schloss Seeburg
|
|
(r'\bSchotten(?:stift)?\b', 'Wien'), # Schottenstift is in Vienna
|
|
(r'\bAlbertina\b', 'Wien'), # Albertina is in Vienna
|
|
(r'\bMozarteum\b', 'Salzburg'), # Mozarteum is in Salzburg
|
|
(r'\bParacelsus\b', 'Salzburg'), # Paracelsus Medizinische Privatuniversität
|
|
(r'\bJoanneum\b', 'Graz'), # FH Joanneum is in Graz
|
|
(r'\bParlament\b', 'Wien'), # Parlamentsbibliothek
|
|
(r'\bBundeskanzleramt\b', 'Wien'), # Federal Chancellery
|
|
(r'\bBundesministerium\b', 'Wien'), # Federal Ministries
|
|
(r'\bBundesdenkmalamt\b', 'Wien'), # Federal Monument Office
|
|
(r'\bÖsterreich(?:ische[rn]?)?\b', 'Wien'), # Austrian national institutions
|
|
(r'\bIST\s*Austria\b', 'Klosterneuburg'), # Institute of Science and Technology Austria
|
|
(r'\bInstitute\s+of\s+Science\s+and\s+Technology\s+Austria\b', 'Klosterneuburg'), # Full name
|
|
(r'\bRapid(?:eum)?\b', 'Wien'), # SK Rapid Vienna
|
|
(r'\bMetalab\b', 'Wien'), # Metalab hackerspace Vienna
|
|
(r'\bSigmund\s+Freud\b', 'Wien'), # Sigmund Freud museum Vienna
|
|
(r'\bMax\s+Perutz\b', 'Wien'), # Max Perutz Library (Vienna Biocenter)
|
|
|
|
# Additional specific institutions
|
|
(r'\bAnton\s+Bruckner\b', 'Linz'), # Anton Bruckner Private University
|
|
(r'\bbifeb\b', 'Strobl'), # Bundesinstitut für Erwachsenenbildung
|
|
(r'\bBundesinstitut\s+für\s+Erwachsenenbildung\b', 'Strobl'),
|
|
(r'\bZeitgenossen\b', 'Krems an der Donau'), # Archiv der Zeitgenossen
|
|
(r'\bCompass[-\s]Verlag\b', 'Wien'), # Compass-Verlag
|
|
(r'\bErnst\s+Krenek\b', 'Krems an der Donau'), # Ernst Krenek Institut
|
|
(r'\bFrauensolidarität\b', 'Wien'), # Frauensolidarität
|
|
(r'\bGeoSphere\b', 'Wien'), # GeoSphere Austria
|
|
(r'\bHochschule\s+Burgenland\b', 'Eisenstadt'), # FH Burgenland
|
|
(r'\bAgrar[-\s]und\s+Umweltpädagogik\b', 'Wien'), # Hochschule für Agrar
|
|
(r'\bHochschule\s+für\s+Agrar\b', 'Wien'), # Hochschule für Agrar (full)
|
|
(r'\bHöhere\s+Studien\b', 'Wien'), # IHS
|
|
(r'\bInterdisciplinary\s+Transformation\b', 'Wien'), # ITU
|
|
(r'\bJAM\s+Music\s+Lab\b', 'Wien'), # JAM Music Lab
|
|
(r'\bKDZ\b', 'Wien'), # KDZ Zentrum
|
|
(r'\bNew\s+Design\s+University\b', 'Sankt Pölten'), # NDU
|
|
(r'\bPädagogische\s+Hochschule\s+Tirol\b', 'Innsbruck'), # PH Tirol
|
|
(r'\bPädagogische\s+Hochschule\s+Burgenland\b', 'Eisenstadt'), # PPH Burgenland
|
|
(r'\bShared\s+Archiving\b', 'Wien'), # SAA
|
|
(r'\bVerbund\s+für\s+Bildung\b', 'Wien'), # VBKV
|
|
(r'\bVilla\s+North\b', 'Wien'), # Villa North
|
|
(r'\bInformationswissenschaft\b', 'Graz'), # VFI
|
|
(r'\bErinnerungskultur\b', 'Villach'), # ZEG is in Villach, not Graz
|
|
(r'\bParlament(?:s)?(?:direktion|bibliothek)?\b', 'Wien'), # Parlamentsbibliothek
|
|
]
|
|
|
|
|
|
def load_source_data(source_file: str) -> dict:
|
|
"""Load Austrian source data with coordinates and ISIL codes."""
|
|
import yaml
|
|
|
|
with open(source_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
lookup = {}
|
|
for inst in data.get('institutions', []):
|
|
# Get ISIL code
|
|
isil = None
|
|
for ident in inst.get('identifiers', []):
|
|
if ident.get('identifier_scheme') == 'ISIL':
|
|
isil = ident.get('identifier_value')
|
|
break
|
|
|
|
if isil:
|
|
locs = inst.get('locations', [])
|
|
coords = None
|
|
if locs and locs[0].get('latitude') and locs[0].get('longitude'):
|
|
coords = (locs[0]['latitude'], locs[0]['longitude'])
|
|
|
|
lookup[isil] = {
|
|
'name': inst.get('name', ''),
|
|
'coords': coords,
|
|
}
|
|
|
|
return lookup
|
|
|
|
|
|
def extract_city_from_name(name: str) -> str | None:
|
|
"""Extract city name from Austrian institution name."""
|
|
for pattern, city in AUSTRIAN_CITY_PATTERNS:
|
|
if re.search(pattern, name, re.IGNORECASE):
|
|
return city
|
|
return None
|
|
|
|
|
|
def generate_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
|
words = clean.split()
|
|
|
|
if len(words) == 1:
|
|
return words[0][:3].upper()
|
|
else:
|
|
if len(words) == 2:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
return ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
|
|
def reverse_geocode(lat: float, lon: float, conn: sqlite3.Connection) -> dict | None:
|
|
"""Reverse geocode coordinates to find nearest Austrian city."""
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute('''
|
|
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
|
FROM cities
|
|
WHERE country_code = 'AT'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY distance_sq
|
|
LIMIT 1
|
|
''', (lat, lat, lon, lon))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
return None
|
|
|
|
|
|
def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
|
|
"""Look up city in GeoNames database."""
|
|
cursor = conn.cursor()
|
|
|
|
# Try exact match
|
|
cursor.execute('''
|
|
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = 'AT'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (city_name, city_name))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
|
|
# Try fuzzy match
|
|
cursor.execute('''
|
|
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = 'AT'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (f'{city_name}%', f'{city_name}%'))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, method: str, dry_run: bool = False) -> bool:
|
|
"""Update a custodian file with city data."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
|
|
if not ghcid_match:
|
|
return False
|
|
|
|
old_ghcid = ghcid_match.group(1)
|
|
|
|
region_code = AUSTRIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
|
|
city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])
|
|
|
|
parts = old_ghcid.split('-')
|
|
if len(parts) >= 5:
|
|
type_code = parts[3]
|
|
abbrev_and_suffix = '-'.join(parts[4:])
|
|
new_ghcid = f"AT-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
|
|
else:
|
|
return False
|
|
|
|
if old_ghcid == new_ghcid:
|
|
return False
|
|
|
|
old_filename = file_path.name
|
|
new_filename = old_filename.replace(old_ghcid, new_ghcid)
|
|
new_file_path = file_path.parent / new_filename
|
|
|
|
new_content = content.replace(old_ghcid, new_ghcid)
|
|
|
|
old_resolution = re.search(r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content)
|
|
|
|
if old_resolution:
|
|
new_resolution = f"""location_resolution:
|
|
country_code: AT
|
|
region_code: {region_code}
|
|
region_name: {geo_data['admin1_name']}
|
|
city_code: {city_code}
|
|
city_name: {geo_data['name']}
|
|
geonames_id: {geo_data['geonames_id']}
|
|
feature_code: {geo_data['feature_code']}
|
|
latitude: {geo_data['latitude']}
|
|
longitude: {geo_data['longitude']}
|
|
method: {method}
|
|
resolution_date: '{datetime.now(timezone.utc).isoformat()}'
|
|
"""
|
|
new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
history_entry = f""" - ghcid: {new_ghcid}
|
|
valid_from: '{timestamp}'
|
|
reason: City enrichment from {method} - {city_name} resolved to {geo_data['name']} ({region_code})
|
|
"""
|
|
|
|
history_match = re.search(r'ghcid_history:\s*\n', new_content)
|
|
if history_match:
|
|
insert_pos = history_match.end()
|
|
new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]
|
|
|
|
if dry_run:
|
|
print(f" DRY RUN: {old_filename} -> {new_filename}")
|
|
return True
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(new_content)
|
|
|
|
if new_file_path != file_path:
|
|
file_path.rename(new_file_path)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
base_dir = Path(__file__).parent.parent
|
|
custodian_dir = base_dir / 'data' / 'custodian'
|
|
source_file = base_dir / 'data' / 'instances' / 'austria_complete_ch_annotator.yaml'
|
|
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
|
|
|
print("Austrian City Enrichment Script")
|
|
print("=" * 50)
|
|
|
|
if dry_run:
|
|
print("DRY RUN MODE")
|
|
|
|
# Load source data
|
|
print(f"\nLoading source data from {source_file.name}...")
|
|
source_lookup = load_source_data(str(source_file))
|
|
print(f" Found {len(source_lookup)} ISIL entries")
|
|
|
|
coords_count = sum(1 for v in source_lookup.values() if v['coords'])
|
|
print(f" {coords_count} entries have coordinates")
|
|
|
|
conn = sqlite3.connect(str(geonames_db))
|
|
|
|
print(f"\nFinding Austrian XXX files...")
|
|
xxx_files = list(custodian_dir.glob('AT-*-XXX-*.yaml'))
|
|
print(f" Found {len(xxx_files)} files")
|
|
|
|
updated = 0
|
|
by_coords = 0
|
|
by_name = 0
|
|
no_city = 0
|
|
no_geonames = 0
|
|
errors = 0
|
|
|
|
for file_path in xxx_files:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Find ISIL code
|
|
isil_match = re.search(r'identifier_value:\s*(AT-\w+)', content)
|
|
isil_code = isil_match.group(1) if isil_match else None
|
|
|
|
# Get institution name
|
|
name_match = re.search(r'claim_value:\s*(.+)', content)
|
|
inst_name = name_match.group(1).strip() if name_match else ''
|
|
|
|
geo_data = None
|
|
method = None
|
|
city_name = None
|
|
|
|
# Strategy 1: Use coordinates for reverse geocoding
|
|
if isil_code and isil_code in source_lookup:
|
|
source_data = source_lookup[isil_code]
|
|
if source_data['coords']:
|
|
lat, lon = source_data['coords']
|
|
geo_data = reverse_geocode(lat, lon, conn)
|
|
if geo_data:
|
|
method = 'REVERSE_GEOCODE'
|
|
city_name = geo_data['name']
|
|
by_coords += 1
|
|
|
|
# Strategy 2: Extract city from institution name
|
|
if not geo_data:
|
|
city_name = extract_city_from_name(inst_name)
|
|
if city_name:
|
|
geo_data = lookup_city_in_geonames(city_name, conn)
|
|
if geo_data:
|
|
method = 'NAME_EXTRACTION'
|
|
by_name += 1
|
|
|
|
if not geo_data:
|
|
no_city += 1
|
|
continue
|
|
|
|
if update_custodian_file(file_path, city_name, geo_data, method, dry_run):
|
|
updated += 1
|
|
if not dry_run:
|
|
print(f" Updated: {file_path.name} -> {city_name} ({method})")
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
print(f" ERROR: {file_path.name}: {e}")
|
|
|
|
conn.close()
|
|
|
|
print("\n" + "=" * 50)
|
|
print("SUMMARY")
|
|
print("=" * 50)
|
|
print(f"Total XXX files: {len(xxx_files)}")
|
|
print(f"Updated: {updated}")
|
|
print(f" By coordinates: {by_coords}")
|
|
print(f" By name extraction: {by_name}")
|
|
print(f"No city found: {no_city}")
|
|
print(f"Errors: {errors}")
|
|
print(f"Remaining XXX: {len(xxx_files) - updated}")
|
|
|
|
# Generate report
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
report_path = base_dir / 'reports' / f'AUSTRIAN_CITY_ENRICHMENT_{timestamp}.md'
|
|
|
|
with open(report_path, 'w') as f:
|
|
f.write(f"# Austrian City Enrichment Report\n\n")
|
|
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
|
f.write(f"**Dry Run**: {dry_run}\n\n")
|
|
f.write(f"## Summary\n\n")
|
|
f.write(f"| Metric | Count |\n")
|
|
f.write(f"|--------|-------|\n")
|
|
f.write(f"| Total XXX files | {len(xxx_files)} |\n")
|
|
f.write(f"| Updated | {updated} |\n")
|
|
f.write(f"| By coordinates | {by_coords} |\n")
|
|
f.write(f"| By name extraction | {by_name} |\n")
|
|
f.write(f"| No city found | {no_city} |\n")
|
|
f.write(f"| Errors | {errors} |\n")
|
|
f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")
|
|
|
|
print(f"\nReport: {report_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|