glam/scripts/enrich_austrian_cities.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

515 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Enrich Austrian custodian files with city data.
Strategy:
1. Use coordinates for reverse geocoding when available
2. Extract city names from institution names (Wien, Salzburg, Graz, etc.)
3. Validate against GeoNames database
Usage:
python scripts/enrich_austrian_cities.py [--dry-run]
"""
import re
import sqlite3
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
# Austrian admin1 codes (GeoNames → ISO 3166-2:AT)
AUSTRIAN_ADMIN1_MAP = {
'01': 'B', # Burgenland
'02': 'K', # Carinthia (Kärnten)
'03': 'NO', # Lower Austria (Niederösterreich)
'04': 'OO', # Upper Austria (Oberösterreich)
'05': 'S', # Salzburg
'06': 'ST', # Styria (Steiermark)
'07': 'T', # Tyrol (Tirol)
'08': 'V', # Vorarlberg
'09': 'W', # Vienna (Wien)
}
# Known Austrian cities in institution names
AUSTRIAN_CITY_PATTERNS = [
# Major cities
(r'\bWien\b', 'Wien'),
(r'\bVienna\b', 'Wien'),
(r'\bGraz\b', 'Graz'),
(r'\bLinz\b', 'Linz'),
(r'\bSalzburg\b', 'Salzburg'),
(r'\bInnsbruck\b', 'Innsbruck'),
(r'\bKlagenfurt\b', 'Klagenfurt'),
(r'\bVillach\b', 'Villach'),
(r'\bWels\b', 'Wels'),
(r'\bSt\.?\s*Pölten\b', 'Sankt Pölten'),
(r'\bSankt\s+Pölten\b', 'Sankt Pölten'),
(r'\bDornbirn\b', 'Dornbirn'),
(r'\bWiener\s+Neustadt\b', 'Wiener Neustadt'),
(r'\bSteyr\b', 'Steyr'),
(r'\bFeldkirch\b', 'Feldkirch'),
(r'\bBregenz\b', 'Bregenz'),
(r'\bLeonding\b', 'Leonding'),
(r'\bKlosterneuburg\b', 'Klosterneuburg'),
(r'\bBaden\b', 'Baden'),
(r'\bLeoben\b', 'Leoben'),
(r'\bKrems\b', 'Krems an der Donau'),
(r'\bAmstetten\b', 'Amstetten'),
(r'\bMödling\b', 'Mödling'),
(r'\bKapfenberg\b', 'Kapfenberg'),
(r'\bLustenau\b', 'Lustenau'),
(r'\bHallein\b', 'Hallein'),
(r'\bKufstein\b', 'Kufstein'),
(r'\bTraun\b', 'Traun'),
(r'\bAnsfelden\b', 'Ansfelden'),
(r'\bHohenems\b', 'Hohenems'),
(r'\bSchwechat\b', 'Schwechat'),
(r'\bBraunau\b', 'Braunau am Inn'),
(r'\bStockerau\b', 'Stockerau'),
(r'\bSaalfelden\b', 'Saalfelden am Steinernen Meer'),
(r'\bTernitz\b', 'Ternitz'),
(r'\bPerchtoldsdorf\b', 'Perchtoldsdorf'),
(r'\bEisenstädter?\b', 'Eisenstadt'),
(r'\bEisenstadt\b', 'Eisenstadt'),
(r'\bTelfs\b', 'Telfs'),
(r'\bWolfsberg\b', 'Wolfsberg'),
(r'\bHard\b', 'Hard'),
(r'\bKorneuburg\b', 'Korneuburg'),
(r'\bNeunkirchen\b', 'Neunkirchen'),
(r'\bRied\b', 'Ried im Innkreis'),
(r'\bBad\s+Ischl\b', 'Bad Ischl'),
(r'\bGmunden\b', 'Gmunden'),
(r'\bWörgl\b', 'Wörgl'),
(r'\bMelk\b', 'Melk'),
(r'\bZell\s+am\s+See\b', 'Zell am See'),
(r'\bMistelbach\b', 'Mistelbach'),
(r'\bVöcklabruck\b', 'Vöcklabruck'),
(r'\bMarchtrenk\b', 'Marchtrenk'),
(r'\bEnns\b', 'Enns'),
(r'\bBruck\s+an\s+der\s+Mur\b', 'Bruck an der Mur'),
(r'\bSpittal\b', 'Spittal an der Drau'),
(r'\bSchwaz\b', 'Schwaz'),
(r'\bVoitsberg\b', 'Voitsberg'),
(r'\bRankweil\b', 'Rankweil'),
(r'\bBad\s+Vöslau\b', 'Bad Vöslau'),
(r'\bTulln\b', 'Tulln an der Donau'),
(r'\bGänserndorf\b', 'Gänserndorf'),
(r'\bHollabrunn\b', 'Hollabrunn'),
(r'\bLienz\b', 'Lienz'),
(r'\bHall\s+in\s+Tirol\b', 'Hall in Tirol'),
(r'\bFeldkirchen\b', 'Feldkirchen in Kärnten'),
(r'\bZwettl\b', 'Zwettl'),
(r'\bWaidhofen\b', 'Waidhofen an der Ybbs'),
(r'\bMattersburg\b', 'Mattersburg'),
(r'\bOberwart\b', 'Oberwart'),
(r'\bJudenburg\b', 'Judenburg'),
(r'\bPöchlarn\b', 'Pöchlarn'),
(r'\bFranziskanerplatz\b', 'Wien'), # Common Vienna address
(r'\bJosefsplatz\b', 'Wien'), # Hofburg, Vienna
# Regional references → capital cities
(r'\bTiroler\b', 'Innsbruck'), # Amt der Tiroler Landesregierung
(r'\bBurgenländische\b', 'Eisenstadt'), # Burgenländische Landesbibliothek
(r'\bKärnt(?:en|ner)\b', 'Klagenfurt'), # Kärnten/Kärntner → Klagenfurt
(r'\bVorarlberg(?:er)?\b', 'Feldkirch'), # Vorarlberg
(r'\bSteiermark\b', 'Graz'), # Steiermark
(r'\bSteiermärk\b', 'Graz'), # Steiermärkisch
(r'\bOÖ\b', 'Linz'), # OÖ = Oberösterreich
(r'\bOberösterreich\b', 'Linz'), # Oberösterreich
(r'\bNiederösterreich\b', 'Sankt Pölten'), # Niederösterreich
(r'\bNÖ\b', 'Sankt Pölten'), # NÖ = Niederösterreich
(r'\bSalzburg(?:er)?\b', 'Salzburg'), # Salzburger Festspiele
# Small towns mentioned in institution names
(r'\bKaltenleutgeben\b', 'Kaltenleutgeben'),
(r'\bLambach\b', 'Lambach'),
(r'\bSeitenstetten\b', 'Seitenstetten'),
(r'\bMattsee\b', 'Mattsee'),
(r'\bPöggstall\b', 'Pöggstall'),
(r'\bLaxenburg\b', 'Laxenburg'),
(r'\bEggenburg\b', 'Eggenburg'),
(r'\bPressbaum\b', 'Pressbaum'),
(r'\bSeeburg\b', 'Seekirchen am Wallersee'), # Schloss Seeburg
(r'\bSchotten(?:stift)?\b', 'Wien'), # Schottenstift is in Vienna
(r'\bAlbertina\b', 'Wien'), # Albertina is in Vienna
(r'\bMozarteum\b', 'Salzburg'), # Mozarteum is in Salzburg
(r'\bParacelsus\b', 'Salzburg'), # Paracelsus Medizinische Privatuniversität
(r'\bJoanneum\b', 'Graz'), # FH Joanneum is in Graz
(r'\bParlament\b', 'Wien'), # Parlamentsbibliothek
(r'\bBundeskanzleramt\b', 'Wien'), # Federal Chancellery
(r'\bBundesministerium\b', 'Wien'), # Federal Ministries
(r'\bBundesdenkmalamt\b', 'Wien'), # Federal Monument Office
(r'\bÖsterreich(?:ische[rn]?)?\b', 'Wien'), # Austrian national institutions
(r'\bIST\s*Austria\b', 'Klosterneuburg'), # Institute of Science and Technology Austria
(r'\bInstitute\s+of\s+Science\s+and\s+Technology\s+Austria\b', 'Klosterneuburg'), # Full name
(r'\bRapid(?:eum)?\b', 'Wien'), # SK Rapid Vienna
(r'\bMetalab\b', 'Wien'), # Metalab hackerspace Vienna
(r'\bSigmund\s+Freud\b', 'Wien'), # Sigmund Freud museum Vienna
(r'\bMax\s+Perutz\b', 'Wien'), # Max Perutz Library (Vienna Biocenter)
# Additional specific institutions
(r'\bAnton\s+Bruckner\b', 'Linz'), # Anton Bruckner Private University
(r'\bbifeb\b', 'Strobl'), # Bundesinstitut für Erwachsenenbildung
(r'\bBundesinstitut\s+für\s+Erwachsenenbildung\b', 'Strobl'),
(r'\bZeitgenossen\b', 'Krems an der Donau'), # Archiv der Zeitgenossen
(r'\bCompass[-\s]Verlag\b', 'Wien'), # Compass-Verlag
(r'\bErnst\s+Krenek\b', 'Krems an der Donau'), # Ernst Krenek Institut
(r'\bFrauensolidarität\b', 'Wien'), # Frauensolidarität
(r'\bGeoSphere\b', 'Wien'), # GeoSphere Austria
(r'\bHochschule\s+Burgenland\b', 'Eisenstadt'), # FH Burgenland
(r'\bAgrar[-\s]und\s+Umweltpädagogik\b', 'Wien'), # Hochschule für Agrar
(r'\bHochschule\s+für\s+Agrar\b', 'Wien'), # Hochschule für Agrar (full)
(r'\bHöhere\s+Studien\b', 'Wien'), # IHS
(r'\bInterdisciplinary\s+Transformation\b', 'Wien'), # ITU
(r'\bJAM\s+Music\s+Lab\b', 'Wien'), # JAM Music Lab
(r'\bKDZ\b', 'Wien'), # KDZ Zentrum
(r'\bNew\s+Design\s+University\b', 'Sankt Pölten'), # NDU
(r'\bPädagogische\s+Hochschule\s+Tirol\b', 'Innsbruck'), # PH Tirol
(r'\bPädagogische\s+Hochschule\s+Burgenland\b', 'Eisenstadt'), # PPH Burgenland
(r'\bShared\s+Archiving\b', 'Wien'), # SAA
(r'\bVerbund\s+für\s+Bildung\b', 'Wien'), # VBKV
(r'\bVilla\s+North\b', 'Wien'), # Villa North
(r'\bInformationswissenschaft\b', 'Graz'), # VFI
(r'\bErinnerungskultur\b', 'Villach'), # ZEG is in Villach, not Graz
(r'\bParlament(?:s)?(?:direktion|bibliothek)?\b', 'Wien'), # Parlamentsbibliothek
]
def load_source_data(source_file: str) -> dict:
"""Load Austrian source data with coordinates and ISIL codes."""
import yaml
with open(source_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
lookup = {}
for inst in data.get('institutions', []):
# Get ISIL code
isil = None
for ident in inst.get('identifiers', []):
if ident.get('identifier_scheme') == 'ISIL':
isil = ident.get('identifier_value')
break
if isil:
locs = inst.get('locations', [])
coords = None
if locs and locs[0].get('latitude') and locs[0].get('longitude'):
coords = (locs[0]['latitude'], locs[0]['longitude'])
lookup[isil] = {
'name': inst.get('name', ''),
'coords': coords,
}
return lookup
def extract_city_from_name(name: str) -> str | None:
"""Extract city name from Austrian institution name."""
for pattern, city in AUSTRIAN_CITY_PATTERNS:
if re.search(pattern, name, re.IGNORECASE):
return city
return None
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
normalized = unicodedata.normalize('NFD', city_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
words = clean.split()
if len(words) == 1:
return words[0][:3].upper()
else:
if len(words) == 2:
return (words[0][0] + words[1][:2]).upper()
else:
return ''.join(w[0] for w in words[:3]).upper()
def reverse_geocode(lat: float, lon: float, conn: sqlite3.Connection) -> dict | None:
"""Reverse geocode coordinates to find nearest Austrian city."""
cursor = conn.cursor()
cursor.execute('''
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code,
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
FROM cities
WHERE country_code = 'AT'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY distance_sq
LIMIT 1
''', (lat, lat, lon, lon))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
return None
def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
"""Look up city in GeoNames database."""
cursor = conn.cursor()
# Try exact match
cursor.execute('''
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE country_code = 'AT'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
ORDER BY population DESC
LIMIT 1
''', (city_name, city_name))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
# Try fuzzy match
cursor.execute('''
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE country_code = 'AT'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
ORDER BY population DESC
LIMIT 1
''', (f'{city_name}%', f'{city_name}%'))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
return None
def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, method: str, dry_run: bool = False) -> bool:
"""Update a custodian file with city data."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
if not ghcid_match:
return False
old_ghcid = ghcid_match.group(1)
region_code = AUSTRIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])
parts = old_ghcid.split('-')
if len(parts) >= 5:
type_code = parts[3]
abbrev_and_suffix = '-'.join(parts[4:])
new_ghcid = f"AT-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
else:
return False
if old_ghcid == new_ghcid:
return False
old_filename = file_path.name
new_filename = old_filename.replace(old_ghcid, new_ghcid)
new_file_path = file_path.parent / new_filename
new_content = content.replace(old_ghcid, new_ghcid)
old_resolution = re.search(r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content)
if old_resolution:
new_resolution = f"""location_resolution:
country_code: AT
region_code: {region_code}
region_name: {geo_data['admin1_name']}
city_code: {city_code}
city_name: {geo_data['name']}
geonames_id: {geo_data['geonames_id']}
feature_code: {geo_data['feature_code']}
latitude: {geo_data['latitude']}
longitude: {geo_data['longitude']}
method: {method}
resolution_date: '{datetime.now(timezone.utc).isoformat()}'
"""
new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]
timestamp = datetime.now(timezone.utc).isoformat()
history_entry = f""" - ghcid: {new_ghcid}
valid_from: '{timestamp}'
reason: City enrichment from {method} - {city_name} resolved to {geo_data['name']} ({region_code})
"""
history_match = re.search(r'ghcid_history:\s*\n', new_content)
if history_match:
insert_pos = history_match.end()
new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]
if dry_run:
print(f" DRY RUN: {old_filename} -> {new_filename}")
return True
with open(file_path, 'w', encoding='utf-8') as f:
f.write(new_content)
if new_file_path != file_path:
file_path.rename(new_file_path)
return True
def main():
dry_run = '--dry-run' in sys.argv
base_dir = Path(__file__).parent.parent
custodian_dir = base_dir / 'data' / 'custodian'
source_file = base_dir / 'data' / 'instances' / 'austria_complete_ch_annotator.yaml'
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
print("Austrian City Enrichment Script")
print("=" * 50)
if dry_run:
print("DRY RUN MODE")
# Load source data
print(f"\nLoading source data from {source_file.name}...")
source_lookup = load_source_data(str(source_file))
print(f" Found {len(source_lookup)} ISIL entries")
coords_count = sum(1 for v in source_lookup.values() if v['coords'])
print(f" {coords_count} entries have coordinates")
conn = sqlite3.connect(str(geonames_db))
print(f"\nFinding Austrian XXX files...")
xxx_files = list(custodian_dir.glob('AT-*-XXX-*.yaml'))
print(f" Found {len(xxx_files)} files")
updated = 0
by_coords = 0
by_name = 0
no_city = 0
no_geonames = 0
errors = 0
for file_path in xxx_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Find ISIL code
isil_match = re.search(r'identifier_value:\s*(AT-\w+)', content)
isil_code = isil_match.group(1) if isil_match else None
# Get institution name
name_match = re.search(r'claim_value:\s*(.+)', content)
inst_name = name_match.group(1).strip() if name_match else ''
geo_data = None
method = None
city_name = None
# Strategy 1: Use coordinates for reverse geocoding
if isil_code and isil_code in source_lookup:
source_data = source_lookup[isil_code]
if source_data['coords']:
lat, lon = source_data['coords']
geo_data = reverse_geocode(lat, lon, conn)
if geo_data:
method = 'REVERSE_GEOCODE'
city_name = geo_data['name']
by_coords += 1
# Strategy 2: Extract city from institution name
if not geo_data:
city_name = extract_city_from_name(inst_name)
if city_name:
geo_data = lookup_city_in_geonames(city_name, conn)
if geo_data:
method = 'NAME_EXTRACTION'
by_name += 1
if not geo_data:
no_city += 1
continue
if update_custodian_file(file_path, city_name, geo_data, method, dry_run):
updated += 1
if not dry_run:
print(f" Updated: {file_path.name} -> {city_name} ({method})")
except Exception as e:
errors += 1
print(f" ERROR: {file_path.name}: {e}")
conn.close()
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Total XXX files: {len(xxx_files)}")
print(f"Updated: {updated}")
print(f" By coordinates: {by_coords}")
print(f" By name extraction: {by_name}")
print(f"No city found: {no_city}")
print(f"Errors: {errors}")
print(f"Remaining XXX: {len(xxx_files) - updated}")
# Generate report
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_path = base_dir / 'reports' / f'AUSTRIAN_CITY_ENRICHMENT_{timestamp}.md'
with open(report_path, 'w') as f:
f.write(f"# Austrian City Enrichment Report\n\n")
f.write(f"**Date**: {datetime.now().isoformat()}\n")
f.write(f"**Dry Run**: {dry_run}\n\n")
f.write(f"## Summary\n\n")
f.write(f"| Metric | Count |\n")
f.write(f"|--------|-------|\n")
f.write(f"| Total XXX files | {len(xxx_files)} |\n")
f.write(f"| Updated | {updated} |\n")
f.write(f"| By coordinates | {by_coords} |\n")
f.write(f"| By name extraction | {by_name} |\n")
f.write(f"| No city found | {no_city} |\n")
f.write(f"| Errors | {errors} |\n")
f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")
print(f"\nReport: {report_path}")
if __name__ == '__main__':
main()