#!/usr/bin/env python3 """ Enrich Austrian custodian files with city data. Strategy: 1. Use coordinates for reverse geocoding when available 2. Extract city names from institution names (Wien, Salzburg, Graz, etc.) 3. Validate against GeoNames database Usage: python scripts/enrich_austrian_cities.py [--dry-run] """ import re import sqlite3 import sys import unicodedata from datetime import datetime, timezone from pathlib import Path # Austrian admin1 codes (GeoNames → ISO 3166-2:AT) AUSTRIAN_ADMIN1_MAP = { '01': 'B', # Burgenland '02': 'K', # Carinthia (Kärnten) '03': 'NO', # Lower Austria (Niederösterreich) '04': 'OO', # Upper Austria (Oberösterreich) '05': 'S', # Salzburg '06': 'ST', # Styria (Steiermark) '07': 'T', # Tyrol (Tirol) '08': 'V', # Vorarlberg '09': 'W', # Vienna (Wien) } # Known Austrian cities in institution names AUSTRIAN_CITY_PATTERNS = [ # Major cities (r'\bWien\b', 'Wien'), (r'\bVienna\b', 'Wien'), (r'\bGraz\b', 'Graz'), (r'\bLinz\b', 'Linz'), (r'\bSalzburg\b', 'Salzburg'), (r'\bInnsbruck\b', 'Innsbruck'), (r'\bKlagenfurt\b', 'Klagenfurt'), (r'\bVillach\b', 'Villach'), (r'\bWels\b', 'Wels'), (r'\bSt\.?\s*Pölten\b', 'Sankt Pölten'), (r'\bSankt\s+Pölten\b', 'Sankt Pölten'), (r'\bDornbirn\b', 'Dornbirn'), (r'\bWiener\s+Neustadt\b', 'Wiener Neustadt'), (r'\bSteyr\b', 'Steyr'), (r'\bFeldkirch\b', 'Feldkirch'), (r'\bBregenz\b', 'Bregenz'), (r'\bLeonding\b', 'Leonding'), (r'\bKlosterneuburg\b', 'Klosterneuburg'), (r'\bBaden\b', 'Baden'), (r'\bLeoben\b', 'Leoben'), (r'\bKrems\b', 'Krems an der Donau'), (r'\bAmstetten\b', 'Amstetten'), (r'\bMödling\b', 'Mödling'), (r'\bKapfenberg\b', 'Kapfenberg'), (r'\bLustenau\b', 'Lustenau'), (r'\bHallein\b', 'Hallein'), (r'\bKufstein\b', 'Kufstein'), (r'\bTraun\b', 'Traun'), (r'\bAnsfelden\b', 'Ansfelden'), (r'\bHohenems\b', 'Hohenems'), (r'\bSchwechat\b', 'Schwechat'), (r'\bBraunau\b', 'Braunau am Inn'), (r'\bStockerau\b', 'Stockerau'), (r'\bSaalfelden\b', 'Saalfelden am Steinernen Meer'), (r'\bTernitz\b', 'Ternitz'), (r'\bPerchtoldsdorf\b', 'Perchtoldsdorf'), (r'\bEisenstädter?\b', 'Eisenstadt'), (r'\bEisenstadt\b', 'Eisenstadt'), (r'\bTelfs\b', 'Telfs'), (r'\bWolfsberg\b', 'Wolfsberg'), (r'\bHard\b', 'Hard'), (r'\bKorneuburg\b', 'Korneuburg'), (r'\bNeunkirchen\b', 'Neunkirchen'), (r'\bRied\b', 'Ried im Innkreis'), (r'\bBad\s+Ischl\b', 'Bad Ischl'), (r'\bGmunden\b', 'Gmunden'), (r'\bWörgl\b', 'Wörgl'), (r'\bMelk\b', 'Melk'), (r'\bZell\s+am\s+See\b', 'Zell am See'), (r'\bMistelbach\b', 'Mistelbach'), (r'\bVöcklabruck\b', 'Vöcklabruck'), (r'\bMarchtrenk\b', 'Marchtrenk'), (r'\bEnns\b', 'Enns'), (r'\bBruck\s+an\s+der\s+Mur\b', 'Bruck an der Mur'), (r'\bSpittal\b', 'Spittal an der Drau'), (r'\bSchwaz\b', 'Schwaz'), (r'\bVoitsberg\b', 'Voitsberg'), (r'\bRankweil\b', 'Rankweil'), (r'\bBad\s+Vöslau\b', 'Bad Vöslau'), (r'\bTulln\b', 'Tulln an der Donau'), (r'\bGänserndorf\b', 'Gänserndorf'), (r'\bHollabrunn\b', 'Hollabrunn'), (r'\bLienz\b', 'Lienz'), (r'\bHall\s+in\s+Tirol\b', 'Hall in Tirol'), (r'\bFeldkirchen\b', 'Feldkirchen in Kärnten'), (r'\bZwettl\b', 'Zwettl'), (r'\bWaidhofen\b', 'Waidhofen an der Ybbs'), (r'\bMattersburg\b', 'Mattersburg'), (r'\bOberwart\b', 'Oberwart'), (r'\bJudenburg\b', 'Judenburg'), (r'\bPöchlarn\b', 'Pöchlarn'), (r'\bFranziskanerplatz\b', 'Wien'), # Common Vienna address (r'\bJosefsplatz\b', 'Wien'), # Hofburg, Vienna # Regional references → capital cities (r'\bTiroler\b', 'Innsbruck'), # Amt der Tiroler Landesregierung (r'\bBurgenländische\b', 'Eisenstadt'), # Burgenländische Landesbibliothek (r'\bKärnt(?:en|ner)\b', 'Klagenfurt'), # Kärnten/Kärntner → Klagenfurt (r'\bVorarlberg(?:er)?\b', 'Feldkirch'), # Vorarlberg (r'\bSteiermark\b', 'Graz'), # Steiermark (r'\bSteiermärk\b', 'Graz'), # Steiermärkisch (r'\bOÖ\b', 'Linz'), # OÖ = Oberösterreich (r'\bOberösterreich\b', 'Linz'), # Oberösterreich (r'\bNiederösterreich\b', 'Sankt Pölten'), # Niederösterreich (r'\bNÖ\b', 'Sankt Pölten'), # NÖ = Niederösterreich (r'\bSalzburg(?:er)?\b', 'Salzburg'), # Salzburger Festspiele # Small towns mentioned in institution names (r'\bKaltenleutgeben\b', 'Kaltenleutgeben'), (r'\bLambach\b', 'Lambach'), (r'\bSeitenstetten\b', 'Seitenstetten'), (r'\bMattsee\b', 'Mattsee'), (r'\bPöggstall\b', 'Pöggstall'), (r'\bLaxenburg\b', 'Laxenburg'), (r'\bEggenburg\b', 'Eggenburg'), (r'\bPressbaum\b', 'Pressbaum'), (r'\bSeeburg\b', 'Seekirchen am Wallersee'), # Schloss Seeburg (r'\bSchotten(?:stift)?\b', 'Wien'), # Schottenstift is in Vienna (r'\bAlbertina\b', 'Wien'), # Albertina is in Vienna (r'\bMozarteum\b', 'Salzburg'), # Mozarteum is in Salzburg (r'\bParacelsus\b', 'Salzburg'), # Paracelsus Medizinische Privatuniversität (r'\bJoanneum\b', 'Graz'), # FH Joanneum is in Graz (r'\bParlament\b', 'Wien'), # Parlamentsbibliothek (r'\bBundeskanzleramt\b', 'Wien'), # Federal Chancellery (r'\bBundesministerium\b', 'Wien'), # Federal Ministries (r'\bBundesdenkmalamt\b', 'Wien'), # Federal Monument Office (r'\bÖsterreich(?:ische[rn]?)?\b', 'Wien'), # Austrian national institutions (r'\bIST\s*Austria\b', 'Klosterneuburg'), # Institute of Science and Technology Austria (r'\bInstitute\s+of\s+Science\s+and\s+Technology\s+Austria\b', 'Klosterneuburg'), # Full name (r'\bRapid(?:eum)?\b', 'Wien'), # SK Rapid Vienna (r'\bMetalab\b', 'Wien'), # Metalab hackerspace Vienna (r'\bSigmund\s+Freud\b', 'Wien'), # Sigmund Freud museum Vienna (r'\bMax\s+Perutz\b', 'Wien'), # Max Perutz Library (Vienna Biocenter) # Additional specific institutions (r'\bAnton\s+Bruckner\b', 'Linz'), # Anton Bruckner Private University (r'\bbifeb\b', 'Strobl'), # Bundesinstitut für Erwachsenenbildung (r'\bBundesinstitut\s+für\s+Erwachsenenbildung\b', 'Strobl'), (r'\bZeitgenossen\b', 'Krems an der Donau'), # Archiv der Zeitgenossen (r'\bCompass[-\s]Verlag\b', 'Wien'), # Compass-Verlag (r'\bErnst\s+Krenek\b', 'Krems an der Donau'), # Ernst Krenek Institut (r'\bFrauensolidarität\b', 'Wien'), # Frauensolidarität (r'\bGeoSphere\b', 'Wien'), # GeoSphere Austria (r'\bHochschule\s+Burgenland\b', 'Eisenstadt'), # FH Burgenland (r'\bAgrar[-\s]und\s+Umweltpädagogik\b', 'Wien'), # Hochschule für Agrar (r'\bHochschule\s+für\s+Agrar\b', 'Wien'), # Hochschule für Agrar (full) (r'\bHöhere\s+Studien\b', 'Wien'), # IHS (r'\bInterdisciplinary\s+Transformation\b', 'Wien'), # ITU (r'\bJAM\s+Music\s+Lab\b', 'Wien'), # JAM Music Lab (r'\bKDZ\b', 'Wien'), # KDZ Zentrum (r'\bNew\s+Design\s+University\b', 'Sankt Pölten'), # NDU (r'\bPädagogische\s+Hochschule\s+Tirol\b', 'Innsbruck'), # PH Tirol (r'\bPädagogische\s+Hochschule\s+Burgenland\b', 'Eisenstadt'), # PPH Burgenland (r'\bShared\s+Archiving\b', 'Wien'), # SAA (r'\bVerbund\s+für\s+Bildung\b', 'Wien'), # VBKV (r'\bVilla\s+North\b', 'Wien'), # Villa North (r'\bInformationswissenschaft\b', 'Graz'), # VFI (r'\bErinnerungskultur\b', 'Villach'), # ZEG is in Villach, not Graz (r'\bParlament(?:s)?(?:direktion|bibliothek)?\b', 'Wien'), # Parlamentsbibliothek ] def load_source_data(source_file: str) -> dict: """Load Austrian source data with coordinates and ISIL codes.""" import yaml with open(source_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) lookup = {} for inst in data.get('institutions', []): # Get ISIL code isil = None for ident in inst.get('identifiers', []): if ident.get('identifier_scheme') == 'ISIL': isil = ident.get('identifier_value') break if isil: locs = inst.get('locations', []) coords = None if locs and locs[0].get('latitude') and locs[0].get('longitude'): coords = (locs[0]['latitude'], locs[0]['longitude']) lookup[isil] = { 'name': inst.get('name', ''), 'coords': coords, } return lookup def extract_city_from_name(name: str) -> str | None: """Extract city name from Austrian institution name.""" for pattern, city in AUSTRIAN_CITY_PATTERNS: if re.search(pattern, name, re.IGNORECASE): return city return None def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" normalized = unicodedata.normalize('NFD', city_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) words = clean.split() if len(words) == 1: return words[0][:3].upper() else: if len(words) == 2: return (words[0][0] + words[1][:2]).upper() else: return ''.join(w[0] for w in words[:3]).upper() def reverse_geocode(lat: float, lon: float, conn: sqlite3.Connection) -> dict | None: """Reverse geocode coordinates to find nearest Austrian city.""" cursor = conn.cursor() cursor.execute(''' SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq FROM cities WHERE country_code = 'AT' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY distance_sq LIMIT 1 ''', (lat, lat, lon, lon)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } return None def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None: """Look up city in GeoNames database.""" cursor = conn.cursor() # Try exact match cursor.execute(''' SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code = 'AT' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) ORDER BY population DESC LIMIT 1 ''', (city_name, city_name)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } # Try fuzzy match cursor.execute(''' SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code = 'AT' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) ORDER BY population DESC LIMIT 1 ''', (f'{city_name}%', f'{city_name}%')) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } return None def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, method: str, dry_run: bool = False) -> bool: """Update a custodian file with city data.""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content) if not ghcid_match: return False old_ghcid = ghcid_match.group(1) region_code = AUSTRIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code']) city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name']) parts = old_ghcid.split('-') if len(parts) >= 5: type_code = parts[3] abbrev_and_suffix = '-'.join(parts[4:]) new_ghcid = f"AT-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}" else: return False if old_ghcid == new_ghcid: return False old_filename = file_path.name new_filename = old_filename.replace(old_ghcid, new_ghcid) new_file_path = file_path.parent / new_filename new_content = content.replace(old_ghcid, new_ghcid) old_resolution = re.search(r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content) if old_resolution: new_resolution = f"""location_resolution: country_code: AT region_code: {region_code} region_name: {geo_data['admin1_name']} city_code: {city_code} city_name: {geo_data['name']} geonames_id: {geo_data['geonames_id']} feature_code: {geo_data['feature_code']} latitude: {geo_data['latitude']} longitude: {geo_data['longitude']} method: {method} resolution_date: '{datetime.now(timezone.utc).isoformat()}' """ new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():] timestamp = datetime.now(timezone.utc).isoformat() history_entry = f""" - ghcid: {new_ghcid} valid_from: '{timestamp}' reason: City enrichment from {method} - {city_name} resolved to {geo_data['name']} ({region_code}) """ history_match = re.search(r'ghcid_history:\s*\n', new_content) if history_match: insert_pos = history_match.end() new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:] if dry_run: print(f" DRY RUN: {old_filename} -> {new_filename}") return True with open(file_path, 'w', encoding='utf-8') as f: f.write(new_content) if new_file_path != file_path: file_path.rename(new_file_path) return True def main(): dry_run = '--dry-run' in sys.argv base_dir = Path(__file__).parent.parent custodian_dir = base_dir / 'data' / 'custodian' source_file = base_dir / 'data' / 'instances' / 'austria_complete_ch_annotator.yaml' geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' print("Austrian City Enrichment Script") print("=" * 50) if dry_run: print("DRY RUN MODE") # Load source data print(f"\nLoading source data from {source_file.name}...") source_lookup = load_source_data(str(source_file)) print(f" Found {len(source_lookup)} ISIL entries") coords_count = sum(1 for v in source_lookup.values() if v['coords']) print(f" {coords_count} entries have coordinates") conn = sqlite3.connect(str(geonames_db)) print(f"\nFinding Austrian XXX files...") xxx_files = list(custodian_dir.glob('AT-*-XXX-*.yaml')) print(f" Found {len(xxx_files)} files") updated = 0 by_coords = 0 by_name = 0 no_city = 0 no_geonames = 0 errors = 0 for file_path in xxx_files: try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Find ISIL code isil_match = re.search(r'identifier_value:\s*(AT-\w+)', content) isil_code = isil_match.group(1) if isil_match else None # Get institution name name_match = re.search(r'claim_value:\s*(.+)', content) inst_name = name_match.group(1).strip() if name_match else '' geo_data = None method = None city_name = None # Strategy 1: Use coordinates for reverse geocoding if isil_code and isil_code in source_lookup: source_data = source_lookup[isil_code] if source_data['coords']: lat, lon = source_data['coords'] geo_data = reverse_geocode(lat, lon, conn) if geo_data: method = 'REVERSE_GEOCODE' city_name = geo_data['name'] by_coords += 1 # Strategy 2: Extract city from institution name if not geo_data: city_name = extract_city_from_name(inst_name) if city_name: geo_data = lookup_city_in_geonames(city_name, conn) if geo_data: method = 'NAME_EXTRACTION' by_name += 1 if not geo_data: no_city += 1 continue if update_custodian_file(file_path, city_name, geo_data, method, dry_run): updated += 1 if not dry_run: print(f" Updated: {file_path.name} -> {city_name} ({method})") except Exception as e: errors += 1 print(f" ERROR: {file_path.name}: {e}") conn.close() print("\n" + "=" * 50) print("SUMMARY") print("=" * 50) print(f"Total XXX files: {len(xxx_files)}") print(f"Updated: {updated}") print(f" By coordinates: {by_coords}") print(f" By name extraction: {by_name}") print(f"No city found: {no_city}") print(f"Errors: {errors}") print(f"Remaining XXX: {len(xxx_files) - updated}") # Generate report timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') report_path = base_dir / 'reports' / f'AUSTRIAN_CITY_ENRICHMENT_{timestamp}.md' with open(report_path, 'w') as f: f.write(f"# Austrian City Enrichment Report\n\n") f.write(f"**Date**: {datetime.now().isoformat()}\n") f.write(f"**Dry Run**: {dry_run}\n\n") f.write(f"## Summary\n\n") f.write(f"| Metric | Count |\n") f.write(f"|--------|-------|\n") f.write(f"| Total XXX files | {len(xxx_files)} |\n") f.write(f"| Updated | {updated} |\n") f.write(f"| By coordinates | {by_coords} |\n") f.write(f"| By name extraction | {by_name} |\n") f.write(f"| No city found | {no_city} |\n") f.write(f"| Errors | {errors} |\n") f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n") print(f"\nReport: {report_path}") if __name__ == '__main__': main()