#!/usr/bin/env python3 """ Build SQLite database from GeoNames data. Downloads and processes GeoNames allCountries.txt file to create a fast, queryable database for global city lookups. Optimized for GLAM institution data across 60+ countries. GeoNames Data Format (tab-separated): 0: geonameid - integer id of record 1: name - name of geographical point (utf8) 2: asciiname - name in plain ascii characters 3: alternatenames - comma-separated, ascii names automatically transliterated 4: latitude - in decimal degrees (wgs84) 5: longitude - in decimal degrees (wgs84) 6: feature class - see http://www.geonames.org/export/codes.html 7: feature code - see http://www.geonames.org/export/codes.html 8: country code - ISO-3166 2-letter country code 9: cc2 - alternate country codes 10: admin1 code - fipscode (subject to change to iso code) 11: admin2 code - code for the second administrative division 12: admin3 code - code for third level administrative division 13: admin4 code - code for fourth level administrative division 14: population - bigint 15: elevation - in meters, integer 16: dem - digital elevation model, srtm3 or gtopo30 17: timezone - the timezone id 18: modification date - date of last modification in yyyy-MM-dd format """ import argparse import csv import sqlite3 import sys from pathlib import Path from typing import Dict, Optional, Set # Feature codes for cities and populated places # See: http://www.geonames.org/export/codes.html CITY_FEATURE_CODES = { 'PPL', # populated place 'PPLA', # seat of a first-order administrative division 'PPLA2', # seat of a second-order administrative division 'PPLA3', # seat of a third-order administrative division 'PPLA4', # seat of a fourth-order administrative division 'PPLC', # capital of a political entity 'PPLG', # seat of government of a political entity 'PPLS', # populated places 'PPLX', # section of populated place } def parse_admin1_codes(admin1_file: Path) -> Dict[str, str]: """ Parse admin1CodesASCII.txt to get province/state names. Format: . TAB TAB TAB Example: NL.07 Zuid-Holland Zuid-Holland 2743698 Returns: Dict mapping "CC.code" -> province name """ admin1_names = {} with open(admin1_file, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split('\t') if len(parts) >= 2: code = parts[0] # e.g., "NL.07" name = parts[1] # e.g., "Zuid-Holland" admin1_names[code] = name return admin1_names def build_database( input_file: Path, admin1_file: Path, output_db: Path, min_population: int = 0, countries: Optional[Set[str]] = None ) -> None: """ Build SQLite database from GeoNames allCountries.txt. Args: input_file: Path to allCountries.txt admin1_file: Path to admin1CodesASCII.txt output_db: Path to output SQLite database min_population: Minimum population to include (0 = all cities) countries: Set of country codes to include (None = all countries) """ print(f"Loading admin1 codes from {admin1_file}...") admin1_names = parse_admin1_codes(admin1_file) print(f"Loaded {len(admin1_names)} admin1 codes") # Create database print(f"Creating database at {output_db}...") if output_db.exists(): output_db.unlink() conn = sqlite3.connect(output_db) cursor = conn.cursor() # Create tables cursor.execute(""" CREATE TABLE cities ( geonames_id INTEGER PRIMARY KEY, name TEXT NOT NULL, ascii_name TEXT NOT NULL, alternate_names TEXT, country_code TEXT NOT NULL, admin1_code TEXT, admin1_name TEXT, admin2_code TEXT, latitude REAL NOT NULL, longitude REAL NOT NULL, feature_code TEXT NOT NULL, population INTEGER, elevation INTEGER, timezone TEXT ) """) cursor.execute(""" CREATE INDEX idx_name_country ON cities(name, country_code) """) cursor.execute(""" CREATE INDEX idx_ascii_country ON cities(ascii_name, country_code) """) cursor.execute(""" CREATE INDEX idx_country ON cities(country_code) """) cursor.execute(""" CREATE INDEX idx_population ON cities(population DESC) """) # Create metadata table cursor.execute(""" CREATE TABLE metadata ( key TEXT PRIMARY KEY, value TEXT ) """) cursor.execute(""" INSERT INTO metadata (key, value) VALUES ('source', 'GeoNames allCountries.txt'), ('url', 'http://download.geonames.org/export/dump/'), ('feature_codes', ?), ('min_population', ?), ('build_date', datetime('now')) """, (','.join(CITY_FEATURE_CODES), str(min_population))) # Process input file print(f"Processing {input_file}...") print(f"Filter: feature_codes={CITY_FEATURE_CODES}, min_population={min_population}") if countries: print(f"Filter: countries={sorted(countries)}") total_rows = 0 inserted_rows = 0 batch_size = 10000 batch = [] with open(input_file, 'r', encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: total_rows += 1 if len(row) < 19: continue geonames_id = int(row[0]) name = row[1] ascii_name = row[2] alternate_names = row[3] latitude = float(row[4]) longitude = float(row[5]) feature_class = row[6] feature_code = row[7] country_code = row[8] admin1_code = row[10] admin2_code = row[11] population = int(row[14]) if row[14] else 0 elevation = int(row[15]) if row[15] else None timezone = row[17] # Filter: only cities/populated places if feature_code not in CITY_FEATURE_CODES: continue # Filter: minimum population if population < min_population: continue # Filter: specific countries (if specified) if countries and country_code not in countries: continue # Get admin1 name admin1_key = f"{country_code}.{admin1_code}" admin1_name = admin1_names.get(admin1_key) batch.append(( geonames_id, name, ascii_name, alternate_names, country_code, admin1_code, admin1_name, admin2_code, latitude, longitude, feature_code, population, elevation, timezone )) if len(batch) >= batch_size: cursor.executemany(""" INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, batch) inserted_rows += len(batch) batch = [] if inserted_rows % 100000 == 0: print(f" Processed {total_rows:,} rows, inserted {inserted_rows:,} cities...") conn.commit() # Insert remaining batch if batch: cursor.executemany(""" INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, batch) inserted_rows += len(batch) conn.commit() # Get database statistics cursor.execute("SELECT COUNT(*) FROM cities") total_cities = cursor.fetchone()[0] cursor.execute("SELECT COUNT(DISTINCT country_code) FROM cities") total_countries = cursor.fetchone()[0] cursor.execute("SELECT country_code, COUNT(*) FROM cities GROUP BY country_code ORDER BY COUNT(*) DESC LIMIT 10") top_countries = cursor.fetchall() # Store stats in metadata cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_cities', ?)", (str(total_cities),)) cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_countries', ?)", (str(total_countries),)) conn.commit() conn.close() # Print summary print("\n" + "=" * 60) print("DATABASE BUILD COMPLETE") print("=" * 60) print(f"Input file: {input_file}") print(f"Output database: {output_db}") print(f"Database size: {output_db.stat().st_size / 1024 / 1024:.1f} MB") print(f"Total rows read: {total_rows:,}") print(f"Cities inserted: {total_cities:,}") print(f"Countries: {total_countries}") print("\nTop 10 countries by city count:") for country_code, count in top_countries: print(f" {country_code}: {count:,} cities") print("=" * 60) def main(): parser = argparse.ArgumentParser( description="Build SQLite database from GeoNames data for global GLAM institution lookups" ) parser.add_argument( '--input', type=Path, default=Path('data/reference/allCountries.txt'), help='Path to GeoNames allCountries.txt file' ) parser.add_argument( '--admin1', type=Path, default=Path('data/reference/admin1CodesASCII.txt'), help='Path to admin1CodesASCII.txt file' ) parser.add_argument( '--output', type=Path, default=Path('data/reference/geonames.db'), help='Path to output SQLite database' ) parser.add_argument( '--min-population', type=int, default=0, help='Minimum population (0 = include all cities/towns)' ) parser.add_argument( '--countries', type=str, help='Comma-separated list of country codes (e.g., NL,US,BR). Default: all countries' ) args = parser.parse_args() # Validate input files if not args.input.exists(): print(f"Error: Input file not found: {args.input}", file=sys.stderr) print("Download from: http://download.geonames.org/export/dump/allCountries.zip", file=sys.stderr) sys.exit(1) if not args.admin1.exists(): print(f"Error: Admin1 file not found: {args.admin1}", file=sys.stderr) print("Download from: http://download.geonames.org/export/dump/admin1CodesASCII.txt", file=sys.stderr) sys.exit(1) # Parse countries filter countries = None if args.countries: countries = set(c.strip().upper() for c in args.countries.split(',')) # Build database build_database( args.input, args.admin1, args.output, args.min_population, countries ) if __name__ == '__main__': main()