glam/scripts/build_geonames_db.py

#!/usr/bin/env python3
"""
Build SQLite database from GeoNames data.

Downloads and processes GeoNames allCountries.txt file to create a fast,
queryable database for global city lookups. Optimized for GLAM institution
data across 60+ countries.

GeoNames Data Format (tab-separated):
0:  geonameid         - integer id of record
1:  name              - name of geographical point (utf8)
2:  asciiname         - name in plain ascii characters
3:  alternatenames    - comma-separated, ascii names automatically transliterated
4:  latitude          - in decimal degrees (wgs84)
5:  longitude         - in decimal degrees (wgs84)
6:  feature class     - see http://www.geonames.org/export/codes.html
7:  feature code      - see http://www.geonames.org/export/codes.html
8:  country code      - ISO-3166 2-letter country code
9:  cc2               - alternate country codes
10: admin1 code       - fipscode (subject to change to iso code)
11: admin2 code       - code for the second administrative division
12: admin3 code       - code for third level administrative division
13: admin4 code       - code for fourth level administrative division
14: population        - bigint
15: elevation         - in meters, integer
16: dem               - digital elevation model, srtm3 or gtopo30
17: timezone          - the timezone id
18: modification date - date of last modification in yyyy-MM-dd format
"""

import argparse
import csv
import sqlite3
import sys
from pathlib import Path
from typing import Dict, Optional, Set


# Feature codes for cities and populated places
# See: http://www.geonames.org/export/codes.html
CITY_FEATURE_CODES = {
    'PPL',    # populated place
    'PPLA',   # seat of a first-order administrative division
    'PPLA2',  # seat of a second-order administrative division
    'PPLA3',  # seat of a third-order administrative division
    'PPLA4',  # seat of a fourth-order administrative division
    'PPLC',   # capital of a political entity
    'PPLG',   # seat of government of a political entity
    'PPLS',   # populated places
    'PPLX',   # section of populated place
}


def parse_admin1_codes(admin1_file: Path) -> Dict[str, str]:
    """
    Parse admin1CodesASCII.txt to get province/state names.

    Format: <country_code>.<admin1_code> TAB <name> TAB <ascii_name> TAB <geonameid>
    Example: NL.07	Zuid-Holland	Zuid-Holland	2743698

    Returns:
        Dict mapping "CC.code" -> province name
    """
    admin1_names = {}
    with open(admin1_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                code = parts[0]  # e.g., "NL.07"
                name = parts[1]   # e.g., "Zuid-Holland"
                admin1_names[code] = name
    return admin1_names


def build_database(
    input_file: Path,
    admin1_file: Path,
    output_db: Path,
    min_population: int = 0,
    countries: Optional[Set[str]] = None
) -> None:
    """
    Build SQLite database from GeoNames allCountries.txt.

    Args:
        input_file: Path to allCountries.txt
        admin1_file: Path to admin1CodesASCII.txt
        output_db: Path to output SQLite database
        min_population: Minimum population to include (0 = all cities)
        countries: Set of country codes to include (None = all countries)
    """
    print(f"Loading admin1 codes from {admin1_file}...")
    admin1_names = parse_admin1_codes(admin1_file)
    print(f"Loaded {len(admin1_names)} admin1 codes")

    # Create database
    print(f"Creating database at {output_db}...")
    if output_db.exists():
        output_db.unlink()

    conn = sqlite3.connect(output_db)
    cursor = conn.cursor()

    # Create tables
    cursor.execute("""
        CREATE TABLE cities (
            geonames_id INTEGER PRIMARY KEY,
            name TEXT NOT NULL,
            ascii_name TEXT NOT NULL,
            alternate_names TEXT,
            country_code TEXT NOT NULL,
            admin1_code TEXT,
            admin1_name TEXT,
            admin2_code TEXT,
            latitude REAL NOT NULL,
            longitude REAL NOT NULL,
            feature_code TEXT NOT NULL,
            population INTEGER,
            elevation INTEGER,
            timezone TEXT
        )
    """)

    cursor.execute("""
        CREATE INDEX idx_name_country ON cities(name, country_code)
    """)

    cursor.execute("""
        CREATE INDEX idx_ascii_country ON cities(ascii_name, country_code)
    """)

    cursor.execute("""
        CREATE INDEX idx_country ON cities(country_code)
    """)

    cursor.execute("""
        CREATE INDEX idx_population ON cities(population DESC)
    """)

    # Create metadata table
    cursor.execute("""
        CREATE TABLE metadata (
            key TEXT PRIMARY KEY,
            value TEXT
        )
    """)

    cursor.execute("""
        INSERT INTO metadata (key, value) VALUES
        ('source', 'GeoNames allCountries.txt'),
        ('url', 'http://download.geonames.org/export/dump/'),
        ('feature_codes', ?),
        ('min_population', ?),
        ('build_date', datetime('now'))
    """, (','.join(CITY_FEATURE_CODES), str(min_population)))

    # Process input file
    print(f"Processing {input_file}...")
    print(f"Filter: feature_codes={CITY_FEATURE_CODES}, min_population={min_population}")
    if countries:
        print(f"Filter: countries={sorted(countries)}")

    total_rows = 0
    inserted_rows = 0
    batch_size = 10000
    batch = []

    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            total_rows += 1

            if len(row) < 19:
                continue

            geonames_id = int(row[0])
            name = row[1]
            ascii_name = row[2]
            alternate_names = row[3]
            latitude = float(row[4])
            longitude = float(row[5])
            feature_class = row[6]
            feature_code = row[7]
            country_code = row[8]
            admin1_code = row[10]
            admin2_code = row[11]
            population = int(row[14]) if row[14] else 0
            elevation = int(row[15]) if row[15] else None
            timezone = row[17]

            # Filter: only cities/populated places
            if feature_code not in CITY_FEATURE_CODES:
                continue

            # Filter: minimum population
            if population < min_population:
                continue

            # Filter: specific countries (if specified)
            if countries and country_code not in countries:
                continue

            # Get admin1 name
            admin1_key = f"{country_code}.{admin1_code}"
            admin1_name = admin1_names.get(admin1_key)

            batch.append((
                geonames_id,
                name,
                ascii_name,
                alternate_names,
                country_code,
                admin1_code,
                admin1_name,
                admin2_code,
                latitude,
                longitude,
                feature_code,
                population,
                elevation,
                timezone
            ))

            if len(batch) >= batch_size:
                cursor.executemany("""
                    INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, batch)
                inserted_rows += len(batch)
                batch = []

                if inserted_rows % 100000 == 0:
                    print(f"  Processed {total_rows:,} rows, inserted {inserted_rows:,} cities...")
                    conn.commit()

        # Insert remaining batch
        if batch:
            cursor.executemany("""
                INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, batch)
            inserted_rows += len(batch)

    conn.commit()

    # Get database statistics
    cursor.execute("SELECT COUNT(*) FROM cities")
    total_cities = cursor.fetchone()[0]

    cursor.execute("SELECT COUNT(DISTINCT country_code) FROM cities")
    total_countries = cursor.fetchone()[0]

    cursor.execute("SELECT country_code, COUNT(*) FROM cities GROUP BY country_code ORDER BY COUNT(*) DESC LIMIT 10")
    top_countries = cursor.fetchall()

    # Store stats in metadata
    cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_cities', ?)", (str(total_cities),))
    cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_countries', ?)", (str(total_countries),))

    conn.commit()
    conn.close()

    # Print summary
    print("\n" + "=" * 60)
    print("DATABASE BUILD COMPLETE")
    print("=" * 60)
    print(f"Input file:       {input_file}")
    print(f"Output database:  {output_db}")
    print(f"Database size:    {output_db.stat().st_size / 1024 / 1024:.1f} MB")
    print(f"Total rows read:  {total_rows:,}")
    print(f"Cities inserted:  {total_cities:,}")
    print(f"Countries:        {total_countries}")
    print("\nTop 10 countries by city count:")
    for country_code, count in top_countries:
        print(f"  {country_code}: {count:,} cities")
    print("=" * 60)


def main():
    parser = argparse.ArgumentParser(
        description="Build SQLite database from GeoNames data for global GLAM institution lookups"
    )
    parser.add_argument(
        '--input',
        type=Path,
        default=Path('data/reference/allCountries.txt'),
        help='Path to GeoNames allCountries.txt file'
    )
    parser.add_argument(
        '--admin1',
        type=Path,
        default=Path('data/reference/admin1CodesASCII.txt'),
        help='Path to admin1CodesASCII.txt file'
    )
    parser.add_argument(
        '--output',
        type=Path,
        default=Path('data/reference/geonames.db'),
        help='Path to output SQLite database'
    )
    parser.add_argument(
        '--min-population',
        type=int,
        default=0,
        help='Minimum population (0 = include all cities/towns)'
    )
    parser.add_argument(
        '--countries',
        type=str,
        help='Comma-separated list of country codes (e.g., NL,US,BR). Default: all countries'
    )

    args = parser.parse_args()

    # Validate input files
    if not args.input.exists():
        print(f"Error: Input file not found: {args.input}", file=sys.stderr)
        print("Download from: http://download.geonames.org/export/dump/allCountries.zip", file=sys.stderr)
        sys.exit(1)

    if not args.admin1.exists():
        print(f"Error: Admin1 file not found: {args.admin1}", file=sys.stderr)
        print("Download from: http://download.geonames.org/export/dump/admin1CodesASCII.txt", file=sys.stderr)
        sys.exit(1)

    # Parse countries filter
    countries = None
    if args.countries:
        countries = set(c.strip().upper() for c in args.countries.split(','))

    # Build database
    build_database(
        args.input,
        args.admin1,
        args.output,
        args.min_population,
        countries
    )


if __name__ == '__main__':
    main()