glam/scripts/build_geonames_db.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

341 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Build SQLite database from GeoNames data.
Downloads and processes GeoNames allCountries.txt file to create a fast,
queryable database for global city lookups. Optimized for GLAM institution
data across 60+ countries.
GeoNames Data Format (tab-separated):
0: geonameid - integer id of record
1: name - name of geographical point (utf8)
2: asciiname - name in plain ascii characters
3: alternatenames - comma-separated, ascii names automatically transliterated
4: latitude - in decimal degrees (wgs84)
5: longitude - in decimal degrees (wgs84)
6: feature class - see http://www.geonames.org/export/codes.html
7: feature code - see http://www.geonames.org/export/codes.html
8: country code - ISO-3166 2-letter country code
9: cc2 - alternate country codes
10: admin1 code - fipscode (subject to change to iso code)
11: admin2 code - code for the second administrative division
12: admin3 code - code for third level administrative division
13: admin4 code - code for fourth level administrative division
14: population - bigint
15: elevation - in meters, integer
16: dem - digital elevation model, srtm3 or gtopo30
17: timezone - the timezone id
18: modification date - date of last modification in yyyy-MM-dd format
"""
import argparse
import csv
import sqlite3
import sys
from pathlib import Path
from typing import Dict, Optional, Set
# Feature codes for cities and populated places
# See: http://www.geonames.org/export/codes.html
CITY_FEATURE_CODES = {
'PPL', # populated place
'PPLA', # seat of a first-order administrative division
'PPLA2', # seat of a second-order administrative division
'PPLA3', # seat of a third-order administrative division
'PPLA4', # seat of a fourth-order administrative division
'PPLC', # capital of a political entity
'PPLG', # seat of government of a political entity
'PPLS', # populated places
'PPLX', # section of populated place
}
def parse_admin1_codes(admin1_file: Path) -> Dict[str, str]:
"""
Parse admin1CodesASCII.txt to get province/state names.
Format: <country_code>.<admin1_code> TAB <name> TAB <ascii_name> TAB <geonameid>
Example: NL.07 Zuid-Holland Zuid-Holland 2743698
Returns:
Dict mapping "CC.code" -> province name
"""
admin1_names = {}
with open(admin1_file, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) >= 2:
code = parts[0] # e.g., "NL.07"
name = parts[1] # e.g., "Zuid-Holland"
admin1_names[code] = name
return admin1_names
def build_database(
input_file: Path,
admin1_file: Path,
output_db: Path,
min_population: int = 0,
countries: Optional[Set[str]] = None
) -> None:
"""
Build SQLite database from GeoNames allCountries.txt.
Args:
input_file: Path to allCountries.txt
admin1_file: Path to admin1CodesASCII.txt
output_db: Path to output SQLite database
min_population: Minimum population to include (0 = all cities)
countries: Set of country codes to include (None = all countries)
"""
print(f"Loading admin1 codes from {admin1_file}...")
admin1_names = parse_admin1_codes(admin1_file)
print(f"Loaded {len(admin1_names)} admin1 codes")
# Create database
print(f"Creating database at {output_db}...")
if output_db.exists():
output_db.unlink()
conn = sqlite3.connect(output_db)
cursor = conn.cursor()
# Create tables
cursor.execute("""
CREATE TABLE cities (
geonames_id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
ascii_name TEXT NOT NULL,
alternate_names TEXT,
country_code TEXT NOT NULL,
admin1_code TEXT,
admin1_name TEXT,
admin2_code TEXT,
latitude REAL NOT NULL,
longitude REAL NOT NULL,
feature_code TEXT NOT NULL,
population INTEGER,
elevation INTEGER,
timezone TEXT
)
""")
cursor.execute("""
CREATE INDEX idx_name_country ON cities(name, country_code)
""")
cursor.execute("""
CREATE INDEX idx_ascii_country ON cities(ascii_name, country_code)
""")
cursor.execute("""
CREATE INDEX idx_country ON cities(country_code)
""")
cursor.execute("""
CREATE INDEX idx_population ON cities(population DESC)
""")
# Create metadata table
cursor.execute("""
CREATE TABLE metadata (
key TEXT PRIMARY KEY,
value TEXT
)
""")
cursor.execute("""
INSERT INTO metadata (key, value) VALUES
('source', 'GeoNames allCountries.txt'),
('url', 'http://download.geonames.org/export/dump/'),
('feature_codes', ?),
('min_population', ?),
('build_date', datetime('now'))
""", (','.join(CITY_FEATURE_CODES), str(min_population)))
# Process input file
print(f"Processing {input_file}...")
print(f"Filter: feature_codes={CITY_FEATURE_CODES}, min_population={min_population}")
if countries:
print(f"Filter: countries={sorted(countries)}")
total_rows = 0
inserted_rows = 0
batch_size = 10000
batch = []
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
total_rows += 1
if len(row) < 19:
continue
geonames_id = int(row[0])
name = row[1]
ascii_name = row[2]
alternate_names = row[3]
latitude = float(row[4])
longitude = float(row[5])
feature_class = row[6]
feature_code = row[7]
country_code = row[8]
admin1_code = row[10]
admin2_code = row[11]
population = int(row[14]) if row[14] else 0
elevation = int(row[15]) if row[15] else None
timezone = row[17]
# Filter: only cities/populated places
if feature_code not in CITY_FEATURE_CODES:
continue
# Filter: minimum population
if population < min_population:
continue
# Filter: specific countries (if specified)
if countries and country_code not in countries:
continue
# Get admin1 name
admin1_key = f"{country_code}.{admin1_code}"
admin1_name = admin1_names.get(admin1_key)
batch.append((
geonames_id,
name,
ascii_name,
alternate_names,
country_code,
admin1_code,
admin1_name,
admin2_code,
latitude,
longitude,
feature_code,
population,
elevation,
timezone
))
if len(batch) >= batch_size:
cursor.executemany("""
INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", batch)
inserted_rows += len(batch)
batch = []
if inserted_rows % 100000 == 0:
print(f" Processed {total_rows:,} rows, inserted {inserted_rows:,} cities...")
conn.commit()
# Insert remaining batch
if batch:
cursor.executemany("""
INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", batch)
inserted_rows += len(batch)
conn.commit()
# Get database statistics
cursor.execute("SELECT COUNT(*) FROM cities")
total_cities = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(DISTINCT country_code) FROM cities")
total_countries = cursor.fetchone()[0]
cursor.execute("SELECT country_code, COUNT(*) FROM cities GROUP BY country_code ORDER BY COUNT(*) DESC LIMIT 10")
top_countries = cursor.fetchall()
# Store stats in metadata
cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_cities', ?)", (str(total_cities),))
cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_countries', ?)", (str(total_countries),))
conn.commit()
conn.close()
# Print summary
print("\n" + "=" * 60)
print("DATABASE BUILD COMPLETE")
print("=" * 60)
print(f"Input file: {input_file}")
print(f"Output database: {output_db}")
print(f"Database size: {output_db.stat().st_size / 1024 / 1024:.1f} MB")
print(f"Total rows read: {total_rows:,}")
print(f"Cities inserted: {total_cities:,}")
print(f"Countries: {total_countries}")
print("\nTop 10 countries by city count:")
for country_code, count in top_countries:
print(f" {country_code}: {count:,} cities")
print("=" * 60)
def main():
parser = argparse.ArgumentParser(
description="Build SQLite database from GeoNames data for global GLAM institution lookups"
)
parser.add_argument(
'--input',
type=Path,
default=Path('data/reference/allCountries.txt'),
help='Path to GeoNames allCountries.txt file'
)
parser.add_argument(
'--admin1',
type=Path,
default=Path('data/reference/admin1CodesASCII.txt'),
help='Path to admin1CodesASCII.txt file'
)
parser.add_argument(
'--output',
type=Path,
default=Path('data/reference/geonames.db'),
help='Path to output SQLite database'
)
parser.add_argument(
'--min-population',
type=int,
default=0,
help='Minimum population (0 = include all cities/towns)'
)
parser.add_argument(
'--countries',
type=str,
help='Comma-separated list of country codes (e.g., NL,US,BR). Default: all countries'
)
args = parser.parse_args()
# Validate input files
if not args.input.exists():
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
print("Download from: http://download.geonames.org/export/dump/allCountries.zip", file=sys.stderr)
sys.exit(1)
if not args.admin1.exists():
print(f"Error: Admin1 file not found: {args.admin1}", file=sys.stderr)
print("Download from: http://download.geonames.org/export/dump/admin1CodesASCII.txt", file=sys.stderr)
sys.exit(1)
# Parse countries filter
countries = None
if args.countries:
countries = set(c.strip().upper() for c in args.countries.split(','))
# Build database
build_database(
args.input,
args.admin1,
args.output,
args.min_population,
countries
)
if __name__ == '__main__':
main()