- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
341 lines
11 KiB
Python
Executable file
341 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Build SQLite database from GeoNames data.
|
|
|
|
Downloads and processes GeoNames allCountries.txt file to create a fast,
|
|
queryable database for global city lookups. Optimized for GLAM institution
|
|
data across 60+ countries.
|
|
|
|
GeoNames Data Format (tab-separated):
|
|
0: geonameid - integer id of record
|
|
1: name - name of geographical point (utf8)
|
|
2: asciiname - name in plain ascii characters
|
|
3: alternatenames - comma-separated, ascii names automatically transliterated
|
|
4: latitude - in decimal degrees (wgs84)
|
|
5: longitude - in decimal degrees (wgs84)
|
|
6: feature class - see http://www.geonames.org/export/codes.html
|
|
7: feature code - see http://www.geonames.org/export/codes.html
|
|
8: country code - ISO-3166 2-letter country code
|
|
9: cc2 - alternate country codes
|
|
10: admin1 code - fipscode (subject to change to iso code)
|
|
11: admin2 code - code for the second administrative division
|
|
12: admin3 code - code for third level administrative division
|
|
13: admin4 code - code for fourth level administrative division
|
|
14: population - bigint
|
|
15: elevation - in meters, integer
|
|
16: dem - digital elevation model, srtm3 or gtopo30
|
|
17: timezone - the timezone id
|
|
18: modification date - date of last modification in yyyy-MM-dd format
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, Set
|
|
|
|
|
|
# Feature codes for cities and populated places
|
|
# See: http://www.geonames.org/export/codes.html
|
|
CITY_FEATURE_CODES = {
|
|
'PPL', # populated place
|
|
'PPLA', # seat of a first-order administrative division
|
|
'PPLA2', # seat of a second-order administrative division
|
|
'PPLA3', # seat of a third-order administrative division
|
|
'PPLA4', # seat of a fourth-order administrative division
|
|
'PPLC', # capital of a political entity
|
|
'PPLG', # seat of government of a political entity
|
|
'PPLS', # populated places
|
|
'PPLX', # section of populated place
|
|
}
|
|
|
|
|
|
def parse_admin1_codes(admin1_file: Path) -> Dict[str, str]:
|
|
"""
|
|
Parse admin1CodesASCII.txt to get province/state names.
|
|
|
|
Format: <country_code>.<admin1_code> TAB <name> TAB <ascii_name> TAB <geonameid>
|
|
Example: NL.07 Zuid-Holland Zuid-Holland 2743698
|
|
|
|
Returns:
|
|
Dict mapping "CC.code" -> province name
|
|
"""
|
|
admin1_names = {}
|
|
with open(admin1_file, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
parts = line.strip().split('\t')
|
|
if len(parts) >= 2:
|
|
code = parts[0] # e.g., "NL.07"
|
|
name = parts[1] # e.g., "Zuid-Holland"
|
|
admin1_names[code] = name
|
|
return admin1_names
|
|
|
|
|
|
def build_database(
|
|
input_file: Path,
|
|
admin1_file: Path,
|
|
output_db: Path,
|
|
min_population: int = 0,
|
|
countries: Optional[Set[str]] = None
|
|
) -> None:
|
|
"""
|
|
Build SQLite database from GeoNames allCountries.txt.
|
|
|
|
Args:
|
|
input_file: Path to allCountries.txt
|
|
admin1_file: Path to admin1CodesASCII.txt
|
|
output_db: Path to output SQLite database
|
|
min_population: Minimum population to include (0 = all cities)
|
|
countries: Set of country codes to include (None = all countries)
|
|
"""
|
|
print(f"Loading admin1 codes from {admin1_file}...")
|
|
admin1_names = parse_admin1_codes(admin1_file)
|
|
print(f"Loaded {len(admin1_names)} admin1 codes")
|
|
|
|
# Create database
|
|
print(f"Creating database at {output_db}...")
|
|
if output_db.exists():
|
|
output_db.unlink()
|
|
|
|
conn = sqlite3.connect(output_db)
|
|
cursor = conn.cursor()
|
|
|
|
# Create tables
|
|
cursor.execute("""
|
|
CREATE TABLE cities (
|
|
geonames_id INTEGER PRIMARY KEY,
|
|
name TEXT NOT NULL,
|
|
ascii_name TEXT NOT NULL,
|
|
alternate_names TEXT,
|
|
country_code TEXT NOT NULL,
|
|
admin1_code TEXT,
|
|
admin1_name TEXT,
|
|
admin2_code TEXT,
|
|
latitude REAL NOT NULL,
|
|
longitude REAL NOT NULL,
|
|
feature_code TEXT NOT NULL,
|
|
population INTEGER,
|
|
elevation INTEGER,
|
|
timezone TEXT
|
|
)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE INDEX idx_name_country ON cities(name, country_code)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE INDEX idx_ascii_country ON cities(ascii_name, country_code)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE INDEX idx_country ON cities(country_code)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE INDEX idx_population ON cities(population DESC)
|
|
""")
|
|
|
|
# Create metadata table
|
|
cursor.execute("""
|
|
CREATE TABLE metadata (
|
|
key TEXT PRIMARY KEY,
|
|
value TEXT
|
|
)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
INSERT INTO metadata (key, value) VALUES
|
|
('source', 'GeoNames allCountries.txt'),
|
|
('url', 'http://download.geonames.org/export/dump/'),
|
|
('feature_codes', ?),
|
|
('min_population', ?),
|
|
('build_date', datetime('now'))
|
|
""", (','.join(CITY_FEATURE_CODES), str(min_population)))
|
|
|
|
# Process input file
|
|
print(f"Processing {input_file}...")
|
|
print(f"Filter: feature_codes={CITY_FEATURE_CODES}, min_population={min_population}")
|
|
if countries:
|
|
print(f"Filter: countries={sorted(countries)}")
|
|
|
|
total_rows = 0
|
|
inserted_rows = 0
|
|
batch_size = 10000
|
|
batch = []
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
|
|
|
|
for row in reader:
|
|
total_rows += 1
|
|
|
|
if len(row) < 19:
|
|
continue
|
|
|
|
geonames_id = int(row[0])
|
|
name = row[1]
|
|
ascii_name = row[2]
|
|
alternate_names = row[3]
|
|
latitude = float(row[4])
|
|
longitude = float(row[5])
|
|
feature_class = row[6]
|
|
feature_code = row[7]
|
|
country_code = row[8]
|
|
admin1_code = row[10]
|
|
admin2_code = row[11]
|
|
population = int(row[14]) if row[14] else 0
|
|
elevation = int(row[15]) if row[15] else None
|
|
timezone = row[17]
|
|
|
|
# Filter: only cities/populated places
|
|
if feature_code not in CITY_FEATURE_CODES:
|
|
continue
|
|
|
|
# Filter: minimum population
|
|
if population < min_population:
|
|
continue
|
|
|
|
# Filter: specific countries (if specified)
|
|
if countries and country_code not in countries:
|
|
continue
|
|
|
|
# Get admin1 name
|
|
admin1_key = f"{country_code}.{admin1_code}"
|
|
admin1_name = admin1_names.get(admin1_key)
|
|
|
|
batch.append((
|
|
geonames_id,
|
|
name,
|
|
ascii_name,
|
|
alternate_names,
|
|
country_code,
|
|
admin1_code,
|
|
admin1_name,
|
|
admin2_code,
|
|
latitude,
|
|
longitude,
|
|
feature_code,
|
|
population,
|
|
elevation,
|
|
timezone
|
|
))
|
|
|
|
if len(batch) >= batch_size:
|
|
cursor.executemany("""
|
|
INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", batch)
|
|
inserted_rows += len(batch)
|
|
batch = []
|
|
|
|
if inserted_rows % 100000 == 0:
|
|
print(f" Processed {total_rows:,} rows, inserted {inserted_rows:,} cities...")
|
|
conn.commit()
|
|
|
|
# Insert remaining batch
|
|
if batch:
|
|
cursor.executemany("""
|
|
INSERT INTO cities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", batch)
|
|
inserted_rows += len(batch)
|
|
|
|
conn.commit()
|
|
|
|
# Get database statistics
|
|
cursor.execute("SELECT COUNT(*) FROM cities")
|
|
total_cities = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT COUNT(DISTINCT country_code) FROM cities")
|
|
total_countries = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT country_code, COUNT(*) FROM cities GROUP BY country_code ORDER BY COUNT(*) DESC LIMIT 10")
|
|
top_countries = cursor.fetchall()
|
|
|
|
# Store stats in metadata
|
|
cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_cities', ?)", (str(total_cities),))
|
|
cursor.execute("INSERT INTO metadata (key, value) VALUES ('total_countries', ?)", (str(total_countries),))
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("DATABASE BUILD COMPLETE")
|
|
print("=" * 60)
|
|
print(f"Input file: {input_file}")
|
|
print(f"Output database: {output_db}")
|
|
print(f"Database size: {output_db.stat().st_size / 1024 / 1024:.1f} MB")
|
|
print(f"Total rows read: {total_rows:,}")
|
|
print(f"Cities inserted: {total_cities:,}")
|
|
print(f"Countries: {total_countries}")
|
|
print("\nTop 10 countries by city count:")
|
|
for country_code, count in top_countries:
|
|
print(f" {country_code}: {count:,} cities")
|
|
print("=" * 60)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Build SQLite database from GeoNames data for global GLAM institution lookups"
|
|
)
|
|
parser.add_argument(
|
|
'--input',
|
|
type=Path,
|
|
default=Path('data/reference/allCountries.txt'),
|
|
help='Path to GeoNames allCountries.txt file'
|
|
)
|
|
parser.add_argument(
|
|
'--admin1',
|
|
type=Path,
|
|
default=Path('data/reference/admin1CodesASCII.txt'),
|
|
help='Path to admin1CodesASCII.txt file'
|
|
)
|
|
parser.add_argument(
|
|
'--output',
|
|
type=Path,
|
|
default=Path('data/reference/geonames.db'),
|
|
help='Path to output SQLite database'
|
|
)
|
|
parser.add_argument(
|
|
'--min-population',
|
|
type=int,
|
|
default=0,
|
|
help='Minimum population (0 = include all cities/towns)'
|
|
)
|
|
parser.add_argument(
|
|
'--countries',
|
|
type=str,
|
|
help='Comma-separated list of country codes (e.g., NL,US,BR). Default: all countries'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input files
|
|
if not args.input.exists():
|
|
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
|
|
print("Download from: http://download.geonames.org/export/dump/allCountries.zip", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if not args.admin1.exists():
|
|
print(f"Error: Admin1 file not found: {args.admin1}", file=sys.stderr)
|
|
print("Download from: http://download.geonames.org/export/dump/admin1CodesASCII.txt", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Parse countries filter
|
|
countries = None
|
|
if args.countries:
|
|
countries = set(c.strip().upper() for c in args.countries.split(','))
|
|
|
|
# Build database
|
|
build_database(
|
|
args.input,
|
|
args.admin1,
|
|
args.output,
|
|
args.min_population,
|
|
countries
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|