glam/src/glam_extractor/geocoding/geonames_lookup.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

500 lines
17 KiB
Python

"""
GeoNames database lookup for global city data.
Provides fast, offline lookups of city information from the GeoNames database.
Used for GHCID generation and geographic standardization across 247 countries.
Includes city code disambiguation to resolve collisions (e.g., Hardenberg, Harlingen, Haren → HAR).
"""
import sqlite3
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, TYPE_CHECKING
from functools import lru_cache
if TYPE_CHECKING:
from glam_extractor.geocoding.city_code_disambiguation import CityCodeDisambiguator
from glam_extractor.geocoding.city_code_disambiguation import CityCodeDisambiguator
@dataclass
class CityInfo:
"""Information about a city from GeoNames."""
geonames_id: int
name: str
ascii_name: str
country_code: str
admin1_code: Optional[str]
admin1_name: Optional[str]
admin2_code: Optional[str]
latitude: float
longitude: float
feature_code: str
population: Optional[int]
elevation: Optional[int]
timezone: Optional[str]
def get_abbreviation(self) -> str:
"""
Get 3-letter city abbreviation for GHCID.
Uses first 3 letters of ASCII name, uppercased.
Skips non-alphanumeric characters (apostrophes, hyphens, spaces, etc.)
For multi-word cities with articles (La Serena, El Paso):
- Takes first letter of article + first 2 of next word
Examples:
Amsterdam → AMS
Rotterdam → ROT
The Hague → THE
's-Hertogenbosch → SHE (skips ' and -)
São Paulo → SAO
La Serena → LSE (L from La + SE from Serena)
El Paso → ELP (E from El + LP from Paso)
"""
import unicodedata
# Remove accents first (São → Sao)
normalized = unicodedata.normalize('NFD', self.ascii_name)
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Split into words
words = normalized.split()
# Handle multi-word cities with articles
if len(words) > 1 and words[0].lower() in ['la', 'el', 'los', 'las', 'le', 'the', 'o', 'a']:
# Special case: "The X" → Use first 3 letters of article for intuitive codes
# Example: "The Hague" → "THE" (not "THA")
if words[0].lower() == 'the':
code = words[0][:3].upper()
else:
# Other articles: first letter of article + first 2 of next word
# Example: "La Paz" → "LAP"
code = (words[0][0] + words[1][:2]).upper()
else:
# Remove non-alphanumeric characters (but keep them to count positions)
cleaned = ''.join(c for c in normalized if c.isalnum())
code = cleaned[:3].upper()
return code
class GeoNamesDB:
"""
Fast lookup interface to GeoNames SQLite database.
Provides city lookups by name and country for global GLAM institutions.
Example:
>>> db = GeoNamesDB()
>>> city = db.lookup_city("Amsterdam", "NL")
>>> print(city.get_abbreviation())
AMS
>>> print(city.geonames_id)
2759794
"""
# Dutch city name mappings (Dutch → GeoNames English)
DUTCH_CITY_ALIASES = {
"Den Haag": "The Hague",
"Den Bosch": "'s-Hertogenbosch",
"'s Hertogenbosch": "'s-Hertogenbosch",
"s-Hertogenbosch": "'s-Hertogenbosch",
}
def __init__(self, db_path: Optional[Path] = None, enable_disambiguation: bool = True):
"""
Initialize GeoNames database connection.
Args:
db_path: Path to geonames.db SQLite file.
Defaults to data/reference/geonames.db
enable_disambiguation: If True, build city code disambiguation tables
for collision resolution (default: True)
"""
if db_path is None:
# Default path relative to project root
project_root = Path(__file__).parent.parent.parent.parent
db_path = project_root / "data" / "reference" / "geonames.db"
if not db_path.exists():
raise FileNotFoundError(
f"GeoNames database not found at {db_path}. "
"Run scripts/build_geonames_db.py to create it."
)
self.db_path = db_path
self.conn = sqlite3.connect(str(db_path))
self.conn.row_factory = sqlite3.Row
# Initialize disambiguation
self.enable_disambiguation = enable_disambiguation
self._disambiguator: Optional['CityCodeDisambiguator'] = None
self._disambiguation_built: set = set() # Countries with built disambiguation
def __del__(self):
"""Close database connection."""
if hasattr(self, 'conn'):
self.conn.close()
@staticmethod
def _normalize_city_name(city_name: str, country_code: str) -> str:
"""
Normalize city name for lookup.
Handles:
- Trailing/leading whitespace
- Parenthetical clarifications like "(Ov.)" or "(Groningen)"
- Dutch city name aliases (Den Haag → The Hague)
Args:
city_name: Raw city name from data
country_code: ISO country code
Returns:
Normalized city name
"""
# Strip whitespace
normalized = city_name.strip()
# Remove parenthetical clarifications: "Hengelo (Ov.)" → "Hengelo"
if '(' in normalized:
normalized = normalized.split('(')[0].strip()
# Apply country-specific aliases
if country_code == "NL" and normalized in GeoNamesDB.DUTCH_CITY_ALIASES:
normalized = GeoNamesDB.DUTCH_CITY_ALIASES[normalized]
return normalized
def lookup_city(
self,
city_name: str,
country_code: str,
prefer_larger: bool = True
) -> Optional[CityInfo]:
"""
Look up a city by name and country.
Args:
city_name: Name of the city (e.g., "Amsterdam", "Rio de Janeiro")
country_code: ISO 3166-1 alpha-2 country code (e.g., "NL", "BR")
prefer_larger: If multiple matches, prefer city with larger population
Returns:
CityInfo if found, None otherwise
Example:
>>> db = GeoNamesDB()
>>> city = db.lookup_city("Rio de Janeiro", "BR")
>>> print(f"{city.name}: {city.latitude}, {city.longitude}")
Rio de Janeiro: -22.90278, -43.2075
"""
# Normalize city name (strip whitespace, handle aliases)
normalized_name = self._normalize_city_name(city_name, country_code)
# Call cached lookup with normalized name
return self._lookup_city_raw(normalized_name, country_code, prefer_larger)
@lru_cache(maxsize=10000)
def _lookup_city_raw(
self,
city_name: str,
country_code: str,
prefer_larger: bool = True
) -> Optional[CityInfo]:
"""
Internal cached lookup method (after normalization).
"""
cursor = self.conn.cursor()
# Try exact match on name first
cursor.execute("""
SELECT * FROM cities
WHERE name = ? AND country_code = ?
ORDER BY population DESC NULLS LAST
LIMIT 1
""", (city_name, country_code.upper()))
row = cursor.fetchone()
# If no exact match, try ASCII name
if not row:
cursor.execute("""
SELECT * FROM cities
WHERE ascii_name = ? AND country_code = ?
ORDER BY population DESC NULLS LAST
LIMIT 1
""", (city_name, country_code.upper()))
row = cursor.fetchone()
# If still no match, try case-insensitive
if not row:
cursor.execute("""
SELECT * FROM cities
WHERE LOWER(name) = LOWER(?) AND country_code = ?
ORDER BY population DESC NULLS LAST
LIMIT 1
""", (city_name, country_code.upper()))
row = cursor.fetchone()
if not row:
return None
return self._row_to_cityinfo(row)
def get_city_abbreviation(
self,
city_name: str,
country_code: str,
use_disambiguation: bool = True
) -> Optional[str]:
"""
Get city abbreviation (3-4 letters, potentially with numeric suffix).
If disambiguation is enabled, uses collision-free codes.
Otherwise, returns standard 3-letter codes (may have collisions).
Args:
city_name: Name of the city
country_code: ISO 3166-1 alpha-2 country code
use_disambiguation: If True, use disambiguation table (default: True)
Returns:
City abbreviation (e.g., "AMS", "HARD", "HARL") or None
Example:
>>> db = GeoNamesDB()
>>> db.get_city_abbreviation("Tokyo", "JP")
'TOK'
>>> db.get_city_abbreviation("Hardenberg", "NL")
'HARD' # Disambiguated from Harlingen (HARL) and Haren (HARE)
"""
# Try disambiguation first if enabled
if use_disambiguation and self.enable_disambiguation:
# Ensure disambiguation is built for this country
self._ensure_disambiguation_built(country_code)
if self._disambiguator:
disambiguated_code = self._disambiguator.get_code(city_name, country_code)
if disambiguated_code:
return disambiguated_code
# Fall back to standard 3-letter code
city = self.lookup_city(city_name, country_code)
return city.get_abbreviation() if city else None
def _ensure_disambiguation_built(self, country_code: str):
"""
Ensure disambiguation table is built for a country.
Lazily builds disambiguation on first request for a country.
Args:
country_code: ISO 3166-1 alpha-2 country code
"""
if not self.enable_disambiguation:
return
country_code = country_code.upper()
# Already built for this country?
if country_code in self._disambiguation_built:
return
# Initialize disambiguator if needed
if self._disambiguator is None:
self._disambiguator = CityCodeDisambiguator()
# Get all cities in the country
cities = self.get_cities_by_country(country_code, min_population=0, limit=100000)
# Add to disambiguator
for city in cities:
self._disambiguator.add_city(
city_name=city.name,
ascii_name=city.ascii_name,
geonames_id=city.geonames_id,
country_code=country_code,
population=city.population
)
# Build disambiguation table
self._disambiguator.build()
# Mark as built
self._disambiguation_built.add(country_code)
def get_disambiguation_report(self, country_code: str) -> str:
"""
Get disambiguation report for a country.
Shows collision groups and how they were resolved.
Args:
country_code: ISO 3166-1 alpha-2 country code
Returns:
Formatted report string
Example:
>>> db = GeoNamesDB()
>>> print(db.get_disambiguation_report("NL"))
"""
self._ensure_disambiguation_built(country_code)
if self._disambiguator:
return self._disambiguator.get_collision_report(country_code)
else:
return "Disambiguation is not enabled"
def get_disambiguation_statistics(self, country_code: str) -> dict:
"""
Get disambiguation statistics for a country.
Args:
country_code: ISO 3166-1 alpha-2 country code
Returns:
Dictionary with statistics
Example:
>>> db = GeoNamesDB()
>>> stats = db.get_disambiguation_statistics("NL")
>>> print(f"Collision rate: {stats['collision_rate']:.1%}")
"""
self._ensure_disambiguation_built(country_code)
if self._disambiguator:
return self._disambiguator.get_statistics(country_code)
else:
return {'error': 'Disambiguation is not enabled'}
def search_cities(
self,
name_pattern: str,
country_code: Optional[str] = None,
limit: int = 10
) -> List[CityInfo]:
"""
Search for cities matching a name pattern.
Args:
name_pattern: City name or pattern (supports % wildcards)
country_code: Optional country filter
limit: Maximum results to return
Returns:
List of matching cities, ordered by population
Example:
>>> db = GeoNamesDB()
>>> cities = db.search_cities("Amsterdam%", "NL")
>>> for city in cities:
... print(f"{city.name} (pop: {city.population})")
Amsterdam (pop: 741636)
Amsterdam-Zuidoost (pop: 86916)
"""
cursor = self.conn.cursor()
if country_code:
cursor.execute("""
SELECT * FROM cities
WHERE name LIKE ? AND country_code = ?
ORDER BY population DESC NULLS LAST
LIMIT ?
""", (name_pattern, country_code.upper(), limit))
else:
cursor.execute("""
SELECT * FROM cities
WHERE name LIKE ?
ORDER BY population DESC NULLS LAST
LIMIT ?
""", (name_pattern, limit))
return [self._row_to_cityinfo(row) for row in cursor.fetchall()]
def get_cities_by_country(
self,
country_code: str,
min_population: int = 0,
limit: int = 100
) -> List[CityInfo]:
"""
Get all cities in a country.
Args:
country_code: ISO 3166-1 alpha-2 country code
min_population: Minimum population threshold
limit: Maximum results to return
Returns:
List of cities, ordered by population descending
Example:
>>> db = GeoNamesDB()
>>> cities = db.get_cities_by_country("NL", min_population=100000)
>>> print(f"Found {len(cities)} major Dutch cities")
"""
cursor = self.conn.cursor()
cursor.execute("""
SELECT * FROM cities
WHERE country_code = ? AND (population >= ? OR population IS NULL)
ORDER BY population DESC NULLS LAST
LIMIT ?
""", (country_code.upper(), min_population, limit))
return [self._row_to_cityinfo(row) for row in cursor.fetchall()]
def get_stats(self) -> dict:
"""
Get database statistics.
Returns:
Dictionary with total_cities, total_countries, etc.
"""
cursor = self.conn.cursor()
cursor.execute("SELECT COUNT(*) FROM cities")
total_cities = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(DISTINCT country_code) FROM cities")
total_countries = cursor.fetchone()[0]
cursor.execute("SELECT country_code, COUNT(*) as count FROM cities GROUP BY country_code ORDER BY count DESC LIMIT 10")
top_countries = [(row[0], row[1]) for row in cursor.fetchall()]
return {
'total_cities': total_cities,
'total_countries': total_countries,
'top_countries': top_countries,
'db_path': str(self.db_path),
'db_size_mb': self.db_path.stat().st_size / 1024 / 1024
}
def _row_to_cityinfo(self, row: sqlite3.Row) -> CityInfo:
"""Convert database row to CityInfo object."""
return CityInfo(
geonames_id=row['geonames_id'],
name=row['name'],
ascii_name=row['ascii_name'],
country_code=row['country_code'],
admin1_code=row['admin1_code'],
admin1_name=row['admin1_name'],
admin2_code=row['admin2_code'],
latitude=row['latitude'],
longitude=row['longitude'],
feature_code=row['feature_code'],
population=row['population'],
elevation=row['elevation'],
timezone=row['timezone']
)
def close(self):
"""Explicitly close the database connection."""
self.conn.close()