glam/tests/geocoding/test_geonames_lookup.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

494 lines
18 KiB
Python

"""Comprehensive tests for GeoNames lookup functionality."""
import pytest
from pathlib import Path
from glam_extractor.geocoding.geonames_lookup import (
GeoNamesDB,
CityInfo,
)
class TestCityInfo:
"""Test CityInfo data class."""
def test_get_abbreviation_simple(self):
"""Test abbreviation generation for simple city names."""
city = CityInfo(
geonames_id=2759794,
name="Amsterdam",
ascii_name="Amsterdam",
country_code="NL",
admin1_code="NH",
admin1_name="North Holland",
admin2_code=None,
latitude=52.37403,
longitude=4.88969,
feature_code="PPLC",
population=741636,
elevation=13,
timezone="Europe/Amsterdam"
)
assert city.get_abbreviation() == "AMS"
def test_get_abbreviation_with_space(self):
"""Test abbreviation for city with space (The Hague)."""
city = CityInfo(
geonames_id=2747373,
name="The Hague",
ascii_name="The Hague",
country_code="NL",
admin1_code="ZH",
admin1_name="South Holland",
admin2_code=None,
latitude=52.07667,
longitude=4.29861,
feature_code="PPLA",
population=474292,
elevation=1,
timezone="Europe/Amsterdam"
)
assert city.get_abbreviation() == "THE"
def test_get_abbreviation_with_apostrophe_and_hyphen(self):
"""Test abbreviation for 's-Hertogenbosch (strips special chars)."""
city = CityInfo(
geonames_id=2747351,
name="'s-Hertogenbosch",
ascii_name="'s-Hertogenbosch",
country_code="NL",
admin1_code="NB",
admin1_name="North Brabant",
admin2_code=None,
latitude=51.69917,
longitude=5.30417,
feature_code="PPLA",
population=134520,
elevation=7,
timezone="Europe/Amsterdam"
)
# Should skip apostrophe and hyphen, take first 3 letters: S-H-E
assert city.get_abbreviation() == "SHE"
def test_get_abbreviation_accented_chars(self):
"""Test abbreviation for cities with accents (São Paulo)."""
city = CityInfo(
geonames_id=3448439,
name="São Paulo",
ascii_name="Sao Paulo",
country_code="BR",
admin1_code="27",
admin1_name="São Paulo",
admin2_code=None,
latitude=-23.5475,
longitude=-46.63611,
feature_code="PPLA",
population=10021295,
elevation=760,
timezone="America/Sao_Paulo"
)
assert city.get_abbreviation() == "SAO"
class TestGeoNamesDB:
"""Test GeoNamesDB database interface."""
@pytest.fixture
def db(self):
"""Create GeoNamesDB instance."""
# Uses default path (data/reference/geonames.db)
return GeoNamesDB()
def test_init_default_path(self, db):
"""Test database initialization with default path."""
assert db.conn is not None
assert db.db_path.exists()
def test_init_invalid_path(self):
"""Test database initialization with invalid path raises error."""
with pytest.raises(FileNotFoundError):
GeoNamesDB(db_path=Path("/nonexistent/path.db"))
def test_lookup_city_simple(self, db):
"""Test basic city lookup."""
city = db.lookup_city("Amsterdam", "NL")
assert city is not None
assert city.name == "Amsterdam"
assert city.country_code == "NL"
assert city.admin1_code == "07" # GeoNames numeric code for North Holland
assert city.admin1_name == "North Holland"
assert city.geonames_id == 2759794
def test_lookup_city_case_insensitive(self, db):
"""Test city lookup is case-insensitive."""
city1 = db.lookup_city("Amsterdam", "NL")
city2 = db.lookup_city("amsterdam", "NL")
city3 = db.lookup_city("AMSTERDAM", "NL")
assert city1 is not None
assert city2 is not None
assert city3 is not None
assert city1.geonames_id == city2.geonames_id == city3.geonames_id
def test_lookup_city_not_found(self, db):
"""Test lookup of non-existent city returns None."""
city = db.lookup_city("NonExistentCity", "NL")
assert city is None
def test_lookup_city_wrong_country(self, db):
"""Test lookup with wrong country code returns None."""
city = db.lookup_city("Amsterdam", "FR")
assert city is None
def test_admin1_name_from_city(self, db):
"""Test that admin1_name is populated from city lookup."""
city = db.lookup_city("Amsterdam", "NL")
assert city is not None
assert city.admin1_name == "North Holland"
def test_admin1_name_multiple_cities(self, db):
"""Test admin1_name for multiple cities in different provinces."""
cities = [
("Amsterdam", "North Holland"),
("Rotterdam", "South Holland"),
("Utrecht", "Utrecht"),
]
for city_name, expected_province in cities:
city = db.lookup_city(city_name, "NL")
assert city is not None
assert city.admin1_name == expected_province
class TestGeoNamesLookup:
"""Test GeoNamesDB high-level interface."""
@pytest.fixture
def lookup(self):
"""Create GeoNamesDB instance."""
return GeoNamesDB()
def test_dutch_cities(self, lookup):
"""Test lookups for major Dutch cities."""
cities = {
"Amsterdam": "AMS",
"Rotterdam": "ROT",
"The Hague": "THE",
"Utrecht": "UTR",
"Eindhoven": "EIN",
"Groningen": "GRO",
"Tilburg": "TIL",
"Almere Stad": "ALM", # Almere is "Almere Stad" in GeoNames
"Breda": "BRE",
"Nijmegen": "NIJ",
}
for city_name, expected_abbr in cities.items():
city = lookup.lookup_city(city_name, "NL")
assert city is not None, f"{city_name} not found"
assert city.get_abbreviation() == expected_abbr, \
f"{city_name}: expected {expected_abbr}, got {city.get_abbreviation()}"
def test_dutch_aliases(self, lookup):
"""Test Dutch city name aliases."""
# Den Haag → The Hague
city = lookup.lookup_city("Den Haag", "NL")
assert city is not None
assert city.get_abbreviation() == "THE"
assert city.name == "The Hague"
# Den Bosch → 's-Hertogenbosch
city = lookup.lookup_city("Den Bosch", "NL")
assert city is not None
assert city.get_abbreviation() == "SHE"
assert city.name == "'s-Hertogenbosch"
def test_global_cities(self, lookup):
"""Test lookups for major global cities."""
cities = {
("Paris", "FR"): "PAR",
("London", "GB"): "LON",
("Tokyo", "JP"): "TOK",
("New York", "US"): "NEW",
("Berlin", "DE"): "BER",
("Madrid", "ES"): "MAD",
("Rome", "IT"): "ROM",
("Rio de Janeiro", "BR"): "RIO",
("Sydney", "AU"): "SYD",
("Toronto", "CA"): "TOR",
}
for (city_name, country), expected_abbr in cities.items():
city = lookup.lookup_city(city_name, country)
assert city is not None, f"{city_name}, {country} not found"
assert city.get_abbreviation() == expected_abbr, \
f"{city_name}: expected {expected_abbr}, got {city.get_abbreviation()}"
def test_cities_with_special_characters(self, lookup):
"""Test cities with apostrophes, hyphens, and accents."""
# 's-Hertogenbosch (NL)
city = lookup.lookup_city("'s-Hertogenbosch", "NL")
assert city is not None
assert city.get_abbreviation() == "SHE"
# São Paulo (BR)
city = lookup.lookup_city("Sao Paulo", "BR")
assert city is not None
assert city.get_abbreviation() == "SAO"
def test_cities_with_parentheticals(self, lookup):
"""Test city names with parenthetical notes."""
# Dutch dataset often has "(Ov.)" etc. - should be stripped
city = lookup.lookup_city("Zwolle (Ov.)", "NL")
assert city is not None
assert city.name == "Zwolle"
city = lookup.lookup_city("Groningen (Gr.)", "NL")
assert city is not None
assert city.name == "Groningen"
def test_whitespace_normalization(self, lookup):
"""Test that leading/trailing whitespace is handled."""
city1 = lookup.lookup_city(" Amsterdam ", "NL")
city2 = lookup.lookup_city("Amsterdam", "NL")
assert city1 is not None
assert city2 is not None
assert city1.geonames_id == city2.geonames_id
def test_province_code_lookups(self, lookup):
"""Test admin1 code and name lookups (GeoNames numeric codes)."""
provinces = {
# GeoNames uses numeric codes, not ISO 3166-2
("Amsterdam", "NL"): ("07", "North Holland"),
("Rotterdam", "NL"): ("11", "South Holland"),
("The Hague", "NL"): ("11", "South Holland"),
("Utrecht", "NL"): ("09", "Utrecht"),
("Groningen", "NL"): ("04", "Groningen"),
("Maastricht", "NL"): ("05", "Limburg"),
}
for (city_name, country), (expected_code, expected_name) in provinces.items():
city = lookup.lookup_city(city_name, country)
assert city is not None, f"{city_name} not found"
assert city.admin1_code == expected_code, \
f"{city_name}: expected code {expected_code}, got {city.admin1_code}"
assert city.admin1_name == expected_name, \
f"{city_name}: expected name {expected_name}, got {city.admin1_name}"
def test_caching(self, lookup):
"""Test that repeated lookups are cached."""
# First lookup
city1 = lookup.lookup_city("Amsterdam", "NL")
# Second lookup (should be cached)
city2 = lookup.lookup_city("Amsterdam", "NL")
# Should return same object (identity check)
assert city1 is city2
class TestEdgeCases:
"""Test edge cases and known issues."""
@pytest.fixture
def lookup(self):
"""Create GeoNamesDB instance."""
return GeoNamesDB()
def test_missing_dutch_cities(self, lookup):
"""Test the 6 known missing Dutch cities."""
# These cities are not in GeoNames (1.6% of ISIL registry)
missing_cities = [
"Avereest", # Typo or very small locality
"IJsselsein", # Typo: should be IJsselstein
"Kralendijk", # Bonaire (Caribbean, BQ not NL)
"Selingen", # Tiny village
"s-Heerenberg", # Should have apostrophe
"St. Annaparochie", # Missing from GeoNames
]
for city_name in missing_cities:
city = lookup.lookup_city(city_name, "NL")
assert city is None, f"{city_name} should not be found"
def test_alternative_spellings(self, lookup):
"""Test that we handle common alternative spellings."""
# 's-Hertogenbosch variations
variations = [
"'s-Hertogenbosch",
"s-Hertogenbosch",
"'s Hertogenbosch",
"Den Bosch",
]
for variant in variations:
city = lookup.lookup_city(variant, "NL")
assert city is not None, f"{variant} should be found"
assert city.get_abbreviation() == "SHE"
def test_bonaire_caribbean_territory(self, lookup):
"""Test that Bonaire cities use BQ country code, not NL."""
# Kralendijk is in Bonaire (BQ), not Netherlands (NL)
city = lookup.lookup_city("Kralendijk", "BQ")
# This might still be None if BQ cities aren't in GeoNames
# Just documenting the correct country code
if city:
assert city.country_code == "BQ"
class TestSearchAndQuery:
"""Test search and query methods."""
@pytest.fixture
def db(self):
"""Create GeoNamesDB instance."""
return GeoNamesDB()
def test_search_cities_with_pattern(self, db):
"""Test search_cities with wildcard pattern."""
# Search for Amsterdam*
results = db.search_cities("Amsterdam%", "NL", limit=5)
assert len(results) > 0
assert all(city.name.startswith("Amsterdam") for city in results)
assert all(city.country_code == "NL" for city in results)
def test_search_cities_without_country(self, db):
"""Test search_cities across all countries."""
results = db.search_cities("Paris%", limit=10)
assert len(results) > 0
# Should find Paris in France and possibly other countries
assert any(city.country_code == "FR" for city in results)
def test_search_cities_ordered_by_population(self, db):
"""Test that search results are ordered by population."""
results = db.search_cities("Amsterdam%", "NL", limit=5)
if len(results) > 1:
# Check that populations are in descending order
populations = [city.population for city in results if city.population]
assert populations == sorted(populations, reverse=True)
def test_get_cities_by_country(self, db):
"""Test get_cities_by_country for Netherlands."""
cities = db.get_cities_by_country("NL", min_population=100000, limit=20)
assert len(cities) > 0
assert all(city.country_code == "NL" for city in cities)
assert all(city.population is None or city.population >= 100000 for city in cities)
def test_get_cities_by_country_ordered(self, db):
"""Test that get_cities_by_country returns cities ordered by population."""
cities = db.get_cities_by_country("NL", min_population=0, limit=10)
assert len(cities) > 0
# Check descending population order (ignoring None values)
populations = [city.population for city in cities if city.population]
assert populations == sorted(populations, reverse=True)
def test_get_city_abbreviation_convenience(self, db):
"""Test get_city_abbreviation convenience method."""
# Test standard 3-letter codes (without disambiguation)
abbr = db.get_city_abbreviation("Amsterdam", "NL", use_disambiguation=False)
assert abbr == "AMS"
abbr = db.get_city_abbreviation("Tokyo", "JP", use_disambiguation=False)
assert abbr == "TOK"
def test_get_city_abbreviation_not_found(self, db):
"""Test get_city_abbreviation returns None for missing cities."""
abbr = db.get_city_abbreviation("NonExistentCity", "XX")
assert abbr is None
def test_get_stats(self, db):
"""Test database statistics retrieval."""
stats = db.get_stats()
# Check expected keys
assert 'total_cities' in stats
assert 'total_countries' in stats
assert 'top_countries' in stats
assert 'db_path' in stats
assert 'db_size_mb' in stats
# Check reasonable values
assert stats['total_cities'] > 1000000 # Should have millions of cities
assert stats['total_countries'] > 200 # GeoNames covers 247 countries
assert len(stats['top_countries']) == 10
assert stats['db_size_mb'] > 0
# Top countries should be tuples of (code, count)
for country_code, count in stats['top_countries']:
assert isinstance(country_code, str)
assert isinstance(count, int)
assert len(country_code) == 2 # ISO alpha-2
assert count > 0
class TestPerformance:
"""Test performance characteristics."""
@pytest.fixture
def lookup(self):
"""Create GeoNamesDB instance."""
return GeoNamesDB()
def test_batch_lookups_fast(self, lookup):
"""Test that batch lookups are fast (thanks to caching)."""
import time
cities = [
("Amsterdam", "NL"),
("Rotterdam", "NL"),
("The Hague", "NL"),
("Utrecht", "NL"),
("Eindhoven", "NL"),
] * 10 # 50 lookups total
start = time.time()
for city_name, country in cities:
lookup.lookup_city(city_name, country)
elapsed = time.time() - start
# 50 lookups should take < 100ms (with caching, most are instant)
assert elapsed < 0.1, f"Batch lookups took {elapsed:.3f}s (expected < 0.1s)"
def test_unique_lookups_reasonable(self, lookup):
"""Test that even uncached lookups are reasonably fast."""
import time
# 20 unique Dutch cities (uncached)
cities = [
"Amsterdam", "Rotterdam", "The Hague", "Utrecht", "Eindhoven",
"Tilburg", "Groningen", "Almere", "Breda", "Nijmegen",
"Enschede", "Haarlem", "Arnhem", "Zaanstad", "Amersfoort",
"Apeldoorn", "Hoofddorp", "Maastricht", "Leiden", "Dordrecht",
]
start = time.time()
for city_name in cities:
lookup.lookup_city(city_name, "NL")
elapsed = time.time() - start
# 20 unique lookups should take < 200ms from SQLite
assert elapsed < 0.2, f"20 unique lookups took {elapsed:.3f}s (expected < 0.2s)"
class TestDatabaseLifecycle:
"""Test database connection lifecycle."""
def test_close_connection(self):
"""Test explicit connection closing."""
db = GeoNamesDB()
assert db.conn is not None
# Close connection
db.close()
# Connection should be closed
# (can't easily test this without triggering an exception)
def test_context_manager_style(self):
"""Test that database can be used and cleaned up properly."""
db = GeoNamesDB()
city = db.lookup_city("Amsterdam", "NL")
assert city is not None
# Cleanup
del db # Should trigger __del__ and close connection