"""Comprehensive tests for GeoNames lookup functionality.""" import pytest from pathlib import Path from glam_extractor.geocoding.geonames_lookup import ( GeoNamesDB, CityInfo, ) class TestCityInfo: """Test CityInfo data class.""" def test_get_abbreviation_simple(self): """Test abbreviation generation for simple city names.""" city = CityInfo( geonames_id=2759794, name="Amsterdam", ascii_name="Amsterdam", country_code="NL", admin1_code="NH", admin1_name="North Holland", admin2_code=None, latitude=52.37403, longitude=4.88969, feature_code="PPLC", population=741636, elevation=13, timezone="Europe/Amsterdam" ) assert city.get_abbreviation() == "AMS" def test_get_abbreviation_with_space(self): """Test abbreviation for city with space (The Hague).""" city = CityInfo( geonames_id=2747373, name="The Hague", ascii_name="The Hague", country_code="NL", admin1_code="ZH", admin1_name="South Holland", admin2_code=None, latitude=52.07667, longitude=4.29861, feature_code="PPLA", population=474292, elevation=1, timezone="Europe/Amsterdam" ) assert city.get_abbreviation() == "THE" def test_get_abbreviation_with_apostrophe_and_hyphen(self): """Test abbreviation for 's-Hertogenbosch (strips special chars).""" city = CityInfo( geonames_id=2747351, name="'s-Hertogenbosch", ascii_name="'s-Hertogenbosch", country_code="NL", admin1_code="NB", admin1_name="North Brabant", admin2_code=None, latitude=51.69917, longitude=5.30417, feature_code="PPLA", population=134520, elevation=7, timezone="Europe/Amsterdam" ) # Should skip apostrophe and hyphen, take first 3 letters: S-H-E assert city.get_abbreviation() == "SHE" def test_get_abbreviation_accented_chars(self): """Test abbreviation for cities with accents (São Paulo).""" city = CityInfo( geonames_id=3448439, name="São Paulo", ascii_name="Sao Paulo", country_code="BR", admin1_code="27", admin1_name="São Paulo", admin2_code=None, latitude=-23.5475, longitude=-46.63611, feature_code="PPLA", population=10021295, elevation=760, timezone="America/Sao_Paulo" ) assert city.get_abbreviation() == "SAO" class TestGeoNamesDB: """Test GeoNamesDB database interface.""" @pytest.fixture def db(self): """Create GeoNamesDB instance.""" # Uses default path (data/reference/geonames.db) return GeoNamesDB() def test_init_default_path(self, db): """Test database initialization with default path.""" assert db.conn is not None assert db.db_path.exists() def test_init_invalid_path(self): """Test database initialization with invalid path raises error.""" with pytest.raises(FileNotFoundError): GeoNamesDB(db_path=Path("/nonexistent/path.db")) def test_lookup_city_simple(self, db): """Test basic city lookup.""" city = db.lookup_city("Amsterdam", "NL") assert city is not None assert city.name == "Amsterdam" assert city.country_code == "NL" assert city.admin1_code == "07" # GeoNames numeric code for North Holland assert city.admin1_name == "North Holland" assert city.geonames_id == 2759794 def test_lookup_city_case_insensitive(self, db): """Test city lookup is case-insensitive.""" city1 = db.lookup_city("Amsterdam", "NL") city2 = db.lookup_city("amsterdam", "NL") city3 = db.lookup_city("AMSTERDAM", "NL") assert city1 is not None assert city2 is not None assert city3 is not None assert city1.geonames_id == city2.geonames_id == city3.geonames_id def test_lookup_city_not_found(self, db): """Test lookup of non-existent city returns None.""" city = db.lookup_city("NonExistentCity", "NL") assert city is None def test_lookup_city_wrong_country(self, db): """Test lookup with wrong country code returns None.""" city = db.lookup_city("Amsterdam", "FR") assert city is None def test_admin1_name_from_city(self, db): """Test that admin1_name is populated from city lookup.""" city = db.lookup_city("Amsterdam", "NL") assert city is not None assert city.admin1_name == "North Holland" def test_admin1_name_multiple_cities(self, db): """Test admin1_name for multiple cities in different provinces.""" cities = [ ("Amsterdam", "North Holland"), ("Rotterdam", "South Holland"), ("Utrecht", "Utrecht"), ] for city_name, expected_province in cities: city = db.lookup_city(city_name, "NL") assert city is not None assert city.admin1_name == expected_province class TestGeoNamesLookup: """Test GeoNamesDB high-level interface.""" @pytest.fixture def lookup(self): """Create GeoNamesDB instance.""" return GeoNamesDB() def test_dutch_cities(self, lookup): """Test lookups for major Dutch cities.""" cities = { "Amsterdam": "AMS", "Rotterdam": "ROT", "The Hague": "THE", "Utrecht": "UTR", "Eindhoven": "EIN", "Groningen": "GRO", "Tilburg": "TIL", "Almere Stad": "ALM", # Almere is "Almere Stad" in GeoNames "Breda": "BRE", "Nijmegen": "NIJ", } for city_name, expected_abbr in cities.items(): city = lookup.lookup_city(city_name, "NL") assert city is not None, f"{city_name} not found" assert city.get_abbreviation() == expected_abbr, \ f"{city_name}: expected {expected_abbr}, got {city.get_abbreviation()}" def test_dutch_aliases(self, lookup): """Test Dutch city name aliases.""" # Den Haag → The Hague city = lookup.lookup_city("Den Haag", "NL") assert city is not None assert city.get_abbreviation() == "THE" assert city.name == "The Hague" # Den Bosch → 's-Hertogenbosch city = lookup.lookup_city("Den Bosch", "NL") assert city is not None assert city.get_abbreviation() == "SHE" assert city.name == "'s-Hertogenbosch" def test_global_cities(self, lookup): """Test lookups for major global cities.""" cities = { ("Paris", "FR"): "PAR", ("London", "GB"): "LON", ("Tokyo", "JP"): "TOK", ("New York", "US"): "NEW", ("Berlin", "DE"): "BER", ("Madrid", "ES"): "MAD", ("Rome", "IT"): "ROM", ("Rio de Janeiro", "BR"): "RIO", ("Sydney", "AU"): "SYD", ("Toronto", "CA"): "TOR", } for (city_name, country), expected_abbr in cities.items(): city = lookup.lookup_city(city_name, country) assert city is not None, f"{city_name}, {country} not found" assert city.get_abbreviation() == expected_abbr, \ f"{city_name}: expected {expected_abbr}, got {city.get_abbreviation()}" def test_cities_with_special_characters(self, lookup): """Test cities with apostrophes, hyphens, and accents.""" # 's-Hertogenbosch (NL) city = lookup.lookup_city("'s-Hertogenbosch", "NL") assert city is not None assert city.get_abbreviation() == "SHE" # São Paulo (BR) city = lookup.lookup_city("Sao Paulo", "BR") assert city is not None assert city.get_abbreviation() == "SAO" def test_cities_with_parentheticals(self, lookup): """Test city names with parenthetical notes.""" # Dutch dataset often has "(Ov.)" etc. - should be stripped city = lookup.lookup_city("Zwolle (Ov.)", "NL") assert city is not None assert city.name == "Zwolle" city = lookup.lookup_city("Groningen (Gr.)", "NL") assert city is not None assert city.name == "Groningen" def test_whitespace_normalization(self, lookup): """Test that leading/trailing whitespace is handled.""" city1 = lookup.lookup_city(" Amsterdam ", "NL") city2 = lookup.lookup_city("Amsterdam", "NL") assert city1 is not None assert city2 is not None assert city1.geonames_id == city2.geonames_id def test_province_code_lookups(self, lookup): """Test admin1 code and name lookups (GeoNames numeric codes).""" provinces = { # GeoNames uses numeric codes, not ISO 3166-2 ("Amsterdam", "NL"): ("07", "North Holland"), ("Rotterdam", "NL"): ("11", "South Holland"), ("The Hague", "NL"): ("11", "South Holland"), ("Utrecht", "NL"): ("09", "Utrecht"), ("Groningen", "NL"): ("04", "Groningen"), ("Maastricht", "NL"): ("05", "Limburg"), } for (city_name, country), (expected_code, expected_name) in provinces.items(): city = lookup.lookup_city(city_name, country) assert city is not None, f"{city_name} not found" assert city.admin1_code == expected_code, \ f"{city_name}: expected code {expected_code}, got {city.admin1_code}" assert city.admin1_name == expected_name, \ f"{city_name}: expected name {expected_name}, got {city.admin1_name}" def test_caching(self, lookup): """Test that repeated lookups are cached.""" # First lookup city1 = lookup.lookup_city("Amsterdam", "NL") # Second lookup (should be cached) city2 = lookup.lookup_city("Amsterdam", "NL") # Should return same object (identity check) assert city1 is city2 class TestEdgeCases: """Test edge cases and known issues.""" @pytest.fixture def lookup(self): """Create GeoNamesDB instance.""" return GeoNamesDB() def test_missing_dutch_cities(self, lookup): """Test the 6 known missing Dutch cities.""" # These cities are not in GeoNames (1.6% of ISIL registry) missing_cities = [ "Avereest", # Typo or very small locality "IJsselsein", # Typo: should be IJsselstein "Kralendijk", # Bonaire (Caribbean, BQ not NL) "Selingen", # Tiny village "s-Heerenberg", # Should have apostrophe "St. Annaparochie", # Missing from GeoNames ] for city_name in missing_cities: city = lookup.lookup_city(city_name, "NL") assert city is None, f"{city_name} should not be found" def test_alternative_spellings(self, lookup): """Test that we handle common alternative spellings.""" # 's-Hertogenbosch variations variations = [ "'s-Hertogenbosch", "s-Hertogenbosch", "'s Hertogenbosch", "Den Bosch", ] for variant in variations: city = lookup.lookup_city(variant, "NL") assert city is not None, f"{variant} should be found" assert city.get_abbreviation() == "SHE" def test_bonaire_caribbean_territory(self, lookup): """Test that Bonaire cities use BQ country code, not NL.""" # Kralendijk is in Bonaire (BQ), not Netherlands (NL) city = lookup.lookup_city("Kralendijk", "BQ") # This might still be None if BQ cities aren't in GeoNames # Just documenting the correct country code if city: assert city.country_code == "BQ" class TestSearchAndQuery: """Test search and query methods.""" @pytest.fixture def db(self): """Create GeoNamesDB instance.""" return GeoNamesDB() def test_search_cities_with_pattern(self, db): """Test search_cities with wildcard pattern.""" # Search for Amsterdam* results = db.search_cities("Amsterdam%", "NL", limit=5) assert len(results) > 0 assert all(city.name.startswith("Amsterdam") for city in results) assert all(city.country_code == "NL" for city in results) def test_search_cities_without_country(self, db): """Test search_cities across all countries.""" results = db.search_cities("Paris%", limit=10) assert len(results) > 0 # Should find Paris in France and possibly other countries assert any(city.country_code == "FR" for city in results) def test_search_cities_ordered_by_population(self, db): """Test that search results are ordered by population.""" results = db.search_cities("Amsterdam%", "NL", limit=5) if len(results) > 1: # Check that populations are in descending order populations = [city.population for city in results if city.population] assert populations == sorted(populations, reverse=True) def test_get_cities_by_country(self, db): """Test get_cities_by_country for Netherlands.""" cities = db.get_cities_by_country("NL", min_population=100000, limit=20) assert len(cities) > 0 assert all(city.country_code == "NL" for city in cities) assert all(city.population is None or city.population >= 100000 for city in cities) def test_get_cities_by_country_ordered(self, db): """Test that get_cities_by_country returns cities ordered by population.""" cities = db.get_cities_by_country("NL", min_population=0, limit=10) assert len(cities) > 0 # Check descending population order (ignoring None values) populations = [city.population for city in cities if city.population] assert populations == sorted(populations, reverse=True) def test_get_city_abbreviation_convenience(self, db): """Test get_city_abbreviation convenience method.""" # Test standard 3-letter codes (without disambiguation) abbr = db.get_city_abbreviation("Amsterdam", "NL", use_disambiguation=False) assert abbr == "AMS" abbr = db.get_city_abbreviation("Tokyo", "JP", use_disambiguation=False) assert abbr == "TOK" def test_get_city_abbreviation_not_found(self, db): """Test get_city_abbreviation returns None for missing cities.""" abbr = db.get_city_abbreviation("NonExistentCity", "XX") assert abbr is None def test_get_stats(self, db): """Test database statistics retrieval.""" stats = db.get_stats() # Check expected keys assert 'total_cities' in stats assert 'total_countries' in stats assert 'top_countries' in stats assert 'db_path' in stats assert 'db_size_mb' in stats # Check reasonable values assert stats['total_cities'] > 1000000 # Should have millions of cities assert stats['total_countries'] > 200 # GeoNames covers 247 countries assert len(stats['top_countries']) == 10 assert stats['db_size_mb'] > 0 # Top countries should be tuples of (code, count) for country_code, count in stats['top_countries']: assert isinstance(country_code, str) assert isinstance(count, int) assert len(country_code) == 2 # ISO alpha-2 assert count > 0 class TestPerformance: """Test performance characteristics.""" @pytest.fixture def lookup(self): """Create GeoNamesDB instance.""" return GeoNamesDB() def test_batch_lookups_fast(self, lookup): """Test that batch lookups are fast (thanks to caching).""" import time cities = [ ("Amsterdam", "NL"), ("Rotterdam", "NL"), ("The Hague", "NL"), ("Utrecht", "NL"), ("Eindhoven", "NL"), ] * 10 # 50 lookups total start = time.time() for city_name, country in cities: lookup.lookup_city(city_name, country) elapsed = time.time() - start # 50 lookups should take < 100ms (with caching, most are instant) assert elapsed < 0.1, f"Batch lookups took {elapsed:.3f}s (expected < 0.1s)" def test_unique_lookups_reasonable(self, lookup): """Test that even uncached lookups are reasonably fast.""" import time # 20 unique Dutch cities (uncached) cities = [ "Amsterdam", "Rotterdam", "The Hague", "Utrecht", "Eindhoven", "Tilburg", "Groningen", "Almere", "Breda", "Nijmegen", "Enschede", "Haarlem", "Arnhem", "Zaanstad", "Amersfoort", "Apeldoorn", "Hoofddorp", "Maastricht", "Leiden", "Dordrecht", ] start = time.time() for city_name in cities: lookup.lookup_city(city_name, "NL") elapsed = time.time() - start # 20 unique lookups should take < 200ms from SQLite assert elapsed < 0.2, f"20 unique lookups took {elapsed:.3f}s (expected < 0.2s)" class TestDatabaseLifecycle: """Test database connection lifecycle.""" def test_close_connection(self): """Test explicit connection closing.""" db = GeoNamesDB() assert db.conn is not None # Close connection db.close() # Connection should be closed # (can't easily test this without triggering an exception) def test_context_manager_style(self): """Test that database can be used and cleaned up properly.""" db = GeoNamesDB() city = db.lookup_city("Amsterdam", "NL") assert city is not None # Cleanup del db # Should trigger __del__ and close connection