""" Tests for city code disambiguation module. Tests the CityCodeDisambiguator class which resolves conflicts when multiple cities generate identical 3-letter codes. """ import pytest from glam_extractor.geocoding.city_code_disambiguation import ( CityCodeDisambiguator, CityCodeEntry ) class TestCityCodeDisambiguator: """Test the CityCodeDisambiguator class.""" def test_basic_3_letter_code_generation(self): """Test basic 3-letter code extraction.""" disambiguator = CityCodeDisambiguator() # Test internal method assert disambiguator._get_3_letter_code("Amsterdam") == "AMS" assert disambiguator._get_3_letter_code("Rotterdam") == "ROT" assert disambiguator._get_3_letter_code("Den Haag") == "DEN" def test_basic_4_letter_code_generation(self): """Test 4-letter code extraction.""" disambiguator = CityCodeDisambiguator() assert disambiguator._get_4_letter_code("Hardenberg") == "HARD" assert disambiguator._get_4_letter_code("Harlingen") == "HARL" assert disambiguator._get_4_letter_code("Haren") == "HARE" def test_no_collision_uses_3_letter_code(self): """Cities without collisions should use 3-letter codes.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446) disambiguator.build() assert disambiguator.get_code("Amsterdam", "NL") == "AMS" assert disambiguator.get_code("Rotterdam", "NL") == "ROT" def test_hardenberg_harlingen_haren_collision(self): """ Test the real-world collision: Hardenberg, Harlingen, Haren. All three generate 'HAR' with 3 letters. Should be disambiguated to HARD, HARL, HARE (4 letters). """ disambiguator = CityCodeDisambiguator() # Add the three colliding cities (real GeoNames data) disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913) disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234) disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132) disambiguator.build() # All should get 4-letter codes (no numeric suffix needed) assert disambiguator.get_code("Hardenberg", "NL") == "HARD" assert disambiguator.get_code("Harlingen", "NL") == "HARL" assert disambiguator.get_code("Haren", "NL") == "HARE" def test_collision_with_numeric_suffix(self): """ Test collision that requires numeric suffix. If two cities have identical 4-letter codes, add numeric suffix based on population ranking. """ disambiguator = CityCodeDisambiguator() # Create synthetic collision: both start with "TEST" disambiguator.add_city("Testa", "Testa", 1001, "XX", 100000) # Larger disambiguator.add_city("Testb", "Testb", 1002, "XX", 50000) # Smaller disambiguator.build() # Larger city gets base code, smaller gets numeric suffix assert disambiguator.get_code("Testa", "XX") == "TEST" assert disambiguator.get_code("Testb", "XX") == "TEST1" def test_population_ranking_for_numeric_suffix(self): """Test that numeric suffixes are assigned by population (descending).""" disambiguator = CityCodeDisambiguator() # Three cities with identical 4-letter codes disambiguator.add_city("TestCity1", "TestCity1", 1001, "XX", 200000) disambiguator.add_city("TestCity2", "TestCity2", 1002, "XX", 150000) disambiguator.add_city("TestCity3", "TestCity3", 1003, "XX", 100000) disambiguator.build() # Largest should get base code assert disambiguator.get_code("TestCity1", "XX") == "TEST" assert disambiguator.get_code("TestCity2", "XX") == "TEST1" assert disambiguator.get_code("TestCity3", "XX") == "TEST2" def test_none_population_goes_last(self): """Cities with None population should be ranked last.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("TestCity1", "TestCity1", 1001, "XX", None) disambiguator.add_city("TestCity2", "TestCity2", 1002, "XX", 100000) disambiguator.build() # City with population gets base code assert disambiguator.get_code("TestCity2", "XX") == "TEST" assert disambiguator.get_code("TestCity1", "XX") == "TEST1" def test_get_code_before_build_raises_error(self): """Calling get_code() before build() should raise ValueError.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) with pytest.raises(ValueError, match="Must call build\\(\\) before get_code"): disambiguator.get_code("Amsterdam", "NL") def test_get_code_for_unknown_city_returns_none(self): """get_code() should return None for cities not in table.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.build() assert disambiguator.get_code("Unknown", "NL") is None def test_get_all_codes_for_country(self): """Test get_all_codes() returns all city codes for a country.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446) disambiguator.add_city("Paris", "Paris", 2988507, "FR", 2165423) disambiguator.build() nl_codes = disambiguator.get_all_codes("NL") assert nl_codes == { "Amsterdam": "AMS", "Rotterdam": "ROT" } # French city should not appear assert "Paris" not in nl_codes def test_collision_report_generation(self): """Test get_collision_report() generates readable output.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913) disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234) disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132) disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.build() report = disambiguator.get_collision_report("NL") # Report should mention HAR collision assert "HAR" in report assert "Hardenberg" in report assert "Harlingen" in report assert "Haren" in report # Amsterdam should NOT appear (no collision) assert "Amsterdam" not in report # Should show disambiguated codes assert "HARD" in report assert "HARL" in report assert "HARE" in report def test_collision_report_no_collisions(self): """Test collision report when there are no collisions.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446) disambiguator.build() report = disambiguator.get_collision_report("NL") assert "No collisions found" in report def test_statistics_calculation(self): """Test get_statistics() returns correct metrics.""" disambiguator = CityCodeDisambiguator() # Add cities with HAR collision disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913) disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234) disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132) # Add non-colliding cities disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446) disambiguator.build() stats = disambiguator.get_statistics("NL") assert stats['country_code'] == "NL" assert stats['total_cities'] == 5 assert stats['collision_groups'] == 1 # HAR group assert stats['cities_in_collisions'] == 3 # Hardenberg, Harlingen, Haren assert stats['code_3_final'] == 2 # Amsterdam, Rotterdam assert stats['code_4_final'] == 3 # Hardenberg, Harlingen, Haren assert stats['code_numeric_final'] == 0 assert stats['collision_rate'] == 0.6 # 3/5 def test_case_insensitive_country_codes(self): """Country codes should be case-insensitive.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "nl", 872680) disambiguator.build() # All variants should work assert disambiguator.get_code("Amsterdam", "NL") == "AMS" assert disambiguator.get_code("Amsterdam", "nl") == "AMS" assert disambiguator.get_code("Amsterdam", "Nl") == "AMS" def test_multi_country_disambiguation(self): """Test disambiguation across multiple countries.""" disambiguator = CityCodeDisambiguator() # Same city name in different countries disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.add_city("Amsterdam", "Amsterdam", 5110266, "US", 18355) disambiguator.build() # Each country should have independent codes assert disambiguator.get_code("Amsterdam", "NL") == "AMS" assert disambiguator.get_code("Amsterdam", "US") == "AMS" def test_accent_handling_in_city_names(self): """Test that accents are properly normalized in code generation.""" disambiguator = CityCodeDisambiguator() # City names with accents disambiguator.add_city("São Paulo", "Sao Paulo", 3448439, "BR", 12252023) disambiguator.add_city("Zürich", "Zurich", 2657896, "CH", 415367) disambiguator.build() assert disambiguator.get_code("São Paulo", "BR") == "SAO" assert disambiguator.get_code("Zürich", "CH") == "ZUR" def test_rebuild_clears_previous_state(self): """Calling build() multiple times should work correctly.""" disambiguator = CityCodeDisambiguator() disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680) disambiguator.build() assert disambiguator.get_code("Amsterdam", "NL") == "AMS" # Add more cities and rebuild disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446) disambiguator.build() # Both should be accessible assert disambiguator.get_code("Amsterdam", "NL") == "AMS" assert disambiguator.get_code("Rotterdam", "NL") == "ROT" def test_real_world_dutch_collision_examples(self): """ Test with more real Dutch city collisions from the dataset. Based on the collision report showing cities like: - Nieuwerkerk (multiple instances) - Veldhoven/Velden """ disambiguator = CityCodeDisambiguator() # VEL collision group disambiguator.add_city("Veldhoven", "Veldhoven", 2745340, "NL", 45531) disambiguator.add_city("Velden", "Velden", 2745349, "NL", 3852) disambiguator.build() # Should get 4-letter codes assert disambiguator.get_code("Veldhoven", "NL") == "VELD" assert disambiguator.get_code("Velden", "NL") == "VELD1" # Smaller population def test_empty_disambiguator(self): """Test behavior with no cities added.""" disambiguator = CityCodeDisambiguator() disambiguator.build() stats = disambiguator.get_statistics("NL") assert stats['total_cities'] == 0 assert stats['collision_groups'] == 0 assert stats['collision_rate'] == 0.0 class TestCityCodeEntry: """Test the CityCodeEntry dataclass.""" def test_entry_creation(self): """Test creating a CityCodeEntry instance.""" entry = CityCodeEntry( city_name="Amsterdam", ascii_name="Amsterdam", geonames_id=2759794, country_code="NL", code_3="AMS", code_4="AMST", final_code="AMS", population=872680 ) assert entry.city_name == "Amsterdam" assert entry.ascii_name == "Amsterdam" assert entry.geonames_id == 2759794 assert entry.country_code == "NL" assert entry.code_3 == "AMS" assert entry.code_4 == "AMST" assert entry.final_code == "AMS" assert entry.population == 872680 def test_entry_optional_population(self): """Test that population is optional (defaults to None).""" entry = CityCodeEntry( city_name="Test", ascii_name="Test", geonames_id=123, country_code="XX", code_3="TES", code_4="TEST", final_code="TES" ) assert entry.population is None