glam/tests/geocoding/test_city_code_disambiguation.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

342 lines
13 KiB
Python

"""
Tests for city code disambiguation module.
Tests the CityCodeDisambiguator class which resolves conflicts when multiple
cities generate identical 3-letter codes.
"""
import pytest
from glam_extractor.geocoding.city_code_disambiguation import (
CityCodeDisambiguator,
CityCodeEntry
)
class TestCityCodeDisambiguator:
"""Test the CityCodeDisambiguator class."""
def test_basic_3_letter_code_generation(self):
"""Test basic 3-letter code extraction."""
disambiguator = CityCodeDisambiguator()
# Test internal method
assert disambiguator._get_3_letter_code("Amsterdam") == "AMS"
assert disambiguator._get_3_letter_code("Rotterdam") == "ROT"
assert disambiguator._get_3_letter_code("Den Haag") == "DEN"
def test_basic_4_letter_code_generation(self):
"""Test 4-letter code extraction."""
disambiguator = CityCodeDisambiguator()
assert disambiguator._get_4_letter_code("Hardenberg") == "HARD"
assert disambiguator._get_4_letter_code("Harlingen") == "HARL"
assert disambiguator._get_4_letter_code("Haren") == "HARE"
def test_no_collision_uses_3_letter_code(self):
"""Cities without collisions should use 3-letter codes."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
disambiguator.build()
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
assert disambiguator.get_code("Rotterdam", "NL") == "ROT"
def test_hardenberg_harlingen_haren_collision(self):
"""
Test the real-world collision: Hardenberg, Harlingen, Haren.
All three generate 'HAR' with 3 letters.
Should be disambiguated to HARD, HARL, HARE (4 letters).
"""
disambiguator = CityCodeDisambiguator()
# Add the three colliding cities (real GeoNames data)
disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913)
disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234)
disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132)
disambiguator.build()
# All should get 4-letter codes (no numeric suffix needed)
assert disambiguator.get_code("Hardenberg", "NL") == "HARD"
assert disambiguator.get_code("Harlingen", "NL") == "HARL"
assert disambiguator.get_code("Haren", "NL") == "HARE"
def test_collision_with_numeric_suffix(self):
"""
Test collision that requires numeric suffix.
If two cities have identical 4-letter codes, add numeric suffix
based on population ranking.
"""
disambiguator = CityCodeDisambiguator()
# Create synthetic collision: both start with "TEST"
disambiguator.add_city("Testa", "Testa", 1001, "XX", 100000) # Larger
disambiguator.add_city("Testb", "Testb", 1002, "XX", 50000) # Smaller
disambiguator.build()
# Larger city gets base code, smaller gets numeric suffix
assert disambiguator.get_code("Testa", "XX") == "TEST"
assert disambiguator.get_code("Testb", "XX") == "TEST1"
def test_population_ranking_for_numeric_suffix(self):
"""Test that numeric suffixes are assigned by population (descending)."""
disambiguator = CityCodeDisambiguator()
# Three cities with identical 4-letter codes
disambiguator.add_city("TestCity1", "TestCity1", 1001, "XX", 200000)
disambiguator.add_city("TestCity2", "TestCity2", 1002, "XX", 150000)
disambiguator.add_city("TestCity3", "TestCity3", 1003, "XX", 100000)
disambiguator.build()
# Largest should get base code
assert disambiguator.get_code("TestCity1", "XX") == "TEST"
assert disambiguator.get_code("TestCity2", "XX") == "TEST1"
assert disambiguator.get_code("TestCity3", "XX") == "TEST2"
def test_none_population_goes_last(self):
"""Cities with None population should be ranked last."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("TestCity1", "TestCity1", 1001, "XX", None)
disambiguator.add_city("TestCity2", "TestCity2", 1002, "XX", 100000)
disambiguator.build()
# City with population gets base code
assert disambiguator.get_code("TestCity2", "XX") == "TEST"
assert disambiguator.get_code("TestCity1", "XX") == "TEST1"
def test_get_code_before_build_raises_error(self):
"""Calling get_code() before build() should raise ValueError."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
with pytest.raises(ValueError, match="Must call build\\(\\) before get_code"):
disambiguator.get_code("Amsterdam", "NL")
def test_get_code_for_unknown_city_returns_none(self):
"""get_code() should return None for cities not in table."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.build()
assert disambiguator.get_code("Unknown", "NL") is None
def test_get_all_codes_for_country(self):
"""Test get_all_codes() returns all city codes for a country."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
disambiguator.add_city("Paris", "Paris", 2988507, "FR", 2165423)
disambiguator.build()
nl_codes = disambiguator.get_all_codes("NL")
assert nl_codes == {
"Amsterdam": "AMS",
"Rotterdam": "ROT"
}
# French city should not appear
assert "Paris" not in nl_codes
def test_collision_report_generation(self):
"""Test get_collision_report() generates readable output."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913)
disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234)
disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132)
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.build()
report = disambiguator.get_collision_report("NL")
# Report should mention HAR collision
assert "HAR" in report
assert "Hardenberg" in report
assert "Harlingen" in report
assert "Haren" in report
# Amsterdam should NOT appear (no collision)
assert "Amsterdam" not in report
# Should show disambiguated codes
assert "HARD" in report
assert "HARL" in report
assert "HARE" in report
def test_collision_report_no_collisions(self):
"""Test collision report when there are no collisions."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
disambiguator.build()
report = disambiguator.get_collision_report("NL")
assert "No collisions found" in report
def test_statistics_calculation(self):
"""Test get_statistics() returns correct metrics."""
disambiguator = CityCodeDisambiguator()
# Add cities with HAR collision
disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913)
disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234)
disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132)
# Add non-colliding cities
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
disambiguator.build()
stats = disambiguator.get_statistics("NL")
assert stats['country_code'] == "NL"
assert stats['total_cities'] == 5
assert stats['collision_groups'] == 1 # HAR group
assert stats['cities_in_collisions'] == 3 # Hardenberg, Harlingen, Haren
assert stats['code_3_final'] == 2 # Amsterdam, Rotterdam
assert stats['code_4_final'] == 3 # Hardenberg, Harlingen, Haren
assert stats['code_numeric_final'] == 0
assert stats['collision_rate'] == 0.6 # 3/5
def test_case_insensitive_country_codes(self):
"""Country codes should be case-insensitive."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "nl", 872680)
disambiguator.build()
# All variants should work
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
assert disambiguator.get_code("Amsterdam", "nl") == "AMS"
assert disambiguator.get_code("Amsterdam", "Nl") == "AMS"
def test_multi_country_disambiguation(self):
"""Test disambiguation across multiple countries."""
disambiguator = CityCodeDisambiguator()
# Same city name in different countries
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.add_city("Amsterdam", "Amsterdam", 5110266, "US", 18355)
disambiguator.build()
# Each country should have independent codes
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
assert disambiguator.get_code("Amsterdam", "US") == "AMS"
def test_accent_handling_in_city_names(self):
"""Test that accents are properly normalized in code generation."""
disambiguator = CityCodeDisambiguator()
# City names with accents
disambiguator.add_city("São Paulo", "Sao Paulo", 3448439, "BR", 12252023)
disambiguator.add_city("Zürich", "Zurich", 2657896, "CH", 415367)
disambiguator.build()
assert disambiguator.get_code("São Paulo", "BR") == "SAO"
assert disambiguator.get_code("Zürich", "CH") == "ZUR"
def test_rebuild_clears_previous_state(self):
"""Calling build() multiple times should work correctly."""
disambiguator = CityCodeDisambiguator()
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
disambiguator.build()
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
# Add more cities and rebuild
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
disambiguator.build()
# Both should be accessible
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
assert disambiguator.get_code("Rotterdam", "NL") == "ROT"
def test_real_world_dutch_collision_examples(self):
"""
Test with more real Dutch city collisions from the dataset.
Based on the collision report showing cities like:
- Nieuwerkerk (multiple instances)
- Veldhoven/Velden
"""
disambiguator = CityCodeDisambiguator()
# VEL collision group
disambiguator.add_city("Veldhoven", "Veldhoven", 2745340, "NL", 45531)
disambiguator.add_city("Velden", "Velden", 2745349, "NL", 3852)
disambiguator.build()
# Should get 4-letter codes
assert disambiguator.get_code("Veldhoven", "NL") == "VELD"
assert disambiguator.get_code("Velden", "NL") == "VELD1" # Smaller population
def test_empty_disambiguator(self):
"""Test behavior with no cities added."""
disambiguator = CityCodeDisambiguator()
disambiguator.build()
stats = disambiguator.get_statistics("NL")
assert stats['total_cities'] == 0
assert stats['collision_groups'] == 0
assert stats['collision_rate'] == 0.0
class TestCityCodeEntry:
"""Test the CityCodeEntry dataclass."""
def test_entry_creation(self):
"""Test creating a CityCodeEntry instance."""
entry = CityCodeEntry(
city_name="Amsterdam",
ascii_name="Amsterdam",
geonames_id=2759794,
country_code="NL",
code_3="AMS",
code_4="AMST",
final_code="AMS",
population=872680
)
assert entry.city_name == "Amsterdam"
assert entry.ascii_name == "Amsterdam"
assert entry.geonames_id == 2759794
assert entry.country_code == "NL"
assert entry.code_3 == "AMS"
assert entry.code_4 == "AMST"
assert entry.final_code == "AMS"
assert entry.population == 872680
def test_entry_optional_population(self):
"""Test that population is optional (defaults to None)."""
entry = CityCodeEntry(
city_name="Test",
ascii_name="Test",
geonames_id=123,
country_code="XX",
code_3="TES",
code_4="TEST",
final_code="TES"
)
assert entry.population is None