- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
342 lines
13 KiB
Python
342 lines
13 KiB
Python
"""
|
|
Tests for city code disambiguation module.
|
|
|
|
Tests the CityCodeDisambiguator class which resolves conflicts when multiple
|
|
cities generate identical 3-letter codes.
|
|
"""
|
|
|
|
import pytest
|
|
from glam_extractor.geocoding.city_code_disambiguation import (
|
|
CityCodeDisambiguator,
|
|
CityCodeEntry
|
|
)
|
|
|
|
|
|
class TestCityCodeDisambiguator:
|
|
"""Test the CityCodeDisambiguator class."""
|
|
|
|
def test_basic_3_letter_code_generation(self):
|
|
"""Test basic 3-letter code extraction."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# Test internal method
|
|
assert disambiguator._get_3_letter_code("Amsterdam") == "AMS"
|
|
assert disambiguator._get_3_letter_code("Rotterdam") == "ROT"
|
|
assert disambiguator._get_3_letter_code("Den Haag") == "DEN"
|
|
|
|
def test_basic_4_letter_code_generation(self):
|
|
"""Test 4-letter code extraction."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
assert disambiguator._get_4_letter_code("Hardenberg") == "HARD"
|
|
assert disambiguator._get_4_letter_code("Harlingen") == "HARL"
|
|
assert disambiguator._get_4_letter_code("Haren") == "HARE"
|
|
|
|
def test_no_collision_uses_3_letter_code(self):
|
|
"""Cities without collisions should use 3-letter codes."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
|
|
disambiguator.build()
|
|
|
|
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
|
|
assert disambiguator.get_code("Rotterdam", "NL") == "ROT"
|
|
|
|
def test_hardenberg_harlingen_haren_collision(self):
|
|
"""
|
|
Test the real-world collision: Hardenberg, Harlingen, Haren.
|
|
|
|
All three generate 'HAR' with 3 letters.
|
|
Should be disambiguated to HARD, HARL, HARE (4 letters).
|
|
"""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# Add the three colliding cities (real GeoNames data)
|
|
disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913)
|
|
disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234)
|
|
disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132)
|
|
|
|
disambiguator.build()
|
|
|
|
# All should get 4-letter codes (no numeric suffix needed)
|
|
assert disambiguator.get_code("Hardenberg", "NL") == "HARD"
|
|
assert disambiguator.get_code("Harlingen", "NL") == "HARL"
|
|
assert disambiguator.get_code("Haren", "NL") == "HARE"
|
|
|
|
def test_collision_with_numeric_suffix(self):
|
|
"""
|
|
Test collision that requires numeric suffix.
|
|
|
|
If two cities have identical 4-letter codes, add numeric suffix
|
|
based on population ranking.
|
|
"""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# Create synthetic collision: both start with "TEST"
|
|
disambiguator.add_city("Testa", "Testa", 1001, "XX", 100000) # Larger
|
|
disambiguator.add_city("Testb", "Testb", 1002, "XX", 50000) # Smaller
|
|
|
|
disambiguator.build()
|
|
|
|
# Larger city gets base code, smaller gets numeric suffix
|
|
assert disambiguator.get_code("Testa", "XX") == "TEST"
|
|
assert disambiguator.get_code("Testb", "XX") == "TEST1"
|
|
|
|
def test_population_ranking_for_numeric_suffix(self):
|
|
"""Test that numeric suffixes are assigned by population (descending)."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# Three cities with identical 4-letter codes
|
|
disambiguator.add_city("TestCity1", "TestCity1", 1001, "XX", 200000)
|
|
disambiguator.add_city("TestCity2", "TestCity2", 1002, "XX", 150000)
|
|
disambiguator.add_city("TestCity3", "TestCity3", 1003, "XX", 100000)
|
|
|
|
disambiguator.build()
|
|
|
|
# Largest should get base code
|
|
assert disambiguator.get_code("TestCity1", "XX") == "TEST"
|
|
assert disambiguator.get_code("TestCity2", "XX") == "TEST1"
|
|
assert disambiguator.get_code("TestCity3", "XX") == "TEST2"
|
|
|
|
def test_none_population_goes_last(self):
|
|
"""Cities with None population should be ranked last."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("TestCity1", "TestCity1", 1001, "XX", None)
|
|
disambiguator.add_city("TestCity2", "TestCity2", 1002, "XX", 100000)
|
|
|
|
disambiguator.build()
|
|
|
|
# City with population gets base code
|
|
assert disambiguator.get_code("TestCity2", "XX") == "TEST"
|
|
assert disambiguator.get_code("TestCity1", "XX") == "TEST1"
|
|
|
|
def test_get_code_before_build_raises_error(self):
|
|
"""Calling get_code() before build() should raise ValueError."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
|
|
with pytest.raises(ValueError, match="Must call build\\(\\) before get_code"):
|
|
disambiguator.get_code("Amsterdam", "NL")
|
|
|
|
def test_get_code_for_unknown_city_returns_none(self):
|
|
"""get_code() should return None for cities not in table."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.build()
|
|
|
|
assert disambiguator.get_code("Unknown", "NL") is None
|
|
|
|
def test_get_all_codes_for_country(self):
|
|
"""Test get_all_codes() returns all city codes for a country."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
|
|
disambiguator.add_city("Paris", "Paris", 2988507, "FR", 2165423)
|
|
|
|
disambiguator.build()
|
|
|
|
nl_codes = disambiguator.get_all_codes("NL")
|
|
|
|
assert nl_codes == {
|
|
"Amsterdam": "AMS",
|
|
"Rotterdam": "ROT"
|
|
}
|
|
|
|
# French city should not appear
|
|
assert "Paris" not in nl_codes
|
|
|
|
def test_collision_report_generation(self):
|
|
"""Test get_collision_report() generates readable output."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913)
|
|
disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234)
|
|
disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132)
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
|
|
disambiguator.build()
|
|
|
|
report = disambiguator.get_collision_report("NL")
|
|
|
|
# Report should mention HAR collision
|
|
assert "HAR" in report
|
|
assert "Hardenberg" in report
|
|
assert "Harlingen" in report
|
|
assert "Haren" in report
|
|
|
|
# Amsterdam should NOT appear (no collision)
|
|
assert "Amsterdam" not in report
|
|
|
|
# Should show disambiguated codes
|
|
assert "HARD" in report
|
|
assert "HARL" in report
|
|
assert "HARE" in report
|
|
|
|
def test_collision_report_no_collisions(self):
|
|
"""Test collision report when there are no collisions."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
|
|
|
|
disambiguator.build()
|
|
|
|
report = disambiguator.get_collision_report("NL")
|
|
|
|
assert "No collisions found" in report
|
|
|
|
def test_statistics_calculation(self):
|
|
"""Test get_statistics() returns correct metrics."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# Add cities with HAR collision
|
|
disambiguator.add_city("Hardenberg", "Hardenberg", 2755251, "NL", 57913)
|
|
disambiguator.add_city("Harlingen", "Harlingen", 2755249, "NL", 16234)
|
|
disambiguator.add_city("Haren", "Haren", 2755267, "NL", 19132)
|
|
|
|
# Add non-colliding cities
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
|
|
|
|
disambiguator.build()
|
|
|
|
stats = disambiguator.get_statistics("NL")
|
|
|
|
assert stats['country_code'] == "NL"
|
|
assert stats['total_cities'] == 5
|
|
assert stats['collision_groups'] == 1 # HAR group
|
|
assert stats['cities_in_collisions'] == 3 # Hardenberg, Harlingen, Haren
|
|
assert stats['code_3_final'] == 2 # Amsterdam, Rotterdam
|
|
assert stats['code_4_final'] == 3 # Hardenberg, Harlingen, Haren
|
|
assert stats['code_numeric_final'] == 0
|
|
assert stats['collision_rate'] == 0.6 # 3/5
|
|
|
|
def test_case_insensitive_country_codes(self):
|
|
"""Country codes should be case-insensitive."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "nl", 872680)
|
|
disambiguator.build()
|
|
|
|
# All variants should work
|
|
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
|
|
assert disambiguator.get_code("Amsterdam", "nl") == "AMS"
|
|
assert disambiguator.get_code("Amsterdam", "Nl") == "AMS"
|
|
|
|
def test_multi_country_disambiguation(self):
|
|
"""Test disambiguation across multiple countries."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# Same city name in different countries
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 5110266, "US", 18355)
|
|
|
|
disambiguator.build()
|
|
|
|
# Each country should have independent codes
|
|
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
|
|
assert disambiguator.get_code("Amsterdam", "US") == "AMS"
|
|
|
|
def test_accent_handling_in_city_names(self):
|
|
"""Test that accents are properly normalized in code generation."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# City names with accents
|
|
disambiguator.add_city("São Paulo", "Sao Paulo", 3448439, "BR", 12252023)
|
|
disambiguator.add_city("Zürich", "Zurich", 2657896, "CH", 415367)
|
|
|
|
disambiguator.build()
|
|
|
|
assert disambiguator.get_code("São Paulo", "BR") == "SAO"
|
|
assert disambiguator.get_code("Zürich", "CH") == "ZUR"
|
|
|
|
def test_rebuild_clears_previous_state(self):
|
|
"""Calling build() multiple times should work correctly."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
disambiguator.add_city("Amsterdam", "Amsterdam", 2759794, "NL", 872680)
|
|
disambiguator.build()
|
|
|
|
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
|
|
|
|
# Add more cities and rebuild
|
|
disambiguator.add_city("Rotterdam", "Rotterdam", 2747891, "NL", 651446)
|
|
disambiguator.build()
|
|
|
|
# Both should be accessible
|
|
assert disambiguator.get_code("Amsterdam", "NL") == "AMS"
|
|
assert disambiguator.get_code("Rotterdam", "NL") == "ROT"
|
|
|
|
def test_real_world_dutch_collision_examples(self):
|
|
"""
|
|
Test with more real Dutch city collisions from the dataset.
|
|
|
|
Based on the collision report showing cities like:
|
|
- Nieuwerkerk (multiple instances)
|
|
- Veldhoven/Velden
|
|
"""
|
|
disambiguator = CityCodeDisambiguator()
|
|
|
|
# VEL collision group
|
|
disambiguator.add_city("Veldhoven", "Veldhoven", 2745340, "NL", 45531)
|
|
disambiguator.add_city("Velden", "Velden", 2745349, "NL", 3852)
|
|
|
|
disambiguator.build()
|
|
|
|
# Should get 4-letter codes
|
|
assert disambiguator.get_code("Veldhoven", "NL") == "VELD"
|
|
assert disambiguator.get_code("Velden", "NL") == "VELD1" # Smaller population
|
|
|
|
def test_empty_disambiguator(self):
|
|
"""Test behavior with no cities added."""
|
|
disambiguator = CityCodeDisambiguator()
|
|
disambiguator.build()
|
|
|
|
stats = disambiguator.get_statistics("NL")
|
|
|
|
assert stats['total_cities'] == 0
|
|
assert stats['collision_groups'] == 0
|
|
assert stats['collision_rate'] == 0.0
|
|
|
|
|
|
class TestCityCodeEntry:
|
|
"""Test the CityCodeEntry dataclass."""
|
|
|
|
def test_entry_creation(self):
|
|
"""Test creating a CityCodeEntry instance."""
|
|
entry = CityCodeEntry(
|
|
city_name="Amsterdam",
|
|
ascii_name="Amsterdam",
|
|
geonames_id=2759794,
|
|
country_code="NL",
|
|
code_3="AMS",
|
|
code_4="AMST",
|
|
final_code="AMS",
|
|
population=872680
|
|
)
|
|
|
|
assert entry.city_name == "Amsterdam"
|
|
assert entry.ascii_name == "Amsterdam"
|
|
assert entry.geonames_id == 2759794
|
|
assert entry.country_code == "NL"
|
|
assert entry.code_3 == "AMS"
|
|
assert entry.code_4 == "AMST"
|
|
assert entry.final_code == "AMS"
|
|
assert entry.population == 872680
|
|
|
|
def test_entry_optional_population(self):
|
|
"""Test that population is optional (defaults to None)."""
|
|
entry = CityCodeEntry(
|
|
city_name="Test",
|
|
ascii_name="Test",
|
|
geonames_id=123,
|
|
country_code="XX",
|
|
code_3="TES",
|
|
code_4="TEST",
|
|
final_code="TES"
|
|
)
|
|
|
|
assert entry.population is None
|