- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
494 lines
18 KiB
Python
494 lines
18 KiB
Python
"""Comprehensive tests for GeoNames lookup functionality."""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
from glam_extractor.geocoding.geonames_lookup import (
|
|
GeoNamesDB,
|
|
CityInfo,
|
|
)
|
|
|
|
|
|
class TestCityInfo:
|
|
"""Test CityInfo data class."""
|
|
|
|
def test_get_abbreviation_simple(self):
|
|
"""Test abbreviation generation for simple city names."""
|
|
city = CityInfo(
|
|
geonames_id=2759794,
|
|
name="Amsterdam",
|
|
ascii_name="Amsterdam",
|
|
country_code="NL",
|
|
admin1_code="NH",
|
|
admin1_name="North Holland",
|
|
admin2_code=None,
|
|
latitude=52.37403,
|
|
longitude=4.88969,
|
|
feature_code="PPLC",
|
|
population=741636,
|
|
elevation=13,
|
|
timezone="Europe/Amsterdam"
|
|
)
|
|
assert city.get_abbreviation() == "AMS"
|
|
|
|
def test_get_abbreviation_with_space(self):
|
|
"""Test abbreviation for city with space (The Hague)."""
|
|
city = CityInfo(
|
|
geonames_id=2747373,
|
|
name="The Hague",
|
|
ascii_name="The Hague",
|
|
country_code="NL",
|
|
admin1_code="ZH",
|
|
admin1_name="South Holland",
|
|
admin2_code=None,
|
|
latitude=52.07667,
|
|
longitude=4.29861,
|
|
feature_code="PPLA",
|
|
population=474292,
|
|
elevation=1,
|
|
timezone="Europe/Amsterdam"
|
|
)
|
|
assert city.get_abbreviation() == "THE"
|
|
|
|
def test_get_abbreviation_with_apostrophe_and_hyphen(self):
|
|
"""Test abbreviation for 's-Hertogenbosch (strips special chars)."""
|
|
city = CityInfo(
|
|
geonames_id=2747351,
|
|
name="'s-Hertogenbosch",
|
|
ascii_name="'s-Hertogenbosch",
|
|
country_code="NL",
|
|
admin1_code="NB",
|
|
admin1_name="North Brabant",
|
|
admin2_code=None,
|
|
latitude=51.69917,
|
|
longitude=5.30417,
|
|
feature_code="PPLA",
|
|
population=134520,
|
|
elevation=7,
|
|
timezone="Europe/Amsterdam"
|
|
)
|
|
# Should skip apostrophe and hyphen, take first 3 letters: S-H-E
|
|
assert city.get_abbreviation() == "SHE"
|
|
|
|
def test_get_abbreviation_accented_chars(self):
|
|
"""Test abbreviation for cities with accents (São Paulo)."""
|
|
city = CityInfo(
|
|
geonames_id=3448439,
|
|
name="São Paulo",
|
|
ascii_name="Sao Paulo",
|
|
country_code="BR",
|
|
admin1_code="27",
|
|
admin1_name="São Paulo",
|
|
admin2_code=None,
|
|
latitude=-23.5475,
|
|
longitude=-46.63611,
|
|
feature_code="PPLA",
|
|
population=10021295,
|
|
elevation=760,
|
|
timezone="America/Sao_Paulo"
|
|
)
|
|
assert city.get_abbreviation() == "SAO"
|
|
|
|
|
|
class TestGeoNamesDB:
|
|
"""Test GeoNamesDB database interface."""
|
|
|
|
@pytest.fixture
|
|
def db(self):
|
|
"""Create GeoNamesDB instance."""
|
|
# Uses default path (data/reference/geonames.db)
|
|
return GeoNamesDB()
|
|
|
|
def test_init_default_path(self, db):
|
|
"""Test database initialization with default path."""
|
|
assert db.conn is not None
|
|
assert db.db_path.exists()
|
|
|
|
def test_init_invalid_path(self):
|
|
"""Test database initialization with invalid path raises error."""
|
|
with pytest.raises(FileNotFoundError):
|
|
GeoNamesDB(db_path=Path("/nonexistent/path.db"))
|
|
|
|
def test_lookup_city_simple(self, db):
|
|
"""Test basic city lookup."""
|
|
city = db.lookup_city("Amsterdam", "NL")
|
|
assert city is not None
|
|
assert city.name == "Amsterdam"
|
|
assert city.country_code == "NL"
|
|
assert city.admin1_code == "07" # GeoNames numeric code for North Holland
|
|
assert city.admin1_name == "North Holland"
|
|
assert city.geonames_id == 2759794
|
|
|
|
def test_lookup_city_case_insensitive(self, db):
|
|
"""Test city lookup is case-insensitive."""
|
|
city1 = db.lookup_city("Amsterdam", "NL")
|
|
city2 = db.lookup_city("amsterdam", "NL")
|
|
city3 = db.lookup_city("AMSTERDAM", "NL")
|
|
|
|
assert city1 is not None
|
|
assert city2 is not None
|
|
assert city3 is not None
|
|
assert city1.geonames_id == city2.geonames_id == city3.geonames_id
|
|
|
|
def test_lookup_city_not_found(self, db):
|
|
"""Test lookup of non-existent city returns None."""
|
|
city = db.lookup_city("NonExistentCity", "NL")
|
|
assert city is None
|
|
|
|
def test_lookup_city_wrong_country(self, db):
|
|
"""Test lookup with wrong country code returns None."""
|
|
city = db.lookup_city("Amsterdam", "FR")
|
|
assert city is None
|
|
|
|
def test_admin1_name_from_city(self, db):
|
|
"""Test that admin1_name is populated from city lookup."""
|
|
city = db.lookup_city("Amsterdam", "NL")
|
|
assert city is not None
|
|
assert city.admin1_name == "North Holland"
|
|
|
|
def test_admin1_name_multiple_cities(self, db):
|
|
"""Test admin1_name for multiple cities in different provinces."""
|
|
cities = [
|
|
("Amsterdam", "North Holland"),
|
|
("Rotterdam", "South Holland"),
|
|
("Utrecht", "Utrecht"),
|
|
]
|
|
for city_name, expected_province in cities:
|
|
city = db.lookup_city(city_name, "NL")
|
|
assert city is not None
|
|
assert city.admin1_name == expected_province
|
|
|
|
|
|
class TestGeoNamesLookup:
|
|
"""Test GeoNamesDB high-level interface."""
|
|
|
|
@pytest.fixture
|
|
def lookup(self):
|
|
"""Create GeoNamesDB instance."""
|
|
return GeoNamesDB()
|
|
|
|
def test_dutch_cities(self, lookup):
|
|
"""Test lookups for major Dutch cities."""
|
|
cities = {
|
|
"Amsterdam": "AMS",
|
|
"Rotterdam": "ROT",
|
|
"The Hague": "THE",
|
|
"Utrecht": "UTR",
|
|
"Eindhoven": "EIN",
|
|
"Groningen": "GRO",
|
|
"Tilburg": "TIL",
|
|
"Almere Stad": "ALM", # Almere is "Almere Stad" in GeoNames
|
|
"Breda": "BRE",
|
|
"Nijmegen": "NIJ",
|
|
}
|
|
|
|
for city_name, expected_abbr in cities.items():
|
|
city = lookup.lookup_city(city_name, "NL")
|
|
assert city is not None, f"{city_name} not found"
|
|
assert city.get_abbreviation() == expected_abbr, \
|
|
f"{city_name}: expected {expected_abbr}, got {city.get_abbreviation()}"
|
|
|
|
def test_dutch_aliases(self, lookup):
|
|
"""Test Dutch city name aliases."""
|
|
# Den Haag → The Hague
|
|
city = lookup.lookup_city("Den Haag", "NL")
|
|
assert city is not None
|
|
assert city.get_abbreviation() == "THE"
|
|
assert city.name == "The Hague"
|
|
|
|
# Den Bosch → 's-Hertogenbosch
|
|
city = lookup.lookup_city("Den Bosch", "NL")
|
|
assert city is not None
|
|
assert city.get_abbreviation() == "SHE"
|
|
assert city.name == "'s-Hertogenbosch"
|
|
|
|
def test_global_cities(self, lookup):
|
|
"""Test lookups for major global cities."""
|
|
cities = {
|
|
("Paris", "FR"): "PAR",
|
|
("London", "GB"): "LON",
|
|
("Tokyo", "JP"): "TOK",
|
|
("New York", "US"): "NEW",
|
|
("Berlin", "DE"): "BER",
|
|
("Madrid", "ES"): "MAD",
|
|
("Rome", "IT"): "ROM",
|
|
("Rio de Janeiro", "BR"): "RIO",
|
|
("Sydney", "AU"): "SYD",
|
|
("Toronto", "CA"): "TOR",
|
|
}
|
|
|
|
for (city_name, country), expected_abbr in cities.items():
|
|
city = lookup.lookup_city(city_name, country)
|
|
assert city is not None, f"{city_name}, {country} not found"
|
|
assert city.get_abbreviation() == expected_abbr, \
|
|
f"{city_name}: expected {expected_abbr}, got {city.get_abbreviation()}"
|
|
|
|
def test_cities_with_special_characters(self, lookup):
|
|
"""Test cities with apostrophes, hyphens, and accents."""
|
|
# 's-Hertogenbosch (NL)
|
|
city = lookup.lookup_city("'s-Hertogenbosch", "NL")
|
|
assert city is not None
|
|
assert city.get_abbreviation() == "SHE"
|
|
|
|
# São Paulo (BR)
|
|
city = lookup.lookup_city("Sao Paulo", "BR")
|
|
assert city is not None
|
|
assert city.get_abbreviation() == "SAO"
|
|
|
|
def test_cities_with_parentheticals(self, lookup):
|
|
"""Test city names with parenthetical notes."""
|
|
# Dutch dataset often has "(Ov.)" etc. - should be stripped
|
|
city = lookup.lookup_city("Zwolle (Ov.)", "NL")
|
|
assert city is not None
|
|
assert city.name == "Zwolle"
|
|
|
|
city = lookup.lookup_city("Groningen (Gr.)", "NL")
|
|
assert city is not None
|
|
assert city.name == "Groningen"
|
|
|
|
def test_whitespace_normalization(self, lookup):
|
|
"""Test that leading/trailing whitespace is handled."""
|
|
city1 = lookup.lookup_city(" Amsterdam ", "NL")
|
|
city2 = lookup.lookup_city("Amsterdam", "NL")
|
|
|
|
assert city1 is not None
|
|
assert city2 is not None
|
|
assert city1.geonames_id == city2.geonames_id
|
|
|
|
def test_province_code_lookups(self, lookup):
|
|
"""Test admin1 code and name lookups (GeoNames numeric codes)."""
|
|
provinces = {
|
|
# GeoNames uses numeric codes, not ISO 3166-2
|
|
("Amsterdam", "NL"): ("07", "North Holland"),
|
|
("Rotterdam", "NL"): ("11", "South Holland"),
|
|
("The Hague", "NL"): ("11", "South Holland"),
|
|
("Utrecht", "NL"): ("09", "Utrecht"),
|
|
("Groningen", "NL"): ("04", "Groningen"),
|
|
("Maastricht", "NL"): ("05", "Limburg"),
|
|
}
|
|
|
|
for (city_name, country), (expected_code, expected_name) in provinces.items():
|
|
city = lookup.lookup_city(city_name, country)
|
|
assert city is not None, f"{city_name} not found"
|
|
assert city.admin1_code == expected_code, \
|
|
f"{city_name}: expected code {expected_code}, got {city.admin1_code}"
|
|
assert city.admin1_name == expected_name, \
|
|
f"{city_name}: expected name {expected_name}, got {city.admin1_name}"
|
|
|
|
def test_caching(self, lookup):
|
|
"""Test that repeated lookups are cached."""
|
|
# First lookup
|
|
city1 = lookup.lookup_city("Amsterdam", "NL")
|
|
|
|
# Second lookup (should be cached)
|
|
city2 = lookup.lookup_city("Amsterdam", "NL")
|
|
|
|
# Should return same object (identity check)
|
|
assert city1 is city2
|
|
|
|
|
|
class TestEdgeCases:
|
|
"""Test edge cases and known issues."""
|
|
|
|
@pytest.fixture
|
|
def lookup(self):
|
|
"""Create GeoNamesDB instance."""
|
|
return GeoNamesDB()
|
|
|
|
def test_missing_dutch_cities(self, lookup):
|
|
"""Test the 6 known missing Dutch cities."""
|
|
# These cities are not in GeoNames (1.6% of ISIL registry)
|
|
missing_cities = [
|
|
"Avereest", # Typo or very small locality
|
|
"IJsselsein", # Typo: should be IJsselstein
|
|
"Kralendijk", # Bonaire (Caribbean, BQ not NL)
|
|
"Selingen", # Tiny village
|
|
"s-Heerenberg", # Should have apostrophe
|
|
"St. Annaparochie", # Missing from GeoNames
|
|
]
|
|
|
|
for city_name in missing_cities:
|
|
city = lookup.lookup_city(city_name, "NL")
|
|
assert city is None, f"{city_name} should not be found"
|
|
|
|
def test_alternative_spellings(self, lookup):
|
|
"""Test that we handle common alternative spellings."""
|
|
# 's-Hertogenbosch variations
|
|
variations = [
|
|
"'s-Hertogenbosch",
|
|
"s-Hertogenbosch",
|
|
"'s Hertogenbosch",
|
|
"Den Bosch",
|
|
]
|
|
|
|
for variant in variations:
|
|
city = lookup.lookup_city(variant, "NL")
|
|
assert city is not None, f"{variant} should be found"
|
|
assert city.get_abbreviation() == "SHE"
|
|
|
|
def test_bonaire_caribbean_territory(self, lookup):
|
|
"""Test that Bonaire cities use BQ country code, not NL."""
|
|
# Kralendijk is in Bonaire (BQ), not Netherlands (NL)
|
|
city = lookup.lookup_city("Kralendijk", "BQ")
|
|
# This might still be None if BQ cities aren't in GeoNames
|
|
# Just documenting the correct country code
|
|
if city:
|
|
assert city.country_code == "BQ"
|
|
|
|
|
|
class TestSearchAndQuery:
|
|
"""Test search and query methods."""
|
|
|
|
@pytest.fixture
|
|
def db(self):
|
|
"""Create GeoNamesDB instance."""
|
|
return GeoNamesDB()
|
|
|
|
def test_search_cities_with_pattern(self, db):
|
|
"""Test search_cities with wildcard pattern."""
|
|
# Search for Amsterdam*
|
|
results = db.search_cities("Amsterdam%", "NL", limit=5)
|
|
assert len(results) > 0
|
|
assert all(city.name.startswith("Amsterdam") for city in results)
|
|
assert all(city.country_code == "NL" for city in results)
|
|
|
|
def test_search_cities_without_country(self, db):
|
|
"""Test search_cities across all countries."""
|
|
results = db.search_cities("Paris%", limit=10)
|
|
assert len(results) > 0
|
|
# Should find Paris in France and possibly other countries
|
|
assert any(city.country_code == "FR" for city in results)
|
|
|
|
def test_search_cities_ordered_by_population(self, db):
|
|
"""Test that search results are ordered by population."""
|
|
results = db.search_cities("Amsterdam%", "NL", limit=5)
|
|
if len(results) > 1:
|
|
# Check that populations are in descending order
|
|
populations = [city.population for city in results if city.population]
|
|
assert populations == sorted(populations, reverse=True)
|
|
|
|
def test_get_cities_by_country(self, db):
|
|
"""Test get_cities_by_country for Netherlands."""
|
|
cities = db.get_cities_by_country("NL", min_population=100000, limit=20)
|
|
assert len(cities) > 0
|
|
assert all(city.country_code == "NL" for city in cities)
|
|
assert all(city.population is None or city.population >= 100000 for city in cities)
|
|
|
|
def test_get_cities_by_country_ordered(self, db):
|
|
"""Test that get_cities_by_country returns cities ordered by population."""
|
|
cities = db.get_cities_by_country("NL", min_population=0, limit=10)
|
|
assert len(cities) > 0
|
|
# Check descending population order (ignoring None values)
|
|
populations = [city.population for city in cities if city.population]
|
|
assert populations == sorted(populations, reverse=True)
|
|
|
|
def test_get_city_abbreviation_convenience(self, db):
|
|
"""Test get_city_abbreviation convenience method."""
|
|
# Test standard 3-letter codes (without disambiguation)
|
|
abbr = db.get_city_abbreviation("Amsterdam", "NL", use_disambiguation=False)
|
|
assert abbr == "AMS"
|
|
|
|
abbr = db.get_city_abbreviation("Tokyo", "JP", use_disambiguation=False)
|
|
assert abbr == "TOK"
|
|
|
|
def test_get_city_abbreviation_not_found(self, db):
|
|
"""Test get_city_abbreviation returns None for missing cities."""
|
|
abbr = db.get_city_abbreviation("NonExistentCity", "XX")
|
|
assert abbr is None
|
|
|
|
def test_get_stats(self, db):
|
|
"""Test database statistics retrieval."""
|
|
stats = db.get_stats()
|
|
|
|
# Check expected keys
|
|
assert 'total_cities' in stats
|
|
assert 'total_countries' in stats
|
|
assert 'top_countries' in stats
|
|
assert 'db_path' in stats
|
|
assert 'db_size_mb' in stats
|
|
|
|
# Check reasonable values
|
|
assert stats['total_cities'] > 1000000 # Should have millions of cities
|
|
assert stats['total_countries'] > 200 # GeoNames covers 247 countries
|
|
assert len(stats['top_countries']) == 10
|
|
assert stats['db_size_mb'] > 0
|
|
|
|
# Top countries should be tuples of (code, count)
|
|
for country_code, count in stats['top_countries']:
|
|
assert isinstance(country_code, str)
|
|
assert isinstance(count, int)
|
|
assert len(country_code) == 2 # ISO alpha-2
|
|
assert count > 0
|
|
|
|
|
|
class TestPerformance:
|
|
"""Test performance characteristics."""
|
|
|
|
@pytest.fixture
|
|
def lookup(self):
|
|
"""Create GeoNamesDB instance."""
|
|
return GeoNamesDB()
|
|
|
|
def test_batch_lookups_fast(self, lookup):
|
|
"""Test that batch lookups are fast (thanks to caching)."""
|
|
import time
|
|
|
|
cities = [
|
|
("Amsterdam", "NL"),
|
|
("Rotterdam", "NL"),
|
|
("The Hague", "NL"),
|
|
("Utrecht", "NL"),
|
|
("Eindhoven", "NL"),
|
|
] * 10 # 50 lookups total
|
|
|
|
start = time.time()
|
|
for city_name, country in cities:
|
|
lookup.lookup_city(city_name, country)
|
|
elapsed = time.time() - start
|
|
|
|
# 50 lookups should take < 100ms (with caching, most are instant)
|
|
assert elapsed < 0.1, f"Batch lookups took {elapsed:.3f}s (expected < 0.1s)"
|
|
|
|
def test_unique_lookups_reasonable(self, lookup):
|
|
"""Test that even uncached lookups are reasonably fast."""
|
|
import time
|
|
|
|
# 20 unique Dutch cities (uncached)
|
|
cities = [
|
|
"Amsterdam", "Rotterdam", "The Hague", "Utrecht", "Eindhoven",
|
|
"Tilburg", "Groningen", "Almere", "Breda", "Nijmegen",
|
|
"Enschede", "Haarlem", "Arnhem", "Zaanstad", "Amersfoort",
|
|
"Apeldoorn", "Hoofddorp", "Maastricht", "Leiden", "Dordrecht",
|
|
]
|
|
|
|
start = time.time()
|
|
for city_name in cities:
|
|
lookup.lookup_city(city_name, "NL")
|
|
elapsed = time.time() - start
|
|
|
|
# 20 unique lookups should take < 200ms from SQLite
|
|
assert elapsed < 0.2, f"20 unique lookups took {elapsed:.3f}s (expected < 0.2s)"
|
|
|
|
|
|
class TestDatabaseLifecycle:
|
|
"""Test database connection lifecycle."""
|
|
|
|
def test_close_connection(self):
|
|
"""Test explicit connection closing."""
|
|
db = GeoNamesDB()
|
|
assert db.conn is not None
|
|
|
|
# Close connection
|
|
db.close()
|
|
|
|
# Connection should be closed
|
|
# (can't easily test this without triggering an exception)
|
|
|
|
def test_context_manager_style(self):
|
|
"""Test that database can be used and cleaned up properly."""
|
|
db = GeoNamesDB()
|
|
city = db.lookup_city("Amsterdam", "NL")
|
|
assert city is not None
|
|
|
|
# Cleanup
|
|
del db # Should trigger __del__ and close connection
|