- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
500 lines
17 KiB
Python
500 lines
17 KiB
Python
"""
|
|
GeoNames database lookup for global city data.
|
|
|
|
Provides fast, offline lookups of city information from the GeoNames database.
|
|
Used for GHCID generation and geographic standardization across 247 countries.
|
|
|
|
Includes city code disambiguation to resolve collisions (e.g., Hardenberg, Harlingen, Haren → HAR).
|
|
"""
|
|
|
|
import sqlite3
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import List, Optional, TYPE_CHECKING
|
|
from functools import lru_cache
|
|
|
|
if TYPE_CHECKING:
|
|
from glam_extractor.geocoding.city_code_disambiguation import CityCodeDisambiguator
|
|
|
|
from glam_extractor.geocoding.city_code_disambiguation import CityCodeDisambiguator
|
|
|
|
|
|
@dataclass
|
|
class CityInfo:
|
|
"""Information about a city from GeoNames."""
|
|
|
|
geonames_id: int
|
|
name: str
|
|
ascii_name: str
|
|
country_code: str
|
|
admin1_code: Optional[str]
|
|
admin1_name: Optional[str]
|
|
admin2_code: Optional[str]
|
|
latitude: float
|
|
longitude: float
|
|
feature_code: str
|
|
population: Optional[int]
|
|
elevation: Optional[int]
|
|
timezone: Optional[str]
|
|
|
|
def get_abbreviation(self) -> str:
|
|
"""
|
|
Get 3-letter city abbreviation for GHCID.
|
|
|
|
Uses first 3 letters of ASCII name, uppercased.
|
|
Skips non-alphanumeric characters (apostrophes, hyphens, spaces, etc.)
|
|
|
|
For multi-word cities with articles (La Serena, El Paso):
|
|
- Takes first letter of article + first 2 of next word
|
|
|
|
Examples:
|
|
Amsterdam → AMS
|
|
Rotterdam → ROT
|
|
The Hague → THE
|
|
's-Hertogenbosch → SHE (skips ' and -)
|
|
São Paulo → SAO
|
|
La Serena → LSE (L from La + SE from Serena)
|
|
El Paso → ELP (E from El + LP from Paso)
|
|
"""
|
|
import unicodedata
|
|
|
|
# Remove accents first (São → Sao)
|
|
normalized = unicodedata.normalize('NFD', self.ascii_name)
|
|
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Split into words
|
|
words = normalized.split()
|
|
|
|
# Handle multi-word cities with articles
|
|
if len(words) > 1 and words[0].lower() in ['la', 'el', 'los', 'las', 'le', 'the', 'o', 'a']:
|
|
# Special case: "The X" → Use first 3 letters of article for intuitive codes
|
|
# Example: "The Hague" → "THE" (not "THA")
|
|
if words[0].lower() == 'the':
|
|
code = words[0][:3].upper()
|
|
else:
|
|
# Other articles: first letter of article + first 2 of next word
|
|
# Example: "La Paz" → "LAP"
|
|
code = (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
# Remove non-alphanumeric characters (but keep them to count positions)
|
|
cleaned = ''.join(c for c in normalized if c.isalnum())
|
|
code = cleaned[:3].upper()
|
|
|
|
return code
|
|
|
|
|
|
class GeoNamesDB:
|
|
"""
|
|
Fast lookup interface to GeoNames SQLite database.
|
|
|
|
Provides city lookups by name and country for global GLAM institutions.
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> city = db.lookup_city("Amsterdam", "NL")
|
|
>>> print(city.get_abbreviation())
|
|
AMS
|
|
>>> print(city.geonames_id)
|
|
2759794
|
|
"""
|
|
|
|
# Dutch city name mappings (Dutch → GeoNames English)
|
|
DUTCH_CITY_ALIASES = {
|
|
"Den Haag": "The Hague",
|
|
"Den Bosch": "'s-Hertogenbosch",
|
|
"'s Hertogenbosch": "'s-Hertogenbosch",
|
|
"s-Hertogenbosch": "'s-Hertogenbosch",
|
|
}
|
|
|
|
def __init__(self, db_path: Optional[Path] = None, enable_disambiguation: bool = True):
|
|
"""
|
|
Initialize GeoNames database connection.
|
|
|
|
Args:
|
|
db_path: Path to geonames.db SQLite file.
|
|
Defaults to data/reference/geonames.db
|
|
enable_disambiguation: If True, build city code disambiguation tables
|
|
for collision resolution (default: True)
|
|
"""
|
|
if db_path is None:
|
|
# Default path relative to project root
|
|
project_root = Path(__file__).parent.parent.parent.parent
|
|
db_path = project_root / "data" / "reference" / "geonames.db"
|
|
|
|
if not db_path.exists():
|
|
raise FileNotFoundError(
|
|
f"GeoNames database not found at {db_path}. "
|
|
"Run scripts/build_geonames_db.py to create it."
|
|
)
|
|
|
|
self.db_path = db_path
|
|
self.conn = sqlite3.connect(str(db_path))
|
|
self.conn.row_factory = sqlite3.Row
|
|
|
|
# Initialize disambiguation
|
|
self.enable_disambiguation = enable_disambiguation
|
|
self._disambiguator: Optional['CityCodeDisambiguator'] = None
|
|
self._disambiguation_built: set = set() # Countries with built disambiguation
|
|
|
|
def __del__(self):
|
|
"""Close database connection."""
|
|
if hasattr(self, 'conn'):
|
|
self.conn.close()
|
|
|
|
@staticmethod
|
|
def _normalize_city_name(city_name: str, country_code: str) -> str:
|
|
"""
|
|
Normalize city name for lookup.
|
|
|
|
Handles:
|
|
- Trailing/leading whitespace
|
|
- Parenthetical clarifications like "(Ov.)" or "(Groningen)"
|
|
- Dutch city name aliases (Den Haag → The Hague)
|
|
|
|
Args:
|
|
city_name: Raw city name from data
|
|
country_code: ISO country code
|
|
|
|
Returns:
|
|
Normalized city name
|
|
"""
|
|
# Strip whitespace
|
|
normalized = city_name.strip()
|
|
|
|
# Remove parenthetical clarifications: "Hengelo (Ov.)" → "Hengelo"
|
|
if '(' in normalized:
|
|
normalized = normalized.split('(')[0].strip()
|
|
|
|
# Apply country-specific aliases
|
|
if country_code == "NL" and normalized in GeoNamesDB.DUTCH_CITY_ALIASES:
|
|
normalized = GeoNamesDB.DUTCH_CITY_ALIASES[normalized]
|
|
|
|
return normalized
|
|
|
|
def lookup_city(
|
|
self,
|
|
city_name: str,
|
|
country_code: str,
|
|
prefer_larger: bool = True
|
|
) -> Optional[CityInfo]:
|
|
"""
|
|
Look up a city by name and country.
|
|
|
|
Args:
|
|
city_name: Name of the city (e.g., "Amsterdam", "Rio de Janeiro")
|
|
country_code: ISO 3166-1 alpha-2 country code (e.g., "NL", "BR")
|
|
prefer_larger: If multiple matches, prefer city with larger population
|
|
|
|
Returns:
|
|
CityInfo if found, None otherwise
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> city = db.lookup_city("Rio de Janeiro", "BR")
|
|
>>> print(f"{city.name}: {city.latitude}, {city.longitude}")
|
|
Rio de Janeiro: -22.90278, -43.2075
|
|
"""
|
|
# Normalize city name (strip whitespace, handle aliases)
|
|
normalized_name = self._normalize_city_name(city_name, country_code)
|
|
|
|
# Call cached lookup with normalized name
|
|
return self._lookup_city_raw(normalized_name, country_code, prefer_larger)
|
|
|
|
@lru_cache(maxsize=10000)
|
|
def _lookup_city_raw(
|
|
self,
|
|
city_name: str,
|
|
country_code: str,
|
|
prefer_larger: bool = True
|
|
) -> Optional[CityInfo]:
|
|
"""
|
|
Internal cached lookup method (after normalization).
|
|
"""
|
|
cursor = self.conn.cursor()
|
|
|
|
# Try exact match on name first
|
|
cursor.execute("""
|
|
SELECT * FROM cities
|
|
WHERE name = ? AND country_code = ?
|
|
ORDER BY population DESC NULLS LAST
|
|
LIMIT 1
|
|
""", (city_name, country_code.upper()))
|
|
|
|
row = cursor.fetchone()
|
|
|
|
# If no exact match, try ASCII name
|
|
if not row:
|
|
cursor.execute("""
|
|
SELECT * FROM cities
|
|
WHERE ascii_name = ? AND country_code = ?
|
|
ORDER BY population DESC NULLS LAST
|
|
LIMIT 1
|
|
""", (city_name, country_code.upper()))
|
|
row = cursor.fetchone()
|
|
|
|
# If still no match, try case-insensitive
|
|
if not row:
|
|
cursor.execute("""
|
|
SELECT * FROM cities
|
|
WHERE LOWER(name) = LOWER(?) AND country_code = ?
|
|
ORDER BY population DESC NULLS LAST
|
|
LIMIT 1
|
|
""", (city_name, country_code.upper()))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
return None
|
|
|
|
return self._row_to_cityinfo(row)
|
|
|
|
def get_city_abbreviation(
|
|
self,
|
|
city_name: str,
|
|
country_code: str,
|
|
use_disambiguation: bool = True
|
|
) -> Optional[str]:
|
|
"""
|
|
Get city abbreviation (3-4 letters, potentially with numeric suffix).
|
|
|
|
If disambiguation is enabled, uses collision-free codes.
|
|
Otherwise, returns standard 3-letter codes (may have collisions).
|
|
|
|
Args:
|
|
city_name: Name of the city
|
|
country_code: ISO 3166-1 alpha-2 country code
|
|
use_disambiguation: If True, use disambiguation table (default: True)
|
|
|
|
Returns:
|
|
City abbreviation (e.g., "AMS", "HARD", "HARL") or None
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> db.get_city_abbreviation("Tokyo", "JP")
|
|
'TOK'
|
|
>>> db.get_city_abbreviation("Hardenberg", "NL")
|
|
'HARD' # Disambiguated from Harlingen (HARL) and Haren (HARE)
|
|
"""
|
|
# Try disambiguation first if enabled
|
|
if use_disambiguation and self.enable_disambiguation:
|
|
# Ensure disambiguation is built for this country
|
|
self._ensure_disambiguation_built(country_code)
|
|
|
|
if self._disambiguator:
|
|
disambiguated_code = self._disambiguator.get_code(city_name, country_code)
|
|
if disambiguated_code:
|
|
return disambiguated_code
|
|
|
|
# Fall back to standard 3-letter code
|
|
city = self.lookup_city(city_name, country_code)
|
|
return city.get_abbreviation() if city else None
|
|
|
|
def _ensure_disambiguation_built(self, country_code: str):
|
|
"""
|
|
Ensure disambiguation table is built for a country.
|
|
|
|
Lazily builds disambiguation on first request for a country.
|
|
|
|
Args:
|
|
country_code: ISO 3166-1 alpha-2 country code
|
|
"""
|
|
if not self.enable_disambiguation:
|
|
return
|
|
|
|
country_code = country_code.upper()
|
|
|
|
# Already built for this country?
|
|
if country_code in self._disambiguation_built:
|
|
return
|
|
|
|
# Initialize disambiguator if needed
|
|
if self._disambiguator is None:
|
|
self._disambiguator = CityCodeDisambiguator()
|
|
|
|
# Get all cities in the country
|
|
cities = self.get_cities_by_country(country_code, min_population=0, limit=100000)
|
|
|
|
# Add to disambiguator
|
|
for city in cities:
|
|
self._disambiguator.add_city(
|
|
city_name=city.name,
|
|
ascii_name=city.ascii_name,
|
|
geonames_id=city.geonames_id,
|
|
country_code=country_code,
|
|
population=city.population
|
|
)
|
|
|
|
# Build disambiguation table
|
|
self._disambiguator.build()
|
|
|
|
# Mark as built
|
|
self._disambiguation_built.add(country_code)
|
|
|
|
def get_disambiguation_report(self, country_code: str) -> str:
|
|
"""
|
|
Get disambiguation report for a country.
|
|
|
|
Shows collision groups and how they were resolved.
|
|
|
|
Args:
|
|
country_code: ISO 3166-1 alpha-2 country code
|
|
|
|
Returns:
|
|
Formatted report string
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> print(db.get_disambiguation_report("NL"))
|
|
"""
|
|
self._ensure_disambiguation_built(country_code)
|
|
|
|
if self._disambiguator:
|
|
return self._disambiguator.get_collision_report(country_code)
|
|
else:
|
|
return "Disambiguation is not enabled"
|
|
|
|
def get_disambiguation_statistics(self, country_code: str) -> dict:
|
|
"""
|
|
Get disambiguation statistics for a country.
|
|
|
|
Args:
|
|
country_code: ISO 3166-1 alpha-2 country code
|
|
|
|
Returns:
|
|
Dictionary with statistics
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> stats = db.get_disambiguation_statistics("NL")
|
|
>>> print(f"Collision rate: {stats['collision_rate']:.1%}")
|
|
"""
|
|
self._ensure_disambiguation_built(country_code)
|
|
|
|
if self._disambiguator:
|
|
return self._disambiguator.get_statistics(country_code)
|
|
else:
|
|
return {'error': 'Disambiguation is not enabled'}
|
|
|
|
def search_cities(
|
|
self,
|
|
name_pattern: str,
|
|
country_code: Optional[str] = None,
|
|
limit: int = 10
|
|
) -> List[CityInfo]:
|
|
"""
|
|
Search for cities matching a name pattern.
|
|
|
|
Args:
|
|
name_pattern: City name or pattern (supports % wildcards)
|
|
country_code: Optional country filter
|
|
limit: Maximum results to return
|
|
|
|
Returns:
|
|
List of matching cities, ordered by population
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> cities = db.search_cities("Amsterdam%", "NL")
|
|
>>> for city in cities:
|
|
... print(f"{city.name} (pop: {city.population})")
|
|
Amsterdam (pop: 741636)
|
|
Amsterdam-Zuidoost (pop: 86916)
|
|
"""
|
|
cursor = self.conn.cursor()
|
|
|
|
if country_code:
|
|
cursor.execute("""
|
|
SELECT * FROM cities
|
|
WHERE name LIKE ? AND country_code = ?
|
|
ORDER BY population DESC NULLS LAST
|
|
LIMIT ?
|
|
""", (name_pattern, country_code.upper(), limit))
|
|
else:
|
|
cursor.execute("""
|
|
SELECT * FROM cities
|
|
WHERE name LIKE ?
|
|
ORDER BY population DESC NULLS LAST
|
|
LIMIT ?
|
|
""", (name_pattern, limit))
|
|
|
|
return [self._row_to_cityinfo(row) for row in cursor.fetchall()]
|
|
|
|
def get_cities_by_country(
|
|
self,
|
|
country_code: str,
|
|
min_population: int = 0,
|
|
limit: int = 100
|
|
) -> List[CityInfo]:
|
|
"""
|
|
Get all cities in a country.
|
|
|
|
Args:
|
|
country_code: ISO 3166-1 alpha-2 country code
|
|
min_population: Minimum population threshold
|
|
limit: Maximum results to return
|
|
|
|
Returns:
|
|
List of cities, ordered by population descending
|
|
|
|
Example:
|
|
>>> db = GeoNamesDB()
|
|
>>> cities = db.get_cities_by_country("NL", min_population=100000)
|
|
>>> print(f"Found {len(cities)} major Dutch cities")
|
|
"""
|
|
cursor = self.conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT * FROM cities
|
|
WHERE country_code = ? AND (population >= ? OR population IS NULL)
|
|
ORDER BY population DESC NULLS LAST
|
|
LIMIT ?
|
|
""", (country_code.upper(), min_population, limit))
|
|
|
|
return [self._row_to_cityinfo(row) for row in cursor.fetchall()]
|
|
|
|
def get_stats(self) -> dict:
|
|
"""
|
|
Get database statistics.
|
|
|
|
Returns:
|
|
Dictionary with total_cities, total_countries, etc.
|
|
"""
|
|
cursor = self.conn.cursor()
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM cities")
|
|
total_cities = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT COUNT(DISTINCT country_code) FROM cities")
|
|
total_countries = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT country_code, COUNT(*) as count FROM cities GROUP BY country_code ORDER BY count DESC LIMIT 10")
|
|
top_countries = [(row[0], row[1]) for row in cursor.fetchall()]
|
|
|
|
return {
|
|
'total_cities': total_cities,
|
|
'total_countries': total_countries,
|
|
'top_countries': top_countries,
|
|
'db_path': str(self.db_path),
|
|
'db_size_mb': self.db_path.stat().st_size / 1024 / 1024
|
|
}
|
|
|
|
def _row_to_cityinfo(self, row: sqlite3.Row) -> CityInfo:
|
|
"""Convert database row to CityInfo object."""
|
|
return CityInfo(
|
|
geonames_id=row['geonames_id'],
|
|
name=row['name'],
|
|
ascii_name=row['ascii_name'],
|
|
country_code=row['country_code'],
|
|
admin1_code=row['admin1_code'],
|
|
admin1_name=row['admin1_name'],
|
|
admin2_code=row['admin2_code'],
|
|
latitude=row['latitude'],
|
|
longitude=row['longitude'],
|
|
feature_code=row['feature_code'],
|
|
population=row['population'],
|
|
elevation=row['elevation'],
|
|
timezone=row['timezone']
|
|
)
|
|
|
|
def close(self):
|
|
"""Explicitly close the database connection."""
|
|
self.conn.close()
|