Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
185 lines
5.7 KiB
Python
185 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Belgian city enrichment v2 - with city name aliases.
|
|
"""
|
|
|
|
import re
|
|
import sqlite3
|
|
import unicodedata
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Belgian city aliases (Dutch names → GeoNames names)
|
|
BELGIAN_CITY_ALIASES = {
|
|
'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert',
|
|
'sint-pieters-woluwe': 'Woluwe-Saint-Pierre',
|
|
'sint-stevens-woluwe': 'Sint-Stevens-Woluwe',
|
|
'oostende': 'Ostend',
|
|
'gent': 'Gent',
|
|
'brugge': 'Brugge',
|
|
'brussel': 'Brussels',
|
|
'antwerpen': 'Antwerpen',
|
|
'luik': 'Liège',
|
|
'liège': 'Liège',
|
|
'leuven': 'Leuven',
|
|
'mechelen': 'Mechelen',
|
|
'aalst': 'Aalst',
|
|
'hasselt': 'Hasselt',
|
|
'kortrijk': 'Kortrijk',
|
|
'sint-niklaas': 'Sint-Niklaas',
|
|
'genk': 'Genk',
|
|
'roeselare': 'Roeselare',
|
|
# Merged municipalities (2019)
|
|
'kluisbergen': 'Kluisbergen',
|
|
'lievegem': 'Nevele', # Lievegem was created from Nevele, Waarschoot, Zomergem, Lovendegem
|
|
'kruisem': 'Kruishoutem', # Kruisem was created from Kruishoutem and Zingem
|
|
'lierde': 'Sint-Maria-Lierde',
|
|
'maarkedal': 'Etikhove', # Maarkedal includes Etikhove
|
|
# Other
|
|
'de haan': 'De Haan',
|
|
'lint': 'Lint',
|
|
'herne': 'Herne',
|
|
}
|
|
|
|
# Belgian admin1 mapping (GeoNames → ISO 3166-2:BE)
|
|
BELGIAN_ADMIN1_MAP = {
|
|
'Brussels Capital': 'BRU',
|
|
'Brussels': 'BRU',
|
|
'Flanders': 'VLG',
|
|
'Wallonia': 'WAL',
|
|
}
|
|
|
|
def normalize_city_name(name):
|
|
"""Normalize city name for lookup."""
|
|
if not name:
|
|
return None
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
return normalized.strip()
|
|
|
|
def lookup_city(city_name, conn):
|
|
"""Look up city in GeoNames with alias support."""
|
|
if not city_name:
|
|
return None
|
|
|
|
normalized = normalize_city_name(city_name)
|
|
|
|
# Check alias first
|
|
if normalized in BELGIAN_CITY_ALIASES:
|
|
lookup_name = BELGIAN_CITY_ALIASES[normalized]
|
|
else:
|
|
lookup_name = city_name
|
|
|
|
cursor = conn.cursor()
|
|
|
|
# Try exact match
|
|
cursor.execute("""
|
|
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
|
|
FROM cities
|
|
WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?))
|
|
ORDER BY population DESC LIMIT 1
|
|
""", (lookup_name, lookup_name))
|
|
|
|
result = cursor.fetchone()
|
|
if result:
|
|
return {
|
|
'name': result[0],
|
|
'ascii_name': result[1],
|
|
'admin1_name': result[2],
|
|
'latitude': result[3],
|
|
'longitude': result[4],
|
|
'geonames_id': result[5],
|
|
'population': result[6],
|
|
}
|
|
|
|
# Try partial match
|
|
cursor.execute("""
|
|
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
|
|
FROM cities
|
|
WHERE country_code='BE' AND (name LIKE ? OR ascii_name LIKE ?)
|
|
ORDER BY population DESC LIMIT 1
|
|
""", (f"%{lookup_name}%", f"%{lookup_name}%"))
|
|
|
|
result = cursor.fetchone()
|
|
if result:
|
|
return {
|
|
'name': result[0],
|
|
'ascii_name': result[1],
|
|
'admin1_name': result[2],
|
|
'latitude': result[3],
|
|
'longitude': result[4],
|
|
'geonames_id': result[5],
|
|
'population': result[6],
|
|
}
|
|
|
|
return None
|
|
|
|
def generate_city_code(city_name):
|
|
"""Generate 3-letter city code."""
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
|
words = clean.split()
|
|
|
|
articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'}
|
|
|
|
if len(words) == 1:
|
|
return clean[:3].upper()
|
|
elif words[0].lower() in articles:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
return ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
def main():
|
|
base_dir = Path(__file__).parent.parent
|
|
custodian_dir = base_dir / 'data' / 'custodian'
|
|
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
|
|
|
print("Belgian City Enrichment v2")
|
|
print("=" * 50)
|
|
|
|
conn = sqlite3.connect(str(geonames_db))
|
|
|
|
# Find Belgian XXX files
|
|
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
|
|
print(f"Found {len(xxx_files)} Belgian XXX files")
|
|
|
|
updated = 0
|
|
not_found = []
|
|
|
|
for file_path in xxx_files:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Get institution name
|
|
name_match = re.search(r'claim_value:\s*(.+)', content)
|
|
inst_name = name_match.group(1).strip() if name_match else ''
|
|
|
|
# Try to extract city from filename or name
|
|
# Belgian cities often in the file details - let's look at the log
|
|
# The scraper was finding cities from ISIL website
|
|
|
|
# Check if there's city info in the file already
|
|
city_match = re.search(r'city(?:_name)?:\s*([^\n]+)', content)
|
|
if city_match:
|
|
city_name = city_match.group(1).strip().strip('"\'')
|
|
if city_name and city_name != 'XXX':
|
|
geo_data = lookup_city(city_name, conn)
|
|
if geo_data:
|
|
print(f"✓ {file_path.name}: {city_name} → {geo_data['name']}")
|
|
updated += 1
|
|
# Would update file here
|
|
else:
|
|
not_found.append((file_path.name, city_name))
|
|
|
|
print(f"\nUpdated: {updated}")
|
|
print(f"Not found: {len(not_found)}")
|
|
if not_found:
|
|
print("\nCities not found:")
|
|
for fname, city in not_found[:20]:
|
|
print(f" {fname}: {city}")
|
|
|
|
conn.close()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|