Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
465 lines
16 KiB
Python
465 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Belgian custodian files with city data from ISIL registry.
|
|
|
|
Strategy:
|
|
1. First try to get city from enriched source file (fast)
|
|
2. If not found, scrape the Belgian ISIL website (slow, 1 req/sec)
|
|
|
|
Usage:
|
|
python scripts/enrich_belgian_cities.py [--dry-run]
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Belgian admin1 codes (GeoNames uses BRU, VLG, WAL)
|
|
BELGIAN_ADMIN1_MAP = {
|
|
'BRU': 'BRU', # Brussels Capital Region
|
|
'VLG': 'VLG', # Flanders (Vlaanderen)
|
|
'WAL': 'WAL', # Wallonia (Wallonië)
|
|
}
|
|
|
|
# Belgian city name aliases (Dutch/French variants)
|
|
BELGIAN_CITY_ALIASES = {
|
|
'Brussel': 'Brussels',
|
|
'Bruxelles': 'Brussels',
|
|
'Antwerpen': 'Antwerpen',
|
|
'Anvers': 'Antwerpen',
|
|
'Gent': 'Gent',
|
|
'Gand': 'Gent',
|
|
'Luik': 'Liège',
|
|
'Liege': 'Liège',
|
|
'Bergen': 'Mons',
|
|
'Namen': 'Namur',
|
|
'Mechelen': 'Mechelen',
|
|
'Malines': 'Mechelen',
|
|
'Leuven': 'Leuven',
|
|
'Louvain': 'Leuven',
|
|
'Elsene': 'Ixelles',
|
|
'Ukkel': 'Uccle',
|
|
'Oudergem': 'Auderghem',
|
|
'Watermaal-Bosvoorde': 'Watermael-Boitsfort',
|
|
'Sint-Gillis': 'Saint-Gilles',
|
|
'Sint-Jans-Molenbeek': 'Molenbeek-Saint-Jean',
|
|
'Schaarbeek': 'Schaerbeek',
|
|
'Etterbeek': 'Etterbeek',
|
|
'Vorst': 'Forest',
|
|
'Anderlecht': 'Anderlecht',
|
|
'Jette': 'Jette',
|
|
'Koekelberg': 'Koekelberg',
|
|
'Evere': 'Evere',
|
|
'Sint-Pieters-Woluwe': 'Woluwe-Saint-Pierre',
|
|
'Sint-Lambrechts-Woluwe': 'Woluwe-Saint-Lambert',
|
|
'Ganshoren': 'Ganshoren',
|
|
}
|
|
|
|
|
|
def load_isil_city_lookup(enriched_file: str) -> dict:
|
|
"""Load ISIL -> city mapping from enriched Belgian ISIL file."""
|
|
with open(enriched_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Split by 'id:' at start of line
|
|
entries = re.split(r'\n(?=id: BE-)', content)
|
|
|
|
lookup = {}
|
|
for entry in entries[1:]: # Skip header
|
|
# Extract ISIL
|
|
isil_match = re.search(r'^id: (BE-\w+)', entry)
|
|
if not isil_match:
|
|
continue
|
|
isil = isil_match.group(1)
|
|
|
|
# Extract city from locations section
|
|
city_match = re.search(r'locations:\s*\n-\s*city:\s*(\S.*)', entry)
|
|
if city_match:
|
|
city = city_match.group(1).strip()
|
|
lookup[isil] = city
|
|
|
|
return lookup
|
|
|
|
|
|
def load_isil_source_urls(enriched_file: str) -> dict:
|
|
"""Load ISIL -> source_url mapping for web scraping fallback."""
|
|
with open(enriched_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
entries = re.split(r'\n(?=id: BE-)', content)
|
|
|
|
lookup = {}
|
|
for entry in entries[1:]:
|
|
isil_match = re.search(r'^id: (BE-\w+)', entry)
|
|
url_match = re.search(r'source_url:\s*(https://isil\.kbr\.be/\S+)', entry)
|
|
if isil_match and url_match:
|
|
lookup[isil_match.group(1)] = url_match.group(1)
|
|
|
|
return lookup
|
|
|
|
|
|
def scrape_city_from_isil_website(url: str) -> str | None:
|
|
"""Scrape city from Belgian ISIL website."""
|
|
try:
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'GLAM-Enricher/1.0'})
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
|
html = response.read().decode('utf-8')
|
|
|
|
# Look for address pattern: "Street, POSTCODE City"
|
|
# Belgian postal codes are 4 digits
|
|
address_match = re.search(r'Walk up adress.*?<td class="output"[^>]*>([^<]+)</td>', html, re.DOTALL | re.IGNORECASE)
|
|
if address_match:
|
|
address = address_match.group(1)
|
|
# Parse city from address: "Veldstraat 53, 9910 Knesselare"
|
|
city_match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', address)
|
|
if city_match:
|
|
city = city_match.group(2).strip()
|
|
# Clean up trailing HTML entities
|
|
city = re.sub(r'&\w+;.*$', '', city).strip()
|
|
return city
|
|
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error scraping {url}: {e}")
|
|
return None
|
|
|
|
|
|
def generate_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
import unicodedata
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Clean up
|
|
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
|
words = clean.split()
|
|
|
|
if len(words) == 1:
|
|
return words[0][:3].upper()
|
|
else:
|
|
if len(words) == 2:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
return ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
|
|
def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
|
|
"""Look up city in GeoNames database."""
|
|
cursor = conn.cursor()
|
|
|
|
# Check aliases first
|
|
normalized_name = BELGIAN_CITY_ALIASES.get(city_name, city_name)
|
|
|
|
# Try exact match first
|
|
cursor.execute('''
|
|
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = 'BE'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (normalized_name, normalized_name))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
|
|
# Try original name if alias was used
|
|
if normalized_name != city_name:
|
|
cursor.execute('''
|
|
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = 'BE'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (city_name, city_name))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
|
|
# Try fuzzy match with LIKE
|
|
cursor.execute('''
|
|
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = 'BE'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
''', (f'{city_name}%', f'{city_name}%'))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'geonames_id': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, dry_run: bool = False) -> bool:
|
|
"""Update a custodian file with city data."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract current GHCID
|
|
ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
|
|
if not ghcid_match:
|
|
print(f" WARNING: No ghcid_current found in {file_path.name}")
|
|
return False
|
|
|
|
old_ghcid = ghcid_match.group(1)
|
|
|
|
# Generate new GHCID components
|
|
region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
|
|
city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])
|
|
|
|
# Build new GHCID: BE-XX-XXX-{type}-{abbrev}[-suffix]
|
|
parts = old_ghcid.split('-')
|
|
if len(parts) >= 5:
|
|
type_code = parts[3]
|
|
abbrev_and_suffix = '-'.join(parts[4:])
|
|
new_ghcid = f"BE-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
|
|
else:
|
|
print(f" WARNING: Unexpected GHCID format: {old_ghcid}")
|
|
return False
|
|
|
|
if old_ghcid == new_ghcid:
|
|
return False
|
|
|
|
# Calculate new filename
|
|
old_filename = file_path.name
|
|
new_filename = old_filename.replace(old_ghcid, new_ghcid)
|
|
new_file_path = file_path.parent / new_filename
|
|
|
|
# Update content
|
|
new_content = content.replace(old_ghcid, new_ghcid)
|
|
|
|
# Update location_resolution section
|
|
old_resolution = re.search(
|
|
r'location_resolution:\s*\n((?:\s+\S.*\n)*)',
|
|
new_content
|
|
)
|
|
|
|
if old_resolution:
|
|
new_resolution = f"""location_resolution:
|
|
country_code: BE
|
|
region_code: {region_code}
|
|
region_name: {geo_data['admin1_name']}
|
|
city_code: {city_code}
|
|
city_name: {geo_data['name']}
|
|
geonames_id: {geo_data['geonames_id']}
|
|
feature_code: {geo_data['feature_code']}
|
|
latitude: {geo_data['latitude']}
|
|
longitude: {geo_data['longitude']}
|
|
method: BELGIAN_ISIL_REGISTRY
|
|
resolution_date: '{datetime.now(timezone.utc).isoformat()}'
|
|
"""
|
|
new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]
|
|
|
|
# Add GHCID history entry
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
history_entry = f""" - ghcid: {new_ghcid}
|
|
valid_from: '{timestamp}'
|
|
reason: City enrichment from Belgian ISIL registry - {city_name} resolved to {geo_data['name']} ({region_code})
|
|
"""
|
|
|
|
history_match = re.search(r'ghcid_history:\s*\n', new_content)
|
|
if history_match:
|
|
insert_pos = history_match.end()
|
|
new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]
|
|
|
|
if dry_run:
|
|
print(f" DRY RUN: Would rename {old_filename} -> {new_filename}")
|
|
print(f" GHCID: {old_ghcid} -> {new_ghcid}")
|
|
return True
|
|
|
|
# Write updated content
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(new_content)
|
|
|
|
# Rename file
|
|
if new_file_path != file_path:
|
|
file_path.rename(new_file_path)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
# Paths
|
|
base_dir = Path(__file__).parent.parent
|
|
custodian_dir = base_dir / 'data' / 'custodian'
|
|
enriched_file = base_dir / 'data' / 'instances' / 'belgium_isil_institutions_enriched.yaml'
|
|
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
|
|
|
print("Belgian City Enrichment Script")
|
|
print("=" * 50)
|
|
|
|
if dry_run:
|
|
print("DRY RUN MODE - No changes will be made")
|
|
|
|
# Load lookups
|
|
print(f"\nLoading ISIL city lookup from {enriched_file.name}...")
|
|
isil_city_lookup = load_isil_city_lookup(str(enriched_file))
|
|
isil_url_lookup = load_isil_source_urls(str(enriched_file))
|
|
print(f" Found {len(isil_city_lookup)} ISIL codes with city data")
|
|
print(f" Found {len(isil_url_lookup)} ISIL codes with source URLs")
|
|
|
|
# Connect to GeoNames
|
|
print(f"\nConnecting to GeoNames database...")
|
|
conn = sqlite3.connect(str(geonames_db))
|
|
|
|
# Find Belgian XXX files
|
|
print(f"\nFinding Belgian custodian files with XXX placeholder...")
|
|
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
|
|
print(f" Found {len(xxx_files)} files to process")
|
|
|
|
# Process files
|
|
updated = 0
|
|
no_isil = 0
|
|
no_city = 0
|
|
no_geonames = 0
|
|
scraped = 0
|
|
errors = 0
|
|
not_found_cities = []
|
|
|
|
for file_path in xxx_files:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Find ISIL code
|
|
isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content)
|
|
if not isil_match:
|
|
no_isil += 1
|
|
continue
|
|
|
|
isil_code = isil_match.group(1)
|
|
|
|
# Strategy 1: Look up city from enriched file
|
|
city_name = isil_city_lookup.get(isil_code)
|
|
|
|
# Strategy 2: Scrape from website if not in lookup
|
|
if not city_name and isil_code in isil_url_lookup:
|
|
url = isil_url_lookup[isil_code]
|
|
print(f" Scraping {isil_code} from {url}...")
|
|
city_name = scrape_city_from_isil_website(url)
|
|
if city_name:
|
|
scraped += 1
|
|
print(f" Found: {city_name}")
|
|
time.sleep(1) # Rate limit
|
|
|
|
if not city_name:
|
|
no_city += 1
|
|
continue
|
|
|
|
# Look up in GeoNames
|
|
geo_data = lookup_city_in_geonames(city_name, conn)
|
|
if not geo_data:
|
|
no_geonames += 1
|
|
not_found_cities.append((file_path.name, isil_code, city_name))
|
|
continue
|
|
|
|
# Update file
|
|
if update_custodian_file(file_path, city_name, geo_data, dry_run):
|
|
updated += 1
|
|
if not dry_run:
|
|
print(f" Updated: {file_path.name} -> {city_name} ({geo_data['admin1_code']})")
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
print(f" ERROR processing {file_path.name}: {e}")
|
|
|
|
conn.close()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 50)
|
|
print("SUMMARY")
|
|
print("=" * 50)
|
|
print(f"Total XXX files: {len(xxx_files)}")
|
|
print(f"Updated: {updated}")
|
|
print(f"Scraped from website: {scraped}")
|
|
print(f"No ISIL in file: {no_isil}")
|
|
print(f"No city found: {no_city}")
|
|
print(f"City not in GeoNames: {no_geonames}")
|
|
print(f"Errors: {errors}")
|
|
print(f"Remaining XXX: {len(xxx_files) - updated}")
|
|
|
|
if not_found_cities:
|
|
print(f"\nCities not found in GeoNames:")
|
|
for fname, isil, city in not_found_cities[:20]:
|
|
print(f" {isil}: {city}")
|
|
if len(not_found_cities) > 20:
|
|
print(f" ... and {len(not_found_cities) - 20} more")
|
|
|
|
# Generate report
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
report_path = base_dir / 'reports' / f'BELGIAN_CITY_ENRICHMENT_{timestamp}.md'
|
|
|
|
with open(report_path, 'w') as f:
|
|
f.write(f"# Belgian City Enrichment Report\n\n")
|
|
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
|
f.write(f"**Dry Run**: {dry_run}\n\n")
|
|
f.write(f"## Summary\n\n")
|
|
f.write(f"| Metric | Count |\n")
|
|
f.write(f"|--------|-------|\n")
|
|
f.write(f"| Total XXX files | {len(xxx_files)} |\n")
|
|
f.write(f"| Updated | {updated} |\n")
|
|
f.write(f"| Scraped from website | {scraped} |\n")
|
|
f.write(f"| No ISIL in file | {no_isil} |\n")
|
|
f.write(f"| No city found | {no_city} |\n")
|
|
f.write(f"| City not in GeoNames | {no_geonames} |\n")
|
|
f.write(f"| Errors | {errors} |\n")
|
|
f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")
|
|
|
|
if not_found_cities:
|
|
f.write(f"\n## Cities Not Found in GeoNames\n\n")
|
|
f.write(f"| File | ISIL | City |\n")
|
|
f.write(f"|------|------|------|\n")
|
|
for fname, isil, city in not_found_cities:
|
|
f.write(f"| {fname} | {isil} | {city} |\n")
|
|
|
|
print(f"\nReport written to: {report_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|