glam/scripts/enrich_swiss_isil_cities.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

559 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Enrich Swiss ISIL custodian files with city data from the Swiss ISIL website.
For Swiss custodian files with XXX city placeholder, this script:
1. Loads the source CH-Annotator file to get ISIL URLs by institution name
2. Fetches the institution page from isil.nb.admin.ch
3. Extracts city (Location) and address data
4. Reverse geocodes using GeoNames to get proper city code
5. Updates the GHCID with correct city code
6. Renames the file if GHCID changes
Usage:
python scripts/enrich_swiss_isil_cities.py [--dry-run] [--limit N]
"""
import argparse
import hashlib
import os
import re
import shutil
import sqlite3
import time
import uuid
import yaml
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
REPORTS_DIR = PROJECT_ROOT / "reports"
SWISS_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "switzerland_isil_ch_annotator.yaml"
# GHCID namespace for UUID generation
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
# Rate limiting
REQUEST_DELAY = 1.0 # seconds between requests
# Swiss canton codes (already ISO 3166-2)
SWISS_CANTON_CODES = {
'Aargau': 'AG', 'Appenzell Ausserrhoden': 'AR', 'Appenzell Innerrhoden': 'AI',
'Basel-Landschaft': 'BL', 'Basel-Stadt': 'BS', 'Bern': 'BE', 'Fribourg': 'FR',
'Geneva': 'GE', 'Glarus': 'GL', 'Graubünden': 'GR', 'Jura': 'JU', 'Lucerne': 'LU',
'Neuchâtel': 'NE', 'Nidwalden': 'NW', 'Obwalden': 'OW', 'Schaffhausen': 'SH',
'Schwyz': 'SZ', 'Solothurn': 'SO', 'St. Gallen': 'SG', 'Thurgau': 'TG',
'Ticino': 'TI', 'Uri': 'UR', 'Valais': 'VS', 'Vaud': 'VD', 'Zug': 'ZG', 'Zürich': 'ZH',
# German names
'Genf': 'GE', 'Luzern': 'LU', 'Neuenburg': 'NE', 'Wallis': 'VS', 'Waadt': 'VD',
# French names
'Genève': 'GE', 'Lucerne': 'LU', 'Valais': 'VS', 'Vaud': 'VD', 'Fribourg': 'FR',
# Italian names
'Ginevra': 'GE', 'Grigioni': 'GR', 'Ticino': 'TI', 'Vallese': 'VS',
}
def load_swiss_isil_lookup() -> Dict[str, str]:
"""Load Swiss CH-Annotator source file and create name -> ISIL URL lookup."""
lookup = {}
if not SWISS_CH_ANNOTATOR_FILE.exists():
print(f"Warning: Swiss CH-Annotator file not found: {SWISS_CH_ANNOTATOR_FILE}")
return lookup
print(f"Loading Swiss CH-Annotator source file...")
with open(SWISS_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f:
entries = yaml.safe_load(f)
if not entries:
return lookup
for entry in entries:
if not isinstance(entry, dict):
continue
name = entry.get('name', '')
if not name:
continue
# Look for ISIL URL in digital_platforms
for platform in entry.get('digital_platforms', []):
if isinstance(platform, dict):
url = platform.get('platform_url', '')
if 'isil.nb.admin.ch' in url:
lookup[name] = url
break
print(f" Loaded {len(lookup)} institutions with ISIL URLs")
return lookup
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
if not city_name:
return 'XXX'
# Remove diacritics and normalize
import unicodedata
normalized = unicodedata.normalize('NFD', city_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Skip articles and prepositions
skip_words = {'de', 'la', 'le', 'les', 'du', 'des', 'von', 'am', 'im', 'an', 'der', 'die', 'das'}
words = ascii_name.split()
significant_words = [w for w in words if w.lower() not in skip_words]
if not significant_words:
significant_words = words
if len(significant_words) == 1:
# Single word: first 3 letters
return significant_words[0][:3].upper()
else:
# Multiple words: initials
return ''.join(w[0] for w in significant_words[:3]).upper()
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate deterministic UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 style from SHA-256 hash."""
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16]
hash_bytes = bytearray(hash_bytes)
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant
return str(uuid.UUID(bytes=bytes(hash_bytes)))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from SHA-256 hash."""
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
return int.from_bytes(hash_bytes[:8], 'big')
def fetch_isil_page(isil_url: str, session: requests.Session) -> Optional[Dict]:
"""Fetch and parse Swiss ISIL institution page."""
try:
response = session.get(isil_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
result = {}
# Find all dt/dd pairs in the definition lists
for dt in soup.find_all('dt'):
label = dt.get_text(strip=True)
dd = dt.find_next_sibling('dd')
if dd:
value = dd.get_text(strip=True)
if label == 'Location':
result['city'] = value
elif label == 'Zip code':
result['postal_code'] = value
elif label == 'Street and number':
result['street_address'] = value
elif label == 'Canton':
result['canton'] = value
result['region'] = SWISS_CANTON_CODES.get(value, value[:2].upper() if len(value) >= 2 else None)
return result if result.get('city') else None
except Exception as e:
print(f" Error fetching {isil_url}: {e}")
return None
def reverse_geocode_city(city_name: str, region_code: str, country_code: str, db_path: Path) -> Optional[Dict]:
"""Look up city in GeoNames database to get coordinates and proper data."""
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Swiss admin1 codes in GeoNames
swiss_admin1_map = {
'AG': '01', 'AR': '15', 'AI': '16', 'BL': '06', 'BS': '05',
'BE': '02', 'FR': '04', 'GE': '07', 'GL': '08', 'GR': '03',
'JU': '26', 'LU': '09', 'NE': '10', 'NW': '11', 'OW': '12',
'SH': '14', 'SZ': '17', 'SO': '13', 'SG': '18', 'TG': '20',
'TI': '21', 'UR': '19', 'VS': '22', 'VD': '23', 'ZG': '25', 'ZH': '24'
}
admin1_code = swiss_admin1_map.get(region_code)
# Try exact match first
query = """
SELECT geonames_id, name, ascii_name, latitude, longitude,
population, feature_code, admin1_code, admin1_name
FROM cities
WHERE country_code = ?
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?))
"""
if admin1_code:
query += " AND admin1_code = ?"
cursor.execute(query + " ORDER BY population DESC LIMIT 1",
(country_code, city_name, city_name, city_name, admin1_code))
else:
cursor.execute(query + " ORDER BY population DESC LIMIT 1",
(country_code, city_name, city_name, city_name))
row = cursor.fetchone()
if row:
return {
'geonames_id': row[0],
'geonames_name': row[1],
'ascii_name': row[2],
'latitude': row[3],
'longitude': row[4],
'population': row[5],
'feature_code': row[6],
'admin1_code': row[7],
'admin1_name': row[8]
}
# Try fuzzy match
cursor.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
population, feature_code, admin1_code, admin1_name
FROM cities
WHERE country_code = ?
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (name LIKE ? OR ascii_name LIKE ?)
ORDER BY population DESC
LIMIT 1
""", (country_code, f"{city_name}%", f"{city_name}%"))
row = cursor.fetchone()
conn.close()
if row:
return {
'geonames_id': row[0],
'geonames_name': row[1],
'ascii_name': row[2],
'latitude': row[3],
'longitude': row[4],
'population': row[5],
'feature_code': row[6],
'admin1_code': row[7],
'admin1_name': row[8]
}
return None
except Exception as e:
print(f" GeoNames lookup error: {e}")
return None
def process_file(file_path: Path, session: requests.Session, isil_lookup: Dict[str, str], dry_run: bool = True) -> Dict:
"""Process a single custodian file."""
result = {
'status': 'unchanged',
'old_ghcid': None,
'new_ghcid': None,
'city': None,
'error': None
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
result['status'] = 'error'
result['error'] = 'Empty file'
return result
# Check if this is a Swiss file with XXX city placeholder
ghcid_current = data.get('ghcid', {}).get('ghcid_current', '')
if not ghcid_current.startswith('CH-') or '-XXX-' not in ghcid_current:
result['status'] = 'skipped'
return result
result['old_ghcid'] = ghcid_current
# Get institution name for lookup
inst_name = data.get('original_entry', {}).get('name', '')
if not inst_name:
inst_name = data.get('custodian_name', {}).get('claim_value', '')
# Find ISIL URL - first try lookup by name
isil_url = isil_lookup.get(inst_name)
# Then check identifiers in the file
if not isil_url:
identifiers = data.get('identifiers', [])
for ident in identifiers:
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL':
url = ident.get('identifier_url', '')
if 'isil.nb.admin.ch' in url:
isil_url = url
break
# Also check original_entry.identifiers
if not isil_url:
original_identifiers = data.get('original_entry', {}).get('identifiers', [])
for ident in original_identifiers:
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL':
url = ident.get('identifier_url', '')
if 'isil.nb.admin.ch' in url:
isil_url = url
break
if not isil_url:
result['status'] = 'no_isil_url'
result['error'] = f'No ISIL URL found for: {inst_name}'
return result
# Convert to proper page URL format
if '?isil=' in isil_url:
isil_code = isil_url.split('?isil=')[-1]
# Convert to institution page URL
isil_url = f"https://www.isil.nb.admin.ch/en/?isil={isil_code}"
# Fetch city data from ISIL website
time.sleep(REQUEST_DELAY)
isil_data = fetch_isil_page(isil_url, session)
if not isil_data or not isil_data.get('city'):
result['status'] = 'no_city_found'
return result
city_name = isil_data['city']
result['city'] = city_name
# Get region from GHCID or ISIL data
parts = ghcid_current.split('-')
region_code = parts[1] if len(parts) > 1 else isil_data.get('region', 'XX')
# Generate city code
city_code = generate_city_code(city_name)
# Try to get GeoNames data for coordinates
geonames_data = reverse_geocode_city(city_name, region_code, 'CH', GEONAMES_DB)
# Build new GHCID
# Format: CH-{region}-{city}-{type}-{abbrev}[-{suffix}]
new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-')
result['new_ghcid'] = new_ghcid
if new_ghcid == ghcid_current:
result['status'] = 'unchanged'
return result
if dry_run:
result['status'] = 'would_update'
return result
# Update the data
now = datetime.now(timezone.utc).isoformat()
# Update GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
# Update location_resolution
location_resolution = {
'method': 'SWISS_ISIL_ENRICHMENT',
'city_name': city_name,
'city_code': city_code,
'region_code': region_code,
'country_code': 'CH',
'enrichment_date': now,
'source_url': isil_url
}
if geonames_data:
location_resolution.update({
'geonames_id': geonames_data['geonames_id'],
'geonames_name': geonames_data['geonames_name'],
'feature_code': geonames_data['feature_code'],
'population': geonames_data['population'],
'latitude': geonames_data['latitude'],
'longitude': geonames_data['longitude']
})
data['ghcid']['location_resolution'] = location_resolution
# Add GHCID history entry
history = data['ghcid'].get('ghcid_history', [])
if history:
# Close previous entry
history[0]['valid_to'] = now
history.insert(0, {
'ghcid': new_ghcid,
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
'valid_from': now,
'valid_to': None,
'reason': f'City code updated from Swiss ISIL enrichment: {city_name} -> {city_code}'
})
data['ghcid']['ghcid_history'] = history
# Update location in original_entry if exists
if 'locations' in data.get('original_entry', {}):
for loc in data['original_entry']['locations']:
if isinstance(loc, dict) and not loc.get('city'):
loc['city'] = city_name
if isil_data.get('postal_code'):
loc['postal_code'] = isil_data['postal_code']
if isil_data.get('street_address'):
loc['street_address'] = isil_data['street_address']
# Update identifiers
for ident in data.get('identifiers', []):
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
# Write updated file
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Rename file if GHCID changed
new_filename = f"{new_ghcid}.yaml"
new_path = file_path.parent / new_filename
if new_path != file_path and not new_path.exists():
shutil.move(file_path, new_path)
result['renamed_to'] = str(new_path.name)
result['status'] = 'updated'
return result
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result
def main():
parser = argparse.ArgumentParser(description='Enrich Swiss ISIL custodian files with city data')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output')
args = parser.parse_args()
print("=" * 60)
print("SWISS ISIL CITY ENRICHMENT")
print("=" * 60)
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
# Find Swiss files with XXX city placeholder
swiss_xxx_files = list(CUSTODIAN_DIR.glob("CH-*-XXX-*.yaml"))
if args.limit:
swiss_xxx_files = swiss_xxx_files[:args.limit]
print(f"Limited to {args.limit} files")
print(f"Found {len(swiss_xxx_files)} Swiss files with XXX city placeholder")
print()
# Load Swiss ISIL lookup from CH-Annotator source file
isil_lookup = load_swiss_isil_lookup()
# Process files
session = requests.Session()
session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)'
stats = {
'updated': 0,
'would_update': 0,
'unchanged': 0,
'skipped': 0,
'no_isil_url': 0,
'no_city_found': 0,
'error': 0
}
cities_found = {}
errors = []
for i, file_path in enumerate(swiss_xxx_files, 1):
if i % 100 == 0 or args.verbose:
print(f"Progress: {i}/{len(swiss_xxx_files)}")
result = process_file(file_path, session, isil_lookup, dry_run=args.dry_run)
stats[result['status']] = stats.get(result['status'], 0) + 1
if result.get('city'):
cities_found[result['city']] = cities_found.get(result['city'], 0) + 1
if result.get('error'):
errors.append(f"{file_path.name}: {result['error']}")
if args.verbose and result['status'] in ('updated', 'would_update'):
print(f" {file_path.name}")
print(f" City: {result.get('city')}")
print(f" {result['old_ghcid']} -> {result['new_ghcid']}")
# Print summary
print()
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total files processed: {len(swiss_xxx_files)}")
print()
print("Results:")
for status, count in sorted(stats.items()):
if count > 0:
print(f" {status}: {count}")
if cities_found:
print()
print(f"Cities found: {len(cities_found)} unique")
print("Top 10 cities:")
for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]:
print(f" {city}: {count}")
if errors:
print()
print(f"Errors ({len(errors)}):")
for err in errors[:10]:
print(f" {err}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more")
# Save report
REPORTS_DIR.mkdir(exist_ok=True)
report_file = REPORTS_DIR / f"SWISS_ISIL_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
with open(report_file, 'w') as f:
f.write("# Swiss ISIL City Enrichment Report\n\n")
f.write(f"**Date**: {datetime.now().isoformat()}\n")
f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n")
f.write("## Summary\n\n")
f.write(f"- Total files processed: {len(swiss_xxx_files)}\n")
for status, count in sorted(stats.items()):
if count > 0:
f.write(f"- {status}: {count}\n")
if cities_found:
f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n")
for city, count in sorted(cities_found.items(), key=lambda x: -x[1]):
f.write(f"- {city}: {count}\n")
print()
print(f"Report saved to: {report_file}")
if __name__ == '__main__':
main()