Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
559 lines
20 KiB
Python
559 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Swiss ISIL custodian files with city data from the Swiss ISIL website.
|
|
|
|
For Swiss custodian files with XXX city placeholder, this script:
|
|
1. Loads the source CH-Annotator file to get ISIL URLs by institution name
|
|
2. Fetches the institution page from isil.nb.admin.ch
|
|
3. Extracts city (Location) and address data
|
|
4. Reverse geocodes using GeoNames to get proper city code
|
|
5. Updates the GHCID with correct city code
|
|
6. Renames the file if GHCID changes
|
|
|
|
Usage:
|
|
python scripts/enrich_swiss_isil_cities.py [--dry-run] [--limit N]
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sqlite3
|
|
import time
|
|
import uuid
|
|
import yaml
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
# Paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
|
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
|
REPORTS_DIR = PROJECT_ROOT / "reports"
|
|
SWISS_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "switzerland_isil_ch_annotator.yaml"
|
|
|
|
# GHCID namespace for UUID generation
|
|
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 1.0 # seconds between requests
|
|
|
|
# Swiss canton codes (already ISO 3166-2)
|
|
SWISS_CANTON_CODES = {
|
|
'Aargau': 'AG', 'Appenzell Ausserrhoden': 'AR', 'Appenzell Innerrhoden': 'AI',
|
|
'Basel-Landschaft': 'BL', 'Basel-Stadt': 'BS', 'Bern': 'BE', 'Fribourg': 'FR',
|
|
'Geneva': 'GE', 'Glarus': 'GL', 'Graubünden': 'GR', 'Jura': 'JU', 'Lucerne': 'LU',
|
|
'Neuchâtel': 'NE', 'Nidwalden': 'NW', 'Obwalden': 'OW', 'Schaffhausen': 'SH',
|
|
'Schwyz': 'SZ', 'Solothurn': 'SO', 'St. Gallen': 'SG', 'Thurgau': 'TG',
|
|
'Ticino': 'TI', 'Uri': 'UR', 'Valais': 'VS', 'Vaud': 'VD', 'Zug': 'ZG', 'Zürich': 'ZH',
|
|
# German names
|
|
'Genf': 'GE', 'Luzern': 'LU', 'Neuenburg': 'NE', 'Wallis': 'VS', 'Waadt': 'VD',
|
|
# French names
|
|
'Genève': 'GE', 'Lucerne': 'LU', 'Valais': 'VS', 'Vaud': 'VD', 'Fribourg': 'FR',
|
|
# Italian names
|
|
'Ginevra': 'GE', 'Grigioni': 'GR', 'Ticino': 'TI', 'Vallese': 'VS',
|
|
}
|
|
|
|
|
|
def load_swiss_isil_lookup() -> Dict[str, str]:
|
|
"""Load Swiss CH-Annotator source file and create name -> ISIL URL lookup."""
|
|
lookup = {}
|
|
|
|
if not SWISS_CH_ANNOTATOR_FILE.exists():
|
|
print(f"Warning: Swiss CH-Annotator file not found: {SWISS_CH_ANNOTATOR_FILE}")
|
|
return lookup
|
|
|
|
print(f"Loading Swiss CH-Annotator source file...")
|
|
with open(SWISS_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f:
|
|
entries = yaml.safe_load(f)
|
|
|
|
if not entries:
|
|
return lookup
|
|
|
|
for entry in entries:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
|
|
name = entry.get('name', '')
|
|
if not name:
|
|
continue
|
|
|
|
# Look for ISIL URL in digital_platforms
|
|
for platform in entry.get('digital_platforms', []):
|
|
if isinstance(platform, dict):
|
|
url = platform.get('platform_url', '')
|
|
if 'isil.nb.admin.ch' in url:
|
|
lookup[name] = url
|
|
break
|
|
|
|
print(f" Loaded {len(lookup)} institutions with ISIL URLs")
|
|
return lookup
|
|
|
|
|
|
def generate_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
if not city_name:
|
|
return 'XXX'
|
|
|
|
# Remove diacritics and normalize
|
|
import unicodedata
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Skip articles and prepositions
|
|
skip_words = {'de', 'la', 'le', 'les', 'du', 'des', 'von', 'am', 'im', 'an', 'der', 'die', 'das'}
|
|
words = ascii_name.split()
|
|
significant_words = [w for w in words if w.lower() not in skip_words]
|
|
|
|
if not significant_words:
|
|
significant_words = words
|
|
|
|
if len(significant_words) == 1:
|
|
# Single word: first 3 letters
|
|
return significant_words[0][:3].upper()
|
|
else:
|
|
# Multiple words: initials
|
|
return ''.join(w[0] for w in significant_words[:3]).upper()
|
|
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate deterministic UUID v5 from GHCID string."""
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
|
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 style from SHA-256 hash."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16]
|
|
hash_bytes = bytearray(hash_bytes)
|
|
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8
|
|
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant
|
|
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
|
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from SHA-256 hash."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
|
return int.from_bytes(hash_bytes[:8], 'big')
|
|
|
|
|
|
def fetch_isil_page(isil_url: str, session: requests.Session) -> Optional[Dict]:
|
|
"""Fetch and parse Swiss ISIL institution page."""
|
|
try:
|
|
response = session.get(isil_url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
result = {}
|
|
|
|
# Find all dt/dd pairs in the definition lists
|
|
for dt in soup.find_all('dt'):
|
|
label = dt.get_text(strip=True)
|
|
dd = dt.find_next_sibling('dd')
|
|
if dd:
|
|
value = dd.get_text(strip=True)
|
|
|
|
if label == 'Location':
|
|
result['city'] = value
|
|
elif label == 'Zip code':
|
|
result['postal_code'] = value
|
|
elif label == 'Street and number':
|
|
result['street_address'] = value
|
|
elif label == 'Canton':
|
|
result['canton'] = value
|
|
result['region'] = SWISS_CANTON_CODES.get(value, value[:2].upper() if len(value) >= 2 else None)
|
|
|
|
return result if result.get('city') else None
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching {isil_url}: {e}")
|
|
return None
|
|
|
|
|
|
def reverse_geocode_city(city_name: str, region_code: str, country_code: str, db_path: Path) -> Optional[Dict]:
|
|
"""Look up city in GeoNames database to get coordinates and proper data."""
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# Swiss admin1 codes in GeoNames
|
|
swiss_admin1_map = {
|
|
'AG': '01', 'AR': '15', 'AI': '16', 'BL': '06', 'BS': '05',
|
|
'BE': '02', 'FR': '04', 'GE': '07', 'GL': '08', 'GR': '03',
|
|
'JU': '26', 'LU': '09', 'NE': '10', 'NW': '11', 'OW': '12',
|
|
'SH': '14', 'SZ': '17', 'SO': '13', 'SG': '18', 'TG': '20',
|
|
'TI': '21', 'UR': '19', 'VS': '22', 'VD': '23', 'ZG': '25', 'ZH': '24'
|
|
}
|
|
|
|
admin1_code = swiss_admin1_map.get(region_code)
|
|
|
|
# Try exact match first
|
|
query = """
|
|
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
|
population, feature_code, admin1_code, admin1_name
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?))
|
|
"""
|
|
|
|
if admin1_code:
|
|
query += " AND admin1_code = ?"
|
|
cursor.execute(query + " ORDER BY population DESC LIMIT 1",
|
|
(country_code, city_name, city_name, city_name, admin1_code))
|
|
else:
|
|
cursor.execute(query + " ORDER BY population DESC LIMIT 1",
|
|
(country_code, city_name, city_name, city_name))
|
|
|
|
row = cursor.fetchone()
|
|
|
|
if row:
|
|
return {
|
|
'geonames_id': row[0],
|
|
'geonames_name': row[1],
|
|
'ascii_name': row[2],
|
|
'latitude': row[3],
|
|
'longitude': row[4],
|
|
'population': row[5],
|
|
'feature_code': row[6],
|
|
'admin1_code': row[7],
|
|
'admin1_name': row[8]
|
|
}
|
|
|
|
# Try fuzzy match
|
|
cursor.execute("""
|
|
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
|
population, feature_code, admin1_code, admin1_name
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND (name LIKE ? OR ascii_name LIKE ?)
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, f"{city_name}%", f"{city_name}%"))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if row:
|
|
return {
|
|
'geonames_id': row[0],
|
|
'geonames_name': row[1],
|
|
'ascii_name': row[2],
|
|
'latitude': row[3],
|
|
'longitude': row[4],
|
|
'population': row[5],
|
|
'feature_code': row[6],
|
|
'admin1_code': row[7],
|
|
'admin1_name': row[8]
|
|
}
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f" GeoNames lookup error: {e}")
|
|
return None
|
|
|
|
|
|
def process_file(file_path: Path, session: requests.Session, isil_lookup: Dict[str, str], dry_run: bool = True) -> Dict:
|
|
"""Process a single custodian file."""
|
|
result = {
|
|
'status': 'unchanged',
|
|
'old_ghcid': None,
|
|
'new_ghcid': None,
|
|
'city': None,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
result['status'] = 'error'
|
|
result['error'] = 'Empty file'
|
|
return result
|
|
|
|
# Check if this is a Swiss file with XXX city placeholder
|
|
ghcid_current = data.get('ghcid', {}).get('ghcid_current', '')
|
|
if not ghcid_current.startswith('CH-') or '-XXX-' not in ghcid_current:
|
|
result['status'] = 'skipped'
|
|
return result
|
|
|
|
result['old_ghcid'] = ghcid_current
|
|
|
|
# Get institution name for lookup
|
|
inst_name = data.get('original_entry', {}).get('name', '')
|
|
if not inst_name:
|
|
inst_name = data.get('custodian_name', {}).get('claim_value', '')
|
|
|
|
# Find ISIL URL - first try lookup by name
|
|
isil_url = isil_lookup.get(inst_name)
|
|
|
|
# Then check identifiers in the file
|
|
if not isil_url:
|
|
identifiers = data.get('identifiers', [])
|
|
for ident in identifiers:
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL':
|
|
url = ident.get('identifier_url', '')
|
|
if 'isil.nb.admin.ch' in url:
|
|
isil_url = url
|
|
break
|
|
|
|
# Also check original_entry.identifiers
|
|
if not isil_url:
|
|
original_identifiers = data.get('original_entry', {}).get('identifiers', [])
|
|
for ident in original_identifiers:
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL':
|
|
url = ident.get('identifier_url', '')
|
|
if 'isil.nb.admin.ch' in url:
|
|
isil_url = url
|
|
break
|
|
|
|
if not isil_url:
|
|
result['status'] = 'no_isil_url'
|
|
result['error'] = f'No ISIL URL found for: {inst_name}'
|
|
return result
|
|
|
|
# Convert to proper page URL format
|
|
if '?isil=' in isil_url:
|
|
isil_code = isil_url.split('?isil=')[-1]
|
|
# Convert to institution page URL
|
|
isil_url = f"https://www.isil.nb.admin.ch/en/?isil={isil_code}"
|
|
|
|
# Fetch city data from ISIL website
|
|
time.sleep(REQUEST_DELAY)
|
|
isil_data = fetch_isil_page(isil_url, session)
|
|
|
|
if not isil_data or not isil_data.get('city'):
|
|
result['status'] = 'no_city_found'
|
|
return result
|
|
|
|
city_name = isil_data['city']
|
|
result['city'] = city_name
|
|
|
|
# Get region from GHCID or ISIL data
|
|
parts = ghcid_current.split('-')
|
|
region_code = parts[1] if len(parts) > 1 else isil_data.get('region', 'XX')
|
|
|
|
# Generate city code
|
|
city_code = generate_city_code(city_name)
|
|
|
|
# Try to get GeoNames data for coordinates
|
|
geonames_data = reverse_geocode_city(city_name, region_code, 'CH', GEONAMES_DB)
|
|
|
|
# Build new GHCID
|
|
# Format: CH-{region}-{city}-{type}-{abbrev}[-{suffix}]
|
|
new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-')
|
|
result['new_ghcid'] = new_ghcid
|
|
|
|
if new_ghcid == ghcid_current:
|
|
result['status'] = 'unchanged'
|
|
return result
|
|
|
|
if dry_run:
|
|
result['status'] = 'would_update'
|
|
return result
|
|
|
|
# Update the data
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
|
|
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
|
|
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
|
|
|
|
# Update location_resolution
|
|
location_resolution = {
|
|
'method': 'SWISS_ISIL_ENRICHMENT',
|
|
'city_name': city_name,
|
|
'city_code': city_code,
|
|
'region_code': region_code,
|
|
'country_code': 'CH',
|
|
'enrichment_date': now,
|
|
'source_url': isil_url
|
|
}
|
|
|
|
if geonames_data:
|
|
location_resolution.update({
|
|
'geonames_id': geonames_data['geonames_id'],
|
|
'geonames_name': geonames_data['geonames_name'],
|
|
'feature_code': geonames_data['feature_code'],
|
|
'population': geonames_data['population'],
|
|
'latitude': geonames_data['latitude'],
|
|
'longitude': geonames_data['longitude']
|
|
})
|
|
|
|
data['ghcid']['location_resolution'] = location_resolution
|
|
|
|
# Add GHCID history entry
|
|
history = data['ghcid'].get('ghcid_history', [])
|
|
if history:
|
|
# Close previous entry
|
|
history[0]['valid_to'] = now
|
|
|
|
history.insert(0, {
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
|
|
'valid_from': now,
|
|
'valid_to': None,
|
|
'reason': f'City code updated from Swiss ISIL enrichment: {city_name} -> {city_code}'
|
|
})
|
|
data['ghcid']['ghcid_history'] = history
|
|
|
|
# Update location in original_entry if exists
|
|
if 'locations' in data.get('original_entry', {}):
|
|
for loc in data['original_entry']['locations']:
|
|
if isinstance(loc, dict) and not loc.get('city'):
|
|
loc['city'] = city_name
|
|
if isil_data.get('postal_code'):
|
|
loc['postal_code'] = isil_data['postal_code']
|
|
if isil_data.get('street_address'):
|
|
loc['street_address'] = isil_data['street_address']
|
|
|
|
# Update identifiers
|
|
for ident in data.get('identifiers', []):
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID':
|
|
ident['identifier_value'] = new_ghcid
|
|
|
|
# Write updated file
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Rename file if GHCID changed
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_path = file_path.parent / new_filename
|
|
|
|
if new_path != file_path and not new_path.exists():
|
|
shutil.move(file_path, new_path)
|
|
result['renamed_to'] = str(new_path.name)
|
|
|
|
result['status'] = 'updated'
|
|
return result
|
|
|
|
except Exception as e:
|
|
result['status'] = 'error'
|
|
result['error'] = str(e)
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Enrich Swiss ISIL custodian files with city data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output')
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 60)
|
|
print("SWISS ISIL CITY ENRICHMENT")
|
|
print("=" * 60)
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN MODE - No files will be modified")
|
|
|
|
# Find Swiss files with XXX city placeholder
|
|
swiss_xxx_files = list(CUSTODIAN_DIR.glob("CH-*-XXX-*.yaml"))
|
|
|
|
if args.limit:
|
|
swiss_xxx_files = swiss_xxx_files[:args.limit]
|
|
print(f"Limited to {args.limit} files")
|
|
|
|
print(f"Found {len(swiss_xxx_files)} Swiss files with XXX city placeholder")
|
|
print()
|
|
|
|
# Load Swiss ISIL lookup from CH-Annotator source file
|
|
isil_lookup = load_swiss_isil_lookup()
|
|
|
|
# Process files
|
|
session = requests.Session()
|
|
session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)'
|
|
|
|
stats = {
|
|
'updated': 0,
|
|
'would_update': 0,
|
|
'unchanged': 0,
|
|
'skipped': 0,
|
|
'no_isil_url': 0,
|
|
'no_city_found': 0,
|
|
'error': 0
|
|
}
|
|
|
|
cities_found = {}
|
|
errors = []
|
|
|
|
for i, file_path in enumerate(swiss_xxx_files, 1):
|
|
if i % 100 == 0 or args.verbose:
|
|
print(f"Progress: {i}/{len(swiss_xxx_files)}")
|
|
|
|
result = process_file(file_path, session, isil_lookup, dry_run=args.dry_run)
|
|
stats[result['status']] = stats.get(result['status'], 0) + 1
|
|
|
|
if result.get('city'):
|
|
cities_found[result['city']] = cities_found.get(result['city'], 0) + 1
|
|
|
|
if result.get('error'):
|
|
errors.append(f"{file_path.name}: {result['error']}")
|
|
|
|
if args.verbose and result['status'] in ('updated', 'would_update'):
|
|
print(f" {file_path.name}")
|
|
print(f" City: {result.get('city')}")
|
|
print(f" {result['old_ghcid']} -> {result['new_ghcid']}")
|
|
|
|
# Print summary
|
|
print()
|
|
print("=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total files processed: {len(swiss_xxx_files)}")
|
|
print()
|
|
print("Results:")
|
|
for status, count in sorted(stats.items()):
|
|
if count > 0:
|
|
print(f" {status}: {count}")
|
|
|
|
if cities_found:
|
|
print()
|
|
print(f"Cities found: {len(cities_found)} unique")
|
|
print("Top 10 cities:")
|
|
for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {city}: {count}")
|
|
|
|
if errors:
|
|
print()
|
|
print(f"Errors ({len(errors)}):")
|
|
for err in errors[:10]:
|
|
print(f" {err}")
|
|
if len(errors) > 10:
|
|
print(f" ... and {len(errors) - 10} more")
|
|
|
|
# Save report
|
|
REPORTS_DIR.mkdir(exist_ok=True)
|
|
report_file = REPORTS_DIR / f"SWISS_ISIL_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
|
|
|
with open(report_file, 'w') as f:
|
|
f.write("# Swiss ISIL City Enrichment Report\n\n")
|
|
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
|
f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n")
|
|
f.write("## Summary\n\n")
|
|
f.write(f"- Total files processed: {len(swiss_xxx_files)}\n")
|
|
for status, count in sorted(stats.items()):
|
|
if count > 0:
|
|
f.write(f"- {status}: {count}\n")
|
|
|
|
if cities_found:
|
|
f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n")
|
|
for city, count in sorted(cities_found.items(), key=lambda x: -x[1]):
|
|
f.write(f"- {city}: {count}\n")
|
|
|
|
print()
|
|
print(f"Report saved to: {report_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|