glam/scripts/enrich_czech_cities.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

791 lines
29 KiB
Python

#!/usr/bin/env python3
"""
Enrich Czech custodian files with city data from the CH-Annotator source file.
For Czech custodian files with XXX city placeholder, this script:
1. Loads the source CH-Annotator file (czech_unified_ch_annotator.yaml)
2. Matches by name, ARON UUID, or Wikidata ID to get city/coordinates
3. Falls back to Wikidata P131 lookup via SPARQL for missing data
4. Updates the GHCID with correct city code
5. Renames the file if GHCID changes
Usage:
python scripts/enrich_czech_cities.py [--dry-run] [--limit N]
"""
import argparse
import hashlib
import os
import re
import shutil
import sqlite3
import time
import uuid
import yaml
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
REPORTS_DIR = PROJECT_ROOT / "reports"
CZECH_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "czech_unified_ch_annotator.yaml"
# GHCID namespace for UUID generation
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
# Rate limiting for Wikidata
REQUEST_DELAY = 1.0
# Czech region mapping (GeoNames admin1 to ISO 3166-2:CZ)
CZECH_ADMIN1_MAP = {
'52': 'JC', # Jihočeský (South Bohemian)
'78': 'JM', # Jihomoravský (South Moravian)
'81': 'KA', # Karlovarský (Karlovy Vary)
'82': 'VY', # Vysočina (Vysočina)
'51': 'KR', # Královéhradecký (Hradec Králové)
'53': 'LI', # Liberecký (Liberec)
'84': 'MO', # Moravskoslezský (Moravian-Silesian)
'85': 'OL', # Olomoucký (Olomouc)
'86': 'PA', # Pardubický (Pardubice)
'54': 'PL', # Plzeňský (Plzeň)
'10': 'PR', # Praha (Prague)
'55': 'ST', # Středočeský (Central Bohemian)
'56': 'US', # Ústecký (Ústí nad Labem)
'87': 'ZL', # Zlínský (Zlín)
}
# Region name to code mapping (from source data)
CZECH_REGION_NAMES = {
'Jihočeský': 'JC',
'Jihomoravský': 'JM',
'Karlovarský': 'KA',
'Vysočina': 'VY',
'Královéhradecký': 'KR',
'Liberecký': 'LI',
'Moravskoslezský': 'MO',
'Olomoucký': 'OL',
'Pardubický': 'PA',
'Plzeňský': 'PL',
'Hlavní město Praha': 'PR',
'Praha': 'PR',
'Středočeský': 'ST',
'Ústecký': 'US',
'Zlínský': 'ZL',
}
def extract_city_from_name(name: str) -> Optional[str]:
"""Try to extract city name from Czech institution name patterns."""
if not name:
return None
# Common patterns in Czech: "v Praze", "v Brně", "v Kladně", "ve Šlapanicích"
# Also: "nad Metují", "nad Labem"
import re
# Pattern: "v/ve + City" (locative case)
patterns = [
# "v CityName" - most common
r'\bv\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)',
# "ve CityName" (before consonant clusters)
r'\bve\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)',
# "nad CityName" or "pod CityName"
r'\b(?:nad|pod)\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)',
]
for pattern in patterns:
match = re.search(pattern, name)
if match:
city = match.group(1)
# Convert locative case to nominative (approximation)
# Common endings: -ě/-e -> -a, -ích -> -y, -ové -> -ov
city = convert_locative_to_nominative(city)
return city
return None
def convert_locative_to_nominative(city: str) -> str:
"""Convert Czech locative case to nominative (best effort)."""
# This is approximate - Czech declension is complex
# Common patterns:
replacements = [
# Praha (Prague): Praze -> Praha
('Praze', 'Praha'),
('Brně', 'Brno'),
('Hradci Králové', 'Hradec Králové'),
('Havlíčkově Brodě', 'Havlíčkův Brod'),
('Liberci', 'Liberec'),
('Olomouci', 'Olomouc'),
('Plzni', 'Plzeň'),
('Ostravě', 'Ostrava'),
('Ústí nad Labem', 'Ústí nad Labem'), # no change
('Opavě', 'Opava'),
# Generic endings
]
for locative, nominative in replacements:
if city == locative:
return nominative
# Generic ending transformations (approximate)
if city.endswith('ě') or city.endswith('e'):
# Could be -a noun (Praha -> Praze) or -o noun (Brno -> Brně)
# Try replacing with -a first (more common)
pass
# For now, return as-is if no specific mapping found
return city
def normalize_czech_name(name: str) -> str:
"""Normalize Czech institution name for matching."""
if not name:
return ''
# Remove common suffixes and legal forms
suffixes = [
'o. p. s.',
'o.p.s.',
'p. o.',
'p.o.',
's. r. o.',
's.r.o.',
'příspěvková organizace',
', příspěvková organizace',
', p. o.',
]
result = name
for suffix in suffixes:
result = result.replace(suffix, '')
# Clean up extra whitespace
result = ' '.join(result.split())
result = result.strip(' -,')
return result
def load_czech_source_data() -> Dict[str, Dict]:
"""Load Czech CH-Annotator source file and create lookup tables."""
by_name = {}
by_aron_uuid = {}
by_wikidata = {}
if not CZECH_CH_ANNOTATOR_FILE.exists():
print(f"Warning: Czech CH-Annotator file not found: {CZECH_CH_ANNOTATOR_FILE}")
return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata}
print(f"Loading Czech CH-Annotator source file...")
with open(CZECH_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f:
entries = yaml.safe_load(f)
if not entries:
return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata}
for entry in entries:
if not isinstance(entry, dict):
continue
# Extract location data
locations = entry.get('locations', [])
if not locations:
continue
loc = locations[0] if locations else {}
if not loc.get('city'):
continue
location_data = {
'city': loc.get('city'),
'region': loc.get('region'),
'region_code': CZECH_REGION_NAMES.get(loc.get('region', ''), None),
'postal_code': loc.get('postal_code'),
'street_address': loc.get('street_address'),
'latitude': loc.get('latitude'),
'longitude': loc.get('longitude'),
'name': entry.get('name', '')
}
# Index by name (exact and normalized)
name = entry.get('name', '')
if name:
by_name[name] = location_data
by_name[name.lower()] = location_data
# Also normalized version
normalized = normalize_czech_name(name)
if normalized and normalized != name:
by_name[normalized] = location_data
by_name[normalized.lower()] = location_data
# Index by alternative names
for alt_name in entry.get('alternative_names', []):
if alt_name:
by_name[alt_name] = location_data
by_name[alt_name.lower()] = location_data
normalized = normalize_czech_name(alt_name)
if normalized and normalized != alt_name:
by_name[normalized] = location_data
by_name[normalized.lower()] = location_data
# Index by ARON UUID and Wikidata
for ident in entry.get('identifiers', []):
if not isinstance(ident, dict):
continue
scheme = ident.get('identifier_scheme', '')
value = ident.get('identifier_value', '')
if scheme == 'ARON_UUID' and value:
by_aron_uuid[value] = location_data
elif scheme == 'Wikidata' and value:
by_wikidata[value] = location_data
print(f" Loaded {len(by_name)} by name, {len(by_aron_uuid)} by ARON UUID, {len(by_wikidata)} by Wikidata")
return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata}
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
if not city_name:
return 'XXX'
# Remove diacritics and normalize
import unicodedata
normalized = unicodedata.normalize('NFD', city_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Czech articles/prepositions to skip
skip_words = {'nad', 'pod', 'u', 'v', 've', 'na', 'do', 'z', 'ze', 'k', 'ke', 'o', 's', 'se'}
words = ascii_name.split()
significant_words = [w for w in words if w.lower() not in skip_words]
if not significant_words:
significant_words = words
if len(significant_words) == 1:
# Single word: first 3 letters
return significant_words[0][:3].upper()
else:
# Multiple words: initials (up to 3)
return ''.join(w[0] for w in significant_words[:3]).upper()
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate deterministic UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 style from SHA-256 hash."""
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16]
hash_bytes = bytearray(hash_bytes)
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant
return str(uuid.UUID(bytes=bytes(hash_bytes)))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from SHA-256 hash."""
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
return int.from_bytes(hash_bytes[:8], 'big')
def fetch_wikidata_location(wikidata_id: str, session: requests.Session) -> Optional[Dict]:
"""Fetch location via Wikidata SPARQL (P131 located in administrative entity)."""
if not wikidata_id or not wikidata_id.startswith('Q'):
return None
query = f"""
SELECT ?cityLabel ?regionLabel ?coords WHERE {{
wd:{wikidata_id} wdt:P131* ?city .
?city wdt:P31/wdt:P279* wd:Q515 . # city
OPTIONAL {{ ?city wdt:P625 ?coords }}
OPTIONAL {{
wd:{wikidata_id} wdt:P131+ ?region .
?region wdt:P31 wd:Q20916591 . # Czech region
}}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "cs,en" }}
}}
LIMIT 1
"""
try:
response = session.get(
'https://query.wikidata.org/sparql',
params={'query': query, 'format': 'json'},
headers={'User-Agent': 'GLAMDataExtractor/1.0'},
timeout=30
)
response.raise_for_status()
data = response.json()
results = data.get('results', {}).get('bindings', [])
if results:
result = results[0]
city = result.get('cityLabel', {}).get('value', '')
region = result.get('regionLabel', {}).get('value', '')
coords = result.get('coords', {}).get('value', '')
lat, lon = None, None
if coords and coords.startswith('Point('):
# Parse Point(lon lat) format
match = re.match(r'Point\(([^ ]+) ([^)]+)\)', coords)
if match:
lon, lat = float(match.group(1)), float(match.group(2))
return {
'city': city,
'region': region,
'region_code': CZECH_REGION_NAMES.get(region, None),
'latitude': lat,
'longitude': lon,
'source': 'wikidata_sparql'
}
except Exception as e:
print(f" Wikidata SPARQL error: {e}")
return None
def reverse_geocode_city(city_name: str, country_code: str, db_path: Path) -> Optional[Dict]:
"""Look up city in GeoNames database to get coordinates and admin1."""
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Try exact match first
cursor.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
population, feature_code, admin1_code, admin1_name
FROM cities
WHERE country_code = ?
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?))
ORDER BY population DESC
LIMIT 1
""", (country_code, city_name, city_name, city_name))
row = cursor.fetchone()
if not row:
# Try fuzzy match
cursor.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
population, feature_code, admin1_code, admin1_name
FROM cities
WHERE country_code = ?
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (name LIKE ? OR ascii_name LIKE ?)
ORDER BY population DESC
LIMIT 1
""", (country_code, f"{city_name}%", f"{city_name}%"))
row = cursor.fetchone()
conn.close()
if row:
admin1_code = row[7]
region_code = CZECH_ADMIN1_MAP.get(admin1_code, None)
return {
'geonames_id': row[0],
'geonames_name': row[1],
'ascii_name': row[2],
'latitude': row[3],
'longitude': row[4],
'population': row[5],
'feature_code': row[6],
'admin1_code': admin1_code,
'admin1_name': row[8],
'region_code': region_code
}
return None
except Exception as e:
print(f" GeoNames lookup error: {e}")
return None
def process_file(file_path: Path, lookup: Dict, session: requests.Session, dry_run: bool = True) -> Dict:
"""Process a single custodian file."""
result = {
'status': 'unchanged',
'old_ghcid': None,
'new_ghcid': None,
'city': None,
'error': None
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
result['status'] = 'error'
result['error'] = 'Empty file'
return result
# Check if this is a Czech file with XXX city placeholder
ghcid_current = data.get('ghcid', {}).get('ghcid_current', '')
if not ghcid_current.startswith('CZ-') or '-XXX-' not in ghcid_current:
result['status'] = 'skipped'
return result
result['old_ghcid'] = ghcid_current
# Get institution name for lookup
inst_name = data.get('original_entry', {}).get('name', '')
if not inst_name:
inst_name = data.get('custodian_name', {}).get('claim_value', '')
# Get identifiers for lookup
aron_uuid = None
wikidata_id = None
for ident in data.get('identifiers', []):
if isinstance(ident, dict):
scheme = ident.get('identifier_scheme', '')
value = ident.get('identifier_value', '')
if scheme == 'ARON_UUID':
aron_uuid = value
elif scheme == 'Wikidata':
wikidata_id = value
# Also check original_entry.identifiers
for ident in data.get('original_entry', {}).get('identifiers', []):
if isinstance(ident, dict):
scheme = ident.get('identifier_scheme', '')
value = ident.get('identifier_value', '')
if scheme == 'ARON_UUID' and not aron_uuid:
aron_uuid = value
elif scheme == 'Wikidata' and not wikidata_id:
wikidata_id = value
# Try to find location data from source
location_data = None
location_source = None
# Try by name first
if inst_name:
location_data = lookup['by_name'].get(inst_name)
if location_data:
location_source = 'source_by_name'
else:
# Try lowercase
location_data = lookup['by_name'].get(inst_name.lower())
if location_data:
location_source = 'source_by_name_lower'
else:
# Try normalized
normalized = normalize_czech_name(inst_name)
if normalized:
location_data = lookup['by_name'].get(normalized)
if location_data:
location_source = 'source_by_normalized_name'
else:
location_data = lookup['by_name'].get(normalized.lower())
if location_data:
location_source = 'source_by_normalized_name_lower'
# Try by ARON UUID
if not location_data and aron_uuid:
location_data = lookup['by_aron_uuid'].get(aron_uuid)
if location_data:
location_source = 'source_by_aron_uuid'
# Try by Wikidata
if not location_data and wikidata_id:
location_data = lookup['by_wikidata'].get(wikidata_id)
if location_data:
location_source = 'source_by_wikidata'
# Fallback to Wikidata SPARQL (skip for now - too slow)
# if not location_data and wikidata_id:
# time.sleep(REQUEST_DELAY)
# location_data = fetch_wikidata_location(wikidata_id, session)
# if location_data:
# location_source = 'wikidata_sparql'
# Fallback: extract city from institution name
if not location_data or not location_data.get('city'):
extracted_city = extract_city_from_name(inst_name)
if extracted_city:
# Validate against GeoNames
geonames_data = reverse_geocode_city(extracted_city, 'CZ', GEONAMES_DB)
if geonames_data:
location_data = {
'city': geonames_data.get('geonames_name', extracted_city),
'region_code': geonames_data.get('region_code'),
'geonames_id': geonames_data.get('geonames_id'),
'geonames_name': geonames_data.get('geonames_name'),
'latitude': geonames_data.get('latitude'),
'longitude': geonames_data.get('longitude'),
}
location_source = 'extracted_from_name'
if not location_data or not location_data.get('city'):
result['status'] = 'no_city_found'
result['error'] = f'No location data for: {inst_name}'
return result
city_name = location_data['city']
result['city'] = city_name
# Generate city code
city_code = generate_city_code(city_name)
# Get region code
region_code = location_data.get('region_code')
if not region_code:
# Try to get from GeoNames
geonames_data = reverse_geocode_city(city_name, 'CZ', GEONAMES_DB)
if geonames_data:
region_code = geonames_data.get('region_code')
location_data['geonames_id'] = geonames_data.get('geonames_id')
location_data['geonames_name'] = geonames_data.get('geonames_name')
if not location_data.get('latitude'):
location_data['latitude'] = geonames_data.get('latitude')
location_data['longitude'] = geonames_data.get('longitude')
# Build new GHCID
parts = ghcid_current.split('-')
if len(parts) >= 5:
# Replace XXX with city code, and update region if we have it
parts[2] = city_code
if region_code:
parts[1] = region_code
new_ghcid = '-'.join(parts)
else:
new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-')
result['new_ghcid'] = new_ghcid
if new_ghcid == ghcid_current:
result['status'] = 'unchanged'
return result
if dry_run:
result['status'] = 'would_update'
return result
# Update the data
now = datetime.now(timezone.utc).isoformat()
# Update GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
# Update location_resolution
location_resolution = {
'method': 'CZECH_CH_ANNOTATOR_ENRICHMENT',
'city_name': city_name,
'city_code': city_code,
'country_code': 'CZ',
'enrichment_date': now,
'source': location_source
}
if region_code:
location_resolution['region_code'] = region_code
location_resolution['region_name'] = location_data.get('region', f'CZ-{region_code}')
if location_data.get('geonames_id'):
location_resolution['geonames_id'] = location_data['geonames_id']
location_resolution['geonames_name'] = location_data['geonames_name']
if location_data.get('latitude'):
location_resolution['latitude'] = location_data['latitude']
location_resolution['longitude'] = location_data['longitude']
data['ghcid']['location_resolution'] = location_resolution
# Add GHCID history entry
history = data['ghcid'].get('ghcid_history', [])
if history and isinstance(history, list) and len(history) > 0:
# Close previous entry
if isinstance(history[0], dict):
history[0]['valid_to'] = now
history.insert(0, {
'ghcid': new_ghcid,
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
'valid_from': now,
'valid_to': None,
'reason': f'City code updated from Czech CH-Annotator enrichment: {city_name} -> {city_code}'
})
data['ghcid']['ghcid_history'] = history
# Update location in original_entry if exists
if 'original_entry' in data:
if 'locations' not in data['original_entry'] or not data['original_entry']['locations']:
data['original_entry']['locations'] = [{}]
for loc in data['original_entry']['locations']:
if isinstance(loc, dict):
loc['city'] = city_name
if location_data.get('postal_code'):
loc['postal_code'] = location_data['postal_code']
if location_data.get('street_address'):
loc['street_address'] = location_data['street_address']
if location_data.get('latitude'):
loc['latitude'] = location_data['latitude']
loc['longitude'] = location_data['longitude']
if region_code:
loc['region'] = location_data.get('region', f'CZ-{region_code}')
# Update identifiers
for ident in data.get('identifiers', []):
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
# Add provenance note
notes = data.get('provenance', {}).get('notes', [])
if isinstance(notes, str):
notes = [notes]
if not isinstance(notes, list):
notes = []
notes.append(f'City resolved {now[:19]}Z: {city_name} -> {city_code} via {location_source}')
data['provenance'] = data.get('provenance', {})
data['provenance']['notes'] = notes
# Write updated file
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Rename file if GHCID changed
new_filename = f"{new_ghcid}.yaml"
new_path = file_path.parent / new_filename
if new_path != file_path and not new_path.exists():
shutil.move(file_path, new_path)
result['renamed_to'] = str(new_path.name)
result['status'] = 'updated'
return result
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
import traceback
traceback.print_exc()
return result
def main():
parser = argparse.ArgumentParser(description='Enrich Czech custodian files with city data')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output')
args = parser.parse_args()
print("=" * 60)
print("CZECH CITY ENRICHMENT")
print("=" * 60)
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
# Find Czech files with XXX city placeholder
czech_xxx_files = list(CUSTODIAN_DIR.glob("CZ-*-XXX-*.yaml"))
if args.limit:
czech_xxx_files = czech_xxx_files[:args.limit]
print(f"Limited to {args.limit} files")
print(f"Found {len(czech_xxx_files)} Czech files with XXX city placeholder")
print()
# Load Czech source data
lookup = load_czech_source_data()
# Process files
session = requests.Session()
session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)'
stats = {
'updated': 0,
'would_update': 0,
'unchanged': 0,
'skipped': 0,
'no_city_found': 0,
'error': 0
}
cities_found = {}
errors = []
for i, file_path in enumerate(czech_xxx_files, 1):
if i % 100 == 0 or args.verbose:
print(f"Progress: {i}/{len(czech_xxx_files)}")
result = process_file(file_path, lookup, session, dry_run=args.dry_run)
stats[result['status']] = stats.get(result['status'], 0) + 1
if result.get('city'):
cities_found[result['city']] = cities_found.get(result['city'], 0) + 1
if result.get('error'):
errors.append(f"{file_path.name}: {result['error']}")
if args.verbose and result['status'] in ('updated', 'would_update'):
print(f" {file_path.name}")
print(f" City: {result.get('city')}")
print(f" {result['old_ghcid']} -> {result['new_ghcid']}")
# Print summary
print()
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total files processed: {len(czech_xxx_files)}")
print()
print("Results:")
for status, count in sorted(stats.items()):
if count > 0:
print(f" {status}: {count}")
if cities_found:
print()
print(f"Cities found: {len(cities_found)} unique")
print("Top 10 cities:")
for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]:
print(f" {city}: {count}")
if errors:
print()
print(f"Errors ({len(errors)}):")
for err in errors[:10]:
print(f" {err}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more")
# Save report
REPORTS_DIR.mkdir(exist_ok=True)
report_file = REPORTS_DIR / f"CZECH_CITY_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
with open(report_file, 'w') as f:
f.write("# Czech City Enrichment Report\n\n")
f.write(f"**Date**: {datetime.now().isoformat()}\n")
f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n")
f.write("## Summary\n\n")
f.write(f"- Total files processed: {len(czech_xxx_files)}\n")
for status, count in sorted(stats.items()):
if count > 0:
f.write(f"- {status}: {count}\n")
if cities_found:
f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n")
for city, count in sorted(cities_found.items(), key=lambda x: -x[1]):
f.write(f"- {city}: {count}\n")
print()
print(f"Report saved to: {report_file}")
if __name__ == '__main__':
main()