Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
269 lines
9.5 KiB
Python
269 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve CZ-XX (unknown region) files to proper ISO 3166-2:CZ region codes.
|
|
|
|
This script updates 36 Czech institution files that have placeholder XX region codes
|
|
to their correct ISO 3166-2:CZ region codes based on researched location data.
|
|
|
|
Research completed 2025-12-07 via GeoNames database and web searches.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# GeoNames Admin1 → ISO 3166-2:CZ region code mapping
|
|
ADMIN1_TO_ISO = {
|
|
'52': '10', # Prague
|
|
'78': '64', # South Moravian (Jihomoravský)
|
|
'79': '31', # South Bohemian (Jihočeský)
|
|
'80': '63', # Vysočina
|
|
'81': '41', # Karlovy Vary
|
|
'82': '52', # Hradec Králové
|
|
'83': '51', # Liberec
|
|
'84': '71', # Olomouc
|
|
'85': '80', # Moravian-Silesian (Moravskoslezský)
|
|
'86': '53', # Pardubice
|
|
'87': '32', # Plzeň
|
|
'88': '20', # Central Bohemian (Středočeský)
|
|
'89': '42', # Ústí nad Labem
|
|
'90': '72', # Zlín
|
|
}
|
|
|
|
# Research results: mapping from old filename suffix to resolution data
|
|
# Format: (new_region_code, new_city_code, city_name, geonames_id, admin1_code)
|
|
RESOLUTIONS = {
|
|
# Archives (A)
|
|
'A-SAČTÚ': ('10', 'PRA', 'Prague', 3067696, '52'),
|
|
'A-SAČÚZK': ('10', 'PRA', 'Prague', 3067696, '52'),
|
|
'A-SAERÚ': ('63', 'JIH', 'Jihlava', 3074199, '80'),
|
|
'A-SAÚPOHS': ('64', 'BRN', 'Brno', 3078610, '78'),
|
|
'A-BSS': ('51', 'MAS', 'Malá Skála', 3071223, '83'),
|
|
'A-PSJAK': ('53', 'BNO', 'Brandýs nad Orlicí', 3078836, '86'),
|
|
'A-ZI': ('10', 'PRA', 'Prague', 3067696, '52'), # Admin location
|
|
|
|
# Galleries (G)
|
|
'G-GAU': ('52', 'HOS', 'Hostinné', 3075058, '82'),
|
|
'G-GVP': ('20', 'MLB', 'Mladá Boleslav', 3070544, '88'),
|
|
|
|
# Libraries (L) - Many are research institutes in Prague/Brno
|
|
'L-SÚPRO': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE064
|
|
'L-ÚRB': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE444
|
|
'L-ÚSLOZ': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE215
|
|
'L-VŠZFA': ('10', 'PRA', 'Prague', 3067696, '52'),
|
|
'L-VŠZR': ('10', 'PRA', 'Prague', 3067696, '52'),
|
|
'L-VÚB': ('64', 'BRN', 'Brno', 3078610, '78'), # BOC006
|
|
'L-VÚI': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC043
|
|
'L-VÚP': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC066
|
|
'L-VÚRV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC162
|
|
'L-VUTÚTMŘP': ('64', 'BRN', 'Brno', 3078610, '78'),
|
|
'L-VVÚNP': ('64', 'BRN', 'Brno', 3078610, '78'), # BOF045
|
|
'L-ZVVZVÚV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABF127
|
|
|
|
# Museums (M)
|
|
'M-BMOP': ('32', 'KPC', 'Klenčí pod Čerchovem', 3073644, '87'),
|
|
'M-MD': ('10', 'PRA', 'Prague', 3067696, '52'),
|
|
'M-MH': ('20', 'KNC', 'Kostelec nad Černými Lesy', 3073152, '88'),
|
|
'M-MJD': ('32', 'CHU', 'Chudenice', 3077528, '87'),
|
|
'M-MKISMDAH': ('63', 'HUM', 'Humpolec', 3074723, '80'),
|
|
'M-MMGLK': ('20', 'POD', 'Poděbrady', 3068107, '88'),
|
|
'M-MMM': ('42', 'MIK', 'Mikulášovice', 3070725, '89'), # Mikcentrum!
|
|
'M-MMSR': ('10', 'PRA', 'Prague', 3067696, '52'),
|
|
'M-MRV': ('51', 'DES', 'Desná', 3077198, '83'),
|
|
'M-MSČ': ('20', 'OST', 'Ostředek', 3068792, '88'),
|
|
'M-MTZSŘ': ('52', 'DEO', 'Deštné v Orlických horách', 3077191, '82'),
|
|
'M-MVBŽS': ('31', 'VOD', 'Vodňany', 3062642, '79'),
|
|
'M-PDEHAM': ('53', 'HOL', 'Holice', 3075599, '86'),
|
|
'M-PMJH': ('31', 'HUS', 'Husinec', 3074686, '79'),
|
|
'M-PZV': ('51', 'PNJ', 'Paseky nad Jizerou', 3068552, '83'),
|
|
}
|
|
|
|
|
|
def generate_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
# Remove diacritics and common prefixes
|
|
import unicodedata
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Handle multi-word names
|
|
words = ascii_name.split()
|
|
|
|
# Skip common prefixes in Czech
|
|
skip_words = {'nad', 'pod', 'v', 'u', 'na'}
|
|
significant_words = [w for w in words if w.lower() not in skip_words]
|
|
|
|
if len(significant_words) == 1:
|
|
# Single word: first 3 letters
|
|
return significant_words[0][:3].upper()
|
|
elif len(significant_words) >= 2:
|
|
# Multi-word: initials
|
|
return ''.join(w[0].upper() for w in significant_words[:3])
|
|
else:
|
|
return ascii_name[:3].upper()
|
|
|
|
|
|
def update_yaml_file(filepath: Path, resolution: tuple) -> tuple:
|
|
"""
|
|
Update a YAML file with resolved region/city data.
|
|
|
|
Returns: (old_ghcid, new_ghcid, new_filepath)
|
|
"""
|
|
region_code, city_code, city_name, geonames_id, admin1_code = resolution
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
data = yaml.safe_load(content)
|
|
|
|
# Extract current GHCID
|
|
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
|
|
# Build new GHCID
|
|
# Pattern: CZ-XX-XXX-{TYPE}-{ABBREV} -> CZ-{REGION}-{CITY}-{TYPE}-{ABBREV}
|
|
match = re.match(r'CZ-XX-XXX-([A-Z])-(.+)$', old_ghcid)
|
|
if not match:
|
|
print(f" WARNING: Could not parse GHCID: {old_ghcid}")
|
|
return None, None, None
|
|
|
|
inst_type, abbrev = match.groups()
|
|
new_ghcid = f"CZ-{region_code}-{city_code}-{inst_type}-{abbrev}"
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update ghcid section
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'GEONAMES_RESEARCH',
|
|
'country_code': 'CZ',
|
|
'region_code': region_code,
|
|
'region_name': get_region_name(region_code),
|
|
'city_code': city_code,
|
|
'city_name': city_name,
|
|
'geonames_id': geonames_id,
|
|
'admin1_code': admin1_code,
|
|
'resolution_timestamp': timestamp,
|
|
'research_date': '2025-12-07',
|
|
'research_method': 'GeoNames database + web search verification'
|
|
}
|
|
|
|
# Add history entry
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'valid_from': timestamp,
|
|
'reason': f'Region resolved via GeoNames research: XX→{region_code}, city: {city_name} (GeoNames ID: {geonames_id})'
|
|
})
|
|
|
|
# Update provenance notes
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
data['provenance']['notes'].append(
|
|
f'Region resolved {timestamp[:10]}: XX→CZ-{region_code} ({city_name}) via GeoNames research'
|
|
)
|
|
|
|
# Update location if present
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
data['location']['city'] = city_name
|
|
data['location']['country'] = 'CZ'
|
|
data['location']['region'] = get_region_name(region_code)
|
|
data['location']['geonames_id'] = geonames_id
|
|
|
|
# Write updated YAML
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
with open(new_filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Remove old file if different
|
|
if new_filepath != filepath:
|
|
filepath.unlink()
|
|
|
|
return old_ghcid, new_ghcid, new_filepath
|
|
|
|
|
|
def get_region_name(region_code: str) -> str:
|
|
"""Get region name from ISO 3166-2:CZ code."""
|
|
region_names = {
|
|
'10': 'Prague',
|
|
'20': 'Central Bohemian',
|
|
'31': 'South Bohemian',
|
|
'32': 'Plzeň',
|
|
'41': 'Karlovy Vary',
|
|
'42': 'Ústí nad Labem',
|
|
'51': 'Liberec',
|
|
'52': 'Hradec Králové',
|
|
'53': 'Pardubice',
|
|
'63': 'Vysočina',
|
|
'64': 'South Moravian',
|
|
'71': 'Olomouc',
|
|
'72': 'Zlín',
|
|
'80': 'Moravian-Silesian',
|
|
}
|
|
return region_names.get(region_code, 'Unknown')
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
# Find all CZ-XX-XXX files
|
|
xx_files = list(custodian_dir.glob('CZ-XX-XXX-*.yaml'))
|
|
print(f"Found {len(xx_files)} CZ-XX-XXX files to resolve")
|
|
|
|
resolved = 0
|
|
failed = 0
|
|
|
|
for filepath in sorted(xx_files):
|
|
filename = filepath.stem
|
|
# Extract suffix (e.g., "A-SAČTÚ" from "CZ-XX-XXX-A-SAČTÚ")
|
|
suffix_match = re.match(r'CZ-XX-XXX-(.+)$', filename)
|
|
if not suffix_match:
|
|
print(f" SKIP: Could not parse filename: {filename}")
|
|
failed += 1
|
|
continue
|
|
|
|
suffix = suffix_match.group(1)
|
|
|
|
if suffix not in RESOLUTIONS:
|
|
print(f" SKIP: No resolution for: {suffix}")
|
|
failed += 1
|
|
continue
|
|
|
|
resolution = RESOLUTIONS[suffix]
|
|
try:
|
|
old_ghcid, new_ghcid, new_filepath = update_yaml_file(filepath, resolution)
|
|
if old_ghcid and new_ghcid:
|
|
print(f" ✓ {old_ghcid} → {new_ghcid}")
|
|
resolved += 1
|
|
else:
|
|
print(f" ✗ Failed to update: {filepath.name}")
|
|
failed += 1
|
|
except Exception as e:
|
|
print(f" ✗ Error processing {filepath.name}: {e}")
|
|
failed += 1
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"SUMMARY: Resolved {resolved}/{len(xx_files)} files")
|
|
if failed:
|
|
print(f" Failed: {failed}")
|
|
|
|
# Verify no CZ-XX files remain
|
|
remaining = list(custodian_dir.glob('CZ-XX-*.yaml'))
|
|
print(f"\nRemaining CZ-XX files: {len(remaining)}")
|
|
if remaining:
|
|
for f in remaining:
|
|
print(f" - {f.name}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|