feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
This commit is contained in:
parent
4825f57951
commit
e45c1a3c85
22 changed files with 9349 additions and 0 deletions
203
scripts/add_ch_annotator_location_claims.py
Normal file
203
scripts/add_ch_annotator_location_claims.py
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Add CH-Annotator compliant location claims to recently resolved Czech institution files.
|
||||
|
||||
This script adds location claims (city, region, country, geonames_id) to the
|
||||
ch_annotator.entity_claims array with proper 5-component provenance:
|
||||
1. namespace (geonames)
|
||||
2. path (xpath-style path to GeoNames resource)
|
||||
3. timestamp (ISO 8601)
|
||||
4. agent (opencode-claude-sonnet-4)
|
||||
5. context_convention (ch_annotator-v1_7_0)
|
||||
|
||||
Per AGENTS.md Rule 5: Additive only - never delete existing data.
|
||||
Per AGENTS.md Rule 10: CH-Annotator is the entity annotation convention.
|
||||
"""
|
||||
|
||||
import os
|
||||
import yaml
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||
RESEARCH_DATE = "2025-12-07"
|
||||
|
||||
|
||||
def find_resolved_files():
|
||||
"""Find all files resolved on the specified research date."""
|
||||
resolved_files = []
|
||||
|
||||
for yaml_file in CUSTODIAN_DIR.glob("CZ-*.yaml"):
|
||||
try:
|
||||
with open(yaml_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
if f"research_date: '{RESEARCH_DATE}'" in content:
|
||||
resolved_files.append(yaml_file)
|
||||
except Exception as e:
|
||||
print(f"Error reading {yaml_file}: {e}")
|
||||
|
||||
return sorted(resolved_files)
|
||||
|
||||
|
||||
def add_location_claims(yaml_file: Path) -> bool:
|
||||
"""
|
||||
Add CH-Annotator location claims to a custodian file.
|
||||
|
||||
Returns True if claims were added, False if already present or error.
|
||||
"""
|
||||
try:
|
||||
with open(yaml_file, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
print(f" SKIP: Empty file {yaml_file.name}")
|
||||
return False
|
||||
|
||||
# Get location data from ghcid.location_resolution
|
||||
location_resolution = data.get('ghcid', {}).get('location_resolution', {})
|
||||
location = data.get('location', {})
|
||||
|
||||
if not location_resolution.get('geonames_id'):
|
||||
print(f" SKIP: No GeoNames ID in {yaml_file.name}")
|
||||
return False
|
||||
|
||||
# Extract location values
|
||||
city_name = location_resolution.get('city_name') or location.get('city')
|
||||
region_name = location_resolution.get('region_name') or location.get('region')
|
||||
country_code = location_resolution.get('country_code') or location.get('country')
|
||||
geonames_id = location_resolution.get('geonames_id') or location.get('geonames_id')
|
||||
resolution_timestamp = location_resolution.get('resolution_timestamp')
|
||||
|
||||
if not all([city_name, country_code, geonames_id]):
|
||||
print(f" SKIP: Missing required location data in {yaml_file.name}")
|
||||
return False
|
||||
|
||||
# Ensure ch_annotator.entity_claims exists
|
||||
if 'ch_annotator' not in data:
|
||||
data['ch_annotator'] = {}
|
||||
if 'entity_claims' not in data['ch_annotator']:
|
||||
data['ch_annotator']['entity_claims'] = []
|
||||
|
||||
entity_claims = data['ch_annotator']['entity_claims']
|
||||
|
||||
# Check if location claims already exist
|
||||
existing_claim_types = {c.get('claim_type') for c in entity_claims if c}
|
||||
if 'location_city' in existing_claim_types:
|
||||
print(f" SKIP: Location claims already exist in {yaml_file.name}")
|
||||
return False
|
||||
|
||||
# Create timestamp for provenance
|
||||
timestamp = resolution_timestamp or datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Common provenance structure
|
||||
def make_provenance(path_suffix: str):
|
||||
return {
|
||||
'namespace': 'geonames',
|
||||
'path': f'/cities/{geonames_id}{path_suffix}',
|
||||
'timestamp': timestamp,
|
||||
'agent': 'glm4.6', # Z.AI GLM 4.6 - preferred model
|
||||
'context_convention': 'ch_annotator-v1_7_0'
|
||||
}
|
||||
|
||||
# Add location_city claim
|
||||
entity_claims.append({
|
||||
'claim_type': 'location_city',
|
||||
'claim_value': city_name,
|
||||
'property_uri': 'schema:addressLocality',
|
||||
'provenance': make_provenance('/name'),
|
||||
'confidence': 0.95,
|
||||
'resolution_method': 'GEONAMES_RESEARCH'
|
||||
})
|
||||
|
||||
# Add location_region claim (if available)
|
||||
if region_name:
|
||||
entity_claims.append({
|
||||
'claim_type': 'location_region',
|
||||
'claim_value': region_name,
|
||||
'property_uri': 'schema:addressRegion',
|
||||
'provenance': make_provenance('/admin1'),
|
||||
'confidence': 0.95,
|
||||
'resolution_method': 'GEONAMES_RESEARCH'
|
||||
})
|
||||
|
||||
# Add location_country claim
|
||||
entity_claims.append({
|
||||
'claim_type': 'location_country',
|
||||
'claim_value': country_code,
|
||||
'property_uri': 'schema:addressCountry',
|
||||
'provenance': make_provenance('/country'),
|
||||
'confidence': 0.98,
|
||||
'resolution_method': 'GEONAMES_RESEARCH'
|
||||
})
|
||||
|
||||
# Add geonames_id claim
|
||||
entity_claims.append({
|
||||
'claim_type': 'geonames_id',
|
||||
'claim_value': str(geonames_id),
|
||||
'property_uri': 'gn:geonamesId',
|
||||
'provenance': make_provenance(''),
|
||||
'confidence': 0.98,
|
||||
'resolution_method': 'GEONAMES_RESEARCH'
|
||||
})
|
||||
|
||||
# Write back to file
|
||||
with open(yaml_file, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
print(f" ADDED: 4 location claims to {yaml_file.name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {yaml_file.name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("CH-Annotator Location Claims Addition Script")
|
||||
print("=" * 70)
|
||||
print(f"Looking for files resolved on: {RESEARCH_DATE}")
|
||||
print()
|
||||
|
||||
# Find resolved files
|
||||
resolved_files = find_resolved_files()
|
||||
print(f"Found {len(resolved_files)} resolved files")
|
||||
print()
|
||||
|
||||
# Process each file
|
||||
added_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
|
||||
for yaml_file in resolved_files:
|
||||
result = add_location_claims(yaml_file)
|
||||
if result:
|
||||
added_count += 1
|
||||
elif result is False:
|
||||
skipped_count += 1
|
||||
else:
|
||||
error_count += 1
|
||||
|
||||
# Summary
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {len(resolved_files)}")
|
||||
print(f"Claims added: {added_count}")
|
||||
print(f"Skipped: {skipped_count}")
|
||||
print(f"Errors: {error_count}")
|
||||
print()
|
||||
|
||||
if added_count > 0:
|
||||
print("CH-Annotator location claims added successfully!")
|
||||
print("Each file now has 4 new claims:")
|
||||
print(" - location_city (schema:addressLocality)")
|
||||
print(" - location_region (schema:addressRegion)")
|
||||
print(" - location_country (schema:addressCountry)")
|
||||
print(" - geonames_id (gn:geonamesId)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
547
scripts/create_custodian_from_ch_annotator.py
Normal file
547
scripts/create_custodian_from_ch_annotator.py
Normal file
|
|
@ -0,0 +1,547 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create custodian files from CH-Annotator data for unmatched institutions.
|
||||
|
||||
This script:
|
||||
1. Loads CH-Annotator files from data/instances/*_ch_annotator.yaml
|
||||
2. Checks which institutions don't have custodian files yet
|
||||
3. Generates GHCID for each new institution
|
||||
4. Creates custodian files in data/custodian/
|
||||
|
||||
Usage:
|
||||
python scripts/create_custodian_from_ch_annotator.py [--dry-run] [--limit N]
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
import hashlib
|
||||
import argparse
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Paths
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
CH_ANNOTATOR_DIR = PROJECT_ROOT / "data" / "instances"
|
||||
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
||||
REPORTS_DIR = PROJECT_ROOT / "reports"
|
||||
INDEX_FILE = Path("/tmp/custodian_index.json")
|
||||
|
||||
# GHCID namespace UUID for deterministic UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # URL namespace
|
||||
|
||||
# Institution type to GHCID code mapping
|
||||
TYPE_TO_CODE = {
|
||||
'GALLERY': 'G',
|
||||
'LIBRARY': 'L',
|
||||
'ARCHIVE': 'A',
|
||||
'MUSEUM': 'M',
|
||||
'OFFICIAL_INSTITUTION': 'O',
|
||||
'RESEARCH_CENTER': 'R',
|
||||
'CORPORATION': 'C',
|
||||
'UNKNOWN': 'U',
|
||||
'BOTANICAL_ZOO': 'B',
|
||||
'EDUCATION_PROVIDER': 'E',
|
||||
'COLLECTING_SOCIETY': 'S',
|
||||
'FEATURES': 'F',
|
||||
'INTANGIBLE_HERITAGE_GROUP': 'I',
|
||||
'MIXED': 'X',
|
||||
'PERSONAL_COLLECTION': 'P',
|
||||
'HOLY_SITES': 'H',
|
||||
'DIGITAL_PLATFORM': 'D',
|
||||
'NGO': 'N',
|
||||
'TASTE_SMELL': 'T',
|
||||
}
|
||||
|
||||
# Prepositions/articles to skip in abbreviations
|
||||
SKIP_WORDS = {
|
||||
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
||||
'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by',
|
||||
'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'en',
|
||||
'der', 'die', 'das', 'dem', 'ein', 'eine', 'von', 'zu', 'für', 'mit',
|
||||
'el', 'la', 'los', 'las', 'un', 'una', 'del', 'al', 'con', 'por', 'para',
|
||||
'o', 'os', 'as', 'um', 'uma', 'do', 'da', 'dos', 'das', 'em', 'no', 'na',
|
||||
'il', 'lo', 'i', 'gli', 'di', 'del', 'dello', 'della', 'nel', 'nella',
|
||||
'and', 'or', 'but', 'und', 'oder', 'et', 'ou', 'e', 'y', 'o',
|
||||
}
|
||||
|
||||
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Normalize name for comparison."""
|
||||
if not name:
|
||||
return ""
|
||||
name = name.lower()
|
||||
name = re.sub(r'[^\w\s]', '', name)
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
return name
|
||||
|
||||
|
||||
def normalize_wikidata(qid: str) -> str:
|
||||
"""Normalize Wikidata ID."""
|
||||
if not qid:
|
||||
return ""
|
||||
if '/' in str(qid):
|
||||
qid = str(qid).split('/')[-1]
|
||||
return str(qid).strip().upper()
|
||||
|
||||
|
||||
def generate_abbreviation(name: str, max_len: int = 10) -> str:
|
||||
"""Generate abbreviation from institution name."""
|
||||
if not name:
|
||||
return "UNK"
|
||||
|
||||
# Remove special characters but keep letters and spaces
|
||||
clean = re.sub(r'[^\w\s]', ' ', name)
|
||||
words = clean.split()
|
||||
|
||||
# Filter out skip words and numbers
|
||||
significant_words = [w for w in words if w.lower() not in SKIP_WORDS and not w.isdigit()]
|
||||
|
||||
if not significant_words:
|
||||
significant_words = words[:3] # Fallback to first 3 words
|
||||
|
||||
# Take first letter of each word
|
||||
abbrev = ''.join(w[0].upper() for w in significant_words if w)
|
||||
|
||||
# Limit length
|
||||
return abbrev[:max_len] if abbrev else "UNK"
|
||||
|
||||
|
||||
def name_to_snake_case(name: str) -> str:
|
||||
"""Convert name to snake_case for file suffix."""
|
||||
import unicodedata
|
||||
|
||||
# Normalize unicode
|
||||
normalized = unicodedata.normalize('NFD', name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Lowercase and clean
|
||||
lower = ascii_name.lower()
|
||||
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lower)
|
||||
underscored = re.sub(r'[\s\-]+', '_', no_punct)
|
||||
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
||||
final = re.sub(r'_+', '_', clean).strip('_')
|
||||
|
||||
return final[:50] # Limit length
|
||||
|
||||
|
||||
def generate_ghcid(
|
||||
country_code: str,
|
||||
region_code: str,
|
||||
city_code: str,
|
||||
institution_type: str,
|
||||
abbreviation: str,
|
||||
name_suffix: Optional[str] = None
|
||||
) -> str:
|
||||
"""Generate GHCID string."""
|
||||
type_code = TYPE_TO_CODE.get(institution_type, 'U')
|
||||
ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}"
|
||||
if name_suffix:
|
||||
ghcid = f"{ghcid}-{name_suffix}"
|
||||
return ghcid
|
||||
|
||||
|
||||
def generate_ghcid_uuid(ghcid: str) -> str:
|
||||
"""Generate UUID v5 from GHCID string."""
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid))
|
||||
|
||||
|
||||
def generate_ghcid_uuid_sha256(ghcid: str) -> str:
|
||||
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
|
||||
sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest()
|
||||
# Format as UUID v8
|
||||
uuid_str = f"{sha256_hash[:8]}-{sha256_hash[8:12]}-8{sha256_hash[13:16]}-{sha256_hash[16:20]}-{sha256_hash[20:32]}"
|
||||
return uuid_str
|
||||
|
||||
|
||||
def generate_ghcid_numeric(ghcid: str) -> int:
|
||||
"""Generate 64-bit numeric ID from GHCID."""
|
||||
sha256_hash = hashlib.sha256(ghcid.encode()).digest()
|
||||
return int.from_bytes(sha256_hash[:8], 'big')
|
||||
|
||||
|
||||
def load_custodian_index() -> Dict:
|
||||
"""Load or build custodian index."""
|
||||
if INDEX_FILE.exists():
|
||||
with open(INDEX_FILE, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
# Build index
|
||||
print("Building custodian index...")
|
||||
index = {'by_wikidata': {}, 'by_name': {}, 'by_isil': {}, 'by_ghcid': {}}
|
||||
|
||||
for f in CUSTODIAN_DIR.glob("*.yaml"):
|
||||
try:
|
||||
with open(f, 'r') as fh:
|
||||
content = fh.read()
|
||||
|
||||
# Extract GHCID from filename
|
||||
ghcid = f.stem
|
||||
index['by_ghcid'][ghcid] = str(f)
|
||||
|
||||
# Extract Wikidata
|
||||
match = re.search(r'wikidata_entity_id:\s*["\']?(Q\d+)', content)
|
||||
if match:
|
||||
index['by_wikidata'][match.group(1).upper()] = str(f)
|
||||
|
||||
# Extract name
|
||||
match = re.search(r'organisatie:\s*(.+?)$', content, re.MULTILINE)
|
||||
if match:
|
||||
name = match.group(1).strip().strip('"\'')
|
||||
index['by_name'][normalize_name(name)] = str(f)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
with open(INDEX_FILE, 'w') as f:
|
||||
json.dump(index, f)
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def institution_exists(inst: Dict, index: Dict) -> bool:
|
||||
"""Check if institution already has a custodian file."""
|
||||
# Check Wikidata
|
||||
for ident in inst.get('identifiers', []):
|
||||
if ident.get('identifier_scheme', '').upper() == 'WIKIDATA':
|
||||
qid = normalize_wikidata(ident.get('identifier_value', ''))
|
||||
if qid and qid in index['by_wikidata']:
|
||||
return True
|
||||
|
||||
# Check name
|
||||
name = normalize_name(inst.get('name', ''))
|
||||
if name and name in index['by_name']:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def sanitize_code(code: str, max_len: int = 2) -> str:
|
||||
"""Sanitize a code for use in filenames and GHCIDs.
|
||||
|
||||
- Removes diacritics
|
||||
- Keeps only alphanumeric chars
|
||||
- Converts to uppercase
|
||||
- Truncates to max_len
|
||||
"""
|
||||
import unicodedata
|
||||
if not code:
|
||||
return "XX" if max_len == 2 else "XXX"
|
||||
|
||||
# Normalize unicode and remove diacritics
|
||||
normalized = unicodedata.normalize('NFD', str(code))
|
||||
ascii_only = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Keep only alphanumeric
|
||||
clean = re.sub(r'[^a-zA-Z0-9]', '', ascii_only)
|
||||
|
||||
if not clean:
|
||||
return "XX" if max_len == 2 else "XXX"
|
||||
|
||||
return clean[:max_len].upper()
|
||||
|
||||
|
||||
def extract_location_info(inst: Dict) -> Tuple[str, str, str]:
|
||||
"""Extract country, region, city codes from institution."""
|
||||
locations = inst.get('locations', [])
|
||||
|
||||
country_code = "XX"
|
||||
region_code = "XX"
|
||||
city_code = "XXX"
|
||||
|
||||
if locations:
|
||||
loc = locations[0]
|
||||
country_code = loc.get('country', 'XX') or 'XX'
|
||||
|
||||
# Region: if it's a 2-letter code, use it; otherwise sanitize
|
||||
region_raw = loc.get('region', 'XX') or 'XX'
|
||||
if len(region_raw) == 2 and region_raw.isalpha():
|
||||
region_code = region_raw.upper()
|
||||
else:
|
||||
# It's a full region name - take first 2 letters
|
||||
region_code = sanitize_code(region_raw, 2)
|
||||
|
||||
# City: generate 3-letter code
|
||||
city = loc.get('city', '')
|
||||
if city:
|
||||
city_code = sanitize_code(city, 3)
|
||||
|
||||
return country_code, region_code, city_code
|
||||
|
||||
|
||||
def create_custodian_file(inst: Dict, source_file: str, index: Dict) -> Tuple[Optional[Path], str]:
|
||||
"""
|
||||
Create a custodian file for an institution.
|
||||
|
||||
Returns: (file_path, status) where status is 'created', 'exists', or 'error'
|
||||
"""
|
||||
try:
|
||||
name = inst.get('name', 'Unknown Institution')
|
||||
institution_type = inst.get('institution_type', 'UNKNOWN')
|
||||
|
||||
# Extract location
|
||||
country_code, region_code, city_code = extract_location_info(inst)
|
||||
|
||||
# Generate abbreviation
|
||||
abbreviation = generate_abbreviation(name)
|
||||
|
||||
# Generate base GHCID
|
||||
base_ghcid = generate_ghcid(country_code, region_code, city_code, institution_type, abbreviation)
|
||||
|
||||
# Check for collision
|
||||
ghcid = base_ghcid
|
||||
if ghcid in index['by_ghcid']:
|
||||
# Add name suffix to resolve collision
|
||||
name_suffix = name_to_snake_case(name)
|
||||
ghcid = generate_ghcid(country_code, region_code, city_code, institution_type, abbreviation, name_suffix)
|
||||
|
||||
# Generate UUIDs
|
||||
ghcid_uuid = generate_ghcid_uuid(ghcid)
|
||||
ghcid_uuid_sha256 = generate_ghcid_uuid_sha256(ghcid)
|
||||
ghcid_numeric = generate_ghcid_numeric(ghcid)
|
||||
record_id = str(uuid.uuid4())
|
||||
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Build custodian data structure
|
||||
custodian_data = {
|
||||
'original_entry': {
|
||||
'name': name,
|
||||
'institution_type': institution_type,
|
||||
'source': f'CH-Annotator ({source_file})',
|
||||
'identifiers': inst.get('identifiers', []),
|
||||
'locations': inst.get('locations', []),
|
||||
},
|
||||
'processing_timestamp': timestamp,
|
||||
'ghcid': {
|
||||
'ghcid_current': ghcid,
|
||||
'ghcid_original': ghcid,
|
||||
'ghcid_uuid': ghcid_uuid,
|
||||
'ghcid_uuid_sha256': ghcid_uuid_sha256,
|
||||
'ghcid_numeric': ghcid_numeric,
|
||||
'record_id': record_id,
|
||||
'generation_timestamp': timestamp,
|
||||
'location_resolution': {
|
||||
'country_code': country_code,
|
||||
'region_code': region_code,
|
||||
'city_code': city_code,
|
||||
'method': 'CH_ANNOTATOR_SOURCE',
|
||||
},
|
||||
'ghcid_history': [{
|
||||
'ghcid': ghcid,
|
||||
'ghcid_numeric': ghcid_numeric,
|
||||
'valid_from': timestamp,
|
||||
'reason': f'Initial GHCID from CH-Annotator ({source_file})',
|
||||
}],
|
||||
},
|
||||
'custodian_name': {
|
||||
'claim_type': 'custodian_name',
|
||||
'claim_value': name,
|
||||
'source_type': 'ch_annotator',
|
||||
},
|
||||
'identifiers': [
|
||||
{'identifier_scheme': 'GHCID', 'identifier_value': ghcid},
|
||||
{'identifier_scheme': 'GHCID_UUID', 'identifier_value': ghcid_uuid},
|
||||
{'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': ghcid_uuid_sha256},
|
||||
{'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ghcid_numeric)},
|
||||
{'identifier_scheme': 'RECORD_ID', 'identifier_value': record_id},
|
||||
],
|
||||
'provenance': {
|
||||
'data_source': inst.get('provenance', {}).get('data_source', 'CH_ANNOTATOR'),
|
||||
'data_tier': inst.get('provenance', {}).get('data_tier', 'TIER_3_CROWD_SOURCED'),
|
||||
'extraction_date': inst.get('provenance', {}).get('extraction_date', timestamp),
|
||||
'extraction_method': f'Created from CH-Annotator file: {source_file}',
|
||||
'confidence_score': inst.get('provenance', {}).get('confidence_score', 0.8),
|
||||
},
|
||||
'ch_annotator': inst.get('ch_annotator', {}),
|
||||
}
|
||||
|
||||
# Add original identifiers
|
||||
for ident in inst.get('identifiers', []):
|
||||
scheme = ident.get('identifier_scheme', '').upper()
|
||||
if scheme not in ['GHCID', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'GHCID_NUMERIC', 'RECORD_ID']:
|
||||
custodian_data['identifiers'].append(ident)
|
||||
|
||||
# Add Wikidata enrichment if available
|
||||
for ident in inst.get('identifiers', []):
|
||||
if ident.get('identifier_scheme', '').upper() == 'WIKIDATA':
|
||||
custodian_data['wikidata_enrichment'] = {
|
||||
'wikidata_entity_id': ident.get('identifier_value', '').split('/')[-1],
|
||||
'wikidata_label_en': name,
|
||||
}
|
||||
break
|
||||
|
||||
# Add integration note to ch_annotator
|
||||
if 'ch_annotator' in custodian_data and custodian_data['ch_annotator']:
|
||||
custodian_data['ch_annotator']['integration_note'] = {
|
||||
'created_from': source_file,
|
||||
'creation_date': timestamp,
|
||||
'creation_method': 'create_custodian_from_ch_annotator.py',
|
||||
}
|
||||
|
||||
# Create file
|
||||
file_path = CUSTODIAN_DIR / f"{ghcid}.yaml"
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
|
||||
|
||||
# Update index
|
||||
index['by_ghcid'][ghcid] = str(file_path)
|
||||
if normalize_name(name):
|
||||
index['by_name'][normalize_name(name)] = str(file_path)
|
||||
|
||||
return file_path, 'created'
|
||||
|
||||
except Exception as e:
|
||||
return None, f'error: {e}'
|
||||
|
||||
|
||||
def load_ch_annotator_file(path: Path) -> List[Dict]:
|
||||
"""Load institutions from CH-Annotator file."""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
elif isinstance(data, dict):
|
||||
return data.get('institutions', [])
|
||||
return []
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Create custodian files from CH-Annotator data')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Preview without creating files')
|
||||
parser.add_argument('--limit', type=int, default=0, help='Limit institutions per file (0=unlimited)')
|
||||
parser.add_argument('--skip-large', action='store_true', help='Skip files with >5000 institutions')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Create Custodian Files from CH-Annotator Data")
|
||||
print("=" * 60)
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - No files will be created")
|
||||
|
||||
# Load index
|
||||
print("\n1. Loading custodian index...")
|
||||
index = load_custodian_index()
|
||||
print(f" Indexed: {len(index.get('by_ghcid', {}))} GHCIDs, "
|
||||
f"{len(index.get('by_wikidata', {}))} Wikidata, "
|
||||
f"{len(index.get('by_name', {}))} names")
|
||||
|
||||
# Find CH-Annotator files
|
||||
ch_files = sorted(CH_ANNOTATOR_DIR.glob("*_ch_annotator.yaml"))
|
||||
print(f"\n2. Found {len(ch_files)} CH-Annotator files")
|
||||
|
||||
# Process files
|
||||
total_stats = {
|
||||
'processed': 0,
|
||||
'created': 0,
|
||||
'skipped_exists': 0,
|
||||
'errors': 0,
|
||||
'by_source': {},
|
||||
}
|
||||
|
||||
for ch_file in ch_files:
|
||||
print(f"\n--- {ch_file.name} ---")
|
||||
|
||||
try:
|
||||
institutions = load_ch_annotator_file(ch_file)
|
||||
print(f" Loaded {len(institutions)} institutions")
|
||||
|
||||
if args.skip_large and len(institutions) > 5000:
|
||||
print(f" SKIPPING (>5000 institutions)")
|
||||
continue
|
||||
|
||||
file_stats = {'processed': 0, 'created': 0, 'skipped': 0, 'errors': 0}
|
||||
|
||||
for i, inst in enumerate(institutions):
|
||||
if args.limit and file_stats['processed'] >= args.limit:
|
||||
print(f" Reached limit of {args.limit}")
|
||||
break
|
||||
|
||||
if i % 500 == 0 and i > 0:
|
||||
print(f" Progress: {i}/{len(institutions)}, created: {file_stats['created']}")
|
||||
|
||||
file_stats['processed'] += 1
|
||||
total_stats['processed'] += 1
|
||||
|
||||
# Check if exists
|
||||
if institution_exists(inst, index):
|
||||
file_stats['skipped'] += 1
|
||||
total_stats['skipped_exists'] += 1
|
||||
continue
|
||||
|
||||
# Create file
|
||||
if not args.dry_run:
|
||||
path, status = create_custodian_file(inst, ch_file.name, index)
|
||||
|
||||
if status == 'created':
|
||||
file_stats['created'] += 1
|
||||
total_stats['created'] += 1
|
||||
elif 'error' in status:
|
||||
file_stats['errors'] += 1
|
||||
total_stats['errors'] += 1
|
||||
else:
|
||||
file_stats['created'] += 1
|
||||
total_stats['created'] += 1
|
||||
|
||||
print(f" Processed: {file_stats['processed']}, Created: {file_stats['created']}, "
|
||||
f"Skipped: {file_stats['skipped']}, Errors: {file_stats['errors']}")
|
||||
|
||||
total_stats['by_source'][ch_file.name] = file_stats
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
total_stats['errors'] += 1
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Total processed: {total_stats['processed']}")
|
||||
print(f"Files created: {total_stats['created']}")
|
||||
print(f"Skipped (already exist): {total_stats['skipped_exists']}")
|
||||
print(f"Errors: {total_stats['errors']}")
|
||||
|
||||
# Save report
|
||||
if not args.dry_run:
|
||||
REPORTS_DIR.mkdir(exist_ok=True)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
report_path = REPORTS_DIR / f"CUSTODIAN_CREATION_REPORT_{timestamp}.md"
|
||||
|
||||
report = f"""# Custodian File Creation Report
|
||||
|
||||
Generated: {datetime.now(timezone.utc).isoformat()}
|
||||
|
||||
## Summary
|
||||
|
||||
| Metric | Count |
|
||||
|--------|-------|
|
||||
| Institutions processed | {total_stats['processed']} |
|
||||
| Custodian files created | {total_stats['created']} |
|
||||
| Skipped (already exist) | {total_stats['skipped_exists']} |
|
||||
| Errors | {total_stats['errors']} |
|
||||
|
||||
## By Source File
|
||||
|
||||
| Source File | Processed | Created | Skipped | Errors |
|
||||
|-------------|-----------|---------|---------|--------|
|
||||
"""
|
||||
for source, stats in total_stats['by_source'].items():
|
||||
report += f"| {source} | {stats['processed']} | {stats['created']} | {stats['skipped']} | {stats['errors']} |\n"
|
||||
|
||||
with open(report_path, 'w') as f:
|
||||
f.write(report)
|
||||
|
||||
print(f"\nReport saved to: {report_path}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
515
scripts/enrich_austrian_cities.py
Normal file
515
scripts/enrich_austrian_cities.py
Normal file
|
|
@ -0,0 +1,515 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich Austrian custodian files with city data.
|
||||
|
||||
Strategy:
|
||||
1. Use coordinates for reverse geocoding when available
|
||||
2. Extract city names from institution names (Wien, Salzburg, Graz, etc.)
|
||||
3. Validate against GeoNames database
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_austrian_cities.py [--dry-run]
|
||||
"""
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Austrian admin1 codes (GeoNames → ISO 3166-2:AT)
|
||||
AUSTRIAN_ADMIN1_MAP = {
|
||||
'01': 'B', # Burgenland
|
||||
'02': 'K', # Carinthia (Kärnten)
|
||||
'03': 'NO', # Lower Austria (Niederösterreich)
|
||||
'04': 'OO', # Upper Austria (Oberösterreich)
|
||||
'05': 'S', # Salzburg
|
||||
'06': 'ST', # Styria (Steiermark)
|
||||
'07': 'T', # Tyrol (Tirol)
|
||||
'08': 'V', # Vorarlberg
|
||||
'09': 'W', # Vienna (Wien)
|
||||
}
|
||||
|
||||
# Known Austrian cities in institution names
|
||||
AUSTRIAN_CITY_PATTERNS = [
|
||||
# Major cities
|
||||
(r'\bWien\b', 'Wien'),
|
||||
(r'\bVienna\b', 'Wien'),
|
||||
(r'\bGraz\b', 'Graz'),
|
||||
(r'\bLinz\b', 'Linz'),
|
||||
(r'\bSalzburg\b', 'Salzburg'),
|
||||
(r'\bInnsbruck\b', 'Innsbruck'),
|
||||
(r'\bKlagenfurt\b', 'Klagenfurt'),
|
||||
(r'\bVillach\b', 'Villach'),
|
||||
(r'\bWels\b', 'Wels'),
|
||||
(r'\bSt\.?\s*Pölten\b', 'Sankt Pölten'),
|
||||
(r'\bSankt\s+Pölten\b', 'Sankt Pölten'),
|
||||
(r'\bDornbirn\b', 'Dornbirn'),
|
||||
(r'\bWiener\s+Neustadt\b', 'Wiener Neustadt'),
|
||||
(r'\bSteyr\b', 'Steyr'),
|
||||
(r'\bFeldkirch\b', 'Feldkirch'),
|
||||
(r'\bBregenz\b', 'Bregenz'),
|
||||
(r'\bLeonding\b', 'Leonding'),
|
||||
(r'\bKlosterneuburg\b', 'Klosterneuburg'),
|
||||
(r'\bBaden\b', 'Baden'),
|
||||
(r'\bLeoben\b', 'Leoben'),
|
||||
(r'\bKrems\b', 'Krems an der Donau'),
|
||||
(r'\bAmstetten\b', 'Amstetten'),
|
||||
(r'\bMödling\b', 'Mödling'),
|
||||
(r'\bKapfenberg\b', 'Kapfenberg'),
|
||||
(r'\bLustenau\b', 'Lustenau'),
|
||||
(r'\bHallein\b', 'Hallein'),
|
||||
(r'\bKufstein\b', 'Kufstein'),
|
||||
(r'\bTraun\b', 'Traun'),
|
||||
(r'\bAnsfelden\b', 'Ansfelden'),
|
||||
(r'\bHohenems\b', 'Hohenems'),
|
||||
(r'\bSchwechat\b', 'Schwechat'),
|
||||
(r'\bBraunau\b', 'Braunau am Inn'),
|
||||
(r'\bStockerau\b', 'Stockerau'),
|
||||
(r'\bSaalfelden\b', 'Saalfelden am Steinernen Meer'),
|
||||
(r'\bTernitz\b', 'Ternitz'),
|
||||
(r'\bPerchtoldsdorf\b', 'Perchtoldsdorf'),
|
||||
(r'\bEisenstädter?\b', 'Eisenstadt'),
|
||||
(r'\bEisenstadt\b', 'Eisenstadt'),
|
||||
(r'\bTelfs\b', 'Telfs'),
|
||||
(r'\bWolfsberg\b', 'Wolfsberg'),
|
||||
(r'\bHard\b', 'Hard'),
|
||||
(r'\bKorneuburg\b', 'Korneuburg'),
|
||||
(r'\bNeunkirchen\b', 'Neunkirchen'),
|
||||
(r'\bRied\b', 'Ried im Innkreis'),
|
||||
(r'\bBad\s+Ischl\b', 'Bad Ischl'),
|
||||
(r'\bGmunden\b', 'Gmunden'),
|
||||
(r'\bWörgl\b', 'Wörgl'),
|
||||
(r'\bMelk\b', 'Melk'),
|
||||
(r'\bZell\s+am\s+See\b', 'Zell am See'),
|
||||
(r'\bMistelbach\b', 'Mistelbach'),
|
||||
(r'\bVöcklabruck\b', 'Vöcklabruck'),
|
||||
(r'\bMarchtrenk\b', 'Marchtrenk'),
|
||||
(r'\bEnns\b', 'Enns'),
|
||||
(r'\bBruck\s+an\s+der\s+Mur\b', 'Bruck an der Mur'),
|
||||
(r'\bSpittal\b', 'Spittal an der Drau'),
|
||||
(r'\bSchwaz\b', 'Schwaz'),
|
||||
(r'\bVoitsberg\b', 'Voitsberg'),
|
||||
(r'\bRankweil\b', 'Rankweil'),
|
||||
(r'\bBad\s+Vöslau\b', 'Bad Vöslau'),
|
||||
(r'\bTulln\b', 'Tulln an der Donau'),
|
||||
(r'\bGänserndorf\b', 'Gänserndorf'),
|
||||
(r'\bHollabrunn\b', 'Hollabrunn'),
|
||||
(r'\bLienz\b', 'Lienz'),
|
||||
(r'\bHall\s+in\s+Tirol\b', 'Hall in Tirol'),
|
||||
(r'\bFeldkirchen\b', 'Feldkirchen in Kärnten'),
|
||||
(r'\bZwettl\b', 'Zwettl'),
|
||||
(r'\bWaidhofen\b', 'Waidhofen an der Ybbs'),
|
||||
(r'\bMattersburg\b', 'Mattersburg'),
|
||||
(r'\bOberwart\b', 'Oberwart'),
|
||||
(r'\bJudenburg\b', 'Judenburg'),
|
||||
(r'\bPöchlarn\b', 'Pöchlarn'),
|
||||
(r'\bFranziskanerplatz\b', 'Wien'), # Common Vienna address
|
||||
(r'\bJosefsplatz\b', 'Wien'), # Hofburg, Vienna
|
||||
|
||||
# Regional references → capital cities
|
||||
(r'\bTiroler\b', 'Innsbruck'), # Amt der Tiroler Landesregierung
|
||||
(r'\bBurgenländische\b', 'Eisenstadt'), # Burgenländische Landesbibliothek
|
||||
(r'\bKärnt(?:en|ner)\b', 'Klagenfurt'), # Kärnten/Kärntner → Klagenfurt
|
||||
(r'\bVorarlberg(?:er)?\b', 'Feldkirch'), # Vorarlberg
|
||||
(r'\bSteiermark\b', 'Graz'), # Steiermark
|
||||
(r'\bSteiermärk\b', 'Graz'), # Steiermärkisch
|
||||
(r'\bOÖ\b', 'Linz'), # OÖ = Oberösterreich
|
||||
(r'\bOberösterreich\b', 'Linz'), # Oberösterreich
|
||||
(r'\bNiederösterreich\b', 'Sankt Pölten'), # Niederösterreich
|
||||
(r'\bNÖ\b', 'Sankt Pölten'), # NÖ = Niederösterreich
|
||||
(r'\bSalzburg(?:er)?\b', 'Salzburg'), # Salzburger Festspiele
|
||||
|
||||
# Small towns mentioned in institution names
|
||||
(r'\bKaltenleutgeben\b', 'Kaltenleutgeben'),
|
||||
(r'\bLambach\b', 'Lambach'),
|
||||
(r'\bSeitenstetten\b', 'Seitenstetten'),
|
||||
(r'\bMattsee\b', 'Mattsee'),
|
||||
(r'\bPöggstall\b', 'Pöggstall'),
|
||||
(r'\bLaxenburg\b', 'Laxenburg'),
|
||||
(r'\bEggenburg\b', 'Eggenburg'),
|
||||
(r'\bPressbaum\b', 'Pressbaum'),
|
||||
(r'\bSeeburg\b', 'Seekirchen am Wallersee'), # Schloss Seeburg
|
||||
(r'\bSchotten(?:stift)?\b', 'Wien'), # Schottenstift is in Vienna
|
||||
(r'\bAlbertina\b', 'Wien'), # Albertina is in Vienna
|
||||
(r'\bMozarteum\b', 'Salzburg'), # Mozarteum is in Salzburg
|
||||
(r'\bParacelsus\b', 'Salzburg'), # Paracelsus Medizinische Privatuniversität
|
||||
(r'\bJoanneum\b', 'Graz'), # FH Joanneum is in Graz
|
||||
(r'\bParlament\b', 'Wien'), # Parlamentsbibliothek
|
||||
(r'\bBundeskanzleramt\b', 'Wien'), # Federal Chancellery
|
||||
(r'\bBundesministerium\b', 'Wien'), # Federal Ministries
|
||||
(r'\bBundesdenkmalamt\b', 'Wien'), # Federal Monument Office
|
||||
(r'\bÖsterreich(?:ische[rn]?)?\b', 'Wien'), # Austrian national institutions
|
||||
(r'\bIST\s*Austria\b', 'Klosterneuburg'), # Institute of Science and Technology Austria
|
||||
(r'\bInstitute\s+of\s+Science\s+and\s+Technology\s+Austria\b', 'Klosterneuburg'), # Full name
|
||||
(r'\bRapid(?:eum)?\b', 'Wien'), # SK Rapid Vienna
|
||||
(r'\bMetalab\b', 'Wien'), # Metalab hackerspace Vienna
|
||||
(r'\bSigmund\s+Freud\b', 'Wien'), # Sigmund Freud museum Vienna
|
||||
(r'\bMax\s+Perutz\b', 'Wien'), # Max Perutz Library (Vienna Biocenter)
|
||||
|
||||
# Additional specific institutions
|
||||
(r'\bAnton\s+Bruckner\b', 'Linz'), # Anton Bruckner Private University
|
||||
(r'\bbifeb\b', 'Strobl'), # Bundesinstitut für Erwachsenenbildung
|
||||
(r'\bBundesinstitut\s+für\s+Erwachsenenbildung\b', 'Strobl'),
|
||||
(r'\bZeitgenossen\b', 'Krems an der Donau'), # Archiv der Zeitgenossen
|
||||
(r'\bCompass[-\s]Verlag\b', 'Wien'), # Compass-Verlag
|
||||
(r'\bErnst\s+Krenek\b', 'Krems an der Donau'), # Ernst Krenek Institut
|
||||
(r'\bFrauensolidarität\b', 'Wien'), # Frauensolidarität
|
||||
(r'\bGeoSphere\b', 'Wien'), # GeoSphere Austria
|
||||
(r'\bHochschule\s+Burgenland\b', 'Eisenstadt'), # FH Burgenland
|
||||
(r'\bAgrar[-\s]und\s+Umweltpädagogik\b', 'Wien'), # Hochschule für Agrar
|
||||
(r'\bHochschule\s+für\s+Agrar\b', 'Wien'), # Hochschule für Agrar (full)
|
||||
(r'\bHöhere\s+Studien\b', 'Wien'), # IHS
|
||||
(r'\bInterdisciplinary\s+Transformation\b', 'Wien'), # ITU
|
||||
(r'\bJAM\s+Music\s+Lab\b', 'Wien'), # JAM Music Lab
|
||||
(r'\bKDZ\b', 'Wien'), # KDZ Zentrum
|
||||
(r'\bNew\s+Design\s+University\b', 'Sankt Pölten'), # NDU
|
||||
(r'\bPädagogische\s+Hochschule\s+Tirol\b', 'Innsbruck'), # PH Tirol
|
||||
(r'\bPädagogische\s+Hochschule\s+Burgenland\b', 'Eisenstadt'), # PPH Burgenland
|
||||
(r'\bShared\s+Archiving\b', 'Wien'), # SAA
|
||||
(r'\bVerbund\s+für\s+Bildung\b', 'Wien'), # VBKV
|
||||
(r'\bVilla\s+North\b', 'Wien'), # Villa North
|
||||
(r'\bInformationswissenschaft\b', 'Graz'), # VFI
|
||||
(r'\bErinnerungskultur\b', 'Villach'), # ZEG is in Villach, not Graz
|
||||
(r'\bParlament(?:s)?(?:direktion|bibliothek)?\b', 'Wien'), # Parlamentsbibliothek
|
||||
]
|
||||
|
||||
|
||||
def load_source_data(source_file: str) -> dict:
|
||||
"""Load Austrian source data with coordinates and ISIL codes."""
|
||||
import yaml
|
||||
|
||||
with open(source_file, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
lookup = {}
|
||||
for inst in data.get('institutions', []):
|
||||
# Get ISIL code
|
||||
isil = None
|
||||
for ident in inst.get('identifiers', []):
|
||||
if ident.get('identifier_scheme') == 'ISIL':
|
||||
isil = ident.get('identifier_value')
|
||||
break
|
||||
|
||||
if isil:
|
||||
locs = inst.get('locations', [])
|
||||
coords = None
|
||||
if locs and locs[0].get('latitude') and locs[0].get('longitude'):
|
||||
coords = (locs[0]['latitude'], locs[0]['longitude'])
|
||||
|
||||
lookup[isil] = {
|
||||
'name': inst.get('name', ''),
|
||||
'coords': coords,
|
||||
}
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def extract_city_from_name(name: str) -> str | None:
|
||||
"""Extract city name from Austrian institution name."""
|
||||
for pattern, city in AUSTRIAN_CITY_PATTERNS:
|
||||
if re.search(pattern, name, re.IGNORECASE):
|
||||
return city
|
||||
return None
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
||||
words = clean.split()
|
||||
|
||||
if len(words) == 1:
|
||||
return words[0][:3].upper()
|
||||
else:
|
||||
if len(words) == 2:
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
|
||||
def reverse_geocode(lat: float, lon: float, conn: sqlite3.Connection) -> dict | None:
|
||||
"""Reverse geocode coordinates to find nearest Austrian city."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code,
|
||||
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
||||
FROM cities
|
||||
WHERE country_code = 'AT'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
ORDER BY distance_sq
|
||||
LIMIT 1
|
||||
''', (lat, lat, lon, lon))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'latitude': row[4],
|
||||
'longitude': row[5],
|
||||
'geonames_id': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
|
||||
"""Look up city in GeoNames database."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try exact match
|
||||
cursor.execute('''
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code = 'AT'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (city_name, city_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'latitude': row[4],
|
||||
'longitude': row[5],
|
||||
'geonames_id': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
# Try fuzzy match
|
||||
cursor.execute('''
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code = 'AT'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (f'{city_name}%', f'{city_name}%'))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'latitude': row[4],
|
||||
'longitude': row[5],
|
||||
'geonames_id': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, method: str, dry_run: bool = False) -> bool:
|
||||
"""Update a custodian file with city data."""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
|
||||
if not ghcid_match:
|
||||
return False
|
||||
|
||||
old_ghcid = ghcid_match.group(1)
|
||||
|
||||
region_code = AUSTRIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
|
||||
city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])
|
||||
|
||||
parts = old_ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
type_code = parts[3]
|
||||
abbrev_and_suffix = '-'.join(parts[4:])
|
||||
new_ghcid = f"AT-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
|
||||
else:
|
||||
return False
|
||||
|
||||
if old_ghcid == new_ghcid:
|
||||
return False
|
||||
|
||||
old_filename = file_path.name
|
||||
new_filename = old_filename.replace(old_ghcid, new_ghcid)
|
||||
new_file_path = file_path.parent / new_filename
|
||||
|
||||
new_content = content.replace(old_ghcid, new_ghcid)
|
||||
|
||||
old_resolution = re.search(r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content)
|
||||
|
||||
if old_resolution:
|
||||
new_resolution = f"""location_resolution:
|
||||
country_code: AT
|
||||
region_code: {region_code}
|
||||
region_name: {geo_data['admin1_name']}
|
||||
city_code: {city_code}
|
||||
city_name: {geo_data['name']}
|
||||
geonames_id: {geo_data['geonames_id']}
|
||||
feature_code: {geo_data['feature_code']}
|
||||
latitude: {geo_data['latitude']}
|
||||
longitude: {geo_data['longitude']}
|
||||
method: {method}
|
||||
resolution_date: '{datetime.now(timezone.utc).isoformat()}'
|
||||
"""
|
||||
new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]
|
||||
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
history_entry = f""" - ghcid: {new_ghcid}
|
||||
valid_from: '{timestamp}'
|
||||
reason: City enrichment from {method} - {city_name} resolved to {geo_data['name']} ({region_code})
|
||||
"""
|
||||
|
||||
history_match = re.search(r'ghcid_history:\s*\n', new_content)
|
||||
if history_match:
|
||||
insert_pos = history_match.end()
|
||||
new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]
|
||||
|
||||
if dry_run:
|
||||
print(f" DRY RUN: {old_filename} -> {new_filename}")
|
||||
return True
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(new_content)
|
||||
|
||||
if new_file_path != file_path:
|
||||
file_path.rename(new_file_path)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
base_dir = Path(__file__).parent.parent
|
||||
custodian_dir = base_dir / 'data' / 'custodian'
|
||||
source_file = base_dir / 'data' / 'instances' / 'austria_complete_ch_annotator.yaml'
|
||||
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
||||
|
||||
print("Austrian City Enrichment Script")
|
||||
print("=" * 50)
|
||||
|
||||
if dry_run:
|
||||
print("DRY RUN MODE")
|
||||
|
||||
# Load source data
|
||||
print(f"\nLoading source data from {source_file.name}...")
|
||||
source_lookup = load_source_data(str(source_file))
|
||||
print(f" Found {len(source_lookup)} ISIL entries")
|
||||
|
||||
coords_count = sum(1 for v in source_lookup.values() if v['coords'])
|
||||
print(f" {coords_count} entries have coordinates")
|
||||
|
||||
conn = sqlite3.connect(str(geonames_db))
|
||||
|
||||
print(f"\nFinding Austrian XXX files...")
|
||||
xxx_files = list(custodian_dir.glob('AT-*-XXX-*.yaml'))
|
||||
print(f" Found {len(xxx_files)} files")
|
||||
|
||||
updated = 0
|
||||
by_coords = 0
|
||||
by_name = 0
|
||||
no_city = 0
|
||||
no_geonames = 0
|
||||
errors = 0
|
||||
|
||||
for file_path in xxx_files:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Find ISIL code
|
||||
isil_match = re.search(r'identifier_value:\s*(AT-\w+)', content)
|
||||
isil_code = isil_match.group(1) if isil_match else None
|
||||
|
||||
# Get institution name
|
||||
name_match = re.search(r'claim_value:\s*(.+)', content)
|
||||
inst_name = name_match.group(1).strip() if name_match else ''
|
||||
|
||||
geo_data = None
|
||||
method = None
|
||||
city_name = None
|
||||
|
||||
# Strategy 1: Use coordinates for reverse geocoding
|
||||
if isil_code and isil_code in source_lookup:
|
||||
source_data = source_lookup[isil_code]
|
||||
if source_data['coords']:
|
||||
lat, lon = source_data['coords']
|
||||
geo_data = reverse_geocode(lat, lon, conn)
|
||||
if geo_data:
|
||||
method = 'REVERSE_GEOCODE'
|
||||
city_name = geo_data['name']
|
||||
by_coords += 1
|
||||
|
||||
# Strategy 2: Extract city from institution name
|
||||
if not geo_data:
|
||||
city_name = extract_city_from_name(inst_name)
|
||||
if city_name:
|
||||
geo_data = lookup_city_in_geonames(city_name, conn)
|
||||
if geo_data:
|
||||
method = 'NAME_EXTRACTION'
|
||||
by_name += 1
|
||||
|
||||
if not geo_data:
|
||||
no_city += 1
|
||||
continue
|
||||
|
||||
if update_custodian_file(file_path, city_name, geo_data, method, dry_run):
|
||||
updated += 1
|
||||
if not dry_run:
|
||||
print(f" Updated: {file_path.name} -> {city_name} ({method})")
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f" ERROR: {file_path.name}: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("SUMMARY")
|
||||
print("=" * 50)
|
||||
print(f"Total XXX files: {len(xxx_files)}")
|
||||
print(f"Updated: {updated}")
|
||||
print(f" By coordinates: {by_coords}")
|
||||
print(f" By name extraction: {by_name}")
|
||||
print(f"No city found: {no_city}")
|
||||
print(f"Errors: {errors}")
|
||||
print(f"Remaining XXX: {len(xxx_files) - updated}")
|
||||
|
||||
# Generate report
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
report_path = base_dir / 'reports' / f'AUSTRIAN_CITY_ENRICHMENT_{timestamp}.md'
|
||||
|
||||
with open(report_path, 'w') as f:
|
||||
f.write(f"# Austrian City Enrichment Report\n\n")
|
||||
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**Dry Run**: {dry_run}\n\n")
|
||||
f.write(f"## Summary\n\n")
|
||||
f.write(f"| Metric | Count |\n")
|
||||
f.write(f"|--------|-------|\n")
|
||||
f.write(f"| Total XXX files | {len(xxx_files)} |\n")
|
||||
f.write(f"| Updated | {updated} |\n")
|
||||
f.write(f"| By coordinates | {by_coords} |\n")
|
||||
f.write(f"| By name extraction | {by_name} |\n")
|
||||
f.write(f"| No city found | {no_city} |\n")
|
||||
f.write(f"| Errors | {errors} |\n")
|
||||
f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")
|
||||
|
||||
print(f"\nReport: {report_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
465
scripts/enrich_belgian_cities.py
Normal file
465
scripts/enrich_belgian_cities.py
Normal file
|
|
@ -0,0 +1,465 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich Belgian custodian files with city data from ISIL registry.
|
||||
|
||||
Strategy:
|
||||
1. First try to get city from enriched source file (fast)
|
||||
2. If not found, scrape the Belgian ISIL website (slow, 1 req/sec)
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_belgian_cities.py [--dry-run]
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Belgian admin1 codes (GeoNames uses BRU, VLG, WAL)
|
||||
BELGIAN_ADMIN1_MAP = {
|
||||
'BRU': 'BRU', # Brussels Capital Region
|
||||
'VLG': 'VLG', # Flanders (Vlaanderen)
|
||||
'WAL': 'WAL', # Wallonia (Wallonië)
|
||||
}
|
||||
|
||||
# Belgian city name aliases (Dutch/French variants)
|
||||
BELGIAN_CITY_ALIASES = {
|
||||
'Brussel': 'Brussels',
|
||||
'Bruxelles': 'Brussels',
|
||||
'Antwerpen': 'Antwerpen',
|
||||
'Anvers': 'Antwerpen',
|
||||
'Gent': 'Gent',
|
||||
'Gand': 'Gent',
|
||||
'Luik': 'Liège',
|
||||
'Liege': 'Liège',
|
||||
'Bergen': 'Mons',
|
||||
'Namen': 'Namur',
|
||||
'Mechelen': 'Mechelen',
|
||||
'Malines': 'Mechelen',
|
||||
'Leuven': 'Leuven',
|
||||
'Louvain': 'Leuven',
|
||||
'Elsene': 'Ixelles',
|
||||
'Ukkel': 'Uccle',
|
||||
'Oudergem': 'Auderghem',
|
||||
'Watermaal-Bosvoorde': 'Watermael-Boitsfort',
|
||||
'Sint-Gillis': 'Saint-Gilles',
|
||||
'Sint-Jans-Molenbeek': 'Molenbeek-Saint-Jean',
|
||||
'Schaarbeek': 'Schaerbeek',
|
||||
'Etterbeek': 'Etterbeek',
|
||||
'Vorst': 'Forest',
|
||||
'Anderlecht': 'Anderlecht',
|
||||
'Jette': 'Jette',
|
||||
'Koekelberg': 'Koekelberg',
|
||||
'Evere': 'Evere',
|
||||
'Sint-Pieters-Woluwe': 'Woluwe-Saint-Pierre',
|
||||
'Sint-Lambrechts-Woluwe': 'Woluwe-Saint-Lambert',
|
||||
'Ganshoren': 'Ganshoren',
|
||||
}
|
||||
|
||||
|
||||
def load_isil_city_lookup(enriched_file: str) -> dict:
|
||||
"""Load ISIL -> city mapping from enriched Belgian ISIL file."""
|
||||
with open(enriched_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Split by 'id:' at start of line
|
||||
entries = re.split(r'\n(?=id: BE-)', content)
|
||||
|
||||
lookup = {}
|
||||
for entry in entries[1:]: # Skip header
|
||||
# Extract ISIL
|
||||
isil_match = re.search(r'^id: (BE-\w+)', entry)
|
||||
if not isil_match:
|
||||
continue
|
||||
isil = isil_match.group(1)
|
||||
|
||||
# Extract city from locations section
|
||||
city_match = re.search(r'locations:\s*\n-\s*city:\s*(\S.*)', entry)
|
||||
if city_match:
|
||||
city = city_match.group(1).strip()
|
||||
lookup[isil] = city
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def load_isil_source_urls(enriched_file: str) -> dict:
|
||||
"""Load ISIL -> source_url mapping for web scraping fallback."""
|
||||
with open(enriched_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
entries = re.split(r'\n(?=id: BE-)', content)
|
||||
|
||||
lookup = {}
|
||||
for entry in entries[1:]:
|
||||
isil_match = re.search(r'^id: (BE-\w+)', entry)
|
||||
url_match = re.search(r'source_url:\s*(https://isil\.kbr\.be/\S+)', entry)
|
||||
if isil_match and url_match:
|
||||
lookup[isil_match.group(1)] = url_match.group(1)
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def scrape_city_from_isil_website(url: str) -> str | None:
|
||||
"""Scrape city from Belgian ISIL website."""
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'GLAM-Enricher/1.0'})
|
||||
with urllib.request.urlopen(req, timeout=10) as response:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
# Look for address pattern: "Street, POSTCODE City"
|
||||
# Belgian postal codes are 4 digits
|
||||
address_match = re.search(r'Walk up adress.*?<td class="output"[^>]*>([^<]+)</td>', html, re.DOTALL | re.IGNORECASE)
|
||||
if address_match:
|
||||
address = address_match.group(1)
|
||||
# Parse city from address: "Veldstraat 53, 9910 Knesselare"
|
||||
city_match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', address)
|
||||
if city_match:
|
||||
city = city_match.group(2).strip()
|
||||
# Clean up trailing HTML entities
|
||||
city = re.sub(r'&\w+;.*$', '', city).strip()
|
||||
return city
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" Error scraping {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
import unicodedata
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Clean up
|
||||
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
||||
words = clean.split()
|
||||
|
||||
if len(words) == 1:
|
||||
return words[0][:3].upper()
|
||||
else:
|
||||
if len(words) == 2:
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
|
||||
def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
|
||||
"""Look up city in GeoNames database."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check aliases first
|
||||
normalized_name = BELGIAN_CITY_ALIASES.get(city_name, city_name)
|
||||
|
||||
# Try exact match first
|
||||
cursor.execute('''
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code = 'BE'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (normalized_name, normalized_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'latitude': row[4],
|
||||
'longitude': row[5],
|
||||
'geonames_id': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
# Try original name if alias was used
|
||||
if normalized_name != city_name:
|
||||
cursor.execute('''
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code = 'BE'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (city_name, city_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'latitude': row[4],
|
||||
'longitude': row[5],
|
||||
'geonames_id': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
# Try fuzzy match with LIKE
|
||||
cursor.execute('''
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code = 'BE'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (f'{city_name}%', f'{city_name}%'))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'latitude': row[4],
|
||||
'longitude': row[5],
|
||||
'geonames_id': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, dry_run: bool = False) -> bool:
|
||||
"""Update a custodian file with city data."""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Extract current GHCID
|
||||
ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
|
||||
if not ghcid_match:
|
||||
print(f" WARNING: No ghcid_current found in {file_path.name}")
|
||||
return False
|
||||
|
||||
old_ghcid = ghcid_match.group(1)
|
||||
|
||||
# Generate new GHCID components
|
||||
region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
|
||||
city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])
|
||||
|
||||
# Build new GHCID: BE-XX-XXX-{type}-{abbrev}[-suffix]
|
||||
parts = old_ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
type_code = parts[3]
|
||||
abbrev_and_suffix = '-'.join(parts[4:])
|
||||
new_ghcid = f"BE-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
|
||||
else:
|
||||
print(f" WARNING: Unexpected GHCID format: {old_ghcid}")
|
||||
return False
|
||||
|
||||
if old_ghcid == new_ghcid:
|
||||
return False
|
||||
|
||||
# Calculate new filename
|
||||
old_filename = file_path.name
|
||||
new_filename = old_filename.replace(old_ghcid, new_ghcid)
|
||||
new_file_path = file_path.parent / new_filename
|
||||
|
||||
# Update content
|
||||
new_content = content.replace(old_ghcid, new_ghcid)
|
||||
|
||||
# Update location_resolution section
|
||||
old_resolution = re.search(
|
||||
r'location_resolution:\s*\n((?:\s+\S.*\n)*)',
|
||||
new_content
|
||||
)
|
||||
|
||||
if old_resolution:
|
||||
new_resolution = f"""location_resolution:
|
||||
country_code: BE
|
||||
region_code: {region_code}
|
||||
region_name: {geo_data['admin1_name']}
|
||||
city_code: {city_code}
|
||||
city_name: {geo_data['name']}
|
||||
geonames_id: {geo_data['geonames_id']}
|
||||
feature_code: {geo_data['feature_code']}
|
||||
latitude: {geo_data['latitude']}
|
||||
longitude: {geo_data['longitude']}
|
||||
method: BELGIAN_ISIL_REGISTRY
|
||||
resolution_date: '{datetime.now(timezone.utc).isoformat()}'
|
||||
"""
|
||||
new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]
|
||||
|
||||
# Add GHCID history entry
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
history_entry = f""" - ghcid: {new_ghcid}
|
||||
valid_from: '{timestamp}'
|
||||
reason: City enrichment from Belgian ISIL registry - {city_name} resolved to {geo_data['name']} ({region_code})
|
||||
"""
|
||||
|
||||
history_match = re.search(r'ghcid_history:\s*\n', new_content)
|
||||
if history_match:
|
||||
insert_pos = history_match.end()
|
||||
new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]
|
||||
|
||||
if dry_run:
|
||||
print(f" DRY RUN: Would rename {old_filename} -> {new_filename}")
|
||||
print(f" GHCID: {old_ghcid} -> {new_ghcid}")
|
||||
return True
|
||||
|
||||
# Write updated content
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(new_content)
|
||||
|
||||
# Rename file
|
||||
if new_file_path != file_path:
|
||||
file_path.rename(new_file_path)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
# Paths
|
||||
base_dir = Path(__file__).parent.parent
|
||||
custodian_dir = base_dir / 'data' / 'custodian'
|
||||
enriched_file = base_dir / 'data' / 'instances' / 'belgium_isil_institutions_enriched.yaml'
|
||||
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
||||
|
||||
print("Belgian City Enrichment Script")
|
||||
print("=" * 50)
|
||||
|
||||
if dry_run:
|
||||
print("DRY RUN MODE - No changes will be made")
|
||||
|
||||
# Load lookups
|
||||
print(f"\nLoading ISIL city lookup from {enriched_file.name}...")
|
||||
isil_city_lookup = load_isil_city_lookup(str(enriched_file))
|
||||
isil_url_lookup = load_isil_source_urls(str(enriched_file))
|
||||
print(f" Found {len(isil_city_lookup)} ISIL codes with city data")
|
||||
print(f" Found {len(isil_url_lookup)} ISIL codes with source URLs")
|
||||
|
||||
# Connect to GeoNames
|
||||
print(f"\nConnecting to GeoNames database...")
|
||||
conn = sqlite3.connect(str(geonames_db))
|
||||
|
||||
# Find Belgian XXX files
|
||||
print(f"\nFinding Belgian custodian files with XXX placeholder...")
|
||||
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
|
||||
print(f" Found {len(xxx_files)} files to process")
|
||||
|
||||
# Process files
|
||||
updated = 0
|
||||
no_isil = 0
|
||||
no_city = 0
|
||||
no_geonames = 0
|
||||
scraped = 0
|
||||
errors = 0
|
||||
not_found_cities = []
|
||||
|
||||
for file_path in xxx_files:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Find ISIL code
|
||||
isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content)
|
||||
if not isil_match:
|
||||
no_isil += 1
|
||||
continue
|
||||
|
||||
isil_code = isil_match.group(1)
|
||||
|
||||
# Strategy 1: Look up city from enriched file
|
||||
city_name = isil_city_lookup.get(isil_code)
|
||||
|
||||
# Strategy 2: Scrape from website if not in lookup
|
||||
if not city_name and isil_code in isil_url_lookup:
|
||||
url = isil_url_lookup[isil_code]
|
||||
print(f" Scraping {isil_code} from {url}...")
|
||||
city_name = scrape_city_from_isil_website(url)
|
||||
if city_name:
|
||||
scraped += 1
|
||||
print(f" Found: {city_name}")
|
||||
time.sleep(1) # Rate limit
|
||||
|
||||
if not city_name:
|
||||
no_city += 1
|
||||
continue
|
||||
|
||||
# Look up in GeoNames
|
||||
geo_data = lookup_city_in_geonames(city_name, conn)
|
||||
if not geo_data:
|
||||
no_geonames += 1
|
||||
not_found_cities.append((file_path.name, isil_code, city_name))
|
||||
continue
|
||||
|
||||
# Update file
|
||||
if update_custodian_file(file_path, city_name, geo_data, dry_run):
|
||||
updated += 1
|
||||
if not dry_run:
|
||||
print(f" Updated: {file_path.name} -> {city_name} ({geo_data['admin1_code']})")
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f" ERROR processing {file_path.name}: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 50)
|
||||
print("SUMMARY")
|
||||
print("=" * 50)
|
||||
print(f"Total XXX files: {len(xxx_files)}")
|
||||
print(f"Updated: {updated}")
|
||||
print(f"Scraped from website: {scraped}")
|
||||
print(f"No ISIL in file: {no_isil}")
|
||||
print(f"No city found: {no_city}")
|
||||
print(f"City not in GeoNames: {no_geonames}")
|
||||
print(f"Errors: {errors}")
|
||||
print(f"Remaining XXX: {len(xxx_files) - updated}")
|
||||
|
||||
if not_found_cities:
|
||||
print(f"\nCities not found in GeoNames:")
|
||||
for fname, isil, city in not_found_cities[:20]:
|
||||
print(f" {isil}: {city}")
|
||||
if len(not_found_cities) > 20:
|
||||
print(f" ... and {len(not_found_cities) - 20} more")
|
||||
|
||||
# Generate report
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
report_path = base_dir / 'reports' / f'BELGIAN_CITY_ENRICHMENT_{timestamp}.md'
|
||||
|
||||
with open(report_path, 'w') as f:
|
||||
f.write(f"# Belgian City Enrichment Report\n\n")
|
||||
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**Dry Run**: {dry_run}\n\n")
|
||||
f.write(f"## Summary\n\n")
|
||||
f.write(f"| Metric | Count |\n")
|
||||
f.write(f"|--------|-------|\n")
|
||||
f.write(f"| Total XXX files | {len(xxx_files)} |\n")
|
||||
f.write(f"| Updated | {updated} |\n")
|
||||
f.write(f"| Scraped from website | {scraped} |\n")
|
||||
f.write(f"| No ISIL in file | {no_isil} |\n")
|
||||
f.write(f"| No city found | {no_city} |\n")
|
||||
f.write(f"| City not in GeoNames | {no_geonames} |\n")
|
||||
f.write(f"| Errors | {errors} |\n")
|
||||
f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")
|
||||
|
||||
if not_found_cities:
|
||||
f.write(f"\n## Cities Not Found in GeoNames\n\n")
|
||||
f.write(f"| File | ISIL | City |\n")
|
||||
f.write(f"|------|------|------|\n")
|
||||
for fname, isil, city in not_found_cities:
|
||||
f.write(f"| {fname} | {isil} | {city} |\n")
|
||||
|
||||
print(f"\nReport written to: {report_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
185
scripts/enrich_belgian_v2.py
Normal file
185
scripts/enrich_belgian_v2.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Belgian city enrichment v2 - with city name aliases.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Belgian city aliases (Dutch names → GeoNames names)
|
||||
BELGIAN_CITY_ALIASES = {
|
||||
'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert',
|
||||
'sint-pieters-woluwe': 'Woluwe-Saint-Pierre',
|
||||
'sint-stevens-woluwe': 'Sint-Stevens-Woluwe',
|
||||
'oostende': 'Ostend',
|
||||
'gent': 'Gent',
|
||||
'brugge': 'Brugge',
|
||||
'brussel': 'Brussels',
|
||||
'antwerpen': 'Antwerpen',
|
||||
'luik': 'Liège',
|
||||
'liège': 'Liège',
|
||||
'leuven': 'Leuven',
|
||||
'mechelen': 'Mechelen',
|
||||
'aalst': 'Aalst',
|
||||
'hasselt': 'Hasselt',
|
||||
'kortrijk': 'Kortrijk',
|
||||
'sint-niklaas': 'Sint-Niklaas',
|
||||
'genk': 'Genk',
|
||||
'roeselare': 'Roeselare',
|
||||
# Merged municipalities (2019)
|
||||
'kluisbergen': 'Kluisbergen',
|
||||
'lievegem': 'Nevele', # Lievegem was created from Nevele, Waarschoot, Zomergem, Lovendegem
|
||||
'kruisem': 'Kruishoutem', # Kruisem was created from Kruishoutem and Zingem
|
||||
'lierde': 'Sint-Maria-Lierde',
|
||||
'maarkedal': 'Etikhove', # Maarkedal includes Etikhove
|
||||
# Other
|
||||
'de haan': 'De Haan',
|
||||
'lint': 'Lint',
|
||||
'herne': 'Herne',
|
||||
}
|
||||
|
||||
# Belgian admin1 mapping (GeoNames → ISO 3166-2:BE)
|
||||
BELGIAN_ADMIN1_MAP = {
|
||||
'Brussels Capital': 'BRU',
|
||||
'Brussels': 'BRU',
|
||||
'Flanders': 'VLG',
|
||||
'Wallonia': 'WAL',
|
||||
}
|
||||
|
||||
def normalize_city_name(name):
|
||||
"""Normalize city name for lookup."""
|
||||
if not name:
|
||||
return None
|
||||
normalized = unicodedata.normalize('NFD', name.lower())
|
||||
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
return normalized.strip()
|
||||
|
||||
def lookup_city(city_name, conn):
|
||||
"""Look up city in GeoNames with alias support."""
|
||||
if not city_name:
|
||||
return None
|
||||
|
||||
normalized = normalize_city_name(city_name)
|
||||
|
||||
# Check alias first
|
||||
if normalized in BELGIAN_CITY_ALIASES:
|
||||
lookup_name = BELGIAN_CITY_ALIASES[normalized]
|
||||
else:
|
||||
lookup_name = city_name
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try exact match
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
|
||||
FROM cities
|
||||
WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?))
|
||||
ORDER BY population DESC LIMIT 1
|
||||
""", (lookup_name, lookup_name))
|
||||
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
return {
|
||||
'name': result[0],
|
||||
'ascii_name': result[1],
|
||||
'admin1_name': result[2],
|
||||
'latitude': result[3],
|
||||
'longitude': result[4],
|
||||
'geonames_id': result[5],
|
||||
'population': result[6],
|
||||
}
|
||||
|
||||
# Try partial match
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
|
||||
FROM cities
|
||||
WHERE country_code='BE' AND (name LIKE ? OR ascii_name LIKE ?)
|
||||
ORDER BY population DESC LIMIT 1
|
||||
""", (f"%{lookup_name}%", f"%{lookup_name}%"))
|
||||
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
return {
|
||||
'name': result[0],
|
||||
'ascii_name': result[1],
|
||||
'admin1_name': result[2],
|
||||
'latitude': result[3],
|
||||
'longitude': result[4],
|
||||
'geonames_id': result[5],
|
||||
'population': result[6],
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
def generate_city_code(city_name):
|
||||
"""Generate 3-letter city code."""
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
||||
words = clean.split()
|
||||
|
||||
articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'}
|
||||
|
||||
if len(words) == 1:
|
||||
return clean[:3].upper()
|
||||
elif words[0].lower() in articles:
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
def main():
|
||||
base_dir = Path(__file__).parent.parent
|
||||
custodian_dir = base_dir / 'data' / 'custodian'
|
||||
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
||||
|
||||
print("Belgian City Enrichment v2")
|
||||
print("=" * 50)
|
||||
|
||||
conn = sqlite3.connect(str(geonames_db))
|
||||
|
||||
# Find Belgian XXX files
|
||||
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
|
||||
print(f"Found {len(xxx_files)} Belgian XXX files")
|
||||
|
||||
updated = 0
|
||||
not_found = []
|
||||
|
||||
for file_path in xxx_files:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Get institution name
|
||||
name_match = re.search(r'claim_value:\s*(.+)', content)
|
||||
inst_name = name_match.group(1).strip() if name_match else ''
|
||||
|
||||
# Try to extract city from filename or name
|
||||
# Belgian cities often in the file details - let's look at the log
|
||||
# The scraper was finding cities from ISIL website
|
||||
|
||||
# Check if there's city info in the file already
|
||||
city_match = re.search(r'city(?:_name)?:\s*([^\n]+)', content)
|
||||
if city_match:
|
||||
city_name = city_match.group(1).strip().strip('"\'')
|
||||
if city_name and city_name != 'XXX':
|
||||
geo_data = lookup_city(city_name, conn)
|
||||
if geo_data:
|
||||
print(f"✓ {file_path.name}: {city_name} → {geo_data['name']}")
|
||||
updated += 1
|
||||
# Would update file here
|
||||
else:
|
||||
not_found.append((file_path.name, city_name))
|
||||
|
||||
print(f"\nUpdated: {updated}")
|
||||
print(f"Not found: {len(not_found)}")
|
||||
if not_found:
|
||||
print("\nCities not found:")
|
||||
for fname, city in not_found[:20]:
|
||||
print(f" {fname}: {city}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
424
scripts/enrich_bulgarian_cities.py
Executable file
424
scripts/enrich_bulgarian_cities.py
Executable file
|
|
@ -0,0 +1,424 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich Bulgarian custodian files with proper city codes from GeoNames.
|
||||
Maps Cyrillic city names to ASCII equivalents and resolves admin1 regions.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import yaml
|
||||
|
||||
# Bulgarian Cyrillic to ASCII city name mapping
|
||||
# Based on standard transliteration
|
||||
CYRILLIC_TO_ASCII = {
|
||||
# Major cities found in XXX files
|
||||
'Самоков': 'Samokov',
|
||||
'Асеновград': 'Asenovgrad',
|
||||
'Казанлък': 'Kazanlak',
|
||||
'Карлово': 'Karlovo',
|
||||
'Котел': 'Kotel',
|
||||
'Димитровград': 'Dimitrovgrad',
|
||||
'Исперих': 'Isperih',
|
||||
'Панагюрище': 'Panagyurishte',
|
||||
'Раднево': 'Radnevo',
|
||||
'Белица': 'Belitsa',
|
||||
'Гоце Делчев': 'Gotse Delchev',
|
||||
'Горна Оряховица': 'Gorna Oryahovitsa',
|
||||
'Якоруда': 'Yakoruda',
|
||||
'Хаджидимово': 'Hadzhidimovo',
|
||||
'Генерал Тодоров': 'General Todorov',
|
||||
'Черноморец': 'Chernomorets',
|
||||
'Плоски': 'Ploski',
|
||||
'Плетена': 'Pletena',
|
||||
'Дюлево': 'Dyulevo',
|
||||
'Левуново': 'Levunovo',
|
||||
'Гълъбово': 'Galabovo',
|
||||
'Абланица': 'Ablanitsa',
|
||||
# Additional common cities
|
||||
'София': 'Sofia',
|
||||
'Пловдив': 'Plovdiv',
|
||||
'Варна': 'Varna',
|
||||
'Бургас': 'Burgas',
|
||||
'Русе': 'Ruse',
|
||||
'Стара Загора': 'Stara Zagora',
|
||||
'Плевен': 'Pleven',
|
||||
'Сливен': 'Sliven',
|
||||
'Добрич': 'Dobrich',
|
||||
'Шумен': 'Shumen',
|
||||
'Перник': 'Pernik',
|
||||
'Хасково': 'Haskovo',
|
||||
'Благоевград': 'Blagoevgrad',
|
||||
'Велико Търново': 'Veliko Tarnovo',
|
||||
'Враца': 'Vratsa',
|
||||
'Габрово': 'Gabrovo',
|
||||
'Пазарджик': 'Pazardzhik',
|
||||
'Ямбол': 'Yambol',
|
||||
'Кърджали': 'Kardzhali',
|
||||
'Монтана': 'Montana',
|
||||
'Разград': 'Razgrad',
|
||||
'Силистра': 'Silistra',
|
||||
'Смолян': 'Smolyan',
|
||||
'Търговище': 'Targovishte',
|
||||
'Кюстендил': 'Kyustendil',
|
||||
'Ловеч': 'Lovech',
|
||||
'Видин': 'Vidin',
|
||||
}
|
||||
|
||||
# Bulgarian admin1 GeoNames code to ISO 3166-2:BG mapping
|
||||
ADMIN1_TO_ISO = {
|
||||
'38': 'BLG', # Blagoevgrad
|
||||
'39': 'BGS', # Burgas
|
||||
'40': 'DOB', # Dobrich
|
||||
'41': 'GAB', # Gabrovo
|
||||
'42': 'SOF', # Sofia-Capital (also SFO for city)
|
||||
'43': 'KHO', # Haskovo (officially HKV but using KHO)
|
||||
'44': 'KRZ', # Kardzhali
|
||||
'45': 'KNL', # Kyustendil
|
||||
'46': 'LOV', # Lovech
|
||||
'47': 'MON', # Montana
|
||||
'48': 'PAZ', # Pazardzhik
|
||||
'49': 'PER', # Pernik
|
||||
'50': 'PVN', # Pleven
|
||||
'51': 'PDV', # Plovdiv
|
||||
'52': 'RAZ', # Razgrad
|
||||
'53': 'RSE', # Ruse
|
||||
'54': 'SHU', # Shumen
|
||||
'55': 'SLS', # Silistra
|
||||
'56': 'SLV', # Sliven
|
||||
'57': 'SML', # Smolyan
|
||||
'58': 'SFO', # Sofia (Province)
|
||||
'59': 'SZR', # Stara Zagora
|
||||
'60': 'TGV', # Targovishte
|
||||
'61': 'VAR', # Varna
|
||||
'62': 'VTR', # Veliko Tarnovo
|
||||
'63': 'VID', # Vidin
|
||||
'64': 'VRC', # Vratsa
|
||||
'65': 'JAM', # Yambol
|
||||
}
|
||||
|
||||
|
||||
def get_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
# Clean the name
|
||||
name = city_name.strip()
|
||||
words = name.split()
|
||||
|
||||
if len(words) == 1:
|
||||
# Single word: first 3 letters
|
||||
return name[:3].upper()
|
||||
elif len(words) == 2:
|
||||
# Two words: first letter of each + first letter of second word
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
# Multiple words: first letter of each (up to 3)
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
|
||||
def transliterate_cyrillic(text: str) -> str:
|
||||
"""Basic Cyrillic to Latin transliteration."""
|
||||
# Check direct mapping first
|
||||
if text in CYRILLIC_TO_ASCII:
|
||||
return CYRILLIC_TO_ASCII[text]
|
||||
|
||||
# Basic character-by-character transliteration
|
||||
cyrillic_map = {
|
||||
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd',
|
||||
'е': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y',
|
||||
'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
|
||||
'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u',
|
||||
'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh',
|
||||
'щ': 'sht', 'ъ': 'a', 'ь': '', 'ю': 'yu', 'я': 'ya',
|
||||
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D',
|
||||
'Е': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y',
|
||||
'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O',
|
||||
'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U',
|
||||
'Ф': 'F', 'Х': 'H', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh',
|
||||
'Щ': 'Sht', 'Ъ': 'A', 'Ь': '', 'Ю': 'Yu', 'Я': 'Ya',
|
||||
}
|
||||
result = []
|
||||
for char in text:
|
||||
if char in cyrillic_map:
|
||||
result.append(cyrillic_map[char])
|
||||
else:
|
||||
result.append(char)
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def lookup_city_in_geonames(conn: sqlite3.Connection, city_name: str) -> dict | None:
|
||||
"""Look up city in GeoNames database."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# First try direct ASCII lookup
|
||||
ascii_name = CYRILLIC_TO_ASCII.get(city_name) or transliterate_cyrillic(city_name)
|
||||
|
||||
# Try exact match first
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
||||
latitude, longitude, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code='BG'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (ascii_name = ? OR name = ?)
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (ascii_name, ascii_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'geonames_id': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
# Try fuzzy match with LIKE
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
||||
latitude, longitude, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code='BG'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (ascii_name LIKE ? OR name LIKE ?)
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (f'{ascii_name}%', f'{ascii_name}%'))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'geonames_id': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False) -> dict:
|
||||
"""Process a single Bulgarian custodian file."""
|
||||
result = {
|
||||
'file': str(filepath),
|
||||
'status': 'skipped',
|
||||
'old_ghcid': None,
|
||||
'new_ghcid': None,
|
||||
'city_cyrillic': None,
|
||||
'city_ascii': None,
|
||||
'error': None,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'Failed to load YAML: {e}'
|
||||
return result
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty YAML file'
|
||||
return result
|
||||
|
||||
# Get current GHCID
|
||||
ghcid_data = data.get('ghcid', {})
|
||||
old_ghcid = ghcid_data.get('ghcid_current', '')
|
||||
result['old_ghcid'] = old_ghcid
|
||||
|
||||
# Check if it's a BG-XX-XXX file
|
||||
if not old_ghcid.startswith('BG-XX-XXX-'):
|
||||
result['status'] = 'skipped'
|
||||
result['error'] = 'Not a BG-XX-XXX file'
|
||||
return result
|
||||
|
||||
# Extract city from original_entry or locations
|
||||
city_cyrillic = None
|
||||
|
||||
if 'original_entry' in data and 'locations' in data['original_entry']:
|
||||
locations = data['original_entry']['locations']
|
||||
if locations and isinstance(locations, list) and len(locations) > 0:
|
||||
city_cyrillic = locations[0].get('city')
|
||||
|
||||
if not city_cyrillic:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'No city found in original_entry'
|
||||
return result
|
||||
|
||||
result['city_cyrillic'] = city_cyrillic
|
||||
|
||||
# Look up city in GeoNames
|
||||
city_info = lookup_city_in_geonames(conn, city_cyrillic)
|
||||
|
||||
if not city_info:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'City not found in GeoNames: {city_cyrillic}'
|
||||
return result
|
||||
|
||||
result['city_ascii'] = city_info['ascii_name']
|
||||
|
||||
# Get region code
|
||||
admin1_code = city_info['admin1_code']
|
||||
region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')
|
||||
|
||||
# Generate city code
|
||||
city_code = get_city_code(city_info['ascii_name'])
|
||||
|
||||
# Build new GHCID
|
||||
# Extract type and abbreviation from old GHCID
|
||||
# Format: BG-XX-XXX-{type}-{abbrev}
|
||||
parts = old_ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
inst_type = parts[3]
|
||||
abbreviation = '-'.join(parts[4:]) # May contain hyphens
|
||||
else:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'Invalid GHCID format: {old_ghcid}'
|
||||
return result
|
||||
|
||||
new_ghcid = f'BG-{region_code}-{city_code}-{inst_type}-{abbreviation}'
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the GHCID data
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update ghcid section
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['location_resolution'] = {
|
||||
'method': 'GEONAMES_LOOKUP',
|
||||
'country_code': 'BG',
|
||||
'region_code': region_code,
|
||||
'region_name': city_info['admin1_name'],
|
||||
'city_code': city_code,
|
||||
'city_name': city_info['ascii_name'],
|
||||
'city_name_cyrillic': city_cyrillic,
|
||||
'geonames_id': city_info['geonames_id'],
|
||||
'feature_code': city_info['feature_code'],
|
||||
'resolution_date': timestamp,
|
||||
}
|
||||
|
||||
# Add to GHCID history
|
||||
if 'ghcid_history' not in data['ghcid']:
|
||||
data['ghcid']['ghcid_history'] = []
|
||||
|
||||
# Mark old GHCID as ended
|
||||
for entry in data['ghcid']['ghcid_history']:
|
||||
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
|
||||
entry['valid_to'] = timestamp
|
||||
|
||||
# Add new GHCID entry
|
||||
data['ghcid']['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
||||
'valid_from': timestamp,
|
||||
'reason': f'City resolved via GeoNames: {city_cyrillic} → {city_info["ascii_name"]} ({region_code})',
|
||||
})
|
||||
|
||||
# Update identifiers
|
||||
if 'identifiers' in data:
|
||||
for identifier in data['identifiers']:
|
||||
if identifier.get('identifier_scheme') == 'GHCID':
|
||||
identifier['identifier_value'] = new_ghcid
|
||||
|
||||
# Calculate new file path
|
||||
new_filename = f'{new_ghcid}.yaml'
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
# Write updated data
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
# Rename file
|
||||
if filepath != new_filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
result['new_file'] = str(new_filepath)
|
||||
elif new_filepath.exists() and filepath != new_filepath:
|
||||
result['status'] = 'collision'
|
||||
result['error'] = f'Target file already exists: {new_filepath}'
|
||||
return result
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Enrich Bulgarian custodian files with GeoNames data')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Find all Bulgarian XXX files
|
||||
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
||||
geonames_db = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
|
||||
|
||||
if not geonames_db.exists():
|
||||
print(f'ERROR: GeoNames database not found: {geonames_db}')
|
||||
return
|
||||
|
||||
files = sorted(custodian_dir.glob('BG-XX-XXX-*.yaml'))
|
||||
|
||||
if args.limit:
|
||||
files = files[:args.limit]
|
||||
|
||||
print(f'Found {len(files)} Bulgarian XXX files')
|
||||
print(f'Dry run: {args.dry_run}')
|
||||
print()
|
||||
|
||||
# Connect to GeoNames database
|
||||
conn = sqlite3.connect(str(geonames_db))
|
||||
|
||||
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
||||
errors = []
|
||||
|
||||
for filepath in files:
|
||||
result = process_file(filepath, conn, dry_run=args.dry_run)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
if result['status'] == 'updated' or result['status'] == 'would_update':
|
||||
print(f"✓ {result['city_cyrillic']} → {result['city_ascii']}: {result['old_ghcid']} → {result['new_ghcid']}")
|
||||
elif result['status'] == 'error':
|
||||
print(f"✗ {filepath.name}: {result['error']}")
|
||||
errors.append(result)
|
||||
elif result['status'] == 'collision':
|
||||
print(f"⚠ {filepath.name}: {result['error']}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print('=' * 60)
|
||||
print('Summary:')
|
||||
print(f" Updated: {stats.get('updated', 0)}")
|
||||
print(f" Would update: {stats.get('would_update', 0)}")
|
||||
print(f" Errors: {stats.get('error', 0)}")
|
||||
print(f" Collisions: {stats.get('collision', 0)}")
|
||||
print(f" Skipped: {stats.get('skipped', 0)}")
|
||||
|
||||
if errors:
|
||||
print()
|
||||
print('Errors:')
|
||||
for err in errors:
|
||||
print(f" - {err['file']}: {err['error']}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
459
scripts/enrich_cities_google.py
Executable file
459
scripts/enrich_cities_google.py
Executable file
|
|
@ -0,0 +1,459 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich custodian files with city/region data using Google Places API.
|
||||
|
||||
This is a generic script that works for any country's XXX files.
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_cities_google.py --country KR [--dry-run] [--limit N]
|
||||
python scripts/enrich_cities_google.py --country AR [--dry-run] [--limit N]
|
||||
python scripts/enrich_cities_google.py --all [--dry-run] [--limit N]
|
||||
|
||||
Environment Variables:
|
||||
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import sqlite3
|
||||
import re
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Configuration
|
||||
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
||||
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
|
||||
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||
|
||||
# Google Places API
|
||||
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
||||
REQUEST_DELAY = 0.3
|
||||
|
||||
# Country name mapping for search queries
|
||||
COUNTRY_NAMES = {
|
||||
'KR': 'South Korea',
|
||||
'AR': 'Argentina',
|
||||
'US': 'United States',
|
||||
'IN': 'India',
|
||||
'JM': 'Jamaica',
|
||||
'UZ': 'Uzbekistan',
|
||||
'UA': 'Ukraine',
|
||||
'TJ': 'Tajikistan',
|
||||
'OM': 'Oman',
|
||||
'NL': 'Netherlands',
|
||||
'NA': 'Namibia',
|
||||
'ML': 'Mali',
|
||||
'LK': 'Sri Lanka',
|
||||
'LB': 'Lebanon',
|
||||
'IT': 'Italy',
|
||||
'IR': 'Iran',
|
||||
'EC': 'Ecuador',
|
||||
'DK': 'Denmark',
|
||||
'CU': 'Cuba',
|
||||
'CO': 'Colombia',
|
||||
'BR': 'Brazil',
|
||||
'MX': 'Mexico',
|
||||
'JP': 'Japan',
|
||||
'CZ': 'Czech Republic',
|
||||
'DE': 'Germany',
|
||||
'FR': 'France',
|
||||
'GB': 'United Kingdom',
|
||||
}
|
||||
|
||||
|
||||
def get_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
name = city_name.strip()
|
||||
# Remove common suffixes
|
||||
for suffix in [' City', ' Town', '-shi', '-ku', '-gun', '-cho', ' District']:
|
||||
if name.endswith(suffix):
|
||||
name = name[:-len(suffix)]
|
||||
|
||||
words = name.split()
|
||||
|
||||
if len(words) == 1:
|
||||
return name[:3].upper()
|
||||
elif len(words) == 2:
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
|
||||
def search_google_places(query: str, api_key: str) -> Optional[dict]:
|
||||
"""Search Google Places API for a location."""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"X-Goog-Api-Key": api_key,
|
||||
"X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"textQuery": query,
|
||||
"languageCode": "en"
|
||||
}
|
||||
|
||||
try:
|
||||
response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if "places" in data and len(data["places"]) > 0:
|
||||
return data["places"][0]
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" Error searching Google Places: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_location_from_google(place: dict) -> dict:
|
||||
"""Extract location information from Google Places result."""
|
||||
result = {
|
||||
'city': None,
|
||||
'region': None,
|
||||
'latitude': None,
|
||||
'longitude': None,
|
||||
'formatted_address': None,
|
||||
'place_id': None,
|
||||
'website': None,
|
||||
}
|
||||
|
||||
if not place:
|
||||
return result
|
||||
|
||||
result['place_id'] = place.get('id')
|
||||
result['formatted_address'] = place.get('formattedAddress')
|
||||
result['website'] = place.get('websiteUri')
|
||||
|
||||
location = place.get('location', {})
|
||||
result['latitude'] = location.get('latitude')
|
||||
result['longitude'] = location.get('longitude')
|
||||
|
||||
components = place.get('addressComponents', [])
|
||||
for comp in components:
|
||||
types = comp.get('types', [])
|
||||
long_name = comp.get('longText', '')
|
||||
|
||||
if 'locality' in types:
|
||||
result['city'] = long_name
|
||||
elif 'administrative_area_level_1' in types:
|
||||
result['region'] = long_name
|
||||
elif 'sublocality_level_1' in types and not result['city']:
|
||||
result['city'] = long_name
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, country_code: str) -> Optional[dict]:
|
||||
"""Reverse geocode coordinates to find nearest city in GeoNames."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
||||
latitude, longitude, population, feature_code,
|
||||
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
ORDER BY dist_sq
|
||||
LIMIT 1
|
||||
""", (lat, lat, lon, lon, country_code))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'geonames_id': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
|
||||
"""Get ISO-style region code from GeoNames admin1_code."""
|
||||
if not admin1_code:
|
||||
return 'XX'
|
||||
|
||||
# For most countries, use first 2-3 characters of admin1_code or name
|
||||
if len(admin1_code) <= 3:
|
||||
return admin1_code.upper()
|
||||
|
||||
# Use abbreviation from name
|
||||
if admin1_name:
|
||||
words = admin1_name.split()
|
||||
if len(words) == 1:
|
||||
return admin1_name[:2].upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:2]).upper()
|
||||
|
||||
return admin1_code[:2].upper()
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
|
||||
country_code: str, country_name: str, dry_run: bool = False) -> dict:
|
||||
"""Process a single custodian file."""
|
||||
result = {
|
||||
'file': str(filepath),
|
||||
'status': 'skipped',
|
||||
'old_ghcid': None,
|
||||
'new_ghcid': None,
|
||||
'city': None,
|
||||
'region': None,
|
||||
'error': None,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'Failed to load YAML: {e}'
|
||||
return result
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty YAML file'
|
||||
return result
|
||||
|
||||
ghcid_data = data.get('ghcid', {})
|
||||
old_ghcid = ghcid_data.get('ghcid_current', '')
|
||||
result['old_ghcid'] = old_ghcid
|
||||
|
||||
# Match both patterns:
|
||||
# 1. {country}-XX-XXX-... (no region, no city)
|
||||
# 2. {country}-{region}-XXX-... (has region, no city)
|
||||
xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-')
|
||||
if not xxx_pattern.match(old_ghcid):
|
||||
result['status'] = 'skipped'
|
||||
result['error'] = f'Not a {country_code}-*-XXX file'
|
||||
return result
|
||||
|
||||
# Get institution name
|
||||
name = data.get('custodian_name', {}).get('claim_value', '')
|
||||
if not name:
|
||||
name = data.get('original_entry', {}).get('name', '')
|
||||
|
||||
if not name:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'No institution name found'
|
||||
return result
|
||||
|
||||
# Search Google Places
|
||||
search_query = f"{name} {country_name}"
|
||||
print(f" Searching: {name[:50]}...")
|
||||
place = search_google_places(search_query, api_key)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not place:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Not found in Google Places'
|
||||
return result
|
||||
|
||||
location_info = extract_location_from_google(place)
|
||||
|
||||
if not location_info['latitude'] or not location_info['longitude']:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'No coordinates from Google'
|
||||
return result
|
||||
|
||||
# Lookup in GeoNames
|
||||
city_info = lookup_city_geonames(conn, location_info['latitude'],
|
||||
location_info['longitude'], country_code)
|
||||
|
||||
if not city_info:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'City not found in GeoNames'
|
||||
return result
|
||||
|
||||
region_code = get_region_code(city_info['admin1_code'], country_code, city_info['admin1_name'])
|
||||
city_code = get_city_code(city_info['ascii_name'])
|
||||
|
||||
result['city'] = city_info['ascii_name']
|
||||
result['region'] = city_info['admin1_name']
|
||||
|
||||
# Build new GHCID
|
||||
parts = old_ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
inst_type = parts[3]
|
||||
abbreviation = '-'.join(parts[4:])
|
||||
else:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'Invalid GHCID format: {old_ghcid}'
|
||||
return result
|
||||
|
||||
new_ghcid = f'{country_code}-{region_code}-{city_code}-{inst_type}-{abbreviation}'
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the data
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['location_resolution'] = {
|
||||
'method': 'GOOGLE_PLACES_GEONAMES',
|
||||
'country_code': country_code,
|
||||
'region_code': region_code,
|
||||
'region_name': city_info['admin1_name'],
|
||||
'city_code': city_code,
|
||||
'city_name': city_info['ascii_name'],
|
||||
'geonames_id': city_info['geonames_id'],
|
||||
'feature_code': city_info['feature_code'],
|
||||
'google_place_id': location_info.get('place_id'),
|
||||
'latitude': location_info['latitude'],
|
||||
'longitude': location_info['longitude'],
|
||||
'resolution_date': timestamp,
|
||||
}
|
||||
|
||||
data['google_maps_enrichment'] = {
|
||||
'place_id': location_info.get('place_id'),
|
||||
'formatted_address': location_info.get('formatted_address'),
|
||||
'website': location_info.get('website'),
|
||||
'latitude': location_info['latitude'],
|
||||
'longitude': location_info['longitude'],
|
||||
'enriched_at': timestamp,
|
||||
'source': 'Google Places API (New)',
|
||||
}
|
||||
|
||||
# Update GHCID history
|
||||
if 'ghcid_history' not in data['ghcid']:
|
||||
data['ghcid']['ghcid_history'] = []
|
||||
|
||||
for entry in data['ghcid']['ghcid_history']:
|
||||
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
|
||||
entry['valid_to'] = timestamp
|
||||
|
||||
data['ghcid']['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
||||
'valid_from': timestamp,
|
||||
'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
|
||||
})
|
||||
|
||||
if 'identifiers' in data:
|
||||
for identifier in data['identifiers']:
|
||||
if identifier.get('identifier_scheme') == 'GHCID':
|
||||
identifier['identifier_value'] = new_ghcid
|
||||
|
||||
# Write and rename
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
new_filename = f'{new_ghcid}.yaml'
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
if filepath != new_filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
result['new_file'] = str(new_filepath)
|
||||
elif new_filepath.exists() and filepath != new_filepath:
|
||||
result['status'] = 'collision'
|
||||
result['error'] = f'Target file exists: {new_filepath.name}'
|
||||
return result
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Enrich custodian files with Google Places data')
|
||||
parser.add_argument('--country', type=str, help='Country code (e.g., KR, AR, US)')
|
||||
parser.add_argument('--all', action='store_true', help='Process all countries with XXX files')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of files per country')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not GOOGLE_PLACES_TOKEN:
|
||||
print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
|
||||
sys.exit(1)
|
||||
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine which countries to process
|
||||
if args.all:
|
||||
# Find all countries with XXX files (either XX-XXX or {region}-XXX)
|
||||
countries = set()
|
||||
for f in CUSTODIAN_DIR.glob('*-*-XXX-*.yaml'):
|
||||
cc = f.name[:2]
|
||||
if cc in COUNTRY_NAMES:
|
||||
countries.add(cc)
|
||||
countries = sorted(countries)
|
||||
elif args.country:
|
||||
countries = [args.country.upper()]
|
||||
else:
|
||||
print("ERROR: Specify --country CODE or --all")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(str(GEONAMES_DB))
|
||||
|
||||
total_stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
||||
|
||||
for country_code in countries:
|
||||
country_name = COUNTRY_NAMES.get(country_code, country_code)
|
||||
|
||||
files = sorted(CUSTODIAN_DIR.glob(f'{country_code}-*-XXX-*.yaml'))
|
||||
|
||||
if args.limit:
|
||||
files = files[:args.limit]
|
||||
|
||||
if not files:
|
||||
continue
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing {country_code} ({country_name}): {len(files)} files")
|
||||
print('='*60)
|
||||
|
||||
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
||||
|
||||
for filepath in files:
|
||||
print(f"Processing: {filepath.name}")
|
||||
result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN,
|
||||
country_code, country_name, dry_run=args.dry_run)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
if result['status'] in ('updated', 'would_update'):
|
||||
print(f" ✓ {result['city']} ({result['region']}): {result['old_ghcid']} → {result['new_ghcid']}")
|
||||
elif result['status'] == 'error':
|
||||
print(f" ✗ {result['error']}")
|
||||
elif result['status'] == 'collision':
|
||||
print(f" ⚠ {result['error']}")
|
||||
|
||||
print(f"\n{country_code} Summary: Updated={stats.get('updated', 0)}, "
|
||||
f"Would update={stats.get('would_update', 0)}, "
|
||||
f"Errors={stats.get('error', 0)}")
|
||||
|
||||
for k, v in stats.items():
|
||||
total_stats[k] = total_stats.get(k, 0) + v
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print('='*60)
|
||||
print('TOTAL Summary:')
|
||||
print(f" Updated: {total_stats.get('updated', 0)}")
|
||||
print(f" Would update: {total_stats.get('would_update', 0)}")
|
||||
print(f" Errors: {total_stats.get('error', 0)}")
|
||||
print(f" Collisions: {total_stats.get('collision', 0)}")
|
||||
print(f" Skipped: {total_stats.get('skipped', 0)}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
791
scripts/enrich_czech_cities.py
Normal file
791
scripts/enrich_czech_cities.py
Normal file
|
|
@ -0,0 +1,791 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich Czech custodian files with city data from the CH-Annotator source file.
|
||||
|
||||
For Czech custodian files with XXX city placeholder, this script:
|
||||
1. Loads the source CH-Annotator file (czech_unified_ch_annotator.yaml)
|
||||
2. Matches by name, ARON UUID, or Wikidata ID to get city/coordinates
|
||||
3. Falls back to Wikidata P131 lookup via SPARQL for missing data
|
||||
4. Updates the GHCID with correct city code
|
||||
5. Renames the file if GHCID changes
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_czech_cities.py [--dry-run] [--limit N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import time
|
||||
import uuid
|
||||
import yaml
|
||||
import requests
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# Paths
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
||||
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
||||
REPORTS_DIR = PROJECT_ROOT / "reports"
|
||||
CZECH_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "czech_unified_ch_annotator.yaml"
|
||||
|
||||
# GHCID namespace for UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
||||
|
||||
# Rate limiting for Wikidata
|
||||
REQUEST_DELAY = 1.0
|
||||
|
||||
# Czech region mapping (GeoNames admin1 to ISO 3166-2:CZ)
|
||||
CZECH_ADMIN1_MAP = {
|
||||
'52': 'JC', # Jihočeský (South Bohemian)
|
||||
'78': 'JM', # Jihomoravský (South Moravian)
|
||||
'81': 'KA', # Karlovarský (Karlovy Vary)
|
||||
'82': 'VY', # Vysočina (Vysočina)
|
||||
'51': 'KR', # Královéhradecký (Hradec Králové)
|
||||
'53': 'LI', # Liberecký (Liberec)
|
||||
'84': 'MO', # Moravskoslezský (Moravian-Silesian)
|
||||
'85': 'OL', # Olomoucký (Olomouc)
|
||||
'86': 'PA', # Pardubický (Pardubice)
|
||||
'54': 'PL', # Plzeňský (Plzeň)
|
||||
'10': 'PR', # Praha (Prague)
|
||||
'55': 'ST', # Středočeský (Central Bohemian)
|
||||
'56': 'US', # Ústecký (Ústí nad Labem)
|
||||
'87': 'ZL', # Zlínský (Zlín)
|
||||
}
|
||||
|
||||
# Region name to code mapping (from source data)
|
||||
CZECH_REGION_NAMES = {
|
||||
'Jihočeský': 'JC',
|
||||
'Jihomoravský': 'JM',
|
||||
'Karlovarský': 'KA',
|
||||
'Vysočina': 'VY',
|
||||
'Královéhradecký': 'KR',
|
||||
'Liberecký': 'LI',
|
||||
'Moravskoslezský': 'MO',
|
||||
'Olomoucký': 'OL',
|
||||
'Pardubický': 'PA',
|
||||
'Plzeňský': 'PL',
|
||||
'Hlavní město Praha': 'PR',
|
||||
'Praha': 'PR',
|
||||
'Středočeský': 'ST',
|
||||
'Ústecký': 'US',
|
||||
'Zlínský': 'ZL',
|
||||
}
|
||||
|
||||
|
||||
def extract_city_from_name(name: str) -> Optional[str]:
|
||||
"""Try to extract city name from Czech institution name patterns."""
|
||||
if not name:
|
||||
return None
|
||||
|
||||
# Common patterns in Czech: "v Praze", "v Brně", "v Kladně", "ve Šlapanicích"
|
||||
# Also: "nad Metují", "nad Labem"
|
||||
import re
|
||||
|
||||
# Pattern: "v/ve + City" (locative case)
|
||||
patterns = [
|
||||
# "v CityName" - most common
|
||||
r'\bv\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)',
|
||||
# "ve CityName" (before consonant clusters)
|
||||
r'\bve\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)',
|
||||
# "nad CityName" or "pod CityName"
|
||||
r'\b(?:nad|pod)\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, name)
|
||||
if match:
|
||||
city = match.group(1)
|
||||
# Convert locative case to nominative (approximation)
|
||||
# Common endings: -ě/-e -> -a, -ích -> -y, -ové -> -ov
|
||||
city = convert_locative_to_nominative(city)
|
||||
return city
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def convert_locative_to_nominative(city: str) -> str:
|
||||
"""Convert Czech locative case to nominative (best effort)."""
|
||||
# This is approximate - Czech declension is complex
|
||||
# Common patterns:
|
||||
replacements = [
|
||||
# Praha (Prague): Praze -> Praha
|
||||
('Praze', 'Praha'),
|
||||
('Brně', 'Brno'),
|
||||
('Hradci Králové', 'Hradec Králové'),
|
||||
('Havlíčkově Brodě', 'Havlíčkův Brod'),
|
||||
('Liberci', 'Liberec'),
|
||||
('Olomouci', 'Olomouc'),
|
||||
('Plzni', 'Plzeň'),
|
||||
('Ostravě', 'Ostrava'),
|
||||
('Ústí nad Labem', 'Ústí nad Labem'), # no change
|
||||
('Opavě', 'Opava'),
|
||||
# Generic endings
|
||||
]
|
||||
|
||||
for locative, nominative in replacements:
|
||||
if city == locative:
|
||||
return nominative
|
||||
|
||||
# Generic ending transformations (approximate)
|
||||
if city.endswith('ě') or city.endswith('e'):
|
||||
# Could be -a noun (Praha -> Praze) or -o noun (Brno -> Brně)
|
||||
# Try replacing with -a first (more common)
|
||||
pass
|
||||
|
||||
# For now, return as-is if no specific mapping found
|
||||
return city
|
||||
|
||||
|
||||
def normalize_czech_name(name: str) -> str:
|
||||
"""Normalize Czech institution name for matching."""
|
||||
if not name:
|
||||
return ''
|
||||
|
||||
# Remove common suffixes and legal forms
|
||||
suffixes = [
|
||||
'o. p. s.',
|
||||
'o.p.s.',
|
||||
'p. o.',
|
||||
'p.o.',
|
||||
's. r. o.',
|
||||
's.r.o.',
|
||||
'příspěvková organizace',
|
||||
', příspěvková organizace',
|
||||
', p. o.',
|
||||
]
|
||||
|
||||
result = name
|
||||
for suffix in suffixes:
|
||||
result = result.replace(suffix, '')
|
||||
|
||||
# Clean up extra whitespace
|
||||
result = ' '.join(result.split())
|
||||
result = result.strip(' -,')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_czech_source_data() -> Dict[str, Dict]:
|
||||
"""Load Czech CH-Annotator source file and create lookup tables."""
|
||||
by_name = {}
|
||||
by_aron_uuid = {}
|
||||
by_wikidata = {}
|
||||
|
||||
if not CZECH_CH_ANNOTATOR_FILE.exists():
|
||||
print(f"Warning: Czech CH-Annotator file not found: {CZECH_CH_ANNOTATOR_FILE}")
|
||||
return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata}
|
||||
|
||||
print(f"Loading Czech CH-Annotator source file...")
|
||||
with open(CZECH_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f:
|
||||
entries = yaml.safe_load(f)
|
||||
|
||||
if not entries:
|
||||
return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata}
|
||||
|
||||
for entry in entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
|
||||
# Extract location data
|
||||
locations = entry.get('locations', [])
|
||||
if not locations:
|
||||
continue
|
||||
|
||||
loc = locations[0] if locations else {}
|
||||
if not loc.get('city'):
|
||||
continue
|
||||
|
||||
location_data = {
|
||||
'city': loc.get('city'),
|
||||
'region': loc.get('region'),
|
||||
'region_code': CZECH_REGION_NAMES.get(loc.get('region', ''), None),
|
||||
'postal_code': loc.get('postal_code'),
|
||||
'street_address': loc.get('street_address'),
|
||||
'latitude': loc.get('latitude'),
|
||||
'longitude': loc.get('longitude'),
|
||||
'name': entry.get('name', '')
|
||||
}
|
||||
|
||||
# Index by name (exact and normalized)
|
||||
name = entry.get('name', '')
|
||||
if name:
|
||||
by_name[name] = location_data
|
||||
by_name[name.lower()] = location_data
|
||||
# Also normalized version
|
||||
normalized = normalize_czech_name(name)
|
||||
if normalized and normalized != name:
|
||||
by_name[normalized] = location_data
|
||||
by_name[normalized.lower()] = location_data
|
||||
|
||||
# Index by alternative names
|
||||
for alt_name in entry.get('alternative_names', []):
|
||||
if alt_name:
|
||||
by_name[alt_name] = location_data
|
||||
by_name[alt_name.lower()] = location_data
|
||||
normalized = normalize_czech_name(alt_name)
|
||||
if normalized and normalized != alt_name:
|
||||
by_name[normalized] = location_data
|
||||
by_name[normalized.lower()] = location_data
|
||||
|
||||
# Index by ARON UUID and Wikidata
|
||||
for ident in entry.get('identifiers', []):
|
||||
if not isinstance(ident, dict):
|
||||
continue
|
||||
scheme = ident.get('identifier_scheme', '')
|
||||
value = ident.get('identifier_value', '')
|
||||
if scheme == 'ARON_UUID' and value:
|
||||
by_aron_uuid[value] = location_data
|
||||
elif scheme == 'Wikidata' and value:
|
||||
by_wikidata[value] = location_data
|
||||
|
||||
print(f" Loaded {len(by_name)} by name, {len(by_aron_uuid)} by ARON UUID, {len(by_wikidata)} by Wikidata")
|
||||
return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata}
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
if not city_name:
|
||||
return 'XXX'
|
||||
|
||||
# Remove diacritics and normalize
|
||||
import unicodedata
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Czech articles/prepositions to skip
|
||||
skip_words = {'nad', 'pod', 'u', 'v', 've', 'na', 'do', 'z', 'ze', 'k', 'ke', 'o', 's', 'se'}
|
||||
words = ascii_name.split()
|
||||
significant_words = [w for w in words if w.lower() not in skip_words]
|
||||
|
||||
if not significant_words:
|
||||
significant_words = words
|
||||
|
||||
if len(significant_words) == 1:
|
||||
# Single word: first 3 letters
|
||||
return significant_words[0][:3].upper()
|
||||
else:
|
||||
# Multiple words: initials (up to 3)
|
||||
return ''.join(w[0] for w in significant_words[:3]).upper()
|
||||
|
||||
|
||||
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
||||
"""Generate deterministic UUID v5 from GHCID string."""
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
||||
|
||||
|
||||
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
||||
"""Generate UUID v8 style from SHA-256 hash."""
|
||||
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16]
|
||||
hash_bytes = bytearray(hash_bytes)
|
||||
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8
|
||||
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant
|
||||
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
||||
|
||||
|
||||
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
||||
"""Generate 64-bit numeric ID from SHA-256 hash."""
|
||||
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
return int.from_bytes(hash_bytes[:8], 'big')
|
||||
|
||||
|
||||
def fetch_wikidata_location(wikidata_id: str, session: requests.Session) -> Optional[Dict]:
|
||||
"""Fetch location via Wikidata SPARQL (P131 located in administrative entity)."""
|
||||
if not wikidata_id or not wikidata_id.startswith('Q'):
|
||||
return None
|
||||
|
||||
query = f"""
|
||||
SELECT ?cityLabel ?regionLabel ?coords WHERE {{
|
||||
wd:{wikidata_id} wdt:P131* ?city .
|
||||
?city wdt:P31/wdt:P279* wd:Q515 . # city
|
||||
OPTIONAL {{ ?city wdt:P625 ?coords }}
|
||||
OPTIONAL {{
|
||||
wd:{wikidata_id} wdt:P131+ ?region .
|
||||
?region wdt:P31 wd:Q20916591 . # Czech region
|
||||
}}
|
||||
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "cs,en" }}
|
||||
}}
|
||||
LIMIT 1
|
||||
"""
|
||||
|
||||
try:
|
||||
response = session.get(
|
||||
'https://query.wikidata.org/sparql',
|
||||
params={'query': query, 'format': 'json'},
|
||||
headers={'User-Agent': 'GLAMDataExtractor/1.0'},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get('results', {}).get('bindings', [])
|
||||
if results:
|
||||
result = results[0]
|
||||
city = result.get('cityLabel', {}).get('value', '')
|
||||
region = result.get('regionLabel', {}).get('value', '')
|
||||
coords = result.get('coords', {}).get('value', '')
|
||||
|
||||
lat, lon = None, None
|
||||
if coords and coords.startswith('Point('):
|
||||
# Parse Point(lon lat) format
|
||||
match = re.match(r'Point\(([^ ]+) ([^)]+)\)', coords)
|
||||
if match:
|
||||
lon, lat = float(match.group(1)), float(match.group(2))
|
||||
|
||||
return {
|
||||
'city': city,
|
||||
'region': region,
|
||||
'region_code': CZECH_REGION_NAMES.get(region, None),
|
||||
'latitude': lat,
|
||||
'longitude': lon,
|
||||
'source': 'wikidata_sparql'
|
||||
}
|
||||
except Exception as e:
|
||||
print(f" Wikidata SPARQL error: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def reverse_geocode_city(city_name: str, country_code: str, db_path: Path) -> Optional[Dict]:
|
||||
"""Look up city in GeoNames database to get coordinates and admin1."""
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try exact match first
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
||||
population, feature_code, admin1_code, admin1_name
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (country_code, city_name, city_name, city_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
# Try fuzzy match
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
||||
population, feature_code, admin1_code, admin1_name
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (name LIKE ? OR ascii_name LIKE ?)
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (country_code, f"{city_name}%", f"{city_name}%"))
|
||||
row = cursor.fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
admin1_code = row[7]
|
||||
region_code = CZECH_ADMIN1_MAP.get(admin1_code, None)
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'geonames_name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'latitude': row[3],
|
||||
'longitude': row[4],
|
||||
'population': row[5],
|
||||
'feature_code': row[6],
|
||||
'admin1_code': admin1_code,
|
||||
'admin1_name': row[8],
|
||||
'region_code': region_code
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f" GeoNames lookup error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def process_file(file_path: Path, lookup: Dict, session: requests.Session, dry_run: bool = True) -> Dict:
|
||||
"""Process a single custodian file."""
|
||||
result = {
|
||||
'status': 'unchanged',
|
||||
'old_ghcid': None,
|
||||
'new_ghcid': None,
|
||||
'city': None,
|
||||
'error': None
|
||||
}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty file'
|
||||
return result
|
||||
|
||||
# Check if this is a Czech file with XXX city placeholder
|
||||
ghcid_current = data.get('ghcid', {}).get('ghcid_current', '')
|
||||
if not ghcid_current.startswith('CZ-') or '-XXX-' not in ghcid_current:
|
||||
result['status'] = 'skipped'
|
||||
return result
|
||||
|
||||
result['old_ghcid'] = ghcid_current
|
||||
|
||||
# Get institution name for lookup
|
||||
inst_name = data.get('original_entry', {}).get('name', '')
|
||||
if not inst_name:
|
||||
inst_name = data.get('custodian_name', {}).get('claim_value', '')
|
||||
|
||||
# Get identifiers for lookup
|
||||
aron_uuid = None
|
||||
wikidata_id = None
|
||||
for ident in data.get('identifiers', []):
|
||||
if isinstance(ident, dict):
|
||||
scheme = ident.get('identifier_scheme', '')
|
||||
value = ident.get('identifier_value', '')
|
||||
if scheme == 'ARON_UUID':
|
||||
aron_uuid = value
|
||||
elif scheme == 'Wikidata':
|
||||
wikidata_id = value
|
||||
|
||||
# Also check original_entry.identifiers
|
||||
for ident in data.get('original_entry', {}).get('identifiers', []):
|
||||
if isinstance(ident, dict):
|
||||
scheme = ident.get('identifier_scheme', '')
|
||||
value = ident.get('identifier_value', '')
|
||||
if scheme == 'ARON_UUID' and not aron_uuid:
|
||||
aron_uuid = value
|
||||
elif scheme == 'Wikidata' and not wikidata_id:
|
||||
wikidata_id = value
|
||||
|
||||
# Try to find location data from source
|
||||
location_data = None
|
||||
location_source = None
|
||||
|
||||
# Try by name first
|
||||
if inst_name:
|
||||
location_data = lookup['by_name'].get(inst_name)
|
||||
if location_data:
|
||||
location_source = 'source_by_name'
|
||||
else:
|
||||
# Try lowercase
|
||||
location_data = lookup['by_name'].get(inst_name.lower())
|
||||
if location_data:
|
||||
location_source = 'source_by_name_lower'
|
||||
else:
|
||||
# Try normalized
|
||||
normalized = normalize_czech_name(inst_name)
|
||||
if normalized:
|
||||
location_data = lookup['by_name'].get(normalized)
|
||||
if location_data:
|
||||
location_source = 'source_by_normalized_name'
|
||||
else:
|
||||
location_data = lookup['by_name'].get(normalized.lower())
|
||||
if location_data:
|
||||
location_source = 'source_by_normalized_name_lower'
|
||||
|
||||
# Try by ARON UUID
|
||||
if not location_data and aron_uuid:
|
||||
location_data = lookup['by_aron_uuid'].get(aron_uuid)
|
||||
if location_data:
|
||||
location_source = 'source_by_aron_uuid'
|
||||
|
||||
# Try by Wikidata
|
||||
if not location_data and wikidata_id:
|
||||
location_data = lookup['by_wikidata'].get(wikidata_id)
|
||||
if location_data:
|
||||
location_source = 'source_by_wikidata'
|
||||
|
||||
# Fallback to Wikidata SPARQL (skip for now - too slow)
|
||||
# if not location_data and wikidata_id:
|
||||
# time.sleep(REQUEST_DELAY)
|
||||
# location_data = fetch_wikidata_location(wikidata_id, session)
|
||||
# if location_data:
|
||||
# location_source = 'wikidata_sparql'
|
||||
|
||||
# Fallback: extract city from institution name
|
||||
if not location_data or not location_data.get('city'):
|
||||
extracted_city = extract_city_from_name(inst_name)
|
||||
if extracted_city:
|
||||
# Validate against GeoNames
|
||||
geonames_data = reverse_geocode_city(extracted_city, 'CZ', GEONAMES_DB)
|
||||
if geonames_data:
|
||||
location_data = {
|
||||
'city': geonames_data.get('geonames_name', extracted_city),
|
||||
'region_code': geonames_data.get('region_code'),
|
||||
'geonames_id': geonames_data.get('geonames_id'),
|
||||
'geonames_name': geonames_data.get('geonames_name'),
|
||||
'latitude': geonames_data.get('latitude'),
|
||||
'longitude': geonames_data.get('longitude'),
|
||||
}
|
||||
location_source = 'extracted_from_name'
|
||||
|
||||
if not location_data or not location_data.get('city'):
|
||||
result['status'] = 'no_city_found'
|
||||
result['error'] = f'No location data for: {inst_name}'
|
||||
return result
|
||||
|
||||
city_name = location_data['city']
|
||||
result['city'] = city_name
|
||||
|
||||
# Generate city code
|
||||
city_code = generate_city_code(city_name)
|
||||
|
||||
# Get region code
|
||||
region_code = location_data.get('region_code')
|
||||
if not region_code:
|
||||
# Try to get from GeoNames
|
||||
geonames_data = reverse_geocode_city(city_name, 'CZ', GEONAMES_DB)
|
||||
if geonames_data:
|
||||
region_code = geonames_data.get('region_code')
|
||||
location_data['geonames_id'] = geonames_data.get('geonames_id')
|
||||
location_data['geonames_name'] = geonames_data.get('geonames_name')
|
||||
if not location_data.get('latitude'):
|
||||
location_data['latitude'] = geonames_data.get('latitude')
|
||||
location_data['longitude'] = geonames_data.get('longitude')
|
||||
|
||||
# Build new GHCID
|
||||
parts = ghcid_current.split('-')
|
||||
if len(parts) >= 5:
|
||||
# Replace XXX with city code, and update region if we have it
|
||||
parts[2] = city_code
|
||||
if region_code:
|
||||
parts[1] = region_code
|
||||
new_ghcid = '-'.join(parts)
|
||||
else:
|
||||
new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-')
|
||||
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
if new_ghcid == ghcid_current:
|
||||
result['status'] = 'unchanged'
|
||||
return result
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the data
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update GHCID
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
|
||||
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
|
||||
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
|
||||
|
||||
# Update location_resolution
|
||||
location_resolution = {
|
||||
'method': 'CZECH_CH_ANNOTATOR_ENRICHMENT',
|
||||
'city_name': city_name,
|
||||
'city_code': city_code,
|
||||
'country_code': 'CZ',
|
||||
'enrichment_date': now,
|
||||
'source': location_source
|
||||
}
|
||||
|
||||
if region_code:
|
||||
location_resolution['region_code'] = region_code
|
||||
location_resolution['region_name'] = location_data.get('region', f'CZ-{region_code}')
|
||||
|
||||
if location_data.get('geonames_id'):
|
||||
location_resolution['geonames_id'] = location_data['geonames_id']
|
||||
location_resolution['geonames_name'] = location_data['geonames_name']
|
||||
|
||||
if location_data.get('latitude'):
|
||||
location_resolution['latitude'] = location_data['latitude']
|
||||
location_resolution['longitude'] = location_data['longitude']
|
||||
|
||||
data['ghcid']['location_resolution'] = location_resolution
|
||||
|
||||
# Add GHCID history entry
|
||||
history = data['ghcid'].get('ghcid_history', [])
|
||||
if history and isinstance(history, list) and len(history) > 0:
|
||||
# Close previous entry
|
||||
if isinstance(history[0], dict):
|
||||
history[0]['valid_to'] = now
|
||||
|
||||
history.insert(0, {
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
|
||||
'valid_from': now,
|
||||
'valid_to': None,
|
||||
'reason': f'City code updated from Czech CH-Annotator enrichment: {city_name} -> {city_code}'
|
||||
})
|
||||
data['ghcid']['ghcid_history'] = history
|
||||
|
||||
# Update location in original_entry if exists
|
||||
if 'original_entry' in data:
|
||||
if 'locations' not in data['original_entry'] or not data['original_entry']['locations']:
|
||||
data['original_entry']['locations'] = [{}]
|
||||
for loc in data['original_entry']['locations']:
|
||||
if isinstance(loc, dict):
|
||||
loc['city'] = city_name
|
||||
if location_data.get('postal_code'):
|
||||
loc['postal_code'] = location_data['postal_code']
|
||||
if location_data.get('street_address'):
|
||||
loc['street_address'] = location_data['street_address']
|
||||
if location_data.get('latitude'):
|
||||
loc['latitude'] = location_data['latitude']
|
||||
loc['longitude'] = location_data['longitude']
|
||||
if region_code:
|
||||
loc['region'] = location_data.get('region', f'CZ-{region_code}')
|
||||
|
||||
# Update identifiers
|
||||
for ident in data.get('identifiers', []):
|
||||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID':
|
||||
ident['identifier_value'] = new_ghcid
|
||||
|
||||
# Add provenance note
|
||||
notes = data.get('provenance', {}).get('notes', [])
|
||||
if isinstance(notes, str):
|
||||
notes = [notes]
|
||||
if not isinstance(notes, list):
|
||||
notes = []
|
||||
notes.append(f'City resolved {now[:19]}Z: {city_name} -> {city_code} via {location_source}')
|
||||
data['provenance'] = data.get('provenance', {})
|
||||
data['provenance']['notes'] = notes
|
||||
|
||||
# Write updated file
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Rename file if GHCID changed
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
new_path = file_path.parent / new_filename
|
||||
|
||||
if new_path != file_path and not new_path.exists():
|
||||
shutil.move(file_path, new_path)
|
||||
result['renamed_to'] = str(new_path.name)
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Enrich Czech custodian files with city data')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("CZECH CITY ENRICHMENT")
|
||||
print("=" * 60)
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
|
||||
# Find Czech files with XXX city placeholder
|
||||
czech_xxx_files = list(CUSTODIAN_DIR.glob("CZ-*-XXX-*.yaml"))
|
||||
|
||||
if args.limit:
|
||||
czech_xxx_files = czech_xxx_files[:args.limit]
|
||||
print(f"Limited to {args.limit} files")
|
||||
|
||||
print(f"Found {len(czech_xxx_files)} Czech files with XXX city placeholder")
|
||||
print()
|
||||
|
||||
# Load Czech source data
|
||||
lookup = load_czech_source_data()
|
||||
|
||||
# Process files
|
||||
session = requests.Session()
|
||||
session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)'
|
||||
|
||||
stats = {
|
||||
'updated': 0,
|
||||
'would_update': 0,
|
||||
'unchanged': 0,
|
||||
'skipped': 0,
|
||||
'no_city_found': 0,
|
||||
'error': 0
|
||||
}
|
||||
|
||||
cities_found = {}
|
||||
errors = []
|
||||
|
||||
for i, file_path in enumerate(czech_xxx_files, 1):
|
||||
if i % 100 == 0 or args.verbose:
|
||||
print(f"Progress: {i}/{len(czech_xxx_files)}")
|
||||
|
||||
result = process_file(file_path, lookup, session, dry_run=args.dry_run)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
if result.get('city'):
|
||||
cities_found[result['city']] = cities_found.get(result['city'], 0) + 1
|
||||
|
||||
if result.get('error'):
|
||||
errors.append(f"{file_path.name}: {result['error']}")
|
||||
|
||||
if args.verbose and result['status'] in ('updated', 'would_update'):
|
||||
print(f" {file_path.name}")
|
||||
print(f" City: {result.get('city')}")
|
||||
print(f" {result['old_ghcid']} -> {result['new_ghcid']}")
|
||||
|
||||
# Print summary
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Total files processed: {len(czech_xxx_files)}")
|
||||
print()
|
||||
print("Results:")
|
||||
for status, count in sorted(stats.items()):
|
||||
if count > 0:
|
||||
print(f" {status}: {count}")
|
||||
|
||||
if cities_found:
|
||||
print()
|
||||
print(f"Cities found: {len(cities_found)} unique")
|
||||
print("Top 10 cities:")
|
||||
for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" {city}: {count}")
|
||||
|
||||
if errors:
|
||||
print()
|
||||
print(f"Errors ({len(errors)}):")
|
||||
for err in errors[:10]:
|
||||
print(f" {err}")
|
||||
if len(errors) > 10:
|
||||
print(f" ... and {len(errors) - 10} more")
|
||||
|
||||
# Save report
|
||||
REPORTS_DIR.mkdir(exist_ok=True)
|
||||
report_file = REPORTS_DIR / f"CZECH_CITY_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
||||
|
||||
with open(report_file, 'w') as f:
|
||||
f.write("# Czech City Enrichment Report\n\n")
|
||||
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n")
|
||||
f.write("## Summary\n\n")
|
||||
f.write(f"- Total files processed: {len(czech_xxx_files)}\n")
|
||||
for status, count in sorted(stats.items()):
|
||||
if count > 0:
|
||||
f.write(f"- {status}: {count}\n")
|
||||
|
||||
if cities_found:
|
||||
f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n")
|
||||
for city, count in sorted(cities_found.items(), key=lambda x: -x[1]):
|
||||
f.write(f"- {city}: {count}\n")
|
||||
|
||||
print()
|
||||
print(f"Report saved to: {report_file}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
449
scripts/enrich_czech_cities_fast.py
Normal file
449
scripts/enrich_czech_cities_fast.py
Normal file
|
|
@ -0,0 +1,449 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fast Czech city enrichment - extracts cities from institution names.
|
||||
|
||||
This is a simplified script that:
|
||||
1. Extracts city names from Czech institution name patterns (v/ve + City)
|
||||
2. Converts from Czech locative case to nominative
|
||||
3. Validates against GeoNames
|
||||
4. Updates custodian files with city codes
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_czech_cities_fast.py [--dry-run] [--limit N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import uuid
|
||||
import yaml
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
# Paths
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
||||
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
||||
REPORTS_DIR = PROJECT_ROOT / "reports"
|
||||
|
||||
# GHCID namespace for UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
||||
|
||||
# Czech region mapping (GeoNames admin1 to ISO 3166-2:CZ)
|
||||
CZECH_ADMIN1_MAP = {
|
||||
'52': 'JC', '78': 'JM', '81': 'KA', '82': 'VY', '51': 'KR',
|
||||
'53': 'LI', '84': 'MO', '85': 'OL', '86': 'PA', '54': 'PL',
|
||||
'10': 'PR', '55': 'ST', '56': 'US', '87': 'ZL',
|
||||
}
|
||||
|
||||
# Czech locative to nominative mappings
|
||||
LOCATIVE_TO_NOMINATIVE = {
|
||||
# Major cities
|
||||
'Praze': 'Praha',
|
||||
'Brně': 'Brno',
|
||||
'Ostravě': 'Ostrava',
|
||||
'Plzni': 'Plzeň',
|
||||
'Olomouci': 'Olomouc',
|
||||
'Liberci': 'Liberec',
|
||||
'Opavě': 'Opava',
|
||||
'Hradci Králové': 'Hradec Králové',
|
||||
'Českých Budějovicích': 'České Budějovice',
|
||||
'Pardubicích': 'Pardubice',
|
||||
'Zlíně': 'Zlín',
|
||||
'Kladně': 'Kladno',
|
||||
'Havlíčkově Brodě': 'Havlíčkův Brod',
|
||||
|
||||
# Medium cities
|
||||
'Prostějově': 'Prostějov',
|
||||
'Domažlicích': 'Domažlice',
|
||||
'Litoměřicích': 'Litoměřice',
|
||||
'Klatovech': 'Klatovy',
|
||||
'Kopřivnici': 'Kopřivnice',
|
||||
'Pacově': 'Pacov',
|
||||
'Táboře': 'Tábor',
|
||||
'Písku': 'Písek',
|
||||
'Trutnově': 'Trutnov',
|
||||
'Chebu': 'Cheb',
|
||||
'Karviné': 'Karviná',
|
||||
'Havířově': 'Havířov',
|
||||
'Mostě': 'Most',
|
||||
'Chomutově': 'Chomutov',
|
||||
'Teplicích': 'Teplice',
|
||||
'Děčíně': 'Děčín',
|
||||
'Jablonci nad Nisou': 'Jablonec nad Nisou',
|
||||
'Mladé Boleslavi': 'Mladá Boleslav',
|
||||
'Příbrami': 'Příbram',
|
||||
'Kolíně': 'Kolín',
|
||||
'Jihlavě': 'Jihlava',
|
||||
'Třebíči': 'Třebíč',
|
||||
'Znojmě': 'Znojmo',
|
||||
'Břeclavi': 'Břeclav',
|
||||
'Hodoníně': 'Hodonín',
|
||||
'Vyškově': 'Vyškov',
|
||||
'Kroměříži': 'Kroměříž',
|
||||
'Vsetíně': 'Vsetín',
|
||||
'Frýdku-Místku': 'Frýdek-Místek',
|
||||
'Novém Jičíně': 'Nový Jičín',
|
||||
'Šumperku': 'Šumperk',
|
||||
'Přerově': 'Přerov',
|
||||
'Prostějově': 'Prostějov',
|
||||
'Uherském Hradišti': 'Uherské Hradiště',
|
||||
'Svitavách': 'Svitavy',
|
||||
'Chrudimi': 'Chrudim',
|
||||
'Ústí nad Orlicí': 'Ústí nad Orlicí',
|
||||
'Náchodě': 'Náchod',
|
||||
'Rychnově nad Kněžnou': 'Rychnov nad Kněžnou',
|
||||
'Semilech': 'Semily',
|
||||
'Jičíně': 'Jičín',
|
||||
'České Lípě': 'Česká Lípa',
|
||||
'Lounech': 'Louny',
|
||||
'Rakovníku': 'Rakovník',
|
||||
'Berouně': 'Beroun',
|
||||
'Benešově': 'Benešov',
|
||||
'Kutné Hoře': 'Kutná Hora',
|
||||
'Nymburce': 'Nymburk',
|
||||
'Mělníku': 'Mělník',
|
||||
'Sokolově': 'Sokolov',
|
||||
'Rokycanech': 'Rokycany',
|
||||
'Klatovech': 'Klatovy',
|
||||
'Strakonicích': 'Strakonice',
|
||||
'Českém Krumlově': 'Český Krumlov',
|
||||
'Jindřichově Hradci': 'Jindřichův Hradec',
|
||||
'Pelhřimově': 'Pelhřimov',
|
||||
'Žďáru nad Sázavou': 'Žďár nad Sázavou',
|
||||
|
||||
# Compound patterns with "nad"
|
||||
'Metují': 'Metuje', # Nové Město nad Metují
|
||||
'Nisou': 'Nisa',
|
||||
'Labem': 'Labe',
|
||||
'Sázavou': 'Sázava',
|
||||
'Kněžnou': 'Kněžná',
|
||||
'Orlicí': 'Orlice',
|
||||
}
|
||||
|
||||
|
||||
def convert_locative_to_nominative(city: str) -> str:
|
||||
"""Convert Czech locative case to nominative."""
|
||||
# Try exact match first
|
||||
if city in LOCATIVE_TO_NOMINATIVE:
|
||||
return LOCATIVE_TO_NOMINATIVE[city]
|
||||
|
||||
# Try lowercase match
|
||||
for locative, nominative in LOCATIVE_TO_NOMINATIVE.items():
|
||||
if city.lower() == locative.lower():
|
||||
return nominative
|
||||
|
||||
# Return as-is if no mapping
|
||||
return city
|
||||
|
||||
|
||||
def extract_city_from_name(name: str) -> Optional[str]:
|
||||
"""Extract city name from Czech institution name patterns."""
|
||||
if not name:
|
||||
return None
|
||||
|
||||
# Pattern: "v/ve + City" (locative case)
|
||||
patterns = [
|
||||
r'\bv\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+(?:nad|pod)?\s*[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)',
|
||||
r'\bve\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+(?:nad|pod)?\s*[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, name)
|
||||
if match:
|
||||
city = match.group(1)
|
||||
return convert_locative_to_nominative(city)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
if not city_name:
|
||||
return 'XXX'
|
||||
|
||||
import unicodedata
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
skip_words = {'nad', 'pod', 'u', 'v', 've', 'na', 'do', 'z', 'ze', 'k', 'ke'}
|
||||
words = ascii_name.split()
|
||||
significant_words = [w for w in words if w.lower() not in skip_words]
|
||||
|
||||
if not significant_words:
|
||||
significant_words = words
|
||||
|
||||
if len(significant_words) == 1:
|
||||
return significant_words[0][:3].upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in significant_words[:3]).upper()
|
||||
|
||||
|
||||
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
||||
|
||||
|
||||
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
||||
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16]
|
||||
hash_bytes = bytearray(hash_bytes)
|
||||
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80
|
||||
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80
|
||||
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
||||
|
||||
|
||||
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
||||
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
return int.from_bytes(hash_bytes[:8], 'big')
|
||||
|
||||
|
||||
def lookup_city_geonames(city_name: str, db_path: Path) -> Optional[Dict]:
|
||||
"""Look up city in GeoNames database."""
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try exact match
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
||||
population, feature_code, admin1_code
|
||||
FROM cities
|
||||
WHERE country_code = 'CZ'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC')
|
||||
AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (city_name, city_name, city_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
# Try prefix match
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
||||
population, feature_code, admin1_code
|
||||
FROM cities
|
||||
WHERE country_code = 'CZ'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC')
|
||||
AND (name LIKE ? OR ascii_name LIKE ?)
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (f"{city_name}%", f"{city_name}%"))
|
||||
row = cursor.fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
admin1_code = row[7]
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'geonames_name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'latitude': row[3],
|
||||
'longitude': row[4],
|
||||
'population': row[5],
|
||||
'feature_code': row[6],
|
||||
'admin1_code': admin1_code,
|
||||
'region_code': CZECH_ADMIN1_MAP.get(admin1_code),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f" GeoNames error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def process_file(file_path: Path, dry_run: bool = True) -> Dict:
|
||||
"""Process a single custodian file."""
|
||||
result = {'status': 'unchanged', 'old_ghcid': None, 'new_ghcid': None, 'city': None, 'error': None}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty file'
|
||||
return result
|
||||
|
||||
ghcid_current = data.get('ghcid', {}).get('ghcid_current', '')
|
||||
if not ghcid_current.startswith('CZ-') or '-XXX-' not in ghcid_current:
|
||||
result['status'] = 'skipped'
|
||||
return result
|
||||
|
||||
result['old_ghcid'] = ghcid_current
|
||||
|
||||
# Get institution name
|
||||
inst_name = data.get('original_entry', {}).get('name', '')
|
||||
if not inst_name:
|
||||
inst_name = data.get('custodian_name', {}).get('claim_value', '')
|
||||
|
||||
# Try to extract city from name
|
||||
extracted_city = extract_city_from_name(inst_name)
|
||||
if not extracted_city:
|
||||
result['status'] = 'no_city_in_name'
|
||||
return result
|
||||
|
||||
# Validate against GeoNames
|
||||
geonames_data = lookup_city_geonames(extracted_city, GEONAMES_DB)
|
||||
if not geonames_data:
|
||||
result['status'] = 'city_not_in_geonames'
|
||||
result['error'] = f'City not found in GeoNames: {extracted_city}'
|
||||
return result
|
||||
|
||||
city_name = geonames_data['geonames_name']
|
||||
city_code = generate_city_code(city_name)
|
||||
region_code = geonames_data.get('region_code')
|
||||
|
||||
result['city'] = city_name
|
||||
|
||||
# Build new GHCID
|
||||
parts = ghcid_current.split('-')
|
||||
if len(parts) >= 5:
|
||||
parts[2] = city_code
|
||||
if region_code:
|
||||
parts[1] = region_code
|
||||
new_ghcid = '-'.join(parts)
|
||||
else:
|
||||
new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-')
|
||||
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
if new_ghcid == ghcid_current:
|
||||
result['status'] = 'unchanged'
|
||||
return result
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the data
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
|
||||
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
|
||||
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
|
||||
|
||||
data['ghcid']['location_resolution'] = {
|
||||
'method': 'EXTRACTED_FROM_NAME',
|
||||
'city_name': city_name,
|
||||
'city_code': city_code,
|
||||
'region_code': region_code,
|
||||
'country_code': 'CZ',
|
||||
'enrichment_date': now,
|
||||
'geonames_id': geonames_data['geonames_id'],
|
||||
'geonames_name': geonames_data['geonames_name'],
|
||||
'latitude': geonames_data['latitude'],
|
||||
'longitude': geonames_data['longitude'],
|
||||
}
|
||||
|
||||
# Add history entry
|
||||
history = data['ghcid'].get('ghcid_history', [])
|
||||
if history and isinstance(history[0], dict):
|
||||
history[0]['valid_to'] = now
|
||||
history.insert(0, {
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
|
||||
'valid_from': now,
|
||||
'reason': f'City extracted from name: {city_name} -> {city_code}'
|
||||
})
|
||||
data['ghcid']['ghcid_history'] = history
|
||||
|
||||
# Update identifiers
|
||||
for ident in data.get('identifiers', []):
|
||||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID':
|
||||
ident['identifier_value'] = new_ghcid
|
||||
|
||||
# Write updated file
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Rename file
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
new_path = file_path.parent / new_filename
|
||||
if new_path != file_path and not new_path.exists():
|
||||
shutil.move(file_path, new_path)
|
||||
result['renamed_to'] = str(new_path.name)
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Fast Czech city enrichment from names')
|
||||
parser.add_argument('--dry-run', action='store_true')
|
||||
parser.add_argument('--limit', type=int)
|
||||
parser.add_argument('--verbose', '-v', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("CZECH CITY ENRICHMENT (Fast Mode)")
|
||||
print("=" * 60)
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE")
|
||||
|
||||
czech_xxx_files = list(CUSTODIAN_DIR.glob("CZ-*-XXX-*.yaml"))
|
||||
if args.limit:
|
||||
czech_xxx_files = czech_xxx_files[:args.limit]
|
||||
|
||||
print(f"Found {len(czech_xxx_files)} Czech files with XXX placeholder")
|
||||
|
||||
stats = {}
|
||||
cities_found = {}
|
||||
|
||||
for i, file_path in enumerate(czech_xxx_files, 1):
|
||||
if i % 50 == 0:
|
||||
print(f"Progress: {i}/{len(czech_xxx_files)}")
|
||||
|
||||
result = process_file(file_path, dry_run=args.dry_run)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
if result.get('city'):
|
||||
cities_found[result['city']] = cities_found.get(result['city'], 0) + 1
|
||||
|
||||
if args.verbose and result['status'] in ('updated', 'would_update'):
|
||||
print(f" {result['old_ghcid']} -> {result['new_ghcid']} ({result['city']})")
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Total processed: {len(czech_xxx_files)}")
|
||||
for status, count in sorted(stats.items()):
|
||||
if count > 0:
|
||||
print(f" {status}: {count}")
|
||||
|
||||
if cities_found:
|
||||
print(f"\nCities found: {len(cities_found)} unique")
|
||||
print("Top 10:")
|
||||
for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" {city}: {count}")
|
||||
|
||||
# Save report
|
||||
REPORTS_DIR.mkdir(exist_ok=True)
|
||||
report_file = REPORTS_DIR / f"CZECH_CITY_FAST_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
||||
with open(report_file, 'w') as f:
|
||||
f.write(f"# Czech City Enrichment (Fast Mode)\n\n")
|
||||
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n")
|
||||
f.write(f"## Results\n")
|
||||
for status, count in sorted(stats.items()):
|
||||
f.write(f"- {status}: {count}\n")
|
||||
|
||||
print(f"\nReport: {report_file}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
480
scripts/enrich_japanese_cities.py
Executable file
480
scripts/enrich_japanese_cities.py
Executable file
|
|
@ -0,0 +1,480 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich Japanese custodian files with city/region data using Google Places API.
|
||||
|
||||
This script:
|
||||
1. Finds Japanese XXX files (no city/region resolved)
|
||||
2. Uses Google Places API to search for each institution
|
||||
3. Extracts location data (city, prefecture, coordinates)
|
||||
4. Updates GHCID with proper region/city codes
|
||||
5. Adds Google Maps enrichment data
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_japanese_cities.py [--dry-run] [--limit N]
|
||||
|
||||
Environment Variables:
|
||||
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import sqlite3
|
||||
import re
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Configuration
|
||||
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
||||
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
|
||||
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||
|
||||
# Google Places API
|
||||
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
||||
REQUEST_DELAY = 0.3 # Rate limiting
|
||||
|
||||
# Japanese prefecture GeoNames admin1_code to ISO 3166-2:JP mapping
|
||||
ADMIN1_TO_ISO = {
|
||||
'01': 'AI', # Aichi
|
||||
'02': 'AK', # Akita
|
||||
'03': 'AO', # Aomori
|
||||
'04': 'CH', # Chiba
|
||||
'05': 'EH', # Ehime
|
||||
'06': 'FI', # Fukui
|
||||
'07': 'FO', # Fukuoka
|
||||
'08': 'FS', # Fukushima
|
||||
'09': 'GI', # Gifu
|
||||
'10': 'GU', # Gunma
|
||||
'11': 'HS', # Hiroshima
|
||||
'12': 'HO', # Hokkaido
|
||||
'13': 'HG', # Hyogo
|
||||
'14': 'IB', # Ibaraki
|
||||
'15': 'IS', # Ishikawa
|
||||
'16': 'IW', # Iwate
|
||||
'17': 'KA', # Kagawa
|
||||
'18': 'KS', # Kagoshima
|
||||
'19': 'KN', # Kanagawa
|
||||
'20': 'KC', # Kochi
|
||||
'21': 'KM', # Kumamoto
|
||||
'22': 'KY', # Kyoto
|
||||
'23': 'ME', # Mie
|
||||
'24': 'MG', # Miyagi
|
||||
'25': 'MZ', # Miyazaki
|
||||
'26': 'NN', # Nagano
|
||||
'27': 'NS', # Nagasaki
|
||||
'28': 'NR', # Nara
|
||||
'29': 'NI', # Niigata
|
||||
'30': 'OT', # Oita
|
||||
'31': 'OK', # Okayama
|
||||
'32': 'OS', # Osaka
|
||||
'33': 'SG', # Saga
|
||||
'34': 'ST', # Saitama
|
||||
'35': 'SI', # Shiga
|
||||
'36': 'SM', # Shimane
|
||||
'37': 'SZ', # Shizuoka
|
||||
'38': 'TC', # Tochigi
|
||||
'39': 'TS', # Tokushima
|
||||
'40': 'TK', # Tokyo
|
||||
'41': 'TT', # Tottori
|
||||
'42': 'TY', # Toyama
|
||||
'43': 'WK', # Wakayama
|
||||
'44': 'YG', # Yamagata
|
||||
'45': 'YM', # Yamaguchi
|
||||
'46': 'YN', # Yamanashi
|
||||
'47': 'ON', # Okinawa
|
||||
}
|
||||
|
||||
# Reverse mapping for lookup by prefecture name
|
||||
PREFECTURE_TO_ISO = {
|
||||
'Aichi': 'AI', 'Akita': 'AK', 'Aomori': 'AO', 'Chiba': 'CH', 'Ehime': 'EH',
|
||||
'Fukui': 'FI', 'Fukuoka': 'FO', 'Fukushima': 'FS', 'Gifu': 'GI', 'Gunma': 'GU',
|
||||
'Hiroshima': 'HS', 'Hokkaido': 'HO', 'Hyogo': 'HG', 'Hyōgo': 'HG',
|
||||
'Ibaraki': 'IB', 'Ishikawa': 'IS', 'Iwate': 'IW', 'Kagawa': 'KA',
|
||||
'Kagoshima': 'KS', 'Kanagawa': 'KN', 'Kochi': 'KC', 'Kumamoto': 'KM',
|
||||
'Kyoto': 'KY', 'Mie': 'ME', 'Miyagi': 'MG', 'Miyazaki': 'MZ',
|
||||
'Nagano': 'NN', 'Nagasaki': 'NS', 'Nara': 'NR', 'Niigata': 'NI',
|
||||
'Oita': 'OT', 'Okayama': 'OK', 'Osaka': 'OS', 'Saga': 'SG',
|
||||
'Saitama': 'ST', 'Shiga': 'SI', 'Shimane': 'SM', 'Shizuoka': 'SZ',
|
||||
'Tochigi': 'TC', 'Tokushima': 'TS', 'Tokyo': 'TK', 'Tottori': 'TT',
|
||||
'Toyama': 'TY', 'Wakayama': 'WK', 'Yamagata': 'YG', 'Yamaguchi': 'YM',
|
||||
'Yamanashi': 'YN', 'Okinawa': 'ON',
|
||||
# Alternative spellings from address strings
|
||||
'Tokyo To': 'TK', 'Osaka Fu': 'OS', 'Kyoto Fu': 'KY', 'Hokkaido': 'HO',
|
||||
'Aichi Ken': 'AI', 'Hyogo Ken': 'HG', 'Kanagawa Ken': 'KN',
|
||||
}
|
||||
|
||||
|
||||
def get_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
# Clean suffixes common in Japanese city names
|
||||
name = city_name.strip()
|
||||
for suffix in [' Shi', ' Ku', ' Cho', ' Machi', ' Mura', ' Gun', ' City', '-shi', '-ku']:
|
||||
if name.endswith(suffix):
|
||||
name = name[:-len(suffix)]
|
||||
|
||||
words = name.split()
|
||||
|
||||
if len(words) == 1:
|
||||
return name[:3].upper()
|
||||
elif len(words) == 2:
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
|
||||
def search_google_places(query: str, api_key: str, country_bias: str = "JP") -> Optional[dict]:
|
||||
"""Search Google Places API for a location."""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"X-Goog-Api-Key": api_key,
|
||||
"X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"textQuery": query,
|
||||
"languageCode": "en"
|
||||
}
|
||||
|
||||
try:
|
||||
response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if "places" in data and len(data["places"]) > 0:
|
||||
return data["places"][0]
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" Error searching Google Places: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_location_from_google(place: dict) -> dict:
|
||||
"""Extract location information from Google Places result."""
|
||||
result = {
|
||||
'city': None,
|
||||
'prefecture': None,
|
||||
'prefecture_code': None,
|
||||
'latitude': None,
|
||||
'longitude': None,
|
||||
'formatted_address': None,
|
||||
'place_id': None,
|
||||
'website': None,
|
||||
}
|
||||
|
||||
if not place:
|
||||
return result
|
||||
|
||||
result['place_id'] = place.get('id')
|
||||
result['formatted_address'] = place.get('formattedAddress')
|
||||
result['website'] = place.get('websiteUri')
|
||||
|
||||
# Get coordinates
|
||||
location = place.get('location', {})
|
||||
result['latitude'] = location.get('latitude')
|
||||
result['longitude'] = location.get('longitude')
|
||||
|
||||
# Parse address components
|
||||
components = place.get('addressComponents', [])
|
||||
for comp in components:
|
||||
types = comp.get('types', [])
|
||||
long_name = comp.get('longText', '')
|
||||
|
||||
if 'locality' in types:
|
||||
result['city'] = long_name
|
||||
elif 'administrative_area_level_1' in types:
|
||||
result['prefecture'] = long_name
|
||||
# Try to get ISO code
|
||||
result['prefecture_code'] = PREFECTURE_TO_ISO.get(long_name)
|
||||
elif 'sublocality_level_1' in types and not result['city']:
|
||||
# Use ward/sublocality as city if no locality
|
||||
result['city'] = long_name
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float) -> Optional[dict]:
|
||||
"""Reverse geocode coordinates to find nearest city in GeoNames."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
||||
latitude, longitude, population, feature_code,
|
||||
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
|
||||
FROM cities
|
||||
WHERE country_code = 'JP'
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
ORDER BY dist_sq
|
||||
LIMIT 1
|
||||
""", (lat, lat, lon, lon))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'name': row[0],
|
||||
'ascii_name': row[1],
|
||||
'admin1_code': row[2],
|
||||
'admin1_name': row[3],
|
||||
'geonames_id': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'population': row[7],
|
||||
'feature_code': row[8],
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, dry_run: bool = False) -> dict:
|
||||
"""Process a single Japanese custodian file."""
|
||||
result = {
|
||||
'file': str(filepath),
|
||||
'status': 'skipped',
|
||||
'old_ghcid': None,
|
||||
'new_ghcid': None,
|
||||
'city': None,
|
||||
'prefecture': None,
|
||||
'error': None,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'Failed to load YAML: {e}'
|
||||
return result
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty YAML file'
|
||||
return result
|
||||
|
||||
# Get current GHCID
|
||||
ghcid_data = data.get('ghcid', {})
|
||||
old_ghcid = ghcid_data.get('ghcid_current', '')
|
||||
result['old_ghcid'] = old_ghcid
|
||||
|
||||
if not old_ghcid.startswith('JP-XX-XXX-'):
|
||||
result['status'] = 'skipped'
|
||||
result['error'] = 'Not a JP-XX-XXX file'
|
||||
return result
|
||||
|
||||
# Get institution name for search
|
||||
name = data.get('custodian_name', {}).get('claim_value', '')
|
||||
if not name:
|
||||
name = data.get('original_entry', {}).get('name', '')
|
||||
|
||||
if not name:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'No institution name found'
|
||||
return result
|
||||
|
||||
# Search Google Places
|
||||
print(f" Searching: {name[:50]}...")
|
||||
place = search_google_places(f"{name} Japan", api_key)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not place:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Not found in Google Places'
|
||||
return result
|
||||
|
||||
# Extract location
|
||||
location_info = extract_location_from_google(place)
|
||||
|
||||
if not location_info['latitude'] or not location_info['longitude']:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'No coordinates from Google'
|
||||
return result
|
||||
|
||||
# Lookup in GeoNames for city code
|
||||
city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude'])
|
||||
|
||||
if not city_info:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'City not found in GeoNames'
|
||||
return result
|
||||
|
||||
# Determine region code
|
||||
admin1_code = city_info['admin1_code']
|
||||
region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')
|
||||
|
||||
if region_code == 'XX':
|
||||
# Try from Google address
|
||||
region_code = location_info.get('prefecture_code', 'XX')
|
||||
|
||||
# Generate city code
|
||||
city_code = get_city_code(city_info['ascii_name'])
|
||||
|
||||
result['city'] = city_info['ascii_name']
|
||||
result['prefecture'] = city_info['admin1_name']
|
||||
|
||||
# Build new GHCID
|
||||
parts = old_ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
inst_type = parts[3]
|
||||
abbreviation = '-'.join(parts[4:])
|
||||
else:
|
||||
result['status'] = 'error'
|
||||
result['error'] = f'Invalid GHCID format: {old_ghcid}'
|
||||
return result
|
||||
|
||||
new_ghcid = f'JP-{region_code}-{city_code}-{inst_type}-{abbreviation}'
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the data
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update ghcid section
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['location_resolution'] = {
|
||||
'method': 'GOOGLE_PLACES_GEONAMES',
|
||||
'country_code': 'JP',
|
||||
'region_code': region_code,
|
||||
'region_name': city_info['admin1_name'],
|
||||
'city_code': city_code,
|
||||
'city_name': city_info['ascii_name'],
|
||||
'geonames_id': city_info['geonames_id'],
|
||||
'feature_code': city_info['feature_code'],
|
||||
'google_place_id': location_info.get('place_id'),
|
||||
'latitude': location_info['latitude'],
|
||||
'longitude': location_info['longitude'],
|
||||
'resolution_date': timestamp,
|
||||
}
|
||||
|
||||
# Add Google Maps enrichment
|
||||
data['google_maps_enrichment'] = {
|
||||
'place_id': location_info.get('place_id'),
|
||||
'formatted_address': location_info.get('formatted_address'),
|
||||
'website': location_info.get('website'),
|
||||
'latitude': location_info['latitude'],
|
||||
'longitude': location_info['longitude'],
|
||||
'enriched_at': timestamp,
|
||||
'source': 'Google Places API (New)',
|
||||
}
|
||||
|
||||
# Update location in original_entry
|
||||
if 'original_entry' in data and 'locations' in data['original_entry']:
|
||||
if data['original_entry']['locations']:
|
||||
data['original_entry']['locations'][0]['city'] = city_info['ascii_name']
|
||||
data['original_entry']['locations'][0]['region'] = city_info['admin1_name']
|
||||
if location_info['latitude']:
|
||||
data['original_entry']['locations'][0]['latitude'] = location_info['latitude']
|
||||
data['original_entry']['locations'][0]['longitude'] = location_info['longitude']
|
||||
|
||||
# Add to GHCID history
|
||||
if 'ghcid_history' not in data['ghcid']:
|
||||
data['ghcid']['ghcid_history'] = []
|
||||
|
||||
for entry in data['ghcid']['ghcid_history']:
|
||||
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
|
||||
entry['valid_to'] = timestamp
|
||||
|
||||
data['ghcid']['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
||||
'valid_from': timestamp,
|
||||
'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
|
||||
})
|
||||
|
||||
# Update identifiers
|
||||
if 'identifiers' in data:
|
||||
for identifier in data['identifiers']:
|
||||
if identifier.get('identifier_scheme') == 'GHCID':
|
||||
identifier['identifier_value'] = new_ghcid
|
||||
|
||||
# Write updated data
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
# Rename file
|
||||
new_filename = f'{new_ghcid}.yaml'
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
if filepath != new_filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
result['new_file'] = str(new_filepath)
|
||||
elif new_filepath.exists() and filepath != new_filepath:
|
||||
result['status'] = 'collision'
|
||||
result['error'] = f'Target file exists: {new_filepath.name}'
|
||||
return result
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Enrich Japanese custodian files with Google Places data')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not GOOGLE_PLACES_TOKEN:
|
||||
print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
|
||||
print("Set it in .env file or export GOOGLE_PLACES_TOKEN=...")
|
||||
sys.exit(1)
|
||||
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
||||
sys.exit(1)
|
||||
|
||||
# Find Japanese XXX files
|
||||
files = sorted(CUSTODIAN_DIR.glob('JP-XX-XXX-*.yaml'))
|
||||
|
||||
if args.limit:
|
||||
files = files[:args.limit]
|
||||
|
||||
print(f"Found {len(files)} Japanese XXX files")
|
||||
print(f"Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
conn = sqlite3.connect(str(GEONAMES_DB))
|
||||
|
||||
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
||||
errors = []
|
||||
|
||||
for filepath in files:
|
||||
print(f"Processing: {filepath.name}")
|
||||
result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, dry_run=args.dry_run)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
if result['status'] in ('updated', 'would_update'):
|
||||
print(f" ✓ {result['city']} ({result['prefecture']}): {result['old_ghcid']} → {result['new_ghcid']}")
|
||||
elif result['status'] == 'error':
|
||||
print(f" ✗ {result['error']}")
|
||||
errors.append(result)
|
||||
elif result['status'] == 'collision':
|
||||
print(f" ⚠ {result['error']}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print('=' * 60)
|
||||
print('Summary:')
|
||||
print(f" Updated: {stats.get('updated', 0)}")
|
||||
print(f" Would update: {stats.get('would_update', 0)}")
|
||||
print(f" Errors: {stats.get('error', 0)}")
|
||||
print(f" Collisions: {stats.get('collision', 0)}")
|
||||
print(f" Skipped: {stats.get('skipped', 0)}")
|
||||
|
||||
if errors:
|
||||
print()
|
||||
print('Files with errors (may need manual research):')
|
||||
for err in errors[:10]:
|
||||
print(f" - {Path(err['file']).name}: {err['error']}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
559
scripts/enrich_swiss_isil_cities.py
Normal file
559
scripts/enrich_swiss_isil_cities.py
Normal file
|
|
@ -0,0 +1,559 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrich Swiss ISIL custodian files with city data from the Swiss ISIL website.
|
||||
|
||||
For Swiss custodian files with XXX city placeholder, this script:
|
||||
1. Loads the source CH-Annotator file to get ISIL URLs by institution name
|
||||
2. Fetches the institution page from isil.nb.admin.ch
|
||||
3. Extracts city (Location) and address data
|
||||
4. Reverse geocodes using GeoNames to get proper city code
|
||||
5. Updates the GHCID with correct city code
|
||||
6. Renames the file if GHCID changes
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_swiss_isil_cities.py [--dry-run] [--limit N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import time
|
||||
import uuid
|
||||
import yaml
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# Paths
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
||||
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
||||
REPORTS_DIR = PROJECT_ROOT / "reports"
|
||||
SWISS_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "switzerland_isil_ch_annotator.yaml"
|
||||
|
||||
# GHCID namespace for UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
||||
|
||||
# Rate limiting
|
||||
REQUEST_DELAY = 1.0 # seconds between requests
|
||||
|
||||
# Swiss canton codes (already ISO 3166-2)
|
||||
SWISS_CANTON_CODES = {
|
||||
'Aargau': 'AG', 'Appenzell Ausserrhoden': 'AR', 'Appenzell Innerrhoden': 'AI',
|
||||
'Basel-Landschaft': 'BL', 'Basel-Stadt': 'BS', 'Bern': 'BE', 'Fribourg': 'FR',
|
||||
'Geneva': 'GE', 'Glarus': 'GL', 'Graubünden': 'GR', 'Jura': 'JU', 'Lucerne': 'LU',
|
||||
'Neuchâtel': 'NE', 'Nidwalden': 'NW', 'Obwalden': 'OW', 'Schaffhausen': 'SH',
|
||||
'Schwyz': 'SZ', 'Solothurn': 'SO', 'St. Gallen': 'SG', 'Thurgau': 'TG',
|
||||
'Ticino': 'TI', 'Uri': 'UR', 'Valais': 'VS', 'Vaud': 'VD', 'Zug': 'ZG', 'Zürich': 'ZH',
|
||||
# German names
|
||||
'Genf': 'GE', 'Luzern': 'LU', 'Neuenburg': 'NE', 'Wallis': 'VS', 'Waadt': 'VD',
|
||||
# French names
|
||||
'Genève': 'GE', 'Lucerne': 'LU', 'Valais': 'VS', 'Vaud': 'VD', 'Fribourg': 'FR',
|
||||
# Italian names
|
||||
'Ginevra': 'GE', 'Grigioni': 'GR', 'Ticino': 'TI', 'Vallese': 'VS',
|
||||
}
|
||||
|
||||
|
||||
def load_swiss_isil_lookup() -> Dict[str, str]:
|
||||
"""Load Swiss CH-Annotator source file and create name -> ISIL URL lookup."""
|
||||
lookup = {}
|
||||
|
||||
if not SWISS_CH_ANNOTATOR_FILE.exists():
|
||||
print(f"Warning: Swiss CH-Annotator file not found: {SWISS_CH_ANNOTATOR_FILE}")
|
||||
return lookup
|
||||
|
||||
print(f"Loading Swiss CH-Annotator source file...")
|
||||
with open(SWISS_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f:
|
||||
entries = yaml.safe_load(f)
|
||||
|
||||
if not entries:
|
||||
return lookup
|
||||
|
||||
for entry in entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
|
||||
name = entry.get('name', '')
|
||||
if not name:
|
||||
continue
|
||||
|
||||
# Look for ISIL URL in digital_platforms
|
||||
for platform in entry.get('digital_platforms', []):
|
||||
if isinstance(platform, dict):
|
||||
url = platform.get('platform_url', '')
|
||||
if 'isil.nb.admin.ch' in url:
|
||||
lookup[name] = url
|
||||
break
|
||||
|
||||
print(f" Loaded {len(lookup)} institutions with ISIL URLs")
|
||||
return lookup
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
if not city_name:
|
||||
return 'XXX'
|
||||
|
||||
# Remove diacritics and normalize
|
||||
import unicodedata
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Skip articles and prepositions
|
||||
skip_words = {'de', 'la', 'le', 'les', 'du', 'des', 'von', 'am', 'im', 'an', 'der', 'die', 'das'}
|
||||
words = ascii_name.split()
|
||||
significant_words = [w for w in words if w.lower() not in skip_words]
|
||||
|
||||
if not significant_words:
|
||||
significant_words = words
|
||||
|
||||
if len(significant_words) == 1:
|
||||
# Single word: first 3 letters
|
||||
return significant_words[0][:3].upper()
|
||||
else:
|
||||
# Multiple words: initials
|
||||
return ''.join(w[0] for w in significant_words[:3]).upper()
|
||||
|
||||
|
||||
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
||||
"""Generate deterministic UUID v5 from GHCID string."""
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
||||
|
||||
|
||||
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
||||
"""Generate UUID v8 style from SHA-256 hash."""
|
||||
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16]
|
||||
hash_bytes = bytearray(hash_bytes)
|
||||
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8
|
||||
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant
|
||||
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
||||
|
||||
|
||||
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
||||
"""Generate 64-bit numeric ID from SHA-256 hash."""
|
||||
hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
return int.from_bytes(hash_bytes[:8], 'big')
|
||||
|
||||
|
||||
def fetch_isil_page(isil_url: str, session: requests.Session) -> Optional[Dict]:
|
||||
"""Fetch and parse Swiss ISIL institution page."""
|
||||
try:
|
||||
response = session.get(isil_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
result = {}
|
||||
|
||||
# Find all dt/dd pairs in the definition lists
|
||||
for dt in soup.find_all('dt'):
|
||||
label = dt.get_text(strip=True)
|
||||
dd = dt.find_next_sibling('dd')
|
||||
if dd:
|
||||
value = dd.get_text(strip=True)
|
||||
|
||||
if label == 'Location':
|
||||
result['city'] = value
|
||||
elif label == 'Zip code':
|
||||
result['postal_code'] = value
|
||||
elif label == 'Street and number':
|
||||
result['street_address'] = value
|
||||
elif label == 'Canton':
|
||||
result['canton'] = value
|
||||
result['region'] = SWISS_CANTON_CODES.get(value, value[:2].upper() if len(value) >= 2 else None)
|
||||
|
||||
return result if result.get('city') else None
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error fetching {isil_url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def reverse_geocode_city(city_name: str, region_code: str, country_code: str, db_path: Path) -> Optional[Dict]:
|
||||
"""Look up city in GeoNames database to get coordinates and proper data."""
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Swiss admin1 codes in GeoNames
|
||||
swiss_admin1_map = {
|
||||
'AG': '01', 'AR': '15', 'AI': '16', 'BL': '06', 'BS': '05',
|
||||
'BE': '02', 'FR': '04', 'GE': '07', 'GL': '08', 'GR': '03',
|
||||
'JU': '26', 'LU': '09', 'NE': '10', 'NW': '11', 'OW': '12',
|
||||
'SH': '14', 'SZ': '17', 'SO': '13', 'SG': '18', 'TG': '20',
|
||||
'TI': '21', 'UR': '19', 'VS': '22', 'VD': '23', 'ZG': '25', 'ZH': '24'
|
||||
}
|
||||
|
||||
admin1_code = swiss_admin1_map.get(region_code)
|
||||
|
||||
# Try exact match first
|
||||
query = """
|
||||
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
||||
population, feature_code, admin1_code, admin1_name
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?))
|
||||
"""
|
||||
|
||||
if admin1_code:
|
||||
query += " AND admin1_code = ?"
|
||||
cursor.execute(query + " ORDER BY population DESC LIMIT 1",
|
||||
(country_code, city_name, city_name, city_name, admin1_code))
|
||||
else:
|
||||
cursor.execute(query + " ORDER BY population DESC LIMIT 1",
|
||||
(country_code, city_name, city_name, city_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'geonames_name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'latitude': row[3],
|
||||
'longitude': row[4],
|
||||
'population': row[5],
|
||||
'feature_code': row[6],
|
||||
'admin1_code': row[7],
|
||||
'admin1_name': row[8]
|
||||
}
|
||||
|
||||
# Try fuzzy match
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
||||
population, feature_code, admin1_code, admin1_name
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (name LIKE ? OR ascii_name LIKE ?)
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (country_code, f"{city_name}%", f"{city_name}%"))
|
||||
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'geonames_name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'latitude': row[3],
|
||||
'longitude': row[4],
|
||||
'population': row[5],
|
||||
'feature_code': row[6],
|
||||
'admin1_code': row[7],
|
||||
'admin1_name': row[8]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f" GeoNames lookup error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def process_file(file_path: Path, session: requests.Session, isil_lookup: Dict[str, str], dry_run: bool = True) -> Dict:
|
||||
"""Process a single custodian file."""
|
||||
result = {
|
||||
'status': 'unchanged',
|
||||
'old_ghcid': None,
|
||||
'new_ghcid': None,
|
||||
'city': None,
|
||||
'error': None
|
||||
}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty file'
|
||||
return result
|
||||
|
||||
# Check if this is a Swiss file with XXX city placeholder
|
||||
ghcid_current = data.get('ghcid', {}).get('ghcid_current', '')
|
||||
if not ghcid_current.startswith('CH-') or '-XXX-' not in ghcid_current:
|
||||
result['status'] = 'skipped'
|
||||
return result
|
||||
|
||||
result['old_ghcid'] = ghcid_current
|
||||
|
||||
# Get institution name for lookup
|
||||
inst_name = data.get('original_entry', {}).get('name', '')
|
||||
if not inst_name:
|
||||
inst_name = data.get('custodian_name', {}).get('claim_value', '')
|
||||
|
||||
# Find ISIL URL - first try lookup by name
|
||||
isil_url = isil_lookup.get(inst_name)
|
||||
|
||||
# Then check identifiers in the file
|
||||
if not isil_url:
|
||||
identifiers = data.get('identifiers', [])
|
||||
for ident in identifiers:
|
||||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL':
|
||||
url = ident.get('identifier_url', '')
|
||||
if 'isil.nb.admin.ch' in url:
|
||||
isil_url = url
|
||||
break
|
||||
|
||||
# Also check original_entry.identifiers
|
||||
if not isil_url:
|
||||
original_identifiers = data.get('original_entry', {}).get('identifiers', [])
|
||||
for ident in original_identifiers:
|
||||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL':
|
||||
url = ident.get('identifier_url', '')
|
||||
if 'isil.nb.admin.ch' in url:
|
||||
isil_url = url
|
||||
break
|
||||
|
||||
if not isil_url:
|
||||
result['status'] = 'no_isil_url'
|
||||
result['error'] = f'No ISIL URL found for: {inst_name}'
|
||||
return result
|
||||
|
||||
# Convert to proper page URL format
|
||||
if '?isil=' in isil_url:
|
||||
isil_code = isil_url.split('?isil=')[-1]
|
||||
# Convert to institution page URL
|
||||
isil_url = f"https://www.isil.nb.admin.ch/en/?isil={isil_code}"
|
||||
|
||||
# Fetch city data from ISIL website
|
||||
time.sleep(REQUEST_DELAY)
|
||||
isil_data = fetch_isil_page(isil_url, session)
|
||||
|
||||
if not isil_data or not isil_data.get('city'):
|
||||
result['status'] = 'no_city_found'
|
||||
return result
|
||||
|
||||
city_name = isil_data['city']
|
||||
result['city'] = city_name
|
||||
|
||||
# Get region from GHCID or ISIL data
|
||||
parts = ghcid_current.split('-')
|
||||
region_code = parts[1] if len(parts) > 1 else isil_data.get('region', 'XX')
|
||||
|
||||
# Generate city code
|
||||
city_code = generate_city_code(city_name)
|
||||
|
||||
# Try to get GeoNames data for coordinates
|
||||
geonames_data = reverse_geocode_city(city_name, region_code, 'CH', GEONAMES_DB)
|
||||
|
||||
# Build new GHCID
|
||||
# Format: CH-{region}-{city}-{type}-{abbrev}[-{suffix}]
|
||||
new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-')
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
if new_ghcid == ghcid_current:
|
||||
result['status'] = 'unchanged'
|
||||
return result
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the data
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update GHCID
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
|
||||
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
|
||||
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
|
||||
|
||||
# Update location_resolution
|
||||
location_resolution = {
|
||||
'method': 'SWISS_ISIL_ENRICHMENT',
|
||||
'city_name': city_name,
|
||||
'city_code': city_code,
|
||||
'region_code': region_code,
|
||||
'country_code': 'CH',
|
||||
'enrichment_date': now,
|
||||
'source_url': isil_url
|
||||
}
|
||||
|
||||
if geonames_data:
|
||||
location_resolution.update({
|
||||
'geonames_id': geonames_data['geonames_id'],
|
||||
'geonames_name': geonames_data['geonames_name'],
|
||||
'feature_code': geonames_data['feature_code'],
|
||||
'population': geonames_data['population'],
|
||||
'latitude': geonames_data['latitude'],
|
||||
'longitude': geonames_data['longitude']
|
||||
})
|
||||
|
||||
data['ghcid']['location_resolution'] = location_resolution
|
||||
|
||||
# Add GHCID history entry
|
||||
history = data['ghcid'].get('ghcid_history', [])
|
||||
if history:
|
||||
# Close previous entry
|
||||
history[0]['valid_to'] = now
|
||||
|
||||
history.insert(0, {
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
|
||||
'valid_from': now,
|
||||
'valid_to': None,
|
||||
'reason': f'City code updated from Swiss ISIL enrichment: {city_name} -> {city_code}'
|
||||
})
|
||||
data['ghcid']['ghcid_history'] = history
|
||||
|
||||
# Update location in original_entry if exists
|
||||
if 'locations' in data.get('original_entry', {}):
|
||||
for loc in data['original_entry']['locations']:
|
||||
if isinstance(loc, dict) and not loc.get('city'):
|
||||
loc['city'] = city_name
|
||||
if isil_data.get('postal_code'):
|
||||
loc['postal_code'] = isil_data['postal_code']
|
||||
if isil_data.get('street_address'):
|
||||
loc['street_address'] = isil_data['street_address']
|
||||
|
||||
# Update identifiers
|
||||
for ident in data.get('identifiers', []):
|
||||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID':
|
||||
ident['identifier_value'] = new_ghcid
|
||||
|
||||
# Write updated file
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Rename file if GHCID changed
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
new_path = file_path.parent / new_filename
|
||||
|
||||
if new_path != file_path and not new_path.exists():
|
||||
shutil.move(file_path, new_path)
|
||||
result['renamed_to'] = str(new_path.name)
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Enrich Swiss ISIL custodian files with city data')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("SWISS ISIL CITY ENRICHMENT")
|
||||
print("=" * 60)
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
|
||||
# Find Swiss files with XXX city placeholder
|
||||
swiss_xxx_files = list(CUSTODIAN_DIR.glob("CH-*-XXX-*.yaml"))
|
||||
|
||||
if args.limit:
|
||||
swiss_xxx_files = swiss_xxx_files[:args.limit]
|
||||
print(f"Limited to {args.limit} files")
|
||||
|
||||
print(f"Found {len(swiss_xxx_files)} Swiss files with XXX city placeholder")
|
||||
print()
|
||||
|
||||
# Load Swiss ISIL lookup from CH-Annotator source file
|
||||
isil_lookup = load_swiss_isil_lookup()
|
||||
|
||||
# Process files
|
||||
session = requests.Session()
|
||||
session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)'
|
||||
|
||||
stats = {
|
||||
'updated': 0,
|
||||
'would_update': 0,
|
||||
'unchanged': 0,
|
||||
'skipped': 0,
|
||||
'no_isil_url': 0,
|
||||
'no_city_found': 0,
|
||||
'error': 0
|
||||
}
|
||||
|
||||
cities_found = {}
|
||||
errors = []
|
||||
|
||||
for i, file_path in enumerate(swiss_xxx_files, 1):
|
||||
if i % 100 == 0 or args.verbose:
|
||||
print(f"Progress: {i}/{len(swiss_xxx_files)}")
|
||||
|
||||
result = process_file(file_path, session, isil_lookup, dry_run=args.dry_run)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
if result.get('city'):
|
||||
cities_found[result['city']] = cities_found.get(result['city'], 0) + 1
|
||||
|
||||
if result.get('error'):
|
||||
errors.append(f"{file_path.name}: {result['error']}")
|
||||
|
||||
if args.verbose and result['status'] in ('updated', 'would_update'):
|
||||
print(f" {file_path.name}")
|
||||
print(f" City: {result.get('city')}")
|
||||
print(f" {result['old_ghcid']} -> {result['new_ghcid']}")
|
||||
|
||||
# Print summary
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Total files processed: {len(swiss_xxx_files)}")
|
||||
print()
|
||||
print("Results:")
|
||||
for status, count in sorted(stats.items()):
|
||||
if count > 0:
|
||||
print(f" {status}: {count}")
|
||||
|
||||
if cities_found:
|
||||
print()
|
||||
print(f"Cities found: {len(cities_found)} unique")
|
||||
print("Top 10 cities:")
|
||||
for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" {city}: {count}")
|
||||
|
||||
if errors:
|
||||
print()
|
||||
print(f"Errors ({len(errors)}):")
|
||||
for err in errors[:10]:
|
||||
print(f" {err}")
|
||||
if len(errors) > 10:
|
||||
print(f" ... and {len(errors) - 10} more")
|
||||
|
||||
# Save report
|
||||
REPORTS_DIR.mkdir(exist_ok=True)
|
||||
report_file = REPORTS_DIR / f"SWISS_ISIL_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
||||
|
||||
with open(report_file, 'w') as f:
|
||||
f.write("# Swiss ISIL City Enrichment Report\n\n")
|
||||
f.write(f"**Date**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n")
|
||||
f.write("## Summary\n\n")
|
||||
f.write(f"- Total files processed: {len(swiss_xxx_files)}\n")
|
||||
for status, count in sorted(stats.items()):
|
||||
if count > 0:
|
||||
f.write(f"- {status}: {count}\n")
|
||||
|
||||
if cities_found:
|
||||
f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n")
|
||||
for city, count in sorted(cities_found.items(), key=lambda x: -x[1]):
|
||||
f.write(f"- {city}: {count}\n")
|
||||
|
||||
print()
|
||||
print(f"Report saved to: {report_file}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
567
scripts/extract_locations_ch_annotator.py
Executable file
567
scripts/extract_locations_ch_annotator.py
Executable file
|
|
@ -0,0 +1,567 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract and resolve locations from custodian files using CH-Annotator convention.
|
||||
|
||||
This script follows CH-Annotator v1.7.0 TOPONYM (TOP) hypernym for:
|
||||
- TOP.SET: Settlements (cities, towns, villages)
|
||||
- TOP.REG: Regions (provinces, states)
|
||||
- TOP.CTY: Countries
|
||||
|
||||
Following AGENTS.md Rules:
|
||||
- Rule 5: Additive only - never delete existing data
|
||||
- Rule 10: CH-Annotator is the entity annotation convention
|
||||
- GHCID settlement standardization: GeoNames is authoritative
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import sqlite3
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
# GeoNames database path
|
||||
GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db"
|
||||
|
||||
# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods)
|
||||
SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
|
||||
# Admin1 to ISO 3166-2 mappings by country
|
||||
ADMIN1_TO_ISO = {
|
||||
'BE': {
|
||||
'BRU': 'BRU', # Brussels-Capital
|
||||
'VLG': 'VLG', # Flanders
|
||||
'WAL': 'WAL', # Wallonia
|
||||
'VAN': 'VAN', # Antwerp
|
||||
'VBR': 'VBR', # Flemish Brabant
|
||||
'VLI': 'VLI', # Limburg
|
||||
'VOV': 'VOV', # East Flanders
|
||||
'VWV': 'VWV', # West Flanders
|
||||
'WBR': 'WBR', # Walloon Brabant
|
||||
'WHT': 'WHT', # Hainaut
|
||||
'WLG': 'WLG', # Liège
|
||||
'WLX': 'WLX', # Luxembourg
|
||||
'WNA': 'WNA', # Namur
|
||||
},
|
||||
'AT': {
|
||||
'01': '1', # Burgenland
|
||||
'02': '2', # Kärnten
|
||||
'03': '3', # Niederösterreich
|
||||
'04': '4', # Oberösterreich
|
||||
'05': '5', # Salzburg
|
||||
'06': '6', # Steiermark
|
||||
'07': '7', # Tirol
|
||||
'08': '8', # Vorarlberg
|
||||
'09': '9', # Wien
|
||||
},
|
||||
'BG': {
|
||||
'42': '22', # Sofia City
|
||||
'41': '23', # Sofia Province
|
||||
'01': '01', # Blagoevgrad
|
||||
'02': '02', # Burgas
|
||||
'03': '03', # Varna
|
||||
'04': '04', # Veliko Tarnovo
|
||||
'05': '05', # Vidin
|
||||
'06': '06', # Vratsa
|
||||
'07': '07', # Gabrovo
|
||||
'08': '08', # Dobrich
|
||||
'09': '09', # Kardzhali
|
||||
'10': '10', # Kyustendil
|
||||
'11': '11', # Lovech
|
||||
'12': '12', # Montana
|
||||
'13': '13', # Pazardzhik
|
||||
'14': '14', # Pernik
|
||||
'15': '15', # Pleven
|
||||
'16': '16', # Plovdiv
|
||||
'17': '17', # Razgrad
|
||||
'18': '18', # Ruse
|
||||
'19': '19', # Silistra
|
||||
'20': '20', # Sliven
|
||||
'21': '21', # Smolyan
|
||||
'24': '24', # Stara Zagora
|
||||
'25': '25', # Targovishte
|
||||
'26': '26', # Haskovo
|
||||
'27': '27', # Shumen
|
||||
'28': '28', # Yambol
|
||||
},
|
||||
'CH': {
|
||||
'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL',
|
||||
'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR',
|
||||
'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW',
|
||||
'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG',
|
||||
'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG',
|
||||
'ZH': 'ZH',
|
||||
},
|
||||
'CZ': {
|
||||
'52': '10', # Prague
|
||||
'78': '20', # Central Bohemia
|
||||
'79': '31', # South Bohemia
|
||||
'80': '32', # Plzeň
|
||||
'81': '41', # Karlovy Vary
|
||||
'82': '42', # Ústí nad Labem
|
||||
'83': '51', # Liberec
|
||||
'84': '52', # Hradec Králové
|
||||
'85': '53', # Pardubice
|
||||
'86': '63', # Vysočina
|
||||
'78': '64', # South Moravia
|
||||
'87': '71', # Olomouc
|
||||
'88': '72', # Zlín
|
||||
'89': '80', # Moravia-Silesia
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def connect_geonames() -> Optional[sqlite3.Connection]:
|
||||
"""Connect to GeoNames database."""
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"Error: GeoNames database not found at {GEONAMES_DB}")
|
||||
return None
|
||||
return sqlite3.connect(str(GEONAMES_DB))
|
||||
|
||||
|
||||
def extract_toponym_from_name(name: str, country: str) -> Optional[str]:
|
||||
"""
|
||||
Extract TOPONYM (TOP.SET) from institution name using CH-Annotator patterns.
|
||||
|
||||
CH-Annotator TOP.SET pattern:
|
||||
- City/town names embedded in institution names
|
||||
- Often after prepositions: "in", "van", "de", "of", etc.
|
||||
- Or as suffix/prefix in compound names
|
||||
|
||||
Returns extracted city name or None.
|
||||
"""
|
||||
if not name:
|
||||
return None
|
||||
|
||||
# Normalize
|
||||
name_lower = name.lower()
|
||||
|
||||
# Pattern 1: Explicit city indicators
|
||||
# "bibliotheek [CityName]", "museum [CityName]", etc.
|
||||
city_patterns = [
|
||||
r'bibliotheek\s+(\w+)',
|
||||
r'bibliothek\s+(\w+)',
|
||||
r'museum\s+(\w+)',
|
||||
r'archief\s+(\w+)',
|
||||
r'archiv\s+(\w+)',
|
||||
r'archive\s+(\w+)',
|
||||
r'openbare\s+bibliotheek\s+(\w+)',
|
||||
r'gemeentelijke.*bibliotheek\s+(\w+)',
|
||||
r'stedelijke.*bibliotheek\s+(\w+)',
|
||||
r'stadsarchief\s+(\w+)',
|
||||
]
|
||||
|
||||
for pattern in city_patterns:
|
||||
match = re.search(pattern, name_lower)
|
||||
if match:
|
||||
city = match.group(1)
|
||||
# Filter out generic words
|
||||
if city not in ('van', 'de', 'het', 'der', 'voor', 'en', 'vzw', 'bv', 'nv'):
|
||||
return city.title()
|
||||
|
||||
# Pattern 2: Parenthetical city names
|
||||
# "Institution Name (City)" or "City Name (Alias)"
|
||||
paren_match = re.search(r'\(([^)]+)\)', name)
|
||||
if paren_match:
|
||||
paren_content = paren_match.group(1).strip()
|
||||
# Check for "(Bib CityName)" pattern - extract last word
|
||||
bib_match = re.match(r'(?:Bib|OB|POB|Bibliotheek)\s+(\w+)', paren_content, re.IGNORECASE)
|
||||
if bib_match:
|
||||
return bib_match.group(1).title()
|
||||
# Check if it looks like a city name (capitalized, not too long)
|
||||
words = paren_content.split()
|
||||
if len(words) <= 3 and words[0][0].isupper():
|
||||
return paren_content
|
||||
|
||||
# Pattern 3: Hyphenated city names (Belgian pattern)
|
||||
# "Brussel-Stad", "Sint-Niklaas"
|
||||
hyphen_match = re.search(r'(\w+-\w+)', name)
|
||||
if hyphen_match:
|
||||
compound = hyphen_match.group(1)
|
||||
# Check against known Belgian compound cities
|
||||
known_compounds = ['sint-niklaas', 'sint-truiden', 'brussel-stad',
|
||||
'la-louvière', 'molenbeek-saint-jean']
|
||||
if compound.lower() in known_compounds:
|
||||
return compound.title()
|
||||
|
||||
# Pattern 4: Last word as city (common pattern)
|
||||
# "Historisch Museum [CityName]"
|
||||
words = name.split()
|
||||
if len(words) >= 2:
|
||||
last_word = words[-1].strip('()')
|
||||
# Check if last word is capitalized and not a common suffix
|
||||
if (last_word[0].isupper() and
|
||||
last_word.lower() not in ('vzw', 'bv', 'nv', 'asbl', 'bibliotheek',
|
||||
'museum', 'archief', 'archiv')):
|
||||
return last_word
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def lookup_city_in_geonames(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]:
|
||||
"""
|
||||
Look up a city name in GeoNames database.
|
||||
|
||||
Returns dict with:
|
||||
- geonames_id
|
||||
- name (ascii_name)
|
||||
- admin1_code
|
||||
- region_code (ISO 3166-2)
|
||||
- latitude, longitude
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try exact match first - include admin2_code for countries that use it (Belgium)
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, latitude, longitude, feature_code, population
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (country, city_name, city_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
# Try partial match - but require minimum 4 chars to avoid false positives
|
||||
if len(city_name) >= 4:
|
||||
cursor.execute("""
|
||||
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, latitude, longitude, feature_code, population
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
""", (country, f"{city_name}%", f"{city_name}%"))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
geonames_id, name, ascii_name, admin1_code, admin2_code, lat, lon, feature_code, population = row
|
||||
|
||||
# Convert to ISO region code
|
||||
# Belgium uses admin2 for provinces, most countries use admin1
|
||||
region_code = 'XX'
|
||||
if country == 'BE':
|
||||
# Belgium: use admin2 (province) instead of admin1 (region)
|
||||
if admin2_code:
|
||||
region_code = admin2_code
|
||||
elif admin1_code:
|
||||
region_code = admin1_code
|
||||
elif country in ADMIN1_TO_ISO and admin1_code in ADMIN1_TO_ISO[country]:
|
||||
region_code = ADMIN1_TO_ISO[country][admin1_code]
|
||||
elif admin1_code:
|
||||
region_code = admin1_code
|
||||
|
||||
return {
|
||||
'geonames_id': geonames_id,
|
||||
'geonames_name': ascii_name or name,
|
||||
'admin1_code': admin1_code,
|
||||
'region_code': region_code,
|
||||
'latitude': lat,
|
||||
'longitude': lon,
|
||||
'feature_code': feature_code,
|
||||
'population': population,
|
||||
}
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from name."""
|
||||
words = city_name.split()
|
||||
if len(words) == 1:
|
||||
return city_name[:3].upper()
|
||||
else:
|
||||
# Use initials for multi-word names
|
||||
initials = ''.join(w[0] for w in words if w)[:3]
|
||||
return initials.upper()
|
||||
|
||||
|
||||
def update_file_with_location(filepath: Path, location_data: Dict, city_name: str,
|
||||
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
||||
"""Update custodian file with resolved location following CH-Annotator convention."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {filepath}: {e}")
|
||||
return False, None
|
||||
|
||||
if 'ghcid' not in data:
|
||||
return False, None
|
||||
|
||||
ghcid = data['ghcid']
|
||||
if 'location_resolution' not in ghcid:
|
||||
ghcid['location_resolution'] = {}
|
||||
|
||||
loc_res = ghcid['location_resolution']
|
||||
country_code = loc_res.get('country_code', '')
|
||||
old_region = loc_res.get('region_code', 'XX')
|
||||
old_city = loc_res.get('city_code', 'XXX')
|
||||
|
||||
if not country_code:
|
||||
return False, None
|
||||
|
||||
# Only update if we have XX or XXX to resolve
|
||||
if old_region != 'XX' and old_city != 'XXX':
|
||||
return False, None
|
||||
|
||||
region_code = location_data['region_code']
|
||||
city_code = generate_city_code(location_data['geonames_name'])
|
||||
|
||||
# Update location resolution with CH-Annotator provenance
|
||||
if old_region == 'XX':
|
||||
loc_res['region_code'] = region_code
|
||||
if old_city == 'XXX':
|
||||
loc_res['city_code'] = city_code
|
||||
loc_res['city_name'] = location_data['geonames_name']
|
||||
|
||||
loc_res['geonames_id'] = location_data['geonames_id']
|
||||
loc_res['feature_code'] = location_data['feature_code']
|
||||
loc_res['method'] = 'CH_ANNOTATOR_TOP_SET'
|
||||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
loc_res['extracted_toponym'] = city_name
|
||||
|
||||
if location_data.get('latitude'):
|
||||
loc_res['latitude'] = location_data['latitude']
|
||||
loc_res['longitude'] = location_data['longitude']
|
||||
|
||||
# Update GHCID string
|
||||
old_ghcid = ghcid.get('ghcid_current', '')
|
||||
new_ghcid = old_ghcid
|
||||
|
||||
if old_region == 'XX':
|
||||
new_ghcid = new_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
||||
if old_city == 'XXX':
|
||||
new_ghcid = new_ghcid.replace(f'-XXX-', f'-{city_code}-')
|
||||
|
||||
if new_ghcid != old_ghcid:
|
||||
ghcid['ghcid_current'] = new_ghcid
|
||||
|
||||
if 'ghcid_history' not in ghcid:
|
||||
ghcid['ghcid_history'] = []
|
||||
|
||||
ghcid['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': f"Location resolved via CH-Annotator TOP.SET extraction: {city_name} -> {location_data['geonames_name']} (GeoNames:{location_data['geonames_id']})"
|
||||
})
|
||||
|
||||
# Add CH-Annotator entity claim for location
|
||||
if 'ch_annotator' not in data:
|
||||
data['ch_annotator'] = {}
|
||||
|
||||
if 'entity_claims' not in data['ch_annotator']:
|
||||
data['ch_annotator']['entity_claims'] = []
|
||||
|
||||
# Add TOP.SET claim
|
||||
data['ch_annotator']['entity_claims'].append({
|
||||
'claim_type': 'location_settlement',
|
||||
'claim_value': location_data['geonames_name'],
|
||||
'property_uri': 'schema:location',
|
||||
'hypernym_code': 'TOP.SET',
|
||||
'hypernym_label': 'SETTLEMENT',
|
||||
'provenance': {
|
||||
'namespace': 'geonames',
|
||||
'path': f"/geonames/{location_data['geonames_id']}",
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'agent': 'extract_locations_ch_annotator.py',
|
||||
'context_convention': 'ch_annotator-v1_7_0',
|
||||
},
|
||||
'confidence': 0.85,
|
||||
'extraction_source': {
|
||||
'field': 'institution_name',
|
||||
'extracted_text': city_name,
|
||||
'method': 'pattern_matching',
|
||||
},
|
||||
})
|
||||
|
||||
# Add provenance note
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance']:
|
||||
data['provenance']['notes'] = []
|
||||
elif isinstance(data['provenance']['notes'], str):
|
||||
data['provenance']['notes'] = [data['provenance']['notes']]
|
||||
|
||||
data['provenance']['notes'].append(
|
||||
f"Location resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
||||
f"CH-Annotator TOP.SET extraction '{city_name}' -> {location_data['geonames_name']} "
|
||||
f"(GeoNames:{location_data['geonames_id']}, Region:{region_code})"
|
||||
)
|
||||
|
||||
# Determine new filename
|
||||
new_filename = filepath.name
|
||||
if old_region == 'XX':
|
||||
new_filename = new_filename.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
||||
if old_city == 'XXX':
|
||||
new_filename = new_filename.replace(f'-XXX-', f'-{city_code}-')
|
||||
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
if not dry_run:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
if new_filepath != filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
|
||||
return True, new_filepath if new_filepath != filepath else None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract locations using CH-Annotator TOPONYM convention'
|
||||
)
|
||||
parser.add_argument('--apply', action='store_true',
|
||||
help='Actually apply the fixes (default: dry run)')
|
||||
parser.add_argument('--path', type=str, default='data/custodian',
|
||||
help='Path to custodian files directory')
|
||||
parser.add_argument('--limit', type=int, default=100,
|
||||
help='Limit number of files to process')
|
||||
parser.add_argument('--country', type=str,
|
||||
help='Only process files for a specific country')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
custodian_dir = Path(args.path)
|
||||
if not custodian_dir.exists():
|
||||
print(f"Error: Directory {custodian_dir} does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
# Connect to GeoNames
|
||||
conn = connect_geonames()
|
||||
if not conn:
|
||||
sys.exit(1)
|
||||
|
||||
dry_run = not args.apply
|
||||
|
||||
print("=" * 70)
|
||||
print("CH-ANNOTATOR TOPONYM (TOP.SET) LOCATION EXTRACTION")
|
||||
print("=" * 70)
|
||||
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
||||
print(f"Convention: ch_annotator-v1_7_0")
|
||||
print()
|
||||
|
||||
# Find files with XX region codes or XXX city codes
|
||||
files_to_process = []
|
||||
|
||||
for filepath in custodian_dir.glob('*-XX-*.yaml'):
|
||||
files_to_process.append(filepath)
|
||||
for filepath in custodian_dir.glob('*-XXX-*.yaml'):
|
||||
if filepath not in files_to_process:
|
||||
files_to_process.append(filepath)
|
||||
|
||||
print(f"Found {len(files_to_process)} files with XX/XXX codes")
|
||||
|
||||
# Process files
|
||||
file_data = []
|
||||
files_processed = 0
|
||||
for filepath in files_to_process:
|
||||
# Apply limit AFTER country filtering
|
||||
if len(file_data) >= args.limit:
|
||||
break
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
# Get country code
|
||||
country = None
|
||||
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
|
||||
country = data['ghcid']['location_resolution'].get('country_code')
|
||||
|
||||
if not country:
|
||||
continue
|
||||
|
||||
if args.country and country != args.country:
|
||||
continue
|
||||
|
||||
# Get institution name
|
||||
name = None
|
||||
if 'custodian_name' in data:
|
||||
name = data['custodian_name'].get('claim_value')
|
||||
if not name and 'original_entry' in data:
|
||||
name = data['original_entry'].get('name')
|
||||
|
||||
if not name:
|
||||
continue
|
||||
|
||||
file_data.append({
|
||||
'filepath': filepath,
|
||||
'data': data,
|
||||
'country': country,
|
||||
'name': name,
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error loading {filepath}: {e}")
|
||||
|
||||
print(f"Processing {len(file_data)} files")
|
||||
print()
|
||||
|
||||
# Process each file
|
||||
resolved = 0
|
||||
renamed = 0
|
||||
no_toponym = 0
|
||||
no_geonames = 0
|
||||
|
||||
for f in file_data:
|
||||
filepath = f['filepath']
|
||||
name = f['name']
|
||||
country = f['country']
|
||||
|
||||
# Extract toponym using CH-Annotator patterns
|
||||
toponym = extract_toponym_from_name(name, country)
|
||||
|
||||
if not toponym:
|
||||
no_toponym += 1
|
||||
continue
|
||||
|
||||
# Look up in GeoNames
|
||||
location = lookup_city_in_geonames(toponym, country, conn)
|
||||
|
||||
if not location:
|
||||
no_geonames += 1
|
||||
print(f" No GeoNames match for '{toponym}' in {country}")
|
||||
continue
|
||||
|
||||
print(f"Processing {filepath.name}...")
|
||||
print(f" Name: {name}")
|
||||
print(f" TOP.SET: {toponym} -> {location['geonames_name']} (Region: {location['region_code']})")
|
||||
|
||||
# Update file
|
||||
success, new_path = update_file_with_location(filepath, location, toponym, dry_run=dry_run)
|
||||
|
||||
if success:
|
||||
resolved += 1
|
||||
if new_path:
|
||||
renamed += 1
|
||||
print(f" Renamed: {filepath.name} -> {new_path.name}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {len(file_data)}")
|
||||
print(f"Resolved: {resolved}")
|
||||
print(f"Renamed: {renamed}")
|
||||
print(f"No toponym extracted: {no_toponym}")
|
||||
print(f"No GeoNames match: {no_geonames}")
|
||||
|
||||
if dry_run:
|
||||
print()
|
||||
print("This was a DRY RUN. Use --apply to make changes.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
226
scripts/fix_belgian_cities.py
Normal file
226
scripts/fix_belgian_cities.py
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix remaining Belgian XXX files by re-scraping ISIL website with correct city extraction.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.request import urlopen, Request
|
||||
|
||||
# Belgian admin1 mapping
|
||||
BELGIAN_ADMIN1_MAP = {
|
||||
'Brussels Capital': 'BRU',
|
||||
'Brussels': 'BRU',
|
||||
'Flanders': 'VLG',
|
||||
'Wallonia': 'WAL',
|
||||
}
|
||||
|
||||
# City name aliases (Dutch → GeoNames)
|
||||
CITY_ALIASES = {
|
||||
'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert',
|
||||
'sint-pieters-woluwe': 'Woluwe-Saint-Pierre',
|
||||
'oostende': 'Ostend',
|
||||
'brussel': 'Brussels',
|
||||
'bruxelles': 'Brussels',
|
||||
}
|
||||
|
||||
def scrape_isil_city(isil_code):
|
||||
"""Scrape city from Belgian ISIL website."""
|
||||
url = f"https://isil.kbr.be/{isil_code}"
|
||||
try:
|
||||
req = Request(url, headers={'User-Agent': 'Mozilla/5.0 GLAM-Scraper/1.0'})
|
||||
with urlopen(req, timeout=10) as response:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
# Look for address pattern: "Street 123, POSTCODE City"
|
||||
match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)</td>', html)
|
||||
if match:
|
||||
postal_code = match.group(1)
|
||||
city = match.group(2).strip()
|
||||
return city, postal_code
|
||||
|
||||
# Alternative pattern
|
||||
match = re.search(r'(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', html)
|
||||
if match:
|
||||
return match.group(2).strip(), match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error scraping {isil_code}: {e}")
|
||||
|
||||
return None, None
|
||||
|
||||
def lookup_city(city_name, conn):
|
||||
"""Look up city in GeoNames."""
|
||||
if not city_name:
|
||||
return None
|
||||
|
||||
# Check alias
|
||||
normalized = city_name.lower().strip()
|
||||
lookup_name = CITY_ALIASES.get(normalized, city_name)
|
||||
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population, feature_code
|
||||
FROM cities
|
||||
WHERE country_code='BE'
|
||||
AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?))
|
||||
AND feature_code NOT IN ('PPLX')
|
||||
ORDER BY population DESC LIMIT 1
|
||||
""", (lookup_name, lookup_name))
|
||||
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
return {
|
||||
'name': result[0],
|
||||
'ascii_name': result[1],
|
||||
'admin1_name': result[2],
|
||||
'latitude': result[3],
|
||||
'longitude': result[4],
|
||||
'geonames_id': result[5],
|
||||
'population': result[6],
|
||||
}
|
||||
return None
|
||||
|
||||
def generate_city_code(city_name):
|
||||
"""Generate 3-letter city code."""
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
||||
words = clean.split()
|
||||
|
||||
articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'}
|
||||
|
||||
if len(words) == 1:
|
||||
return clean[:3].upper()
|
||||
elif words[0].lower() in articles:
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
return ''.join(w[0] for w in words[:3]).upper()
|
||||
|
||||
def update_file(file_path, geo_data, method='ISIL_SCRAPE'):
|
||||
"""Update custodian file with city data."""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
city_code = generate_city_code(geo_data['name'])
|
||||
region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_name'], 'XX')
|
||||
|
||||
# Update GHCID
|
||||
old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content)
|
||||
if not old_ghcid_match:
|
||||
return False
|
||||
|
||||
old_ghcid = old_ghcid_match.group(1).strip()
|
||||
new_ghcid = re.sub(r'^BE-XX-XXX-', f'BE-{region_code}-{city_code}-', old_ghcid)
|
||||
|
||||
if new_ghcid == old_ghcid:
|
||||
return False
|
||||
|
||||
# Update content
|
||||
content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}')
|
||||
content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}')
|
||||
content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}')
|
||||
content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}")
|
||||
|
||||
# Update location_resolution
|
||||
content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content)
|
||||
content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content)
|
||||
|
||||
# Add resolution details
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
history_entry = f"""
|
||||
- ghcid: {new_ghcid}
|
||||
valid_from: '{timestamp}'
|
||||
reason: City resolved via {method} - {geo_data['name']} (GeoNames ID {geo_data['geonames_id']})"""
|
||||
|
||||
history_match = re.search(r'(ghcid_history:\s*\n)', content)
|
||||
if history_match:
|
||||
insert_pos = history_match.end()
|
||||
content = content[:insert_pos] + history_entry + content[insert_pos:]
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
# Rename file
|
||||
old_filename = file_path.name
|
||||
new_filename = old_filename.replace('BE-XX-XXX-', f'BE-{region_code}-{city_code}-')
|
||||
if new_filename != old_filename:
|
||||
new_path = file_path.parent / new_filename
|
||||
file_path.rename(new_path)
|
||||
|
||||
return True
|
||||
|
||||
def main():
|
||||
import sys
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
base_dir = Path(__file__).parent.parent
|
||||
custodian_dir = base_dir / 'data' / 'custodian'
|
||||
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
|
||||
|
||||
print("Belgian City Fix Script")
|
||||
print("=" * 50)
|
||||
if dry_run:
|
||||
print("DRY RUN MODE\n")
|
||||
|
||||
conn = sqlite3.connect(str(geonames_db))
|
||||
|
||||
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
|
||||
print(f"Found {len(xxx_files)} Belgian XXX files\n")
|
||||
|
||||
updated = 0
|
||||
not_found = []
|
||||
|
||||
for file_path in xxx_files:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Get ISIL code
|
||||
isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content)
|
||||
if not isil_match:
|
||||
continue
|
||||
|
||||
isil_code = isil_match.group(1)
|
||||
|
||||
# Scrape city from website
|
||||
city, postal = scrape_isil_city(isil_code)
|
||||
if not city:
|
||||
print(f"✗ {file_path.name}: No city found for {isil_code}")
|
||||
not_found.append((file_path.name, isil_code, 'scrape failed'))
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
# Lookup in GeoNames
|
||||
geo_data = lookup_city(city, conn)
|
||||
if not geo_data:
|
||||
print(f"? {file_path.name}: {city} not in GeoNames")
|
||||
not_found.append((file_path.name, isil_code, city))
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
print(f"✓ {file_path.name}: {isil_code} → {city} ({geo_data['name']})")
|
||||
else:
|
||||
if update_file(file_path, geo_data):
|
||||
print(f"✓ Updated: {file_path.name} → {geo_data['name']}")
|
||||
updated += 1
|
||||
|
||||
time.sleep(1) # Rate limit
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Updated: {updated}")
|
||||
print(f"Not found: {len(not_found)}")
|
||||
|
||||
if not_found:
|
||||
print("\nNot resolved:")
|
||||
for fname, isil, city in not_found:
|
||||
print(f" {fname}: {isil} → {city}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
154
scripts/migrate_egyptian_from_ch.py
Normal file
154
scripts/migrate_egyptian_from_ch.py
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migrate Egyptian institutions incorrectly placed under CH (Switzerland) to EG (Egypt).
|
||||
"""
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Egyptian city mapping
|
||||
EGYPTIAN_CITIES = {
|
||||
'Cairo': {'region': 'C', 'city_code': 'CAI'},
|
||||
'Alexandria': {'region': 'ALX', 'city_code': 'ALX'},
|
||||
'Giza': {'region': 'GZ', 'city_code': 'GIZ'},
|
||||
'Assiut': {'region': 'AST', 'city_code': 'ASS'},
|
||||
'Helwan': {'region': 'C', 'city_code': 'HEL'},
|
||||
'6th of October City': {'region': 'GZ', 'city_code': 'OCT'},
|
||||
'Ain Shams': {'region': 'C', 'city_code': 'ASH'},
|
||||
'Maadi': {'region': 'C', 'city_code': 'MAA'},
|
||||
'New Cairo': {'region': 'C', 'city_code': 'NCA'},
|
||||
}
|
||||
|
||||
def extract_city_from_name(name):
|
||||
"""Extract Egyptian city from institution name."""
|
||||
name_lower = name.lower()
|
||||
|
||||
if 'cairo' in name_lower or 'ain shams' in name_lower or 'helwan' in name_lower:
|
||||
return 'Cairo'
|
||||
if 'alexandria' in name_lower:
|
||||
return 'Alexandria'
|
||||
if 'assiut' in name_lower or 'asyut' in name_lower:
|
||||
return 'Assiut'
|
||||
if 'giza' in name_lower or 'october' in name_lower:
|
||||
return 'Giza'
|
||||
if 'nile' in name_lower or 'maadi' in name_lower:
|
||||
return 'Cairo' # Most Egyptian institutions without city are in Cairo
|
||||
if 'egypt' in name_lower or 'egyptian' in name_lower:
|
||||
return 'Cairo' # Default for national institutions
|
||||
|
||||
return 'Cairo' # Default
|
||||
|
||||
def update_file(file_path, city_name, dry_run=False):
|
||||
"""Update file from CH to EG namespace."""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
city_info = EGYPTIAN_CITIES.get(city_name, {'region': 'C', 'city_code': 'CAI'})
|
||||
region_code = city_info['region']
|
||||
city_code = city_info['city_code']
|
||||
|
||||
# Get current GHCID
|
||||
old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content)
|
||||
if not old_ghcid_match:
|
||||
return False, None
|
||||
|
||||
old_ghcid = old_ghcid_match.group(1).strip()
|
||||
|
||||
# Create new GHCID with EG namespace
|
||||
new_ghcid = re.sub(r'^CH-XX-XXX-', f'EG-{region_code}-{city_code}-', old_ghcid)
|
||||
|
||||
if dry_run:
|
||||
return True, (old_ghcid, new_ghcid)
|
||||
|
||||
# Update all GHCID references
|
||||
content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}')
|
||||
content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}')
|
||||
content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}')
|
||||
content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}")
|
||||
|
||||
# Update country code
|
||||
content = re.sub(r'country:\s*CH', 'country: EG', content)
|
||||
content = re.sub(r'country_code:\s*CH', 'country_code: EG', content)
|
||||
|
||||
# Update location_resolution
|
||||
content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content)
|
||||
content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content)
|
||||
|
||||
# Add history entry
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
history_entry = f"""
|
||||
- ghcid: {new_ghcid}
|
||||
valid_from: '{timestamp}'
|
||||
reason: Migrated from CH to EG namespace - {city_name}"""
|
||||
|
||||
history_match = re.search(r'(ghcid_history:\s*\n)', content)
|
||||
if history_match:
|
||||
insert_pos = history_match.end()
|
||||
content = content[:insert_pos] + history_entry + content[insert_pos:]
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
# Rename file
|
||||
old_filename = file_path.name
|
||||
new_filename = old_filename.replace('CH-XX-XXX-', f'EG-{region_code}-{city_code}-')
|
||||
if new_filename != old_filename:
|
||||
new_path = file_path.parent / new_filename
|
||||
file_path.rename(new_path)
|
||||
|
||||
return True, (old_ghcid, new_ghcid)
|
||||
|
||||
def main():
|
||||
import sys
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
base_dir = Path(__file__).parent.parent
|
||||
custodian_dir = base_dir / 'data' / 'custodian'
|
||||
|
||||
print("Egyptian Institution Migration (CH → EG)")
|
||||
print("=" * 50)
|
||||
if dry_run:
|
||||
print("DRY RUN MODE\n")
|
||||
|
||||
# Find CH-XX-XXX files that are actually Egyptian
|
||||
xxx_files = list(custodian_dir.glob('CH-XX-XXX-*.yaml'))
|
||||
print(f"Found {len(xxx_files)} CH-XX-XXX files\n")
|
||||
|
||||
migrated = 0
|
||||
egyptian_keywords = ['egypt', 'cairo', 'alexandria', 'ain shams', 'helwan', 'assiut',
|
||||
'giza', 'nile', 'al-azhar', 'dar al-kutub', 'guc', 'auc', 'bue']
|
||||
|
||||
for file_path in xxx_files:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Check if this is an Egyptian institution
|
||||
name_match = re.search(r'claim_value:\s*(.+)', content)
|
||||
if not name_match:
|
||||
continue
|
||||
|
||||
inst_name = name_match.group(1).strip().lower()
|
||||
|
||||
is_egyptian = any(keyword in inst_name for keyword in egyptian_keywords)
|
||||
if not is_egyptian:
|
||||
continue
|
||||
|
||||
city = extract_city_from_name(inst_name)
|
||||
success, ghcid_change = update_file(file_path, city, dry_run)
|
||||
|
||||
if success:
|
||||
if dry_run:
|
||||
print(f" {file_path.name}")
|
||||
print(f" → {ghcid_change[0]} → {ghcid_change[1]}")
|
||||
else:
|
||||
print(f"✓ Migrated: {file_path.name} → {city}")
|
||||
migrated += 1
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Migrated: {migrated}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
426
scripts/migrate_web_archives.py
Normal file
426
scripts/migrate_web_archives.py
Normal file
|
|
@ -0,0 +1,426 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migrate web archives from /data/nde/enriched/entries/web/ to /data/custodian/{GHCID}/web/
|
||||
|
||||
This script:
|
||||
1. Builds a mapping from entry_index -> GHCID by scanning custodian files
|
||||
2. Moves (or symlinks) web archive folders to the appropriate custodian folder
|
||||
3. Creates a DuckDB database with web archive metadata for DuckLake ingestion
|
||||
|
||||
Usage:
|
||||
python scripts/migrate_web_archives.py --dry-run # Preview changes
|
||||
python scripts/migrate_web_archives.py --execute # Actually migrate
|
||||
python scripts/migrate_web_archives.py --build-ducklake # Create DuckDB tables
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import yaml
|
||||
import shutil
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, Optional, List, Any
|
||||
import json
|
||||
|
||||
# Try to import duckdb for DuckLake ingestion
|
||||
try:
|
||||
import duckdb
|
||||
HAS_DUCKDB = True
|
||||
except ImportError:
|
||||
HAS_DUCKDB = False
|
||||
print("Warning: duckdb not installed. DuckLake ingestion disabled.")
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Paths
|
||||
BASE_DIR = Path("/Users/kempersc/apps/glam")
|
||||
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
|
||||
WEB_ARCHIVE_SOURCE = BASE_DIR / "data" / "nde" / "enriched" / "entries" / "web"
|
||||
DUCKLAKE_DB = BASE_DIR / "data" / "ducklake" / "web_archives.duckdb"
|
||||
MAPPING_FILE = WEB_ARCHIVE_SOURCE / "_entry_to_ghcid.txt"
|
||||
|
||||
|
||||
def build_entry_index_to_ghcid_mapping() -> Dict[int, str]:
|
||||
"""
|
||||
Load mapping from pre-built file (created via ripgrep for speed).
|
||||
Falls back to scanning YAML files if file doesn't exist.
|
||||
|
||||
Returns:
|
||||
Dict mapping entry_index (int) to GHCID (str, e.g., "NL-GE-GEN-S-HKG")
|
||||
"""
|
||||
mapping = {}
|
||||
|
||||
# Try to load from pre-built mapping file
|
||||
if MAPPING_FILE.exists():
|
||||
logger.info(f"Loading mapping from {MAPPING_FILE}")
|
||||
with open(MAPPING_FILE, 'r') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split(' ', 1)
|
||||
if len(parts) == 2 and parts[0].isdigit():
|
||||
entry_index = int(parts[0])
|
||||
ghcid = parts[1]
|
||||
mapping[entry_index] = ghcid
|
||||
logger.info(f"Loaded {len(mapping)} entries from mapping file")
|
||||
return mapping
|
||||
|
||||
# Fallback: scan YAML files (slow)
|
||||
logger.info("Mapping file not found, scanning custodian files...")
|
||||
custodian_files = list(CUSTODIAN_DIR.glob("*.yaml"))
|
||||
logger.info(f"Scanning {len(custodian_files)} custodian files...")
|
||||
|
||||
for filepath in custodian_files:
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if data and 'entry_index' in data:
|
||||
entry_index = data['entry_index']
|
||||
if isinstance(entry_index, int):
|
||||
ghcid = filepath.stem # e.g., "NL-GE-GEN-S-HKG"
|
||||
mapping[entry_index] = ghcid
|
||||
except Exception as e:
|
||||
logger.debug(f"Error reading {filepath}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Built mapping for {len(mapping)} entries with entry_index")
|
||||
return mapping
|
||||
|
||||
|
||||
def get_web_archive_folders() -> List[Path]:
|
||||
"""Get list of web archive folders (entry numbers)."""
|
||||
folders = []
|
||||
for item in WEB_ARCHIVE_SOURCE.iterdir():
|
||||
if item.is_dir() and item.name.isdigit():
|
||||
folders.append(item)
|
||||
return sorted(folders, key=lambda p: int(p.name))
|
||||
|
||||
|
||||
def parse_metadata(metadata_path: Path) -> Optional[Dict[str, Any]]:
|
||||
"""Parse web archive metadata.yaml file."""
|
||||
try:
|
||||
with open(metadata_path, 'r', encoding='utf-8') as f:
|
||||
return yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse {metadata_path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def migrate_web_archive(source_folder: Path, ghcid: str, dry_run: bool = True) -> bool:
|
||||
"""
|
||||
Migrate a web archive folder to the custodian's web/ folder.
|
||||
|
||||
Args:
|
||||
source_folder: Path to source web archive (e.g., .../web/0183/historischekringgente.nl/)
|
||||
ghcid: Target GHCID (e.g., "NL-GE-GEN-S-HKG")
|
||||
dry_run: If True, only preview changes
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
target_dir = CUSTODIAN_DIR / ghcid / "web"
|
||||
|
||||
# Find domain subfolder
|
||||
domain_folders = [d for d in source_folder.iterdir() if d.is_dir()]
|
||||
|
||||
if not domain_folders:
|
||||
logger.warning(f"No domain folders in {source_folder}")
|
||||
return False
|
||||
|
||||
for domain_folder in domain_folders:
|
||||
domain_name = domain_folder.name
|
||||
target_path = target_dir / domain_name
|
||||
|
||||
if dry_run:
|
||||
logger.info(f"[DRY-RUN] Would migrate: {domain_folder} -> {target_path}")
|
||||
else:
|
||||
try:
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
if target_path.exists():
|
||||
logger.warning(f"Target already exists: {target_path}")
|
||||
continue
|
||||
shutil.copytree(domain_folder, target_path)
|
||||
logger.info(f"Migrated: {domain_folder} -> {target_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to migrate {domain_folder}: {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def build_ducklake_database(mapping: Dict[int, str]):
|
||||
"""
|
||||
Create DuckDB database with web archive metadata for DuckLake.
|
||||
|
||||
Tables:
|
||||
- web_archives: Archive metadata (ghcid, url, timestamp, stats)
|
||||
- web_pages: Individual pages with extraction counts
|
||||
- web_claims: Extracted claims/entities from annotations
|
||||
"""
|
||||
if not HAS_DUCKDB:
|
||||
logger.error("DuckDB not installed. Cannot build DuckLake database.")
|
||||
return
|
||||
|
||||
DUCKLAKE_DB.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
con = duckdb.connect(str(DUCKLAKE_DB))
|
||||
|
||||
# Create tables
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS web_archives (
|
||||
ghcid VARCHAR PRIMARY KEY,
|
||||
entry_index INTEGER,
|
||||
domain VARCHAR,
|
||||
url VARCHAR,
|
||||
archive_timestamp TIMESTAMP,
|
||||
archive_method VARCHAR,
|
||||
total_pages INTEGER,
|
||||
processed_pages INTEGER,
|
||||
warc_file VARCHAR,
|
||||
warc_size_bytes BIGINT,
|
||||
has_annotations BOOLEAN DEFAULT FALSE
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS web_pages (
|
||||
id INTEGER PRIMARY KEY,
|
||||
ghcid VARCHAR,
|
||||
page_title VARCHAR,
|
||||
source_path VARCHAR,
|
||||
archived_file VARCHAR,
|
||||
extractions_count INTEGER,
|
||||
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS web_claims (
|
||||
id INTEGER PRIMARY KEY,
|
||||
ghcid VARCHAR,
|
||||
claim_id VARCHAR,
|
||||
claim_type VARCHAR,
|
||||
text_content VARCHAR,
|
||||
hypernym VARCHAR,
|
||||
hyponym VARCHAR,
|
||||
class_uri VARCHAR,
|
||||
xpath VARCHAR,
|
||||
recognition_confidence FLOAT,
|
||||
linking_confidence FLOAT,
|
||||
wikidata_id VARCHAR,
|
||||
FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid)
|
||||
)
|
||||
""")
|
||||
|
||||
# Clear existing data
|
||||
con.execute("DELETE FROM web_claims")
|
||||
con.execute("DELETE FROM web_pages")
|
||||
con.execute("DELETE FROM web_archives")
|
||||
|
||||
page_id = 0
|
||||
claim_id_counter = 0
|
||||
|
||||
web_folders = get_web_archive_folders()
|
||||
logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...")
|
||||
|
||||
for folder in web_folders:
|
||||
entry_index = int(folder.name)
|
||||
ghcid = mapping.get(entry_index)
|
||||
|
||||
if not ghcid:
|
||||
logger.debug(f"No GHCID mapping for entry {entry_index}")
|
||||
continue
|
||||
|
||||
# Find domain folder
|
||||
domain_folders = [d for d in folder.iterdir() if d.is_dir()]
|
||||
|
||||
for domain_folder in domain_folders:
|
||||
metadata_path = domain_folder / "metadata.yaml"
|
||||
if not metadata_path.exists():
|
||||
continue
|
||||
|
||||
metadata = parse_metadata(metadata_path)
|
||||
if not metadata:
|
||||
continue
|
||||
|
||||
# Check for annotations
|
||||
annotations_path = domain_folder / "annotations_v1.7.0.yaml"
|
||||
has_annotations = annotations_path.exists()
|
||||
|
||||
# Parse warc info
|
||||
warc_info = metadata.get('warc', {})
|
||||
|
||||
# Insert archive record
|
||||
try:
|
||||
archive_ts = metadata.get('archive_timestamp')
|
||||
if archive_ts:
|
||||
archive_ts = datetime.fromisoformat(archive_ts.replace('Z', '+00:00'))
|
||||
|
||||
con.execute("""
|
||||
INSERT INTO web_archives VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", [
|
||||
ghcid,
|
||||
entry_index,
|
||||
domain_folder.name,
|
||||
metadata.get('url'),
|
||||
archive_ts,
|
||||
metadata.get('archive_method'),
|
||||
metadata.get('total_pages', 0),
|
||||
metadata.get('processed_pages', 0),
|
||||
warc_info.get('warc_file'),
|
||||
warc_info.get('warc_size_bytes', 0),
|
||||
has_annotations
|
||||
])
|
||||
except Exception as e:
|
||||
logger.debug(f"Error inserting archive {ghcid}: {e}")
|
||||
continue
|
||||
|
||||
# Insert pages
|
||||
for page in metadata.get('pages', []):
|
||||
page_id += 1
|
||||
try:
|
||||
con.execute("""
|
||||
INSERT INTO web_pages VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", [
|
||||
page_id,
|
||||
ghcid,
|
||||
page.get('title'),
|
||||
page.get('source_path'),
|
||||
page.get('archived_file'),
|
||||
page.get('extractions_count', 0)
|
||||
])
|
||||
except Exception as e:
|
||||
logger.debug(f"Error inserting page: {e}")
|
||||
|
||||
# Insert claims from annotations
|
||||
if has_annotations:
|
||||
try:
|
||||
with open(annotations_path, 'r', encoding='utf-8') as f:
|
||||
annotations = yaml.safe_load(f)
|
||||
|
||||
session = annotations.get('session', {})
|
||||
claims = session.get('claims', {})
|
||||
|
||||
# Process entity claims
|
||||
for claim in claims.get('entity', []):
|
||||
claim_id_counter += 1
|
||||
provenance = claim.get('provenance', {})
|
||||
con.execute("""
|
||||
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", [
|
||||
claim_id_counter,
|
||||
ghcid,
|
||||
claim.get('claim_id'),
|
||||
claim.get('claim_type'),
|
||||
claim.get('text_content'),
|
||||
claim.get('hypernym'),
|
||||
claim.get('hyponym'),
|
||||
claim.get('class_uri'),
|
||||
provenance.get('path'),
|
||||
claim.get('recognition_confidence', 0),
|
||||
claim.get('linking_confidence', 0),
|
||||
claim.get('wikidata_id')
|
||||
])
|
||||
|
||||
# Process aggregate claims
|
||||
for claim in claims.get('aggregate', []):
|
||||
claim_id_counter += 1
|
||||
provenance = claim.get('provenance', {})
|
||||
con.execute("""
|
||||
INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", [
|
||||
claim_id_counter,
|
||||
ghcid,
|
||||
claim.get('claim_id'),
|
||||
claim.get('claim_type'),
|
||||
claim.get('text_content'),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
provenance.get('path'),
|
||||
provenance.get('confidence', 0),
|
||||
0,
|
||||
None
|
||||
])
|
||||
except Exception as e:
|
||||
logger.debug(f"Error processing annotations for {ghcid}: {e}")
|
||||
|
||||
# Create indices
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_pages_ghcid ON web_pages(ghcid)")
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)")
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)")
|
||||
con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)")
|
||||
|
||||
# Get stats
|
||||
archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0]
|
||||
page_count = con.execute("SELECT COUNT(*) FROM web_pages").fetchone()[0]
|
||||
claim_count = con.execute("SELECT COUNT(*) FROM web_claims").fetchone()[0]
|
||||
|
||||
con.close()
|
||||
|
||||
logger.info(f"DuckLake database created at: {DUCKLAKE_DB}")
|
||||
logger.info(f" - Archives: {archive_count}")
|
||||
logger.info(f" - Pages: {page_count}")
|
||||
logger.info(f" - Claims: {claim_count}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Migrate web archives to custodian folders")
|
||||
parser.add_argument('--dry-run', action='store_true', help='Preview changes without executing')
|
||||
parser.add_argument('--execute', action='store_true', help='Actually migrate files')
|
||||
parser.add_argument('--build-ducklake', action='store_true', help='Build DuckDB database only')
|
||||
parser.add_argument('--build-mapping', action='store_true', help='Just build and show mapping')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not any([args.dry_run, args.execute, args.build_ducklake, args.build_mapping]):
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Build the mapping
|
||||
mapping = build_entry_index_to_ghcid_mapping()
|
||||
|
||||
if args.build_mapping:
|
||||
print(f"\nMapping has {len(mapping)} entries")
|
||||
print("\nSample entries:")
|
||||
for idx, (entry_idx, ghcid) in enumerate(sorted(mapping.items())[:20]):
|
||||
print(f" {entry_idx:04d} -> {ghcid}")
|
||||
return
|
||||
|
||||
if args.build_ducklake:
|
||||
build_ducklake_database(mapping)
|
||||
return
|
||||
|
||||
# Migration mode
|
||||
web_folders = get_web_archive_folders()
|
||||
logger.info(f"Found {len(web_folders)} web archive folders")
|
||||
|
||||
migrated = 0
|
||||
skipped = 0
|
||||
no_mapping = 0
|
||||
|
||||
for folder in web_folders:
|
||||
entry_index = int(folder.name)
|
||||
ghcid = mapping.get(entry_index)
|
||||
|
||||
if not ghcid:
|
||||
logger.debug(f"No GHCID for entry {entry_index}")
|
||||
no_mapping += 1
|
||||
continue
|
||||
|
||||
success = migrate_web_archive(folder, ghcid, dry_run=not args.execute)
|
||||
if success:
|
||||
migrated += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
print(f"\n{'[DRY-RUN] ' if args.dry_run else ''}Migration summary:")
|
||||
print(f" - Migrated: {migrated}")
|
||||
print(f" - Skipped: {skipped}")
|
||||
print(f" - No mapping: {no_mapping}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
301
scripts/resolve_cities_from_file_coords.py
Executable file
301
scripts/resolve_cities_from_file_coords.py
Executable file
|
|
@ -0,0 +1,301 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve XXX city codes using coordinates already in the file (locations[].latitude/longitude).
|
||||
|
||||
This script handles files that already have coordinates but haven't been geocoded yet.
|
||||
|
||||
Following AGENTS.md Rules:
|
||||
- Rule 5: Additive only - never delete existing data
|
||||
- GHCID settlement standardization: GeoNames is authoritative
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
# GeoNames database
|
||||
GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db"
|
||||
CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian"
|
||||
|
||||
# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods)
|
||||
SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
|
||||
# Netherlands admin1 code mapping
|
||||
NL_ADMIN1_MAP = {
|
||||
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI',
|
||||
'06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH',
|
||||
'15': 'OV', '16': 'FL'
|
||||
}
|
||||
|
||||
# Belgian admin2 to ISO mapping
|
||||
BE_ADMIN2_MAP = {
|
||||
'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV',
|
||||
'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA', 'BRU': 'BRU'
|
||||
}
|
||||
|
||||
|
||||
def generate_city_code(name: str) -> str:
|
||||
"""Generate 2-4 letter city code from name."""
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
# Normalize unicode
|
||||
normalized = unicodedata.normalize('NFD', name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Remove special characters
|
||||
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
|
||||
words = clean.split()
|
||||
|
||||
if not words:
|
||||
return 'XXX'
|
||||
|
||||
# Dutch articles
|
||||
dutch_articles = {'de', 'het', 'den', "'s", 's'}
|
||||
|
||||
if len(words) == 1:
|
||||
# Single word: take first 3 letters
|
||||
return words[0][:3].upper()
|
||||
elif words[0].lower() in dutch_articles:
|
||||
# Article + word: D + first 2 letters of main word
|
||||
return (words[0][0] + words[1][:2]).upper()
|
||||
else:
|
||||
# Multi-word: initials
|
||||
initials = ''.join(w[0] for w in words[:3])
|
||||
return initials.upper()
|
||||
|
||||
|
||||
def reverse_geocode(lat: float, lon: float, country: str, conn: sqlite3.Connection) -> Optional[Dict]:
|
||||
"""Reverse geocode coordinates to nearest city in GeoNames."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(f'''
|
||||
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
|
||||
latitude, longitude, feature_code, population
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
|
||||
ORDER BY ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?))
|
||||
LIMIT 1
|
||||
''', (country, lat, lat, lon, lon))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'admin1_code': row[3],
|
||||
'admin2_code': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'feature_code': row[7],
|
||||
'population': row[8],
|
||||
}
|
||||
|
||||
|
||||
def get_region_code(country: str, admin1_code: str, admin2_code: str) -> str:
|
||||
"""Get ISO 3166-2 region code from admin codes."""
|
||||
if country == 'NL':
|
||||
return NL_ADMIN1_MAP.get(admin1_code, 'XX')
|
||||
elif country == 'BE':
|
||||
return BE_ADMIN2_MAP.get(admin2_code, admin1_code if admin1_code else 'XX')
|
||||
else:
|
||||
return admin1_code if admin1_code else 'XX'
|
||||
|
||||
|
||||
def find_coords_in_file(data: Dict) -> Optional[tuple]:
|
||||
"""Find latitude/longitude in file data."""
|
||||
# Check original_entry.locations
|
||||
if 'original_entry' in data:
|
||||
locations = data['original_entry'].get('locations', [])
|
||||
for loc in locations:
|
||||
if 'latitude' in loc and 'longitude' in loc:
|
||||
country = loc.get('country', data.get('ghcid', {}).get('location_resolution', {}).get('country_code', 'XX'))
|
||||
return (loc['latitude'], loc['longitude'], country)
|
||||
|
||||
# Check top-level locations
|
||||
locations = data.get('locations', [])
|
||||
for loc in locations:
|
||||
if 'latitude' in loc and 'longitude' in loc:
|
||||
country = loc.get('country', 'XX')
|
||||
return (loc['latitude'], loc['longitude'], country)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool:
|
||||
"""Process a single file with XXX city code and coordinates."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {filepath}: {e}")
|
||||
return False
|
||||
|
||||
if not data:
|
||||
return False
|
||||
|
||||
# Get coordinates from file
|
||||
coords = find_coords_in_file(data)
|
||||
if not coords:
|
||||
return False
|
||||
|
||||
lat, lon, country = coords
|
||||
print(f" Coords: {lat:.4f}, {lon:.4f} ({country})")
|
||||
|
||||
# Reverse geocode
|
||||
city_data = reverse_geocode(lat, lon, country, conn)
|
||||
if not city_data:
|
||||
print(f" No GeoNames match for {country}")
|
||||
return False
|
||||
|
||||
city_code = generate_city_code(city_data['ascii_name'])
|
||||
region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code', ''))
|
||||
|
||||
print(f" City: {city_data['name']} ({city_code}), Region: {region_code}")
|
||||
|
||||
if not apply:
|
||||
return True
|
||||
|
||||
# Update GHCID
|
||||
ghcid = data.get('ghcid', {})
|
||||
current = ghcid.get('ghcid_current', '')
|
||||
|
||||
# Parse current GHCID
|
||||
parts = current.split('-')
|
||||
if len(parts) < 5:
|
||||
print(f" Invalid GHCID format: {current}")
|
||||
return False
|
||||
|
||||
# Update city code (and region if still XX)
|
||||
old_region = parts[1]
|
||||
old_city = parts[2]
|
||||
|
||||
if old_city != 'XXX':
|
||||
print(f" City already resolved: {old_city}")
|
||||
return False
|
||||
|
||||
# Update parts
|
||||
if old_region == 'XX' and region_code != 'XX':
|
||||
parts[1] = region_code
|
||||
parts[2] = city_code
|
||||
|
||||
new_ghcid = '-'.join(parts)
|
||||
|
||||
# Update data
|
||||
ghcid['ghcid_current'] = new_ghcid
|
||||
loc_res = ghcid.get('location_resolution', {})
|
||||
loc_res['city_code'] = city_code
|
||||
loc_res['city_name'] = city_data['name']
|
||||
loc_res['geonames_id'] = city_data['geonames_id']
|
||||
loc_res['feature_code'] = city_data['feature_code']
|
||||
if old_region == 'XX' and region_code != 'XX':
|
||||
loc_res['region_code'] = region_code
|
||||
loc_res['method'] = 'REVERSE_GEOCODE_FROM_FILE_COORDS'
|
||||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
ghcid['location_resolution'] = loc_res
|
||||
|
||||
# Add to history
|
||||
history = ghcid.get('ghcid_history', [])
|
||||
history.append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': f'City resolved via reverse geocoding: XXX->{city_code} ({city_data["name"]})'
|
||||
})
|
||||
ghcid['ghcid_history'] = history
|
||||
data['ghcid'] = ghcid
|
||||
|
||||
# Calculate new filename
|
||||
old_name = filepath.name
|
||||
new_name = old_name.replace(f'{old_region}-XXX', f'{parts[1]}-{city_code}')
|
||||
if old_region != 'XX' or region_code == 'XX':
|
||||
new_name = old_name.replace('-XXX-', f'-{city_code}-')
|
||||
|
||||
new_path = filepath.parent / new_name
|
||||
|
||||
# Write and rename
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
if new_path != filepath:
|
||||
filepath.rename(new_path)
|
||||
print(f" Renamed: {old_name} -> {new_name}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Resolve XXX city codes using coordinates in files')
|
||||
parser.add_argument('--limit', type=int, default=100, help='Max files to process')
|
||||
parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)')
|
||||
parser.add_argument('--country', help='Filter by country code')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("CITY RESOLUTION FROM FILE COORDINATES")
|
||||
print("=" * 70)
|
||||
print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}")
|
||||
print()
|
||||
|
||||
# Connect to GeoNames
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(str(GEONAMES_DB))
|
||||
|
||||
# Find XXX files with coordinates
|
||||
xxx_files = []
|
||||
for f in CUSTODIAN_DIR.glob('*.yaml'):
|
||||
if '-XXX-' in f.name:
|
||||
if args.country and not f.name.startswith(f'{args.country}-'):
|
||||
continue
|
||||
xxx_files.append(f)
|
||||
|
||||
print(f"Found {len(xxx_files)} files with XXX codes")
|
||||
|
||||
# Filter to files with coordinates
|
||||
files_with_coords = []
|
||||
for f in xxx_files:
|
||||
try:
|
||||
with open(f, 'r', encoding='utf-8') as fp:
|
||||
content = fp.read()
|
||||
if 'latitude:' in content and 'longitude:' in content:
|
||||
files_with_coords.append(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"Processing {min(len(files_with_coords), args.limit)} files with coordinates")
|
||||
print()
|
||||
|
||||
resolved = 0
|
||||
renamed = 0
|
||||
|
||||
for f in files_with_coords[:args.limit]:
|
||||
print(f"Processing {f.name}...")
|
||||
if process_file(f, conn, args.apply):
|
||||
resolved += 1
|
||||
if args.apply:
|
||||
renamed += 1
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {min(len(files_with_coords), args.limit)}")
|
||||
print(f"Resolved: {resolved}")
|
||||
print(f"Renamed: {renamed}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
317
scripts/resolve_cities_wikidata.py
Executable file
317
scripts/resolve_cities_wikidata.py
Executable file
|
|
@ -0,0 +1,317 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve XXX city codes using Wikidata P159 (headquarters) or P625 (coordinates).
|
||||
|
||||
This script handles files with XXX city codes by:
|
||||
1. Getting Wikidata ID from the file
|
||||
2. Querying P625 (coordinates) or P159 (headquarters location)
|
||||
3. Reverse geocoding to GeoNames to find the nearest city
|
||||
|
||||
Following AGENTS.md Rules:
|
||||
- Rule 5: Additive only - never delete existing data
|
||||
- GHCID settlement standardization: GeoNames is authoritative
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import json
|
||||
import time
|
||||
import sqlite3
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
|
||||
# GeoNames database
|
||||
GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db"
|
||||
|
||||
# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods)
|
||||
SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
|
||||
|
||||
def get_wikidata_location(wikidata_id: str) -> Optional[Tuple[float, float]]:
|
||||
"""Get coordinates from Wikidata entity using P625 or P159."""
|
||||
headers = {'User-Agent': 'GLAM-Extractor/1.0 (heritage research project)'}
|
||||
url = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={wikidata_id}&props=claims&format=json'
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
data = json.loads(response.read().decode('utf-8'))
|
||||
|
||||
claims = data['entities'][wikidata_id]['claims']
|
||||
|
||||
# Try P625 (coordinates) first
|
||||
if 'P625' in claims:
|
||||
coords = claims['P625'][0]['mainsnak']['datavalue']['value']
|
||||
return (coords['latitude'], coords['longitude'])
|
||||
|
||||
# Try P159 (headquarters location)
|
||||
if 'P159' in claims:
|
||||
loc_id = claims['P159'][0]['mainsnak']['datavalue']['value']['id']
|
||||
time.sleep(0.5) # Rate limiting
|
||||
|
||||
# Get coordinates of headquarters
|
||||
url2 = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={loc_id}&props=claims&format=json'
|
||||
req2 = urllib.request.Request(url2, headers=headers)
|
||||
with urllib.request.urlopen(req2, timeout=30) as response2:
|
||||
data2 = json.loads(response2.read().decode('utf-8'))
|
||||
|
||||
claims2 = data2['entities'][loc_id]['claims']
|
||||
if 'P625' in claims2:
|
||||
coords = claims2['P625'][0]['mainsnak']['datavalue']['value']
|
||||
return (coords['latitude'], coords['longitude'])
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" Error fetching Wikidata {wikidata_id}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def reverse_geocode(lat: float, lon: float, country: str, conn: sqlite3.Connection) -> Optional[Dict]:
|
||||
"""Reverse geocode coordinates to nearest city in GeoNames."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(f'''
|
||||
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
|
||||
latitude, longitude, feature_code, population,
|
||||
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
|
||||
ORDER BY distance_sq
|
||||
LIMIT 1
|
||||
''', (lat, lat, lon, lon, country))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'admin1_code': row[3],
|
||||
'admin2_code': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'feature_code': row[7],
|
||||
'population': row[8],
|
||||
'distance_sq': row[9],
|
||||
}
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from name."""
|
||||
words = city_name.split()
|
||||
if len(words) == 1:
|
||||
return city_name[:3].upper()
|
||||
else:
|
||||
initials = ''.join(w[0] for w in words if w)[:3]
|
||||
return initials.upper()
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
||||
"""Process a single file to resolve XXX city code."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {filepath}: {e}")
|
||||
return False, None
|
||||
|
||||
# Check if has XXX city code
|
||||
ghcid = data.get('ghcid', {})
|
||||
loc_res = ghcid.get('location_resolution', {})
|
||||
|
||||
if loc_res.get('city_code', '') != 'XXX':
|
||||
return False, None
|
||||
|
||||
country = loc_res.get('country_code', '')
|
||||
if not country:
|
||||
return False, None
|
||||
|
||||
# Get Wikidata ID
|
||||
wikidata_id = None
|
||||
if 'original_entry' in data and 'wikidata_id' in data['original_entry']:
|
||||
wikidata_id = data['original_entry']['wikidata_id']
|
||||
elif 'wikidata_enrichment' in data and 'wikidata_entity_id' in data['wikidata_enrichment']:
|
||||
wikidata_id = data['wikidata_enrichment']['wikidata_entity_id']
|
||||
|
||||
if not wikidata_id:
|
||||
return False, None
|
||||
|
||||
# Get coordinates from Wikidata
|
||||
coords = get_wikidata_location(wikidata_id)
|
||||
if not coords:
|
||||
print(f" No coordinates for {wikidata_id}")
|
||||
return False, None
|
||||
|
||||
lat, lon = coords
|
||||
print(f" Coords: {lat:.4f}, {lon:.4f}")
|
||||
|
||||
# Reverse geocode
|
||||
city_data = reverse_geocode(lat, lon, country, conn)
|
||||
if not city_data:
|
||||
print(f" No GeoNames match in {country}")
|
||||
return False, None
|
||||
|
||||
city_name = city_data['ascii_name'] or city_data['name']
|
||||
city_code = generate_city_code(city_name)
|
||||
|
||||
print(f" City: {city_name} ({city_code})")
|
||||
|
||||
# Update file
|
||||
old_city_code = loc_res.get('city_code', 'XXX')
|
||||
loc_res['city_code'] = city_code
|
||||
loc_res['city_label'] = city_name
|
||||
loc_res['geonames_id'] = city_data['geonames_id']
|
||||
loc_res['method'] = 'WIKIDATA_COORDS_REVERSE_GEOCODE'
|
||||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update GHCID string
|
||||
old_ghcid = ghcid.get('ghcid_current', '')
|
||||
new_ghcid = old_ghcid.replace(f'-XXX-', f'-{city_code}-')
|
||||
ghcid['ghcid_current'] = new_ghcid
|
||||
|
||||
# Add to history
|
||||
if 'ghcid_history' not in ghcid:
|
||||
ghcid['ghcid_history'] = []
|
||||
ghcid['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': f"City resolved via Wikidata {wikidata_id} coordinates: XXX->{city_code} ({city_name})"
|
||||
})
|
||||
|
||||
# Add provenance note
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance']:
|
||||
data['provenance']['notes'] = []
|
||||
elif isinstance(data['provenance']['notes'], str):
|
||||
data['provenance']['notes'] = [data['provenance']['notes']]
|
||||
|
||||
data['provenance']['notes'].append(
|
||||
f"City resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
||||
f"XXX->{city_code} via Wikidata {wikidata_id} coords ({lat:.4f},{lon:.4f}) -> {city_name} (GeoNames:{city_data['geonames_id']})"
|
||||
)
|
||||
|
||||
# Determine new filename
|
||||
new_filename = filepath.name.replace(f'-XXX-', f'-{city_code}-')
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
if not dry_run:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
if new_filepath != filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
|
||||
return True, new_filepath if new_filepath != filepath else None
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Resolve XXX city codes using Wikidata coordinates')
|
||||
parser.add_argument('--apply', action='store_true', help='Actually apply the fixes')
|
||||
parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files')
|
||||
parser.add_argument('--limit', type=int, default=50, help='Limit number of files to process')
|
||||
parser.add_argument('--country', type=str, help='Only process files for a specific country')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
custodian_dir = Path(args.path)
|
||||
if not custodian_dir.exists():
|
||||
print(f"Error: Directory {custodian_dir} does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
# Connect to GeoNames
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"Error: GeoNames database not found at {GEONAMES_DB}")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(GEONAMES_DB)
|
||||
dry_run = not args.apply
|
||||
|
||||
print("=" * 70)
|
||||
print("WIKIDATA COORDINATES CITY RESOLUTION")
|
||||
print("=" * 70)
|
||||
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
||||
print()
|
||||
|
||||
# Find files with XXX city codes
|
||||
files_to_process = list(custodian_dir.glob('*-XXX-*.yaml'))
|
||||
print(f"Found {len(files_to_process)} files with XXX codes")
|
||||
|
||||
# Filter and collect files with Wikidata IDs
|
||||
file_data = []
|
||||
for filepath in files_to_process:
|
||||
if len(file_data) >= args.limit:
|
||||
break
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
country = data.get('ghcid', {}).get('location_resolution', {}).get('country_code', '')
|
||||
if args.country and country != args.country:
|
||||
continue
|
||||
|
||||
# Check for Wikidata ID
|
||||
wikidata_id = None
|
||||
if 'original_entry' in data and 'wikidata_id' in data['original_entry']:
|
||||
wikidata_id = data['original_entry']['wikidata_id']
|
||||
elif 'wikidata_enrichment' in data and 'wikidata_entity_id' in data['wikidata_enrichment']:
|
||||
wikidata_id = data['wikidata_enrichment']['wikidata_entity_id']
|
||||
|
||||
if not wikidata_id:
|
||||
continue
|
||||
|
||||
file_data.append({
|
||||
'filepath': filepath,
|
||||
'wikidata_id': wikidata_id,
|
||||
'country': country,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f"Processing {len(file_data)} files with Wikidata IDs")
|
||||
print()
|
||||
|
||||
resolved = 0
|
||||
renamed = 0
|
||||
|
||||
for f in file_data:
|
||||
filepath = f['filepath']
|
||||
print(f"Processing {filepath.name}...")
|
||||
print(f" Wikidata: {f['wikidata_id']}")
|
||||
|
||||
success, new_path = process_file(filepath, conn, dry_run=dry_run)
|
||||
|
||||
if success:
|
||||
resolved += 1
|
||||
if new_path:
|
||||
renamed += 1
|
||||
print(f" Renamed: {filepath.name} -> {new_path.name}")
|
||||
|
||||
time.sleep(0.5) # Rate limiting
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {len(file_data)}")
|
||||
print(f"Resolved: {resolved}")
|
||||
print(f"Renamed: {renamed}")
|
||||
|
||||
if dry_run:
|
||||
print()
|
||||
print("This was a DRY RUN. Use --apply to make changes.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
472
scripts/resolve_country_codes.py
Normal file
472
scripts/resolve_country_codes.py
Normal file
|
|
@ -0,0 +1,472 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve XX country codes using Wikidata P17 (country) lookup.
|
||||
|
||||
This script:
|
||||
1. Finds files with XX country code
|
||||
2. Extracts Wikidata IDs from the files
|
||||
3. Queries Wikidata P17 to get country
|
||||
4. Updates files with resolved country code
|
||||
5. Renames files to match new GHCID
|
||||
|
||||
Following AGENTS.md Rules:
|
||||
- Rule 5: Additive only - never delete existing data
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import json
|
||||
import re
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
|
||||
# Wikidata entity ID to ISO 3166-1 alpha-2 country code mapping
|
||||
WIKIDATA_COUNTRY_TO_ISO = {
|
||||
'Q213': 'CZ', # Czechia
|
||||
'Q40': 'AT', # Austria
|
||||
'Q183': 'DE', # Germany
|
||||
'Q36': 'PL', # Poland
|
||||
'Q39': 'CH', # Switzerland
|
||||
'Q31': 'BE', # Belgium
|
||||
'Q142': 'FR', # France
|
||||
'Q145': 'GB', # United Kingdom
|
||||
'Q38': 'IT', # Italy
|
||||
'Q29': 'ES', # Spain
|
||||
'Q55': 'NL', # Netherlands
|
||||
'Q30': 'US', # United States
|
||||
'Q17': 'JP', # Japan
|
||||
'Q884': 'KR', # South Korea
|
||||
'Q148': 'CN', # China
|
||||
'Q668': 'IN', # India
|
||||
'Q155': 'BR', # Brazil
|
||||
'Q96': 'MX', # Mexico
|
||||
'Q414': 'AR', # Argentina
|
||||
'Q298': 'CL', # Chile
|
||||
'Q45': 'PT', # Portugal
|
||||
'Q27': 'IE', # Ireland
|
||||
'Q20': 'NO', # Norway
|
||||
'Q35': 'DK', # Denmark
|
||||
'Q34': 'SE', # Sweden
|
||||
'Q33': 'FI', # Finland
|
||||
'Q211': 'LV', # Latvia
|
||||
'Q37': 'LT', # Lithuania
|
||||
'Q191': 'EE', # Estonia
|
||||
'Q159': 'RU', # Russia
|
||||
'Q212': 'UA', # Ukraine
|
||||
'Q184': 'BY', # Belarus
|
||||
'Q219': 'BG', # Bulgaria
|
||||
'Q218': 'RO', # Romania
|
||||
'Q28': 'HU', # Hungary
|
||||
'Q214': 'SK', # Slovakia
|
||||
'Q215': 'SI', # Slovenia
|
||||
'Q224': 'HR', # Croatia
|
||||
'Q225': 'BA', # Bosnia and Herzegovina
|
||||
'Q117': 'GH', # Ghana
|
||||
'Q115': 'ET', # Ethiopia
|
||||
'Q1033': 'NG', # Nigeria
|
||||
'Q258': 'ZA', # South Africa
|
||||
'Q916': 'AO', # Angola
|
||||
'Q1008': 'CI', # Ivory Coast
|
||||
'Q114': 'KE', # Kenya
|
||||
'Q1044': 'SN', # Senegal
|
||||
'Q262': 'DZ', # Algeria
|
||||
'Q1028': 'MA', # Morocco
|
||||
'Q948': 'TN', # Tunisia
|
||||
'Q79': 'EG', # Egypt
|
||||
'Q1030': 'LY', # Libya
|
||||
'Q265': 'UZ', # Uzbekistan
|
||||
'Q232': 'KZ', # Kazakhstan
|
||||
'Q863': 'TJ', # Tajikistan
|
||||
'Q874': 'TM', # Turkmenistan
|
||||
'Q813': 'KG', # Kyrgyzstan
|
||||
'Q889': 'AF', # Afghanistan
|
||||
'Q794': 'IR', # Iran
|
||||
'Q796': 'IQ', # Iraq
|
||||
'Q858': 'SY', # Syria
|
||||
'Q801': 'IL', # Israel
|
||||
'Q810': 'JO', # Jordan
|
||||
'Q822': 'LB', # Lebanon
|
||||
'Q846': 'QA', # Qatar
|
||||
'Q878': 'AE', # United Arab Emirates
|
||||
'Q851': 'SA', # Saudi Arabia
|
||||
'Q805': 'YE', # Yemen
|
||||
'Q842': 'OM', # Oman
|
||||
'Q398': 'BH', # Bahrain
|
||||
'Q817': 'KW', # Kuwait
|
||||
'Q16': 'CA', # Canada
|
||||
'Q408': 'AU', # Australia
|
||||
'Q664': 'NZ', # New Zealand
|
||||
'Q869': 'TH', # Thailand
|
||||
'Q881': 'VN', # Vietnam
|
||||
'Q928': 'PH', # Philippines
|
||||
'Q252': 'ID', # Indonesia
|
||||
'Q833': 'MY', # Malaysia
|
||||
'Q334': 'SG', # Singapore
|
||||
'Q836': 'MM', # Myanmar
|
||||
'Q424': 'KH', # Cambodia
|
||||
'Q819': 'LA', # Laos
|
||||
'Q865': 'TW', # Taiwan
|
||||
'Q921': 'BN', # Brunei
|
||||
'Q399': 'AM', # Armenia
|
||||
'Q230': 'GE', # Georgia
|
||||
'Q227': 'AZ', # Azerbaijan
|
||||
'Q217': 'MD', # Moldova
|
||||
'Q229': 'CY', # Cyprus
|
||||
'Q41': 'GR', # Greece
|
||||
'Q43': 'TR', # Turkey
|
||||
'Q221': 'MK', # North Macedonia
|
||||
'Q222': 'AL', # Albania
|
||||
'Q403': 'RS', # Serbia
|
||||
'Q236': 'ME', # Montenegro
|
||||
'Q23635': 'XK', # Kosovo
|
||||
'Q347': 'LI', # Liechtenstein
|
||||
'Q32': 'LU', # Luxembourg
|
||||
'Q235': 'MC', # Monaco
|
||||
'Q238': 'SM', # San Marino
|
||||
'Q237': 'VA', # Vatican City
|
||||
'Q228': 'AD', # Andorra
|
||||
'Q233': 'MT', # Malta
|
||||
'Q189': 'IS', # Iceland
|
||||
'Q219060': 'PS', # Palestine
|
||||
# Add more as needed
|
||||
}
|
||||
|
||||
|
||||
def extract_wikidata_ids(data: Dict[str, Any]) -> List[str]:
|
||||
"""Extract all Wikidata IDs from custodian data."""
|
||||
wikidata_ids = []
|
||||
|
||||
# Check identifiers array
|
||||
if 'identifiers' in data:
|
||||
for ident in data['identifiers']:
|
||||
if ident.get('identifier_scheme') == 'Wikidata':
|
||||
value = ident.get('identifier_value', '')
|
||||
if value.startswith('Q'):
|
||||
wikidata_ids.append(value)
|
||||
|
||||
# Check original_entry.identifiers
|
||||
if 'original_entry' in data and 'identifiers' in data['original_entry']:
|
||||
for ident in data['original_entry']['identifiers']:
|
||||
if ident.get('identifier_scheme') == 'Wikidata':
|
||||
value = ident.get('identifier_value', '')
|
||||
if value.startswith('Q') and value not in wikidata_ids:
|
||||
wikidata_ids.append(value)
|
||||
|
||||
# Check wikidata_enrichment
|
||||
if 'wikidata_enrichment' in data:
|
||||
wd_id = data['wikidata_enrichment'].get('wikidata_entity_id', '')
|
||||
if wd_id.startswith('Q') and wd_id not in wikidata_ids:
|
||||
wikidata_ids.append(wd_id)
|
||||
|
||||
return wikidata_ids
|
||||
|
||||
|
||||
def query_wikidata_countries(wikidata_ids: List[str]) -> Dict[str, str]:
|
||||
"""Query Wikidata for P17 (country) in batch."""
|
||||
if not wikidata_ids:
|
||||
return {}
|
||||
|
||||
values = ' '.join([f'wd:{qid}' for qid in wikidata_ids])
|
||||
|
||||
query = f"""
|
||||
SELECT ?item ?country WHERE {{
|
||||
VALUES ?item {{ {values} }}
|
||||
?item wdt:P17 ?country.
|
||||
}}
|
||||
"""
|
||||
|
||||
url = "https://query.wikidata.org/sparql"
|
||||
headers = {
|
||||
'Accept': 'application/sparql-results+json',
|
||||
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
|
||||
}
|
||||
|
||||
data = urllib.parse.urlencode({'query': query}).encode('utf-8')
|
||||
|
||||
try:
|
||||
request = urllib.request.Request(url, data=data, headers=headers)
|
||||
with urllib.request.urlopen(request, timeout=60) as response:
|
||||
result = json.loads(response.read().decode('utf-8'))
|
||||
bindings = result.get('results', {}).get('bindings', [])
|
||||
except Exception as e:
|
||||
print(f" Wikidata SPARQL error: {e}")
|
||||
return {}
|
||||
|
||||
country_map = {}
|
||||
for row in bindings:
|
||||
item_uri = row.get('item', {}).get('value', '')
|
||||
country_uri = row.get('country', {}).get('value', '')
|
||||
|
||||
if item_uri and country_uri:
|
||||
qid = item_uri.split('/')[-1]
|
||||
country_qid = country_uri.split('/')[-1]
|
||||
|
||||
if country_qid in WIKIDATA_COUNTRY_TO_ISO:
|
||||
country_map[qid] = WIKIDATA_COUNTRY_TO_ISO[country_qid]
|
||||
|
||||
return country_map
|
||||
|
||||
|
||||
def update_custodian_file(filepath: Path, country_code: str, dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
||||
"""Update a custodian file with resolved country code."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {filepath}: {e}")
|
||||
return False, None
|
||||
|
||||
if 'ghcid' not in data:
|
||||
return False, None
|
||||
|
||||
ghcid = data['ghcid']
|
||||
if 'location_resolution' not in ghcid:
|
||||
ghcid['location_resolution'] = {}
|
||||
|
||||
loc_res = ghcid['location_resolution']
|
||||
|
||||
# Check if country code is XX
|
||||
old_country = loc_res.get('country_code', 'XX')
|
||||
if old_country != 'XX':
|
||||
return False, None
|
||||
|
||||
# Update country code
|
||||
loc_res['country_code'] = country_code
|
||||
loc_res['method'] = 'WIKIDATA_P17'
|
||||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update GHCID string
|
||||
old_ghcid = ghcid.get('ghcid_current', '')
|
||||
new_ghcid = old_ghcid.replace('XX-XX-', f'{country_code}-XX-')
|
||||
|
||||
if new_ghcid != old_ghcid:
|
||||
ghcid['ghcid_current'] = new_ghcid
|
||||
|
||||
# Add to history
|
||||
if 'ghcid_history' not in ghcid:
|
||||
ghcid['ghcid_history'] = []
|
||||
|
||||
ghcid['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': f"Country resolved via Wikidata P17: XX→{country_code}"
|
||||
})
|
||||
|
||||
# Add provenance note
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance']:
|
||||
data['provenance']['notes'] = []
|
||||
elif isinstance(data['provenance']['notes'], str):
|
||||
data['provenance']['notes'] = [data['provenance']['notes']]
|
||||
|
||||
data['provenance']['notes'].append(
|
||||
f"Country resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
||||
f"XX→{country_code} via Wikidata P17"
|
||||
)
|
||||
|
||||
# Determine new filename
|
||||
old_filename = filepath.name
|
||||
new_filename = old_filename.replace('XX-XX-', f'{country_code}-XX-')
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
if not dry_run:
|
||||
# Write updated file
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Rename if needed
|
||||
if new_filepath != filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
|
||||
return True, new_filepath if new_filepath != filepath else None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Resolve XX country codes using Wikidata P17 lookup'
|
||||
)
|
||||
parser.add_argument('--apply', action='store_true',
|
||||
help='Actually apply the fixes (default: dry run)')
|
||||
parser.add_argument('--path', type=str, default='data/custodian',
|
||||
help='Path to custodian files directory')
|
||||
parser.add_argument('--limit', type=int, default=100,
|
||||
help='Limit number of files to process')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
custodian_dir = Path(args.path)
|
||||
if not custodian_dir.exists():
|
||||
print(f"Error: Directory {custodian_dir} does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
dry_run = not args.apply
|
||||
|
||||
print("=" * 70)
|
||||
print("COUNTRY CODE RESOLUTION VIA WIKIDATA P17")
|
||||
print("=" * 70)
|
||||
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
||||
print()
|
||||
|
||||
# Find files with XX country code
|
||||
files_to_process = list(custodian_dir.glob('XX-*.yaml'))[:args.limit]
|
||||
|
||||
print(f"Found {len(files_to_process)} files with XX country code")
|
||||
print()
|
||||
|
||||
# Load files and extract Wikidata IDs
|
||||
file_data = []
|
||||
for filepath in files_to_process:
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
wikidata_ids = extract_wikidata_ids(data)
|
||||
|
||||
file_data.append({
|
||||
'filepath': filepath,
|
||||
'data': data,
|
||||
'wikidata_ids': wikidata_ids
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error loading {filepath}: {e}")
|
||||
|
||||
print(f"Loaded {len(file_data)} files")
|
||||
|
||||
# Count files with Wikidata IDs
|
||||
with_wikidata = [f for f in file_data if f['wikidata_ids']]
|
||||
without_wikidata = [f for f in file_data if not f['wikidata_ids']]
|
||||
|
||||
print(f" With Wikidata IDs: {len(with_wikidata)}")
|
||||
print(f" Without Wikidata IDs: {len(without_wikidata)}")
|
||||
print()
|
||||
|
||||
# Query Wikidata for countries in batch
|
||||
all_wikidata_ids = []
|
||||
for f in with_wikidata:
|
||||
all_wikidata_ids.extend(f['wikidata_ids'])
|
||||
all_wikidata_ids = list(set(all_wikidata_ids))
|
||||
|
||||
print(f"Querying Wikidata for {len(all_wikidata_ids)} entities...")
|
||||
|
||||
# Batch in groups of 50
|
||||
all_countries = {}
|
||||
for i in range(0, len(all_wikidata_ids), 50):
|
||||
batch = all_wikidata_ids[i:i+50]
|
||||
countries = query_wikidata_countries(batch)
|
||||
all_countries.update(countries)
|
||||
if i + 50 < len(all_wikidata_ids):
|
||||
import time
|
||||
time.sleep(1) # Rate limiting
|
||||
|
||||
print(f" Retrieved country for {len(all_countries)} entities")
|
||||
print()
|
||||
|
||||
# Process files
|
||||
resolved = 0
|
||||
renamed = 0
|
||||
no_country = []
|
||||
|
||||
# First process files with Wikidata IDs
|
||||
for f in with_wikidata:
|
||||
filepath = f['filepath']
|
||||
wikidata_ids = f['wikidata_ids']
|
||||
|
||||
# Find country from any Wikidata ID
|
||||
country_code = None
|
||||
for wid in wikidata_ids:
|
||||
if wid in all_countries:
|
||||
country_code = all_countries[wid]
|
||||
break
|
||||
|
||||
if not country_code:
|
||||
no_country.append(filepath.name)
|
||||
continue
|
||||
|
||||
# Update file
|
||||
success, new_path = update_custodian_file(filepath, country_code, dry_run=dry_run)
|
||||
|
||||
if success:
|
||||
resolved += 1
|
||||
if new_path:
|
||||
renamed += 1
|
||||
print(f" {filepath.name} → {new_path.name}")
|
||||
else:
|
||||
print(f" Updated: {filepath.name}")
|
||||
|
||||
# Now process files without Wikidata IDs using source-based inference
|
||||
source_resolved = 0
|
||||
for f in without_wikidata:
|
||||
filepath = f['filepath']
|
||||
data = f['data']
|
||||
|
||||
# Try to infer country from source file
|
||||
country_code = None
|
||||
source = data.get('original_entry', {}).get('source', '')
|
||||
|
||||
# Czech source patterns
|
||||
if 'czech' in source.lower() or 'cz_' in source.lower():
|
||||
country_code = 'CZ'
|
||||
# Austrian source patterns
|
||||
elif 'austria' in source.lower() or 'at_' in source.lower():
|
||||
country_code = 'AT'
|
||||
# German source patterns
|
||||
elif 'german' in source.lower() or 'de_' in source.lower():
|
||||
country_code = 'DE'
|
||||
# Swiss source patterns
|
||||
elif 'swiss' in source.lower() or 'switzerland' in source.lower() or 'ch_' in source.lower():
|
||||
country_code = 'CH'
|
||||
# Belgian source patterns
|
||||
elif 'belgium' in source.lower() or 'belgian' in source.lower() or 'be_' in source.lower():
|
||||
country_code = 'BE'
|
||||
# Dutch source patterns
|
||||
elif 'dutch' in source.lower() or 'netherlands' in source.lower() or 'nl_' in source.lower():
|
||||
country_code = 'NL'
|
||||
# Japanese source patterns
|
||||
elif 'japan' in source.lower() or 'jp_' in source.lower():
|
||||
country_code = 'JP'
|
||||
|
||||
if country_code:
|
||||
success, new_path = update_custodian_file(filepath, country_code, dry_run=dry_run)
|
||||
if success:
|
||||
source_resolved += 1
|
||||
resolved += 1
|
||||
if new_path:
|
||||
renamed += 1
|
||||
print(f" [source-inferred] {filepath.name} → {new_path.name}")
|
||||
else:
|
||||
no_country.append(filepath.name)
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {len(file_data)}")
|
||||
print(f"With Wikidata IDs: {len(with_wikidata)}")
|
||||
print(f"Source-inferred: {source_resolved}")
|
||||
print(f"Resolved: {resolved}")
|
||||
print(f"Renamed: {renamed}")
|
||||
print(f"No country found: {len(no_country)}")
|
||||
print(f"Without Wikidata IDs: {len(without_wikidata)}")
|
||||
|
||||
if no_country and len(no_country) <= 20:
|
||||
print()
|
||||
print("Files without country resolution:")
|
||||
for name in no_country:
|
||||
print(f" - {name}")
|
||||
|
||||
if dry_run:
|
||||
print()
|
||||
print("This was a DRY RUN. Use --apply to make changes.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
269
scripts/resolve_cz_xx_regions.py
Normal file
269
scripts/resolve_cz_xx_regions.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve CZ-XX (unknown region) files to proper ISO 3166-2:CZ region codes.
|
||||
|
||||
This script updates 36 Czech institution files that have placeholder XX region codes
|
||||
to their correct ISO 3166-2:CZ region codes based on researched location data.
|
||||
|
||||
Research completed 2025-12-07 via GeoNames database and web searches.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import yaml
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# GeoNames Admin1 → ISO 3166-2:CZ region code mapping
|
||||
ADMIN1_TO_ISO = {
|
||||
'52': '10', # Prague
|
||||
'78': '64', # South Moravian (Jihomoravský)
|
||||
'79': '31', # South Bohemian (Jihočeský)
|
||||
'80': '63', # Vysočina
|
||||
'81': '41', # Karlovy Vary
|
||||
'82': '52', # Hradec Králové
|
||||
'83': '51', # Liberec
|
||||
'84': '71', # Olomouc
|
||||
'85': '80', # Moravian-Silesian (Moravskoslezský)
|
||||
'86': '53', # Pardubice
|
||||
'87': '32', # Plzeň
|
||||
'88': '20', # Central Bohemian (Středočeský)
|
||||
'89': '42', # Ústí nad Labem
|
||||
'90': '72', # Zlín
|
||||
}
|
||||
|
||||
# Research results: mapping from old filename suffix to resolution data
|
||||
# Format: (new_region_code, new_city_code, city_name, geonames_id, admin1_code)
|
||||
RESOLUTIONS = {
|
||||
# Archives (A)
|
||||
'A-SAČTÚ': ('10', 'PRA', 'Prague', 3067696, '52'),
|
||||
'A-SAČÚZK': ('10', 'PRA', 'Prague', 3067696, '52'),
|
||||
'A-SAERÚ': ('63', 'JIH', 'Jihlava', 3074199, '80'),
|
||||
'A-SAÚPOHS': ('64', 'BRN', 'Brno', 3078610, '78'),
|
||||
'A-BSS': ('51', 'MAS', 'Malá Skála', 3071223, '83'),
|
||||
'A-PSJAK': ('53', 'BNO', 'Brandýs nad Orlicí', 3078836, '86'),
|
||||
'A-ZI': ('10', 'PRA', 'Prague', 3067696, '52'), # Admin location
|
||||
|
||||
# Galleries (G)
|
||||
'G-GAU': ('52', 'HOS', 'Hostinné', 3075058, '82'),
|
||||
'G-GVP': ('20', 'MLB', 'Mladá Boleslav', 3070544, '88'),
|
||||
|
||||
# Libraries (L) - Many are research institutes in Prague/Brno
|
||||
'L-SÚPRO': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE064
|
||||
'L-ÚRB': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE444
|
||||
'L-ÚSLOZ': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE215
|
||||
'L-VŠZFA': ('10', 'PRA', 'Prague', 3067696, '52'),
|
||||
'L-VŠZR': ('10', 'PRA', 'Prague', 3067696, '52'),
|
||||
'L-VÚB': ('64', 'BRN', 'Brno', 3078610, '78'), # BOC006
|
||||
'L-VÚI': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC043
|
||||
'L-VÚP': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC066
|
||||
'L-VÚRV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC162
|
||||
'L-VUTÚTMŘP': ('64', 'BRN', 'Brno', 3078610, '78'),
|
||||
'L-VVÚNP': ('64', 'BRN', 'Brno', 3078610, '78'), # BOF045
|
||||
'L-ZVVZVÚV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABF127
|
||||
|
||||
# Museums (M)
|
||||
'M-BMOP': ('32', 'KPC', 'Klenčí pod Čerchovem', 3073644, '87'),
|
||||
'M-MD': ('10', 'PRA', 'Prague', 3067696, '52'),
|
||||
'M-MH': ('20', 'KNC', 'Kostelec nad Černými Lesy', 3073152, '88'),
|
||||
'M-MJD': ('32', 'CHU', 'Chudenice', 3077528, '87'),
|
||||
'M-MKISMDAH': ('63', 'HUM', 'Humpolec', 3074723, '80'),
|
||||
'M-MMGLK': ('20', 'POD', 'Poděbrady', 3068107, '88'),
|
||||
'M-MMM': ('42', 'MIK', 'Mikulášovice', 3070725, '89'), # Mikcentrum!
|
||||
'M-MMSR': ('10', 'PRA', 'Prague', 3067696, '52'),
|
||||
'M-MRV': ('51', 'DES', 'Desná', 3077198, '83'),
|
||||
'M-MSČ': ('20', 'OST', 'Ostředek', 3068792, '88'),
|
||||
'M-MTZSŘ': ('52', 'DEO', 'Deštné v Orlických horách', 3077191, '82'),
|
||||
'M-MVBŽS': ('31', 'VOD', 'Vodňany', 3062642, '79'),
|
||||
'M-PDEHAM': ('53', 'HOL', 'Holice', 3075599, '86'),
|
||||
'M-PMJH': ('31', 'HUS', 'Husinec', 3074686, '79'),
|
||||
'M-PZV': ('51', 'PNJ', 'Paseky nad Jizerou', 3068552, '83'),
|
||||
}
|
||||
|
||||
|
||||
def generate_city_code(city_name: str) -> str:
|
||||
"""Generate 3-letter city code from city name."""
|
||||
# Remove diacritics and common prefixes
|
||||
import unicodedata
|
||||
normalized = unicodedata.normalize('NFD', city_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Handle multi-word names
|
||||
words = ascii_name.split()
|
||||
|
||||
# Skip common prefixes in Czech
|
||||
skip_words = {'nad', 'pod', 'v', 'u', 'na'}
|
||||
significant_words = [w for w in words if w.lower() not in skip_words]
|
||||
|
||||
if len(significant_words) == 1:
|
||||
# Single word: first 3 letters
|
||||
return significant_words[0][:3].upper()
|
||||
elif len(significant_words) >= 2:
|
||||
# Multi-word: initials
|
||||
return ''.join(w[0].upper() for w in significant_words[:3])
|
||||
else:
|
||||
return ascii_name[:3].upper()
|
||||
|
||||
|
||||
def update_yaml_file(filepath: Path, resolution: tuple) -> tuple:
|
||||
"""
|
||||
Update a YAML file with resolved region/city data.
|
||||
|
||||
Returns: (old_ghcid, new_ghcid, new_filepath)
|
||||
"""
|
||||
region_code, city_code, city_name, geonames_id, admin1_code = resolution
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Parse YAML
|
||||
data = yaml.safe_load(content)
|
||||
|
||||
# Extract current GHCID
|
||||
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
||||
|
||||
# Build new GHCID
|
||||
# Pattern: CZ-XX-XXX-{TYPE}-{ABBREV} -> CZ-{REGION}-{CITY}-{TYPE}-{ABBREV}
|
||||
match = re.match(r'CZ-XX-XXX-([A-Z])-(.+)$', old_ghcid)
|
||||
if not match:
|
||||
print(f" WARNING: Could not parse GHCID: {old_ghcid}")
|
||||
return None, None, None
|
||||
|
||||
inst_type, abbrev = match.groups()
|
||||
new_ghcid = f"CZ-{region_code}-{city_code}-{inst_type}-{abbrev}"
|
||||
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update ghcid section
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['location_resolution'] = {
|
||||
'method': 'GEONAMES_RESEARCH',
|
||||
'country_code': 'CZ',
|
||||
'region_code': region_code,
|
||||
'region_name': get_region_name(region_code),
|
||||
'city_code': city_code,
|
||||
'city_name': city_name,
|
||||
'geonames_id': geonames_id,
|
||||
'admin1_code': admin1_code,
|
||||
'resolution_timestamp': timestamp,
|
||||
'research_date': '2025-12-07',
|
||||
'research_method': 'GeoNames database + web search verification'
|
||||
}
|
||||
|
||||
# Add history entry
|
||||
if 'ghcid_history' not in data['ghcid']:
|
||||
data['ghcid']['ghcid_history'] = []
|
||||
|
||||
data['ghcid']['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': timestamp,
|
||||
'reason': f'Region resolved via GeoNames research: XX→{region_code}, city: {city_name} (GeoNames ID: {geonames_id})'
|
||||
})
|
||||
|
||||
# Update provenance notes
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance']:
|
||||
data['provenance']['notes'] = []
|
||||
data['provenance']['notes'].append(
|
||||
f'Region resolved {timestamp[:10]}: XX→CZ-{region_code} ({city_name}) via GeoNames research'
|
||||
)
|
||||
|
||||
# Update location if present
|
||||
if 'location' not in data:
|
||||
data['location'] = {}
|
||||
data['location']['city'] = city_name
|
||||
data['location']['country'] = 'CZ'
|
||||
data['location']['region'] = get_region_name(region_code)
|
||||
data['location']['geonames_id'] = geonames_id
|
||||
|
||||
# Write updated YAML
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
with open(new_filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
# Remove old file if different
|
||||
if new_filepath != filepath:
|
||||
filepath.unlink()
|
||||
|
||||
return old_ghcid, new_ghcid, new_filepath
|
||||
|
||||
|
||||
def get_region_name(region_code: str) -> str:
|
||||
"""Get region name from ISO 3166-2:CZ code."""
|
||||
region_names = {
|
||||
'10': 'Prague',
|
||||
'20': 'Central Bohemian',
|
||||
'31': 'South Bohemian',
|
||||
'32': 'Plzeň',
|
||||
'41': 'Karlovy Vary',
|
||||
'42': 'Ústí nad Labem',
|
||||
'51': 'Liberec',
|
||||
'52': 'Hradec Králové',
|
||||
'53': 'Pardubice',
|
||||
'63': 'Vysočina',
|
||||
'64': 'South Moravian',
|
||||
'71': 'Olomouc',
|
||||
'72': 'Zlín',
|
||||
'80': 'Moravian-Silesian',
|
||||
}
|
||||
return region_names.get(region_code, 'Unknown')
|
||||
|
||||
|
||||
def main():
|
||||
"""Main execution function."""
|
||||
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
||||
|
||||
# Find all CZ-XX-XXX files
|
||||
xx_files = list(custodian_dir.glob('CZ-XX-XXX-*.yaml'))
|
||||
print(f"Found {len(xx_files)} CZ-XX-XXX files to resolve")
|
||||
|
||||
resolved = 0
|
||||
failed = 0
|
||||
|
||||
for filepath in sorted(xx_files):
|
||||
filename = filepath.stem
|
||||
# Extract suffix (e.g., "A-SAČTÚ" from "CZ-XX-XXX-A-SAČTÚ")
|
||||
suffix_match = re.match(r'CZ-XX-XXX-(.+)$', filename)
|
||||
if not suffix_match:
|
||||
print(f" SKIP: Could not parse filename: {filename}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
suffix = suffix_match.group(1)
|
||||
|
||||
if suffix not in RESOLUTIONS:
|
||||
print(f" SKIP: No resolution for: {suffix}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
resolution = RESOLUTIONS[suffix]
|
||||
try:
|
||||
old_ghcid, new_ghcid, new_filepath = update_yaml_file(filepath, resolution)
|
||||
if old_ghcid and new_ghcid:
|
||||
print(f" ✓ {old_ghcid} → {new_ghcid}")
|
||||
resolved += 1
|
||||
else:
|
||||
print(f" ✗ Failed to update: {filepath.name}")
|
||||
failed += 1
|
||||
except Exception as e:
|
||||
print(f" ✗ Error processing {filepath.name}: {e}")
|
||||
failed += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUMMARY: Resolved {resolved}/{len(xx_files)} files")
|
||||
if failed:
|
||||
print(f" Failed: {failed}")
|
||||
|
||||
# Verify no CZ-XX files remain
|
||||
remaining = list(custodian_dir.glob('CZ-XX-*.yaml'))
|
||||
print(f"\nRemaining CZ-XX files: {len(remaining)}")
|
||||
if remaining:
|
||||
for f in remaining:
|
||||
print(f" - {f.name}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
353
scripts/resolve_locations_by_name.py
Executable file
353
scripts/resolve_locations_by_name.py
Executable file
|
|
@ -0,0 +1,353 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve XX region codes using city names extracted from institution names.
|
||||
|
||||
This script handles files without coordinates or Wikidata IDs by:
|
||||
1. Extracting city names from institution names
|
||||
2. Looking up cities in GeoNames database
|
||||
3. Mapping to ISO 3166-2 region codes
|
||||
|
||||
Following AGENTS.md Rules:
|
||||
- Rule 5: Additive only - never delete existing data
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import sqlite3
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
# Belgian city name patterns
|
||||
BELGIAN_CITIES = {
|
||||
'brussel': 'BRU', 'bruxelles': 'BRU', 'brussels': 'BRU',
|
||||
'antwerpen': 'VAN', 'anvers': 'VAN', 'antwerp': 'VAN',
|
||||
'gent': 'VOV', 'ghent': 'VOV', 'gand': 'VOV',
|
||||
'brugge': 'VWV', 'bruges': 'VWV',
|
||||
'leuven': 'VBR', 'louvain': 'VBR',
|
||||
'mechelen': 'VAN', 'malines': 'VAN',
|
||||
'hasselt': 'VLI',
|
||||
'luik': 'WLG', 'liège': 'WLG', 'liege': 'WLG',
|
||||
'charleroi': 'WHT',
|
||||
'namur': 'WNA', 'namen': 'WNA',
|
||||
'mons': 'WHT', 'bergen': 'WHT',
|
||||
'tournai': 'WHT', 'doornik': 'WHT',
|
||||
'kortrijk': 'VWV', 'courtrai': 'VWV',
|
||||
'oostende': 'VWV', 'ostende': 'VWV',
|
||||
'aalst': 'VOV', 'alost': 'VOV',
|
||||
'sint-niklaas': 'VOV',
|
||||
'dendermonde': 'VOV',
|
||||
'genk': 'VLI',
|
||||
'roeselare': 'VWV',
|
||||
'mouscron': 'WHT', 'moeskroen': 'WHT',
|
||||
'tienen': 'VBR', 'tirlemont': 'VBR',
|
||||
'ieper': 'VWV', 'ypres': 'VWV',
|
||||
'turnhout': 'VAN',
|
||||
'waregem': 'VWV',
|
||||
'lokeren': 'VOV',
|
||||
'beveren': 'VOV',
|
||||
'vilvoorde': 'VBR',
|
||||
'dilbeek': 'VBR',
|
||||
'schoten': 'VAN',
|
||||
'brasschaat': 'VAN',
|
||||
'boom': 'VAN',
|
||||
'mortsel': 'VAN',
|
||||
'temse': 'VOV',
|
||||
'herzele': 'VOV',
|
||||
'brecht': 'VAN',
|
||||
'oudenaarde': 'VOV',
|
||||
'rotselaar': 'VBR',
|
||||
'niel': 'VAN',
|
||||
'lint': 'VAN',
|
||||
'ravels': 'VAN',
|
||||
'bree': 'VLI',
|
||||
'peer': 'VLI',
|
||||
'meeuwen': 'VLI',
|
||||
'gruitrode': 'VLI',
|
||||
'arlon': 'WLX', 'aarlen': 'WLX',
|
||||
'bastogne': 'WLX', 'bastenaken': 'WLX',
|
||||
}
|
||||
|
||||
# Austrian state codes
|
||||
AUSTRIAN_STATES = {
|
||||
'wien': '9', 'vienna': '9',
|
||||
'salzburg': '5',
|
||||
'tirol': '7', 'tyrol': '7', 'innsbruck': '7',
|
||||
'vorarlberg': '8', 'bregenz': '8',
|
||||
'kärnten': '2', 'carinthia': '2', 'klagenfurt': '2',
|
||||
'steiermark': '6', 'styria': '6', 'graz': '6',
|
||||
'oberösterreich': '4', 'upper austria': '4', 'linz': '4',
|
||||
'niederösterreich': '3', 'lower austria': '3', 'st. pölten': '3',
|
||||
'burgenland': '1', 'eisenstadt': '1',
|
||||
}
|
||||
|
||||
# Bulgarian province codes
|
||||
BULGARIAN_PROVINCES = {
|
||||
'sofia': '22', 'софія': '22',
|
||||
'plovdiv': '16', 'пловдив': '16',
|
||||
'varna': '03', 'варна': '03',
|
||||
'burgas': '02', 'бургас': '02',
|
||||
'ruse': '18', 'русе': '18',
|
||||
'stara zagora': '24',
|
||||
'pleven': '15', 'плевен': '15',
|
||||
}
|
||||
|
||||
# Swiss canton codes (abbreviated)
|
||||
SWISS_CANTONS = {
|
||||
'zürich': 'ZH', 'zurich': 'ZH',
|
||||
'bern': 'BE', 'berne': 'BE',
|
||||
'luzern': 'LU', 'lucerne': 'LU',
|
||||
'genève': 'GE', 'geneva': 'GE', 'genf': 'GE',
|
||||
'basel': 'BS',
|
||||
'lausanne': 'VD',
|
||||
'winterthur': 'ZH',
|
||||
'st. gallen': 'SG', 'st gallen': 'SG',
|
||||
'lugano': 'TI',
|
||||
'biel': 'BE', 'bienne': 'BE',
|
||||
'thun': 'BE',
|
||||
'fribourg': 'FR', 'freiburg': 'FR',
|
||||
'schaffhausen': 'SH',
|
||||
'chur': 'GR',
|
||||
'neuchâtel': 'NE', 'neuchatel': 'NE',
|
||||
'sion': 'VS',
|
||||
'aarau': 'AG',
|
||||
'baden': 'AG',
|
||||
}
|
||||
|
||||
|
||||
def extract_city_from_name(name: str, country: str) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Extract city name from institution name.
|
||||
Returns (city_name, region_code) or None.
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
|
||||
if country == 'BE':
|
||||
for city, region in BELGIAN_CITIES.items():
|
||||
if city in name_lower:
|
||||
return (city.title(), region)
|
||||
|
||||
elif country == 'AT':
|
||||
for city, region in AUSTRIAN_STATES.items():
|
||||
if city in name_lower:
|
||||
return (city.title(), region)
|
||||
|
||||
elif country == 'BG':
|
||||
for city, region in BULGARIAN_PROVINCES.items():
|
||||
if city in name_lower:
|
||||
return (city.title(), region)
|
||||
|
||||
elif country == 'CH':
|
||||
for city, region in SWISS_CANTONS.items():
|
||||
if city in name_lower:
|
||||
return (city.title(), region)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def update_file_with_region(filepath: Path, region_code: str, city_name: str,
|
||||
dry_run: bool = True) -> Tuple[bool, Optional[Path]]:
|
||||
"""Update a custodian file with resolved region code."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {filepath}: {e}")
|
||||
return False, None
|
||||
|
||||
if 'ghcid' not in data:
|
||||
return False, None
|
||||
|
||||
ghcid = data['ghcid']
|
||||
if 'location_resolution' not in ghcid:
|
||||
ghcid['location_resolution'] = {}
|
||||
|
||||
loc_res = ghcid['location_resolution']
|
||||
country_code = loc_res.get('country_code', '')
|
||||
|
||||
if not country_code:
|
||||
return False, None
|
||||
|
||||
old_region = loc_res.get('region_code', 'XX')
|
||||
|
||||
if old_region != 'XX':
|
||||
return False, None
|
||||
|
||||
# Update location resolution
|
||||
loc_res['region_code'] = region_code
|
||||
loc_res['region_name'] = city_name
|
||||
loc_res['method'] = 'NAME_LOOKUP'
|
||||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update GHCID string
|
||||
old_ghcid = ghcid.get('ghcid_current', '')
|
||||
new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
||||
|
||||
if new_ghcid != old_ghcid:
|
||||
ghcid['ghcid_current'] = new_ghcid
|
||||
|
||||
if 'ghcid_history' not in ghcid:
|
||||
ghcid['ghcid_history'] = []
|
||||
|
||||
ghcid['ghcid_history'].append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': f"Region resolved via name lookup: XX->{region_code} (city: {city_name})"
|
||||
})
|
||||
|
||||
# Add provenance note
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance']:
|
||||
data['provenance']['notes'] = []
|
||||
elif isinstance(data['provenance']['notes'], str):
|
||||
data['provenance']['notes'] = [data['provenance']['notes']]
|
||||
|
||||
data['provenance']['notes'].append(
|
||||
f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: "
|
||||
f"XX->{region_code} via name lookup (city: {city_name})"
|
||||
)
|
||||
|
||||
# Determine new filename
|
||||
new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-')
|
||||
new_filepath = filepath.parent / new_filename
|
||||
|
||||
if not dry_run:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
if new_filepath != filepath and not new_filepath.exists():
|
||||
filepath.rename(new_filepath)
|
||||
|
||||
return True, new_filepath if new_filepath != filepath else None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Resolve XX region codes using city names from institution names'
|
||||
)
|
||||
parser.add_argument('--apply', action='store_true',
|
||||
help='Actually apply the fixes (default: dry run)')
|
||||
parser.add_argument('--path', type=str, default='data/custodian',
|
||||
help='Path to custodian files directory')
|
||||
parser.add_argument('--limit', type=int, default=100,
|
||||
help='Limit number of files to process')
|
||||
parser.add_argument('--country', type=str,
|
||||
help='Only process files for a specific country')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
custodian_dir = Path(args.path)
|
||||
if not custodian_dir.exists():
|
||||
print(f"Error: Directory {custodian_dir} does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
dry_run = not args.apply
|
||||
|
||||
print("=" * 70)
|
||||
print("REGION RESOLUTION VIA NAME LOOKUP")
|
||||
print("=" * 70)
|
||||
print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}")
|
||||
print()
|
||||
|
||||
# Find files with XX region codes
|
||||
files_to_process = []
|
||||
|
||||
for filepath in custodian_dir.glob('*-XX-*.yaml'):
|
||||
files_to_process.append(filepath)
|
||||
|
||||
print(f"Found {len(files_to_process)} files with XX region codes")
|
||||
|
||||
# Load files and extract institution names
|
||||
file_data = []
|
||||
for filepath in files_to_process[:args.limit]:
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
# Get country code
|
||||
country = None
|
||||
if 'ghcid' in data and 'location_resolution' in data['ghcid']:
|
||||
country = data['ghcid']['location_resolution'].get('country_code')
|
||||
|
||||
if not country:
|
||||
continue
|
||||
|
||||
if args.country and country != args.country:
|
||||
continue
|
||||
|
||||
# Get institution name
|
||||
name = None
|
||||
if 'custodian_name' in data:
|
||||
name = data['custodian_name'].get('claim_value')
|
||||
if not name and 'original_entry' in data:
|
||||
name = data['original_entry'].get('name')
|
||||
|
||||
if not name:
|
||||
continue
|
||||
|
||||
file_data.append({
|
||||
'filepath': filepath,
|
||||
'data': data,
|
||||
'country': country,
|
||||
'name': name
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error loading {filepath}: {e}")
|
||||
|
||||
print(f"Processing {len(file_data)} files with institution names")
|
||||
print()
|
||||
|
||||
# Process each file
|
||||
resolved = 0
|
||||
renamed = 0
|
||||
no_match = 0
|
||||
|
||||
for f in file_data:
|
||||
filepath = f['filepath']
|
||||
name = f['name']
|
||||
country = f['country']
|
||||
|
||||
# Try to extract city from name
|
||||
result = extract_city_from_name(name, country)
|
||||
|
||||
if not result:
|
||||
no_match += 1
|
||||
continue
|
||||
|
||||
city_name, region_code = result
|
||||
|
||||
print(f"Processing {filepath.name}...")
|
||||
print(f" Name: {name}")
|
||||
print(f" City: {city_name} -> Region: {region_code}")
|
||||
|
||||
# Update file
|
||||
success, new_path = update_file_with_region(filepath, region_code, city_name, dry_run=dry_run)
|
||||
|
||||
if success:
|
||||
resolved += 1
|
||||
if new_path:
|
||||
renamed += 1
|
||||
print(f" {filepath.name} -> {new_path.name}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {len(file_data)}")
|
||||
print(f"Resolved: {resolved}")
|
||||
print(f"Renamed: {renamed}")
|
||||
print(f"No city match: {no_match}")
|
||||
|
||||
if dry_run:
|
||||
print()
|
||||
print("This was a DRY RUN. Use --apply to make changes.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
568
scripts/resolve_regions_from_city.py
Normal file
568
scripts/resolve_regions_from_city.py
Normal file
|
|
@ -0,0 +1,568 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve XX region codes using city names already in the file.
|
||||
|
||||
This script handles files that have city data but unknown region codes.
|
||||
It looks up the city in GeoNames to get the admin1 (region) code.
|
||||
|
||||
Following AGENTS.md Rules:
|
||||
- Rule 5: Additive only - never delete existing data
|
||||
- GHCID settlement standardization: GeoNames is authoritative
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import sqlite3
|
||||
import re
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
# GeoNames database
|
||||
GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db"
|
||||
CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian"
|
||||
|
||||
# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods)
|
||||
SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
|
||||
# Country-specific region code mappings (GeoNames admin1 → ISO 3166-2)
|
||||
COUNTRY_ADMIN_MAPS = {
|
||||
'NL': {
|
||||
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI',
|
||||
'06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH',
|
||||
'15': 'OV', '16': 'FL'
|
||||
},
|
||||
'BE': {
|
||||
'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV',
|
||||
'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA',
|
||||
'BRU': 'BRU'
|
||||
},
|
||||
# Georgia: GeoNames admin1 → ISO 3166-2:GE
|
||||
'GE': {
|
||||
'51': 'TB', # Tbilisi
|
||||
'04': 'AJ', # Adjara
|
||||
'67': 'KA', # Kakheti
|
||||
'66': 'IM', # Imereti
|
||||
'68': 'KK', # Kvemo Kartli
|
||||
'69': 'MM', # Mtskheta-Mtianeti
|
||||
'70': 'RL', # Racha-Lechkhumi and Kvemo Svaneti
|
||||
'71': 'SZ', # Samegrelo and Zemo Svaneti
|
||||
'72': 'SJ', # Samtskhe-Javakheti
|
||||
'73': 'SK', # Shida Kartli
|
||||
'65': 'GU', # Guria
|
||||
},
|
||||
# Czech Republic: GeoNames admin1 → ISO 3166-2:CZ (2-digit NUTS codes)
|
||||
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:CZ
|
||||
'CZ': {
|
||||
'52': '10', # Prague (Praha)
|
||||
'88': '20', # Central Bohemian (Středočeský kraj)
|
||||
'79': '31', # South Bohemian (Jihočeský kraj)
|
||||
'87': '32', # Plzeň Region (Plzeňský kraj)
|
||||
'81': '41', # Karlovy Vary Region (Karlovarský kraj)
|
||||
'89': '42', # Ústí nad Labem Region (Ústecký kraj)
|
||||
'83': '51', # Liberec Region (Liberecký kraj)
|
||||
'82': '52', # Hradec Králové Region (Královéhradecký kraj)
|
||||
'86': '53', # Pardubice Region (Pardubický kraj)
|
||||
'80': '63', # Vysočina Region
|
||||
'78': '64', # South Moravian (Jihomoravský kraj)
|
||||
'84': '71', # Olomouc Region (Olomoucký kraj)
|
||||
'90': '72', # Zlín Region (Zlínský kraj)
|
||||
'85': '80', # Moravian-Silesian (Moravskoslezský kraj)
|
||||
},
|
||||
# Austria: GeoNames admin1 → ISO 3166-2:AT
|
||||
'AT': {
|
||||
'01': '1', # Burgenland
|
||||
'02': '2', # Kärnten (Carinthia)
|
||||
'03': '3', # Niederösterreich (Lower Austria)
|
||||
'04': '4', # Oberösterreich (Upper Austria)
|
||||
'05': '5', # Salzburg
|
||||
'06': '6', # Steiermark (Styria)
|
||||
'07': '7', # Tirol (Tyrol)
|
||||
'08': '8', # Vorarlberg
|
||||
'09': '9', # Wien (Vienna)
|
||||
},
|
||||
# Bulgaria: GeoNames admin1 → ISO 3166-2:BG (2-letter province codes)
|
||||
'BG': {
|
||||
'38': '01', # Blagoevgrad
|
||||
'39': '02', # Burgas
|
||||
'40': '08', # Dobrich
|
||||
'41': '07', # Gabrovo
|
||||
'42': '26', # Haskovo
|
||||
'43': '09', # Kardzhali (Kurdzhali)
|
||||
'44': '10', # Kyustendil
|
||||
'45': '11', # Lovech
|
||||
'46': '12', # Montana
|
||||
'47': '13', # Pazardzhik
|
||||
'48': '14', # Pernik
|
||||
'49': '15', # Pleven
|
||||
'50': '16', # Plovdiv
|
||||
'51': '17', # Razgrad
|
||||
'52': '18', # Ruse
|
||||
'53': '27', # Shumen
|
||||
'54': '19', # Silistra
|
||||
'55': '20', # Sliven
|
||||
'56': '21', # Smolyan
|
||||
'57': '23', # Sofia (Sofiya-Grad)
|
||||
'58': '22', # Sofia Province (Sofiya)
|
||||
'59': '24', # Stara Zagora
|
||||
'60': '25', # Targovishte
|
||||
'61': '03', # Varna
|
||||
'62': '04', # Veliko Tarnovo
|
||||
'63': '05', # Vidin
|
||||
'64': '06', # Vratsa
|
||||
'65': '28', # Yambol
|
||||
},
|
||||
# Switzerland: GeoNames already uses ISO 3166-2:CH canton codes
|
||||
'CH': {
|
||||
'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL',
|
||||
'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR',
|
||||
'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW',
|
||||
'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG',
|
||||
'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG',
|
||||
'ZH': 'ZH',
|
||||
},
|
||||
# Vietnam: GeoNames admin1 codes are the ISO 3166-2:VN codes (use directly)
|
||||
# GeoNames uses 2-digit codes that match ISO 3166-2:VN province codes
|
||||
'VN': {
|
||||
'01': 'HN', # Hanoi (Ha Noi)
|
||||
'31': 'HP', # Hai Phong
|
||||
'48': 'DN', # Da Nang (Đà Nẵng)
|
||||
'79': 'SG', # Ho Chi Minh City (Saigon)
|
||||
'92': 'CT', # Can Tho
|
||||
'75': 'DNa', # Dong Nai
|
||||
'24': 'BN', # Bac Ninh
|
||||
'22': 'QN', # Quang Ninh (Quảng Ninh)
|
||||
'38': 'TH', # Thanh Hoa (Thanh Hóa)
|
||||
'46': 'TTH', # Thua Thien-Hue (Thừa Thiên Huế)
|
||||
'40': 'NA', # Nghe An (Nghệ An)
|
||||
'04': 'CB', # Cao Bang
|
||||
'37': 'NB', # Ninh Binh
|
||||
'56': 'KH', # Khanh Hoa
|
||||
'66': 'DLK', # Dak Lak
|
||||
'68': 'LDG', # Lam Dong
|
||||
'91': 'AG', # An Giang
|
||||
'86': 'VL', # Vinh Long
|
||||
'82': 'DTP', # Dong Thap
|
||||
'80': 'TNi', # Tay Ninh
|
||||
'96': 'CMa', # Ca Mau
|
||||
'51': 'QNg', # Quang Ngai
|
||||
'52': 'GL', # Gia Lai
|
||||
'19': 'TN', # Thai Nguyen
|
||||
'25': 'PT', # Phu Tho
|
||||
},
|
||||
# Japan: GeoNames admin1 → ISO 3166-2:JP (2-digit prefecture codes)
|
||||
# See: https://en.wikipedia.org/wiki/ISO_3166-2:JP
|
||||
'JP': {
|
||||
'01': '23', # Aichi
|
||||
'02': '05', # Akita
|
||||
'03': '02', # Aomori
|
||||
'04': '12', # Chiba
|
||||
'05': '38', # Ehime
|
||||
'06': '18', # Fukui
|
||||
'07': '40', # Fukuoka
|
||||
'08': '07', # Fukushima
|
||||
'09': '21', # Gifu
|
||||
'10': '10', # Gunma
|
||||
'11': '34', # Hiroshima
|
||||
'12': '01', # Hokkaido
|
||||
'13': '28', # Hyogo
|
||||
'14': '08', # Ibaraki
|
||||
'15': '17', # Ishikawa
|
||||
'16': '03', # Iwate
|
||||
'17': '37', # Kagawa
|
||||
'18': '46', # Kagoshima
|
||||
'19': '14', # Kanagawa
|
||||
'20': '39', # Kochi
|
||||
'21': '43', # Kumamoto
|
||||
'22': '26', # Kyoto
|
||||
'23': '24', # Mie
|
||||
'24': '04', # Miyagi
|
||||
'25': '45', # Miyazaki
|
||||
'26': '20', # Nagano
|
||||
'27': '42', # Nagasaki
|
||||
'28': '29', # Nara
|
||||
'29': '15', # Niigata
|
||||
'30': '44', # Oita
|
||||
'31': '33', # Okayama
|
||||
'32': '27', # Osaka
|
||||
'33': '41', # Saga
|
||||
'34': '11', # Saitama
|
||||
'35': '25', # Shiga
|
||||
'36': '32', # Shimane
|
||||
'37': '22', # Shizuoka
|
||||
'38': '09', # Tochigi
|
||||
'39': '36', # Tokushima
|
||||
'40': '13', # Tokyo
|
||||
'41': '31', # Tottori
|
||||
'42': '16', # Toyama
|
||||
'43': '30', # Wakayama
|
||||
'44': '06', # Yamagata
|
||||
'45': '35', # Yamaguchi
|
||||
'46': '19', # Yamanashi
|
||||
'47': '47', # Okinawa
|
||||
},
|
||||
# Egypt: GeoNames admin1 → ISO 3166-2:EG
|
||||
# See: https://en.wikipedia.org/wiki/ISO_3166-2:EG
|
||||
'EG': {
|
||||
'01': 'DK', # Dakahlia
|
||||
'02': 'BA', # Red Sea (Al Bahr al Ahmar)
|
||||
'03': 'BH', # Beheira
|
||||
'04': 'FYM', # Faiyum
|
||||
'05': 'GH', # Gharbia
|
||||
'06': 'ALX', # Alexandria
|
||||
'07': 'IS', # Ismailia
|
||||
'08': 'GZ', # Giza
|
||||
'09': 'MNF', # Monufia
|
||||
'10': 'MN', # Minya
|
||||
'11': 'C', # Cairo
|
||||
'12': 'KB', # Qalyubia
|
||||
'13': 'WAD', # New Valley (Al Wadi al Jadid)
|
||||
'14': 'SHR', # Sharqia
|
||||
'15': 'SUZ', # Suez
|
||||
'16': 'ASN', # Aswan
|
||||
'17': 'AST', # Asyut
|
||||
'18': 'BNS', # Beni Suweif
|
||||
'19': 'PTS', # Port Said
|
||||
'20': 'DT', # Damietta
|
||||
'21': 'KFS', # Kafr el-Sheikh
|
||||
'22': 'MT', # Matruh
|
||||
'23': 'KN', # Qena
|
||||
'24': 'SHG', # Sohag
|
||||
'26': 'JS', # South Sinai
|
||||
'27': 'SIN', # North Sinai
|
||||
'28': 'LX', # Luxor
|
||||
},
|
||||
}
|
||||
|
||||
# City name translations (native → GeoNames ASCII name)
|
||||
# Many cities in GeoNames use English/anglicized names
|
||||
CITY_NAME_TRANSLATIONS = {
|
||||
# German → English
|
||||
'wien': 'vienna',
|
||||
'munchen': 'munich',
|
||||
'koln': 'cologne',
|
||||
'nurnberg': 'nuremberg',
|
||||
'braunschweig': 'brunswick',
|
||||
# Czech → GeoNames (use normalized/ASCII keys)
|
||||
'praha': 'prague',
|
||||
'plzen': 'pilsen', # Plzeň → plzen after normalization
|
||||
'brno': 'brno',
|
||||
'ostrava': 'ostrava',
|
||||
# Swiss cities
|
||||
'geneve': 'geneva',
|
||||
'zurich': 'zurich',
|
||||
'bern': 'berne',
|
||||
'basel': 'basle',
|
||||
# Italian cities
|
||||
'roma': 'rome',
|
||||
'milano': 'milan',
|
||||
'napoli': 'naples',
|
||||
'firenze': 'florence',
|
||||
'venezia': 'venice',
|
||||
'torino': 'turin',
|
||||
# Austrian special cases (use normalized keys after diacritics removal)
|
||||
# GeoNames uses 'oe' for ö, so 'Sankt Poelten'
|
||||
'st. polten': 'sankt poelten',
|
||||
'st polten': 'sankt poelten',
|
||||
'sankt polten': 'sankt poelten',
|
||||
# Japanese cities - complex administrative format to GeoNames
|
||||
# Format: "District Gun City Machi/Cho" → just the city name
|
||||
'haga gun motegi machi': 'motegi',
|
||||
'motegi machi': 'motegi',
|
||||
# Egyptian landmarks → Cairo
|
||||
'nile corniche': 'cairo',
|
||||
}
|
||||
|
||||
|
||||
def normalize_city_name(name: str) -> str:
|
||||
"""Normalize city name for matching."""
|
||||
# NFD normalization to separate diacritics
|
||||
normalized = unicodedata.normalize('NFD', name)
|
||||
# Remove diacritics
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
# Lowercase
|
||||
return ascii_name.lower().strip()
|
||||
|
||||
|
||||
def clean_city_name(city: str) -> str:
|
||||
"""Extract base city name from complex strings like 'Praha 1' or 'Zlín - Louky'."""
|
||||
# Remove district numbers like "Praha 1", "Praha 9 - Běchovice"
|
||||
city = re.sub(r'\s+\d+.*$', '', city)
|
||||
# Remove parts after dash
|
||||
city = re.sub(r'\s*-\s*.*$', '', city)
|
||||
# Remove postal code patterns
|
||||
city = re.sub(r'\s+\d{3}\s*\d{2}.*$', '', city)
|
||||
return city.strip()
|
||||
|
||||
|
||||
def lookup_city_region(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]:
|
||||
"""Look up city in GeoNames and return region info."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clean city name
|
||||
base_city = clean_city_name(city_name)
|
||||
normalized = normalize_city_name(base_city)
|
||||
|
||||
# Check for translated name (native → GeoNames)
|
||||
if normalized in CITY_NAME_TRANSLATIONS:
|
||||
translated = CITY_NAME_TRANSLATIONS[normalized]
|
||||
else:
|
||||
translated = normalized
|
||||
|
||||
# Try translated name first, then normalized
|
||||
row = None
|
||||
for search_name in [translated, normalized]:
|
||||
cursor.execute(f'''
|
||||
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
|
||||
latitude, longitude, feature_code, population
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
|
||||
AND LOWER(ascii_name) = ?
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (country, search_name))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
break
|
||||
|
||||
# If no match, try LIKE search with normalized name
|
||||
if not row:
|
||||
cursor.execute(f'''
|
||||
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
|
||||
latitude, longitude, feature_code, population
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
|
||||
AND LOWER(ascii_name) LIKE ?
|
||||
ORDER BY population DESC
|
||||
LIMIT 1
|
||||
''', (country, f'{normalized}%'))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'admin1_code': row[3],
|
||||
'admin2_code': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'feature_code': row[7],
|
||||
'population': row[8],
|
||||
}
|
||||
|
||||
|
||||
def get_region_code(country: str, admin1_code: Optional[str], admin2_code: Optional[str] = None) -> str:
|
||||
"""Convert GeoNames admin codes to ISO 3166-2 region codes."""
|
||||
if country in COUNTRY_ADMIN_MAPS:
|
||||
country_map = COUNTRY_ADMIN_MAPS[country]
|
||||
if country == 'BE' and admin2_code:
|
||||
return country_map.get(admin2_code, admin1_code or 'XX')
|
||||
if admin1_code:
|
||||
return country_map.get(admin1_code, admin1_code)
|
||||
return 'XX'
|
||||
return admin1_code if admin1_code else 'XX'
|
||||
|
||||
|
||||
def find_city_in_file(data: Dict) -> Optional[Tuple[str, str]]:
|
||||
"""Find city name and country from file data."""
|
||||
country = None
|
||||
city = None
|
||||
|
||||
# Get country from ghcid
|
||||
ghcid = data.get('ghcid', {})
|
||||
loc_res = ghcid.get('location_resolution', {})
|
||||
country = loc_res.get('country_code')
|
||||
|
||||
# Check original_entry.locations
|
||||
if 'original_entry' in data:
|
||||
locations = data['original_entry'].get('locations', [])
|
||||
for loc in locations:
|
||||
if 'city' in loc and loc['city']:
|
||||
city = loc['city']
|
||||
if not country and 'country' in loc:
|
||||
country = loc['country']
|
||||
break
|
||||
|
||||
# Check top-level locations
|
||||
if not city:
|
||||
locations = data.get('locations', [])
|
||||
for loc in locations:
|
||||
if 'city' in loc and loc['city']:
|
||||
city = loc['city']
|
||||
if not country and 'country' in loc:
|
||||
country = loc['country']
|
||||
break
|
||||
|
||||
if city and country:
|
||||
return (city, country)
|
||||
return None
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool:
|
||||
"""Process a single file with XX region code."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {filepath}: {e}")
|
||||
return False
|
||||
|
||||
if not data:
|
||||
return False
|
||||
|
||||
# Check if region is already resolved
|
||||
ghcid = data.get('ghcid', {})
|
||||
loc_res = ghcid.get('location_resolution', {})
|
||||
if loc_res.get('region_code', 'XX') != 'XX':
|
||||
return False
|
||||
|
||||
# Find city name
|
||||
city_info = find_city_in_file(data)
|
||||
if not city_info:
|
||||
return False
|
||||
|
||||
city_name, country = city_info
|
||||
print(f" City: {city_name} ({country})")
|
||||
|
||||
# Look up in GeoNames
|
||||
city_data = lookup_city_region(city_name, country, conn)
|
||||
if not city_data:
|
||||
print(f" No GeoNames match for '{city_name}'")
|
||||
return False
|
||||
|
||||
region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code'))
|
||||
if region_code == 'XX':
|
||||
print(f" Could not determine region for admin1={city_data['admin1_code']}")
|
||||
return False
|
||||
|
||||
print(f" Found: {city_data['name']} -> Region {region_code}")
|
||||
|
||||
if not apply:
|
||||
return True
|
||||
|
||||
# Update GHCID
|
||||
current = ghcid.get('ghcid_current', '')
|
||||
parts = current.split('-')
|
||||
if len(parts) < 5:
|
||||
print(f" Invalid GHCID format: {current}")
|
||||
return False
|
||||
|
||||
old_region = parts[1]
|
||||
if old_region != 'XX':
|
||||
print(f" Region already set: {old_region}")
|
||||
return False
|
||||
|
||||
parts[1] = region_code
|
||||
new_ghcid = '-'.join(parts)
|
||||
|
||||
# Update data
|
||||
ghcid['ghcid_current'] = new_ghcid
|
||||
loc_res['region_code'] = region_code
|
||||
loc_res['region_name'] = f"{country}-{region_code}"
|
||||
loc_res['geonames_id'] = city_data['geonames_id']
|
||||
loc_res['method'] = 'GEONAMES_CITY_LOOKUP'
|
||||
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
ghcid['location_resolution'] = loc_res
|
||||
|
||||
# Add to history
|
||||
history = ghcid.get('ghcid_history', [])
|
||||
history.append({
|
||||
'ghcid': new_ghcid,
|
||||
'valid_from': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': f'Region resolved via GeoNames city lookup: XX->{region_code} ({city_data["name"]})'
|
||||
})
|
||||
ghcid['ghcid_history'] = history
|
||||
data['ghcid'] = ghcid
|
||||
|
||||
# Calculate new filename
|
||||
old_name = filepath.name
|
||||
new_name = old_name.replace(f'{country}-XX-', f'{country}-{region_code}-')
|
||||
new_path = filepath.parent / new_name
|
||||
|
||||
# Write and rename
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
if new_path != filepath:
|
||||
filepath.rename(new_path)
|
||||
print(f" Renamed: {old_name} -> {new_name}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Resolve XX region codes using city names in files')
|
||||
parser.add_argument('--limit', type=int, default=100, help='Max files to process')
|
||||
parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)')
|
||||
parser.add_argument('--country', help='Filter by country code')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("REGION RESOLUTION FROM FILE CITY NAMES")
|
||||
print("=" * 70)
|
||||
print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}")
|
||||
print()
|
||||
|
||||
# Connect to GeoNames
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(str(GEONAMES_DB))
|
||||
|
||||
# Find XX files with city names
|
||||
xx_files = []
|
||||
for f in CUSTODIAN_DIR.glob('*.yaml'):
|
||||
if '-XX-' in f.name:
|
||||
if args.country and not f.name.startswith(f'{args.country}-'):
|
||||
continue
|
||||
xx_files.append(f)
|
||||
|
||||
print(f"Found {len(xx_files)} files with XX region codes")
|
||||
|
||||
# Filter to files with city names
|
||||
files_with_cities = []
|
||||
for f in xx_files:
|
||||
try:
|
||||
with open(f, 'r', encoding='utf-8') as fp:
|
||||
content = fp.read()
|
||||
if 'city:' in content:
|
||||
files_with_cities.append(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"Processing {min(len(files_with_cities), args.limit)} files with city names")
|
||||
print()
|
||||
|
||||
resolved = 0
|
||||
renamed = 0
|
||||
|
||||
for f in files_with_cities[:args.limit]:
|
||||
print(f"Processing {f.name}...")
|
||||
if process_file(f, conn, args.apply):
|
||||
resolved += 1
|
||||
if args.apply:
|
||||
renamed += 1
|
||||
|
||||
conn.close()
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Files processed: {min(len(files_with_cities), args.limit)}")
|
||||
print(f"Resolved: {resolved}")
|
||||
print(f"Renamed: {renamed}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
619
scripts/update_ghcid_with_geonames.py
Normal file
619
scripts/update_ghcid_with_geonames.py
Normal file
|
|
@ -0,0 +1,619 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Update GHCID region and city codes using GeoNames reverse geocoding.
|
||||
|
||||
For custodian files that have coordinates, this script:
|
||||
1. Reverse geocodes coordinates to find the nearest GeoNames city
|
||||
2. Extracts proper admin1_code (region) and city code
|
||||
3. Updates the GHCID with correct codes
|
||||
4. Renames the file if GHCID changes
|
||||
|
||||
Usage:
|
||||
python scripts/update_ghcid_with_geonames.py [--dry-run] [--limit N] [--country CODE]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import uuid
|
||||
import yaml
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# Paths
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
||||
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
||||
REPORTS_DIR = PROJECT_ROOT / "reports"
|
||||
|
||||
# GHCID namespace for UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
||||
|
||||
# Country-specific region code mappings (GeoNames admin1_code -> ISO 3166-2)
|
||||
# This handles cases where GeoNames codes differ from ISO codes
|
||||
REGION_CODE_MAPPINGS = {
|
||||
'NL': {
|
||||
'01': 'DR', # Drenthe
|
||||
'02': 'FR', # Friesland
|
||||
'03': 'GE', # Gelderland
|
||||
'04': 'GR', # Groningen
|
||||
'05': 'LI', # Limburg
|
||||
'06': 'NB', # Noord-Brabant
|
||||
'07': 'NH', # Noord-Holland
|
||||
'09': 'UT', # Utrecht
|
||||
'10': 'ZE', # Zeeland
|
||||
'11': 'ZH', # Zuid-Holland
|
||||
'15': 'OV', # Overijssel
|
||||
'16': 'FL', # Flevoland
|
||||
},
|
||||
# Japan uses prefecture numbers which are fine as-is (2-digit)
|
||||
# Most countries can use admin1_code directly
|
||||
}
|
||||
|
||||
# Type code mapping
|
||||
TYPE_TO_CODE = {
|
||||
'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M',
|
||||
'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R', 'CORPORATION': 'C',
|
||||
'UNKNOWN': 'U', 'BOTANICAL_ZOO': 'B', 'EDUCATION_PROVIDER': 'E',
|
||||
'COLLECTING_SOCIETY': 'S', 'FEATURES': 'F', 'INTANGIBLE_HERITAGE_GROUP': 'I',
|
||||
'MIXED': 'X', 'PERSONAL_COLLECTION': 'P', 'HOLY_SITES': 'H',
|
||||
'DIGITAL_PLATFORM': 'D', 'NGO': 'N', 'TASTE_SMELL': 'T',
|
||||
}
|
||||
|
||||
|
||||
def get_geonames_connection() -> sqlite3.Connection:
|
||||
"""Get connection to GeoNames database."""
|
||||
return sqlite3.connect(GEONAMES_DB)
|
||||
|
||||
|
||||
def reverse_geocode(lat: float, lon: float, country_code: str, conn: sqlite3.Connection) -> Optional[Dict]:
|
||||
"""
|
||||
Find nearest GeoNames city for given coordinates.
|
||||
|
||||
Uses simple Euclidean distance (good enough for nearby city matching).
|
||||
Filters by feature_code to exclude neighborhoods (PPLX).
|
||||
"""
|
||||
# Query for nearest city, excluding PPLX (neighborhoods)
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
geonames_id, name, ascii_name, admin1_code, admin1_name,
|
||||
latitude, longitude, feature_code, population,
|
||||
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
||||
FROM cities
|
||||
WHERE country_code = ?
|
||||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||||
ORDER BY distance_sq
|
||||
LIMIT 1
|
||||
""", (lat, lat, lon, lon, country_code))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'geonames_id': row[0],
|
||||
'city_name': row[1],
|
||||
'ascii_name': row[2],
|
||||
'admin1_code': row[3],
|
||||
'admin1_name': row[4],
|
||||
'latitude': row[5],
|
||||
'longitude': row[6],
|
||||
'feature_code': row[7],
|
||||
'population': row[8],
|
||||
'distance_sq': row[9],
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def generate_city_code(name: str) -> str:
|
||||
"""Generate 3-letter city code from name."""
|
||||
import unicodedata
|
||||
if not name:
|
||||
return "XXX"
|
||||
|
||||
# Normalize and remove diacritics
|
||||
normalized = unicodedata.normalize('NFD', name)
|
||||
ascii_only = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Keep only alphanumeric
|
||||
clean = re.sub(r'[^a-zA-Z0-9]', '', ascii_only)
|
||||
|
||||
return clean[:3].upper() if clean else "XXX"
|
||||
|
||||
|
||||
def get_region_code(country_code: str, admin1_code: str) -> str:
|
||||
"""Get 2-letter region code, using mappings if available."""
|
||||
if not admin1_code:
|
||||
return "XX"
|
||||
|
||||
# Check for country-specific mapping
|
||||
if country_code in REGION_CODE_MAPPINGS:
|
||||
mapped = REGION_CODE_MAPPINGS[country_code].get(admin1_code)
|
||||
if mapped:
|
||||
return mapped
|
||||
|
||||
# Use admin1_code directly (truncate to 2 chars if needed)
|
||||
return admin1_code[:2].upper()
|
||||
|
||||
|
||||
def generate_ghcid(country_code: str, region_code: str, city_code: str,
|
||||
institution_type: str, abbreviation: str,
|
||||
name_suffix: Optional[str] = None) -> str:
|
||||
"""Generate GHCID string."""
|
||||
type_code = TYPE_TO_CODE.get(institution_type, 'U')
|
||||
ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}"
|
||||
if name_suffix:
|
||||
ghcid = f"{ghcid}-{name_suffix}"
|
||||
return ghcid
|
||||
|
||||
|
||||
def generate_ghcid_uuid(ghcid: str) -> str:
|
||||
"""Generate UUID v5 from GHCID."""
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid))
|
||||
|
||||
|
||||
def generate_ghcid_uuid_sha256(ghcid: str) -> str:
|
||||
"""Generate UUID v8 (SHA-256 based) from GHCID."""
|
||||
sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest()
|
||||
return f"{sha256_hash[:8]}-{sha256_hash[8:12]}-8{sha256_hash[13:16]}-{sha256_hash[16:20]}-{sha256_hash[20:32]}"
|
||||
|
||||
|
||||
def generate_ghcid_numeric(ghcid: str) -> int:
|
||||
"""Generate 64-bit numeric ID from GHCID."""
|
||||
sha256_hash = hashlib.sha256(ghcid.encode()).digest()
|
||||
return int.from_bytes(sha256_hash[:8], 'big')
|
||||
|
||||
|
||||
def extract_coordinates(data: Dict) -> Optional[Tuple[float, float]]:
|
||||
"""Extract latitude/longitude from custodian data."""
|
||||
# Check original_entry.locations
|
||||
locations = data.get('original_entry', {}).get('locations', [])
|
||||
if locations and isinstance(locations, list):
|
||||
loc = locations[0]
|
||||
lat = loc.get('latitude')
|
||||
lon = loc.get('longitude')
|
||||
if lat is not None and lon is not None:
|
||||
return (float(lat), float(lon))
|
||||
|
||||
# Check top-level locations
|
||||
locations = data.get('locations', [])
|
||||
if locations and isinstance(locations, list):
|
||||
loc = locations[0]
|
||||
lat = loc.get('latitude')
|
||||
lon = loc.get('longitude')
|
||||
if lat is not None and lon is not None:
|
||||
return (float(lat), float(lon))
|
||||
|
||||
# Check google_maps_enrichment
|
||||
gm = data.get('google_maps_enrichment', {})
|
||||
lat = gm.get('latitude')
|
||||
lon = gm.get('longitude')
|
||||
if lat is not None and lon is not None:
|
||||
return (float(lat), float(lon))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_country_code(data: Dict) -> str:
|
||||
"""Extract country code from custodian data."""
|
||||
# Try ghcid.location_resolution
|
||||
country = data.get('ghcid', {}).get('location_resolution', {}).get('country_code')
|
||||
if country and country != 'XX':
|
||||
return country
|
||||
|
||||
# Try original_entry.locations
|
||||
locations = data.get('original_entry', {}).get('locations', [])
|
||||
if locations:
|
||||
country = locations[0].get('country')
|
||||
if country:
|
||||
return country
|
||||
|
||||
# Try top-level locations
|
||||
locations = data.get('locations', [])
|
||||
if locations:
|
||||
country = locations[0].get('country')
|
||||
if country:
|
||||
return country
|
||||
|
||||
return 'XX'
|
||||
|
||||
|
||||
def extract_abbreviation_from_ghcid(ghcid: str) -> str:
|
||||
"""Extract the abbreviation component from a GHCID."""
|
||||
parts = ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
return parts[4]
|
||||
return "UNK"
|
||||
|
||||
|
||||
def extract_name_suffix_from_ghcid(ghcid: str) -> Optional[str]:
|
||||
"""Extract name suffix from GHCID if present."""
|
||||
parts = ghcid.split('-')
|
||||
if len(parts) > 5:
|
||||
return '-'.join(parts[5:])
|
||||
return None
|
||||
|
||||
|
||||
def validate_ch_annotator_entity(data: Dict) -> Tuple[bool, str]:
|
||||
"""
|
||||
Validate that the entity has a valid CH-Annotator profile for heritage institutions.
|
||||
|
||||
Returns (is_valid, entity_subtype).
|
||||
Valid subtypes for enrichment: GRP.HER.* (heritage institutions)
|
||||
"""
|
||||
ch_annotator = data.get('ch_annotator', {})
|
||||
entity_class = ch_annotator.get('entity_classification', {})
|
||||
|
||||
hypernym = entity_class.get('hypernym', '')
|
||||
subtype = entity_class.get('subtype', '')
|
||||
|
||||
# Valid heritage institution subtypes
|
||||
valid_subtypes = [
|
||||
'GRP.HER', # Generic heritage institution
|
||||
'GRP.HER.GAL', # Gallery
|
||||
'GRP.HER.LIB', # Library
|
||||
'GRP.HER.ARC', # Archive
|
||||
'GRP.HER.MUS', # Museum
|
||||
'GRP.HER.RES', # Research center
|
||||
'GRP.HER.EDU', # Education provider
|
||||
'GRP.HER.REL', # Religious heritage site
|
||||
'GRP.HER.BOT', # Botanical/zoo
|
||||
'GRP.HER.MIX', # Mixed type
|
||||
]
|
||||
|
||||
# Check if entity has valid heritage subtype
|
||||
if subtype:
|
||||
for valid in valid_subtypes:
|
||||
if subtype.startswith(valid):
|
||||
return (True, subtype)
|
||||
|
||||
# Fallback: check hypernym is GROUP
|
||||
if hypernym == 'GRP':
|
||||
# Check institution_type from original_entry
|
||||
inst_type = data.get('original_entry', {}).get('institution_type', '')
|
||||
if inst_type in TYPE_TO_CODE:
|
||||
return (True, f'GRP.HER.{inst_type[:3]}')
|
||||
|
||||
# No valid CH-Annotator profile - but still allow processing if has institution_type
|
||||
inst_type = data.get('original_entry', {}).get('institution_type', '')
|
||||
if inst_type and inst_type != 'UNKNOWN':
|
||||
return (True, f'INFERRED.{inst_type}')
|
||||
|
||||
return (False, '')
|
||||
|
||||
|
||||
def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False,
|
||||
require_ch_annotator: bool = False) -> Dict:
|
||||
"""
|
||||
Process a single custodian file.
|
||||
|
||||
Args:
|
||||
filepath: Path to custodian YAML file
|
||||
conn: GeoNames database connection
|
||||
dry_run: If True, don't write changes
|
||||
require_ch_annotator: If True, skip files without valid CH-Annotator entity profile
|
||||
|
||||
Returns dict with processing results.
|
||||
"""
|
||||
result = {
|
||||
'file': filepath.name,
|
||||
'status': 'skipped',
|
||||
'old_ghcid': None,
|
||||
'new_ghcid': None,
|
||||
'geonames_match': None,
|
||||
'entity_profile': None,
|
||||
'error': None,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'Empty file'
|
||||
return result
|
||||
|
||||
# Validate CH-Annotator entity profile
|
||||
is_valid_entity, entity_subtype = validate_ch_annotator_entity(data)
|
||||
result['entity_profile'] = entity_subtype
|
||||
|
||||
if require_ch_annotator and not is_valid_entity:
|
||||
result['status'] = 'invalid_entity_profile'
|
||||
result['error'] = 'No valid CH-Annotator GRP.HER.* entity profile'
|
||||
return result
|
||||
|
||||
# Get current GHCID
|
||||
current_ghcid = data.get('ghcid', {}).get('ghcid_current')
|
||||
if not current_ghcid:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'No GHCID found'
|
||||
return result
|
||||
|
||||
result['old_ghcid'] = current_ghcid
|
||||
|
||||
# Check if already has proper GeoNames resolution
|
||||
resolution = data.get('ghcid', {}).get('location_resolution', {})
|
||||
if resolution.get('method') == 'REVERSE_GEOCODE' and resolution.get('geonames_id'):
|
||||
result['status'] = 'already_geocoded'
|
||||
return result
|
||||
|
||||
# Extract coordinates
|
||||
coords = extract_coordinates(data)
|
||||
if not coords:
|
||||
result['status'] = 'no_coordinates'
|
||||
return result
|
||||
|
||||
lat, lon = coords
|
||||
country_code = extract_country_code(data)
|
||||
|
||||
if country_code == 'XX':
|
||||
result['status'] = 'no_country'
|
||||
return result
|
||||
|
||||
# Reverse geocode
|
||||
geo_result = reverse_geocode(lat, lon, country_code, conn)
|
||||
if not geo_result:
|
||||
result['status'] = 'geocode_failed'
|
||||
return result
|
||||
|
||||
result['geonames_match'] = {
|
||||
'city': geo_result['city_name'],
|
||||
'admin1': geo_result['admin1_name'],
|
||||
'geonames_id': geo_result['geonames_id'],
|
||||
}
|
||||
|
||||
# Generate new codes
|
||||
new_region_code = get_region_code(country_code, geo_result['admin1_code'])
|
||||
new_city_code = generate_city_code(geo_result['ascii_name'])
|
||||
|
||||
# Extract existing abbreviation and name suffix
|
||||
abbreviation = extract_abbreviation_from_ghcid(current_ghcid)
|
||||
name_suffix = extract_name_suffix_from_ghcid(current_ghcid)
|
||||
|
||||
# Get institution type
|
||||
inst_type = data.get('original_entry', {}).get('institution_type', 'UNKNOWN')
|
||||
|
||||
# Generate new GHCID
|
||||
new_ghcid = generate_ghcid(country_code, new_region_code, new_city_code,
|
||||
inst_type, abbreviation, name_suffix)
|
||||
|
||||
result['new_ghcid'] = new_ghcid
|
||||
|
||||
# Check if GHCID changed
|
||||
if new_ghcid == current_ghcid:
|
||||
result['status'] = 'unchanged'
|
||||
return result
|
||||
|
||||
if dry_run:
|
||||
result['status'] = 'would_update'
|
||||
return result
|
||||
|
||||
# Update the data
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Update GHCID section
|
||||
data['ghcid']['ghcid_current'] = new_ghcid
|
||||
data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid)
|
||||
data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid)
|
||||
data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid)
|
||||
|
||||
# Update location_resolution
|
||||
data['ghcid']['location_resolution'] = {
|
||||
'method': 'REVERSE_GEOCODE',
|
||||
'country_code': country_code,
|
||||
'region_code': new_region_code,
|
||||
'region_name': geo_result['admin1_name'],
|
||||
'city_code': new_city_code,
|
||||
'city_name': geo_result['city_name'],
|
||||
'geonames_id': geo_result['geonames_id'],
|
||||
'feature_code': geo_result['feature_code'],
|
||||
'resolution_date': timestamp,
|
||||
}
|
||||
|
||||
# Add to GHCID history
|
||||
history = data['ghcid'].get('ghcid_history', [])
|
||||
|
||||
# Mark old GHCID as superseded
|
||||
if history:
|
||||
history[0]['valid_to'] = timestamp
|
||||
history[0]['superseded_by'] = new_ghcid
|
||||
|
||||
# Add new GHCID entry
|
||||
history.insert(0, {
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': generate_ghcid_numeric(new_ghcid),
|
||||
'valid_from': timestamp,
|
||||
'reason': f'Updated via GeoNames reverse geocoding (matched {geo_result["city_name"]}, geonames:{geo_result["geonames_id"]})',
|
||||
})
|
||||
|
||||
data['ghcid']['ghcid_history'] = history
|
||||
|
||||
# Update identifiers
|
||||
for ident in data.get('identifiers', []):
|
||||
if ident.get('identifier_scheme') == 'GHCID':
|
||||
ident['identifier_value'] = new_ghcid
|
||||
elif ident.get('identifier_scheme') == 'GHCID_UUID':
|
||||
ident['identifier_value'] = generate_ghcid_uuid(new_ghcid)
|
||||
elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
|
||||
ident['identifier_value'] = generate_ghcid_uuid_sha256(new_ghcid)
|
||||
elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
|
||||
ident['identifier_value'] = str(generate_ghcid_numeric(new_ghcid))
|
||||
|
||||
# Write updated data
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
new_filepath = CUSTODIAN_DIR / new_filename
|
||||
|
||||
with open(new_filepath, 'w') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Remove old file if different
|
||||
if filepath != new_filepath:
|
||||
os.remove(filepath)
|
||||
|
||||
result['status'] = 'updated'
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Update GHCID with GeoNames data')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show changes without applying')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
||||
parser.add_argument('--country', type=str, help='Only process files for specific country')
|
||||
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
|
||||
parser.add_argument('--require-ch-annotator', action='store_true',
|
||||
help='Only process files with valid CH-Annotator GRP.HER.* entity profile')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Update GHCID with GeoNames Reverse Geocoding")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
if args.dry_run:
|
||||
print("*** DRY RUN - No changes will be made ***")
|
||||
print()
|
||||
|
||||
if args.require_ch_annotator:
|
||||
print("*** Requiring CH-Annotator entity profile (GRP.HER.*) ***")
|
||||
print()
|
||||
|
||||
# Connect to GeoNames
|
||||
if not GEONAMES_DB.exists():
|
||||
print(f"Error: GeoNames database not found at {GEONAMES_DB}")
|
||||
return
|
||||
|
||||
conn = get_geonames_connection()
|
||||
print(f"Connected to GeoNames database")
|
||||
|
||||
# Get list of files
|
||||
files = list(CUSTODIAN_DIR.glob("*.yaml"))
|
||||
print(f"Found {len(files)} custodian files")
|
||||
|
||||
# Filter by country if specified
|
||||
if args.country:
|
||||
files = [f for f in files if f.name.startswith(f"{args.country}-")]
|
||||
print(f"Filtered to {len(files)} files for country {args.country}")
|
||||
|
||||
# Apply limit
|
||||
if args.limit:
|
||||
files = files[:args.limit]
|
||||
print(f"Limited to {args.limit} files")
|
||||
|
||||
print()
|
||||
|
||||
# Process files
|
||||
stats = {
|
||||
'updated': 0,
|
||||
'unchanged': 0,
|
||||
'already_geocoded': 0,
|
||||
'no_coordinates': 0,
|
||||
'no_country': 0,
|
||||
'geocode_failed': 0,
|
||||
'would_update': 0,
|
||||
'invalid_entity_profile': 0,
|
||||
'error': 0,
|
||||
}
|
||||
|
||||
updates = []
|
||||
entity_profiles_seen = {}
|
||||
|
||||
for i, filepath in enumerate(files):
|
||||
if (i + 1) % 500 == 0:
|
||||
print(f"Progress: {i + 1}/{len(files)}")
|
||||
|
||||
result = process_file(filepath, conn, args.dry_run, args.require_ch_annotator)
|
||||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||||
|
||||
# Track entity profiles
|
||||
profile = result.get('entity_profile', 'NONE')
|
||||
entity_profiles_seen[profile] = entity_profiles_seen.get(profile, 0) + 1
|
||||
|
||||
if result['status'] in ('updated', 'would_update'):
|
||||
updates.append(result)
|
||||
if args.verbose:
|
||||
print(f" {result['old_ghcid']} -> {result['new_ghcid']}")
|
||||
print(f" Matched: {result['geonames_match']}")
|
||||
print(f" Entity: {result.get('entity_profile', 'N/A')}")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Print summary
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Total files processed: {len(files)}")
|
||||
print()
|
||||
print("Results:")
|
||||
print(f" Updated: {stats.get('updated', 0)}")
|
||||
print(f" Would update (dry-run): {stats.get('would_update', 0)}")
|
||||
print(f" Unchanged: {stats.get('unchanged', 0)}")
|
||||
print(f" Already geocoded: {stats.get('already_geocoded', 0)}")
|
||||
print(f" No coordinates: {stats.get('no_coordinates', 0)}")
|
||||
print(f" No country code: {stats.get('no_country', 0)}")
|
||||
print(f" Geocode failed: {stats.get('geocode_failed', 0)}")
|
||||
print(f" Invalid entity profile: {stats.get('invalid_entity_profile', 0)}")
|
||||
print(f" Errors: {stats.get('error', 0)}")
|
||||
|
||||
# Print entity profile breakdown
|
||||
if entity_profiles_seen:
|
||||
print()
|
||||
print("CH-Annotator Entity Profiles:")
|
||||
for profile, count in sorted(entity_profiles_seen.items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" {profile}: {count}")
|
||||
|
||||
# Save report
|
||||
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
|
||||
report_file = REPORTS_DIR / f"GEONAMES_UPDATE_REPORT_{timestamp}.md"
|
||||
|
||||
with open(report_file, 'w') as f:
|
||||
f.write("# GeoNames GHCID Update Report\n\n")
|
||||
f.write(f"Generated: {datetime.now(timezone.utc).isoformat()}\n\n")
|
||||
f.write("## Summary\n\n")
|
||||
f.write(f"| Metric | Count |\n")
|
||||
f.write(f"|--------|-------|\n")
|
||||
f.write(f"| Files processed | {len(files)} |\n")
|
||||
f.write(f"| Updated | {stats.get('updated', 0)} |\n")
|
||||
f.write(f"| Would update | {stats.get('would_update', 0)} |\n")
|
||||
f.write(f"| Unchanged | {stats.get('unchanged', 0)} |\n")
|
||||
f.write(f"| Already geocoded | {stats.get('already_geocoded', 0)} |\n")
|
||||
f.write(f"| No coordinates | {stats.get('no_coordinates', 0)} |\n")
|
||||
f.write(f"| Geocode failed | {stats.get('geocode_failed', 0)} |\n")
|
||||
f.write(f"| Invalid entity profile | {stats.get('invalid_entity_profile', 0)} |\n")
|
||||
f.write(f"| Errors | {stats.get('error', 0)} |\n")
|
||||
|
||||
# Entity profile breakdown
|
||||
if entity_profiles_seen:
|
||||
f.write("\n## CH-Annotator Entity Profiles\n\n")
|
||||
f.write("| Entity Profile | Count |\n")
|
||||
f.write("|---------------|-------|\n")
|
||||
for profile, count in sorted(entity_profiles_seen.items(), key=lambda x: -x[1]):
|
||||
f.write(f"| {profile} | {count} |\n")
|
||||
|
||||
if updates:
|
||||
f.write("\n## Updates\n\n")
|
||||
f.write("| Old GHCID | New GHCID | Matched City | Entity Profile |\n")
|
||||
f.write("|-----------|-----------|-------------|----------------|\n")
|
||||
for u in updates[:100]: # Limit to first 100
|
||||
city = u.get('geonames_match', {}).get('city', 'N/A')
|
||||
profile = u.get('entity_profile', 'N/A')
|
||||
f.write(f"| {u['old_ghcid']} | {u['new_ghcid']} | {city} | {profile} |\n")
|
||||
|
||||
if len(updates) > 100:
|
||||
f.write(f"\n*... and {len(updates) - 100} more updates*\n")
|
||||
|
||||
print()
|
||||
print(f"Report saved to: {report_file}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in a new issue