diff --git a/scripts/add_ch_annotator_location_claims.py b/scripts/add_ch_annotator_location_claims.py new file mode 100644 index 0000000000..af7a035104 --- /dev/null +++ b/scripts/add_ch_annotator_location_claims.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Add CH-Annotator compliant location claims to recently resolved Czech institution files. + +This script adds location claims (city, region, country, geonames_id) to the +ch_annotator.entity_claims array with proper 5-component provenance: +1. namespace (geonames) +2. path (xpath-style path to GeoNames resource) +3. timestamp (ISO 8601) +4. agent (opencode-claude-sonnet-4) +5. context_convention (ch_annotator-v1_7_0) + +Per AGENTS.md Rule 5: Additive only - never delete existing data. +Per AGENTS.md Rule 10: CH-Annotator is the entity annotation convention. +""" + +import os +import yaml +from datetime import datetime, timezone +from pathlib import Path + +# Configuration +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") +RESEARCH_DATE = "2025-12-07" + + +def find_resolved_files(): + """Find all files resolved on the specified research date.""" + resolved_files = [] + + for yaml_file in CUSTODIAN_DIR.glob("CZ-*.yaml"): + try: + with open(yaml_file, 'r', encoding='utf-8') as f: + content = f.read() + if f"research_date: '{RESEARCH_DATE}'" in content: + resolved_files.append(yaml_file) + except Exception as e: + print(f"Error reading {yaml_file}: {e}") + + return sorted(resolved_files) + + +def add_location_claims(yaml_file: Path) -> bool: + """ + Add CH-Annotator location claims to a custodian file. + + Returns True if claims were added, False if already present or error. + """ + try: + with open(yaml_file, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + print(f" SKIP: Empty file {yaml_file.name}") + return False + + # Get location data from ghcid.location_resolution + location_resolution = data.get('ghcid', {}).get('location_resolution', {}) + location = data.get('location', {}) + + if not location_resolution.get('geonames_id'): + print(f" SKIP: No GeoNames ID in {yaml_file.name}") + return False + + # Extract location values + city_name = location_resolution.get('city_name') or location.get('city') + region_name = location_resolution.get('region_name') or location.get('region') + country_code = location_resolution.get('country_code') or location.get('country') + geonames_id = location_resolution.get('geonames_id') or location.get('geonames_id') + resolution_timestamp = location_resolution.get('resolution_timestamp') + + if not all([city_name, country_code, geonames_id]): + print(f" SKIP: Missing required location data in {yaml_file.name}") + return False + + # Ensure ch_annotator.entity_claims exists + if 'ch_annotator' not in data: + data['ch_annotator'] = {} + if 'entity_claims' not in data['ch_annotator']: + data['ch_annotator']['entity_claims'] = [] + + entity_claims = data['ch_annotator']['entity_claims'] + + # Check if location claims already exist + existing_claim_types = {c.get('claim_type') for c in entity_claims if c} + if 'location_city' in existing_claim_types: + print(f" SKIP: Location claims already exist in {yaml_file.name}") + return False + + # Create timestamp for provenance + timestamp = resolution_timestamp or datetime.now(timezone.utc).isoformat() + + # Common provenance structure + def make_provenance(path_suffix: str): + return { + 'namespace': 'geonames', + 'path': f'/cities/{geonames_id}{path_suffix}', + 'timestamp': timestamp, + 'agent': 'glm4.6', # Z.AI GLM 4.6 - preferred model + 'context_convention': 'ch_annotator-v1_7_0' + } + + # Add location_city claim + entity_claims.append({ + 'claim_type': 'location_city', + 'claim_value': city_name, + 'property_uri': 'schema:addressLocality', + 'provenance': make_provenance('/name'), + 'confidence': 0.95, + 'resolution_method': 'GEONAMES_RESEARCH' + }) + + # Add location_region claim (if available) + if region_name: + entity_claims.append({ + 'claim_type': 'location_region', + 'claim_value': region_name, + 'property_uri': 'schema:addressRegion', + 'provenance': make_provenance('/admin1'), + 'confidence': 0.95, + 'resolution_method': 'GEONAMES_RESEARCH' + }) + + # Add location_country claim + entity_claims.append({ + 'claim_type': 'location_country', + 'claim_value': country_code, + 'property_uri': 'schema:addressCountry', + 'provenance': make_provenance('/country'), + 'confidence': 0.98, + 'resolution_method': 'GEONAMES_RESEARCH' + }) + + # Add geonames_id claim + entity_claims.append({ + 'claim_type': 'geonames_id', + 'claim_value': str(geonames_id), + 'property_uri': 'gn:geonamesId', + 'provenance': make_provenance(''), + 'confidence': 0.98, + 'resolution_method': 'GEONAMES_RESEARCH' + }) + + # Write back to file + with open(yaml_file, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + print(f" ADDED: 4 location claims to {yaml_file.name}") + return True + + except Exception as e: + print(f" ERROR: {yaml_file.name}: {e}") + return False + + +def main(): + print("=" * 70) + print("CH-Annotator Location Claims Addition Script") + print("=" * 70) + print(f"Looking for files resolved on: {RESEARCH_DATE}") + print() + + # Find resolved files + resolved_files = find_resolved_files() + print(f"Found {len(resolved_files)} resolved files") + print() + + # Process each file + added_count = 0 + skipped_count = 0 + error_count = 0 + + for yaml_file in resolved_files: + result = add_location_claims(yaml_file) + if result: + added_count += 1 + elif result is False: + skipped_count += 1 + else: + error_count += 1 + + # Summary + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {len(resolved_files)}") + print(f"Claims added: {added_count}") + print(f"Skipped: {skipped_count}") + print(f"Errors: {error_count}") + print() + + if added_count > 0: + print("CH-Annotator location claims added successfully!") + print("Each file now has 4 new claims:") + print(" - location_city (schema:addressLocality)") + print(" - location_region (schema:addressRegion)") + print(" - location_country (schema:addressCountry)") + print(" - geonames_id (gn:geonamesId)") + + +if __name__ == "__main__": + main() diff --git a/scripts/create_custodian_from_ch_annotator.py b/scripts/create_custodian_from_ch_annotator.py new file mode 100644 index 0000000000..7f79af1e8a --- /dev/null +++ b/scripts/create_custodian_from_ch_annotator.py @@ -0,0 +1,547 @@ +#!/usr/bin/env python3 +""" +Create custodian files from CH-Annotator data for unmatched institutions. + +This script: +1. Loads CH-Annotator files from data/instances/*_ch_annotator.yaml +2. Checks which institutions don't have custodian files yet +3. Generates GHCID for each new institution +4. Creates custodian files in data/custodian/ + +Usage: + python scripts/create_custodian_from_ch_annotator.py [--dry-run] [--limit N] +""" + +import os +import sys +import yaml +import json +import re +import uuid +import hashlib +import argparse +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any + +# Paths +PROJECT_ROOT = Path(__file__).parent.parent +CH_ANNOTATOR_DIR = PROJECT_ROOT / "data" / "instances" +CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" +REPORTS_DIR = PROJECT_ROOT / "reports" +INDEX_FILE = Path("/tmp/custodian_index.json") + +# GHCID namespace UUID for deterministic UUID generation +GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # URL namespace + +# Institution type to GHCID code mapping +TYPE_TO_CODE = { + 'GALLERY': 'G', + 'LIBRARY': 'L', + 'ARCHIVE': 'A', + 'MUSEUM': 'M', + 'OFFICIAL_INSTITUTION': 'O', + 'RESEARCH_CENTER': 'R', + 'CORPORATION': 'C', + 'UNKNOWN': 'U', + 'BOTANICAL_ZOO': 'B', + 'EDUCATION_PROVIDER': 'E', + 'COLLECTING_SOCIETY': 'S', + 'FEATURES': 'F', + 'INTANGIBLE_HERITAGE_GROUP': 'I', + 'MIXED': 'X', + 'PERSONAL_COLLECTION': 'P', + 'HOLY_SITES': 'H', + 'DIGITAL_PLATFORM': 'D', + 'NGO': 'N', + 'TASTE_SMELL': 'T', +} + +# Prepositions/articles to skip in abbreviations +SKIP_WORDS = { + 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', + 'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', + 'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'en', + 'der', 'die', 'das', 'dem', 'ein', 'eine', 'von', 'zu', 'für', 'mit', + 'el', 'la', 'los', 'las', 'un', 'una', 'del', 'al', 'con', 'por', 'para', + 'o', 'os', 'as', 'um', 'uma', 'do', 'da', 'dos', 'das', 'em', 'no', 'na', + 'il', 'lo', 'i', 'gli', 'di', 'del', 'dello', 'della', 'nel', 'nella', + 'and', 'or', 'but', 'und', 'oder', 'et', 'ou', 'e', 'y', 'o', +} + + +def normalize_name(name: str) -> str: + """Normalize name for comparison.""" + if not name: + return "" + name = name.lower() + name = re.sub(r'[^\w\s]', '', name) + name = re.sub(r'\s+', ' ', name).strip() + return name + + +def normalize_wikidata(qid: str) -> str: + """Normalize Wikidata ID.""" + if not qid: + return "" + if '/' in str(qid): + qid = str(qid).split('/')[-1] + return str(qid).strip().upper() + + +def generate_abbreviation(name: str, max_len: int = 10) -> str: + """Generate abbreviation from institution name.""" + if not name: + return "UNK" + + # Remove special characters but keep letters and spaces + clean = re.sub(r'[^\w\s]', ' ', name) + words = clean.split() + + # Filter out skip words and numbers + significant_words = [w for w in words if w.lower() not in SKIP_WORDS and not w.isdigit()] + + if not significant_words: + significant_words = words[:3] # Fallback to first 3 words + + # Take first letter of each word + abbrev = ''.join(w[0].upper() for w in significant_words if w) + + # Limit length + return abbrev[:max_len] if abbrev else "UNK" + + +def name_to_snake_case(name: str) -> str: + """Convert name to snake_case for file suffix.""" + import unicodedata + + # Normalize unicode + normalized = unicodedata.normalize('NFD', name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Lowercase and clean + lower = ascii_name.lower() + no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lower) + underscored = re.sub(r'[\s\-]+', '_', no_punct) + clean = re.sub(r'[^a-z0-9_]', '', underscored) + final = re.sub(r'_+', '_', clean).strip('_') + + return final[:50] # Limit length + + +def generate_ghcid( + country_code: str, + region_code: str, + city_code: str, + institution_type: str, + abbreviation: str, + name_suffix: Optional[str] = None +) -> str: + """Generate GHCID string.""" + type_code = TYPE_TO_CODE.get(institution_type, 'U') + ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}" + if name_suffix: + ghcid = f"{ghcid}-{name_suffix}" + return ghcid + + +def generate_ghcid_uuid(ghcid: str) -> str: + """Generate UUID v5 from GHCID string.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid)) + + +def generate_ghcid_uuid_sha256(ghcid: str) -> str: + """Generate UUID v8 (SHA-256 based) from GHCID string.""" + sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest() + # Format as UUID v8 + uuid_str = f"{sha256_hash[:8]}-{sha256_hash[8:12]}-8{sha256_hash[13:16]}-{sha256_hash[16:20]}-{sha256_hash[20:32]}" + return uuid_str + + +def generate_ghcid_numeric(ghcid: str) -> int: + """Generate 64-bit numeric ID from GHCID.""" + sha256_hash = hashlib.sha256(ghcid.encode()).digest() + return int.from_bytes(sha256_hash[:8], 'big') + + +def load_custodian_index() -> Dict: + """Load or build custodian index.""" + if INDEX_FILE.exists(): + with open(INDEX_FILE, 'r') as f: + return json.load(f) + + # Build index + print("Building custodian index...") + index = {'by_wikidata': {}, 'by_name': {}, 'by_isil': {}, 'by_ghcid': {}} + + for f in CUSTODIAN_DIR.glob("*.yaml"): + try: + with open(f, 'r') as fh: + content = fh.read() + + # Extract GHCID from filename + ghcid = f.stem + index['by_ghcid'][ghcid] = str(f) + + # Extract Wikidata + match = re.search(r'wikidata_entity_id:\s*["\']?(Q\d+)', content) + if match: + index['by_wikidata'][match.group(1).upper()] = str(f) + + # Extract name + match = re.search(r'organisatie:\s*(.+?)$', content, re.MULTILINE) + if match: + name = match.group(1).strip().strip('"\'') + index['by_name'][normalize_name(name)] = str(f) + + except: + pass + + with open(INDEX_FILE, 'w') as f: + json.dump(index, f) + + return index + + +def institution_exists(inst: Dict, index: Dict) -> bool: + """Check if institution already has a custodian file.""" + # Check Wikidata + for ident in inst.get('identifiers', []): + if ident.get('identifier_scheme', '').upper() == 'WIKIDATA': + qid = normalize_wikidata(ident.get('identifier_value', '')) + if qid and qid in index['by_wikidata']: + return True + + # Check name + name = normalize_name(inst.get('name', '')) + if name and name in index['by_name']: + return True + + return False + + +def sanitize_code(code: str, max_len: int = 2) -> str: + """Sanitize a code for use in filenames and GHCIDs. + + - Removes diacritics + - Keeps only alphanumeric chars + - Converts to uppercase + - Truncates to max_len + """ + import unicodedata + if not code: + return "XX" if max_len == 2 else "XXX" + + # Normalize unicode and remove diacritics + normalized = unicodedata.normalize('NFD', str(code)) + ascii_only = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Keep only alphanumeric + clean = re.sub(r'[^a-zA-Z0-9]', '', ascii_only) + + if not clean: + return "XX" if max_len == 2 else "XXX" + + return clean[:max_len].upper() + + +def extract_location_info(inst: Dict) -> Tuple[str, str, str]: + """Extract country, region, city codes from institution.""" + locations = inst.get('locations', []) + + country_code = "XX" + region_code = "XX" + city_code = "XXX" + + if locations: + loc = locations[0] + country_code = loc.get('country', 'XX') or 'XX' + + # Region: if it's a 2-letter code, use it; otherwise sanitize + region_raw = loc.get('region', 'XX') or 'XX' + if len(region_raw) == 2 and region_raw.isalpha(): + region_code = region_raw.upper() + else: + # It's a full region name - take first 2 letters + region_code = sanitize_code(region_raw, 2) + + # City: generate 3-letter code + city = loc.get('city', '') + if city: + city_code = sanitize_code(city, 3) + + return country_code, region_code, city_code + + +def create_custodian_file(inst: Dict, source_file: str, index: Dict) -> Tuple[Optional[Path], str]: + """ + Create a custodian file for an institution. + + Returns: (file_path, status) where status is 'created', 'exists', or 'error' + """ + try: + name = inst.get('name', 'Unknown Institution') + institution_type = inst.get('institution_type', 'UNKNOWN') + + # Extract location + country_code, region_code, city_code = extract_location_info(inst) + + # Generate abbreviation + abbreviation = generate_abbreviation(name) + + # Generate base GHCID + base_ghcid = generate_ghcid(country_code, region_code, city_code, institution_type, abbreviation) + + # Check for collision + ghcid = base_ghcid + if ghcid in index['by_ghcid']: + # Add name suffix to resolve collision + name_suffix = name_to_snake_case(name) + ghcid = generate_ghcid(country_code, region_code, city_code, institution_type, abbreviation, name_suffix) + + # Generate UUIDs + ghcid_uuid = generate_ghcid_uuid(ghcid) + ghcid_uuid_sha256 = generate_ghcid_uuid_sha256(ghcid) + ghcid_numeric = generate_ghcid_numeric(ghcid) + record_id = str(uuid.uuid4()) + + timestamp = datetime.now(timezone.utc).isoformat() + + # Build custodian data structure + custodian_data = { + 'original_entry': { + 'name': name, + 'institution_type': institution_type, + 'source': f'CH-Annotator ({source_file})', + 'identifiers': inst.get('identifiers', []), + 'locations': inst.get('locations', []), + }, + 'processing_timestamp': timestamp, + 'ghcid': { + 'ghcid_current': ghcid, + 'ghcid_original': ghcid, + 'ghcid_uuid': ghcid_uuid, + 'ghcid_uuid_sha256': ghcid_uuid_sha256, + 'ghcid_numeric': ghcid_numeric, + 'record_id': record_id, + 'generation_timestamp': timestamp, + 'location_resolution': { + 'country_code': country_code, + 'region_code': region_code, + 'city_code': city_code, + 'method': 'CH_ANNOTATOR_SOURCE', + }, + 'ghcid_history': [{ + 'ghcid': ghcid, + 'ghcid_numeric': ghcid_numeric, + 'valid_from': timestamp, + 'reason': f'Initial GHCID from CH-Annotator ({source_file})', + }], + }, + 'custodian_name': { + 'claim_type': 'custodian_name', + 'claim_value': name, + 'source_type': 'ch_annotator', + }, + 'identifiers': [ + {'identifier_scheme': 'GHCID', 'identifier_value': ghcid}, + {'identifier_scheme': 'GHCID_UUID', 'identifier_value': ghcid_uuid}, + {'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': ghcid_uuid_sha256}, + {'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ghcid_numeric)}, + {'identifier_scheme': 'RECORD_ID', 'identifier_value': record_id}, + ], + 'provenance': { + 'data_source': inst.get('provenance', {}).get('data_source', 'CH_ANNOTATOR'), + 'data_tier': inst.get('provenance', {}).get('data_tier', 'TIER_3_CROWD_SOURCED'), + 'extraction_date': inst.get('provenance', {}).get('extraction_date', timestamp), + 'extraction_method': f'Created from CH-Annotator file: {source_file}', + 'confidence_score': inst.get('provenance', {}).get('confidence_score', 0.8), + }, + 'ch_annotator': inst.get('ch_annotator', {}), + } + + # Add original identifiers + for ident in inst.get('identifiers', []): + scheme = ident.get('identifier_scheme', '').upper() + if scheme not in ['GHCID', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'GHCID_NUMERIC', 'RECORD_ID']: + custodian_data['identifiers'].append(ident) + + # Add Wikidata enrichment if available + for ident in inst.get('identifiers', []): + if ident.get('identifier_scheme', '').upper() == 'WIKIDATA': + custodian_data['wikidata_enrichment'] = { + 'wikidata_entity_id': ident.get('identifier_value', '').split('/')[-1], + 'wikidata_label_en': name, + } + break + + # Add integration note to ch_annotator + if 'ch_annotator' in custodian_data and custodian_data['ch_annotator']: + custodian_data['ch_annotator']['integration_note'] = { + 'created_from': source_file, + 'creation_date': timestamp, + 'creation_method': 'create_custodian_from_ch_annotator.py', + } + + # Create file + file_path = CUSTODIAN_DIR / f"{ghcid}.yaml" + + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(custodian_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) + + # Update index + index['by_ghcid'][ghcid] = str(file_path) + if normalize_name(name): + index['by_name'][normalize_name(name)] = str(file_path) + + return file_path, 'created' + + except Exception as e: + return None, f'error: {e}' + + +def load_ch_annotator_file(path: Path) -> List[Dict]: + """Load institutions from CH-Annotator file.""" + with open(path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if isinstance(data, list): + return data + elif isinstance(data, dict): + return data.get('institutions', []) + return [] + + +def main(): + parser = argparse.ArgumentParser(description='Create custodian files from CH-Annotator data') + parser.add_argument('--dry-run', action='store_true', help='Preview without creating files') + parser.add_argument('--limit', type=int, default=0, help='Limit institutions per file (0=unlimited)') + parser.add_argument('--skip-large', action='store_true', help='Skip files with >5000 institutions') + args = parser.parse_args() + + print("=" * 60) + print("Create Custodian Files from CH-Annotator Data") + print("=" * 60) + + if args.dry_run: + print("DRY RUN MODE - No files will be created") + + # Load index + print("\n1. Loading custodian index...") + index = load_custodian_index() + print(f" Indexed: {len(index.get('by_ghcid', {}))} GHCIDs, " + f"{len(index.get('by_wikidata', {}))} Wikidata, " + f"{len(index.get('by_name', {}))} names") + + # Find CH-Annotator files + ch_files = sorted(CH_ANNOTATOR_DIR.glob("*_ch_annotator.yaml")) + print(f"\n2. Found {len(ch_files)} CH-Annotator files") + + # Process files + total_stats = { + 'processed': 0, + 'created': 0, + 'skipped_exists': 0, + 'errors': 0, + 'by_source': {}, + } + + for ch_file in ch_files: + print(f"\n--- {ch_file.name} ---") + + try: + institutions = load_ch_annotator_file(ch_file) + print(f" Loaded {len(institutions)} institutions") + + if args.skip_large and len(institutions) > 5000: + print(f" SKIPPING (>5000 institutions)") + continue + + file_stats = {'processed': 0, 'created': 0, 'skipped': 0, 'errors': 0} + + for i, inst in enumerate(institutions): + if args.limit and file_stats['processed'] >= args.limit: + print(f" Reached limit of {args.limit}") + break + + if i % 500 == 0 and i > 0: + print(f" Progress: {i}/{len(institutions)}, created: {file_stats['created']}") + + file_stats['processed'] += 1 + total_stats['processed'] += 1 + + # Check if exists + if institution_exists(inst, index): + file_stats['skipped'] += 1 + total_stats['skipped_exists'] += 1 + continue + + # Create file + if not args.dry_run: + path, status = create_custodian_file(inst, ch_file.name, index) + + if status == 'created': + file_stats['created'] += 1 + total_stats['created'] += 1 + elif 'error' in status: + file_stats['errors'] += 1 + total_stats['errors'] += 1 + else: + file_stats['created'] += 1 + total_stats['created'] += 1 + + print(f" Processed: {file_stats['processed']}, Created: {file_stats['created']}, " + f"Skipped: {file_stats['skipped']}, Errors: {file_stats['errors']}") + + total_stats['by_source'][ch_file.name] = file_stats + + except Exception as e: + print(f" ERROR: {e}") + total_stats['errors'] += 1 + + # Print summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Total processed: {total_stats['processed']}") + print(f"Files created: {total_stats['created']}") + print(f"Skipped (already exist): {total_stats['skipped_exists']}") + print(f"Errors: {total_stats['errors']}") + + # Save report + if not args.dry_run: + REPORTS_DIR.mkdir(exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + report_path = REPORTS_DIR / f"CUSTODIAN_CREATION_REPORT_{timestamp}.md" + + report = f"""# Custodian File Creation Report + +Generated: {datetime.now(timezone.utc).isoformat()} + +## Summary + +| Metric | Count | +|--------|-------| +| Institutions processed | {total_stats['processed']} | +| Custodian files created | {total_stats['created']} | +| Skipped (already exist) | {total_stats['skipped_exists']} | +| Errors | {total_stats['errors']} | + +## By Source File + +| Source File | Processed | Created | Skipped | Errors | +|-------------|-----------|---------|---------|--------| +""" + for source, stats in total_stats['by_source'].items(): + report += f"| {source} | {stats['processed']} | {stats['created']} | {stats['skipped']} | {stats['errors']} |\n" + + with open(report_path, 'w') as f: + f.write(report) + + print(f"\nReport saved to: {report_path}") + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/enrich_austrian_cities.py b/scripts/enrich_austrian_cities.py new file mode 100644 index 0000000000..d4a6a87f0d --- /dev/null +++ b/scripts/enrich_austrian_cities.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 +""" +Enrich Austrian custodian files with city data. + +Strategy: +1. Use coordinates for reverse geocoding when available +2. Extract city names from institution names (Wien, Salzburg, Graz, etc.) +3. Validate against GeoNames database + +Usage: + python scripts/enrich_austrian_cities.py [--dry-run] +""" + +import re +import sqlite3 +import sys +import unicodedata +from datetime import datetime, timezone +from pathlib import Path + +# Austrian admin1 codes (GeoNames → ISO 3166-2:AT) +AUSTRIAN_ADMIN1_MAP = { + '01': 'B', # Burgenland + '02': 'K', # Carinthia (Kärnten) + '03': 'NO', # Lower Austria (Niederösterreich) + '04': 'OO', # Upper Austria (Oberösterreich) + '05': 'S', # Salzburg + '06': 'ST', # Styria (Steiermark) + '07': 'T', # Tyrol (Tirol) + '08': 'V', # Vorarlberg + '09': 'W', # Vienna (Wien) +} + +# Known Austrian cities in institution names +AUSTRIAN_CITY_PATTERNS = [ + # Major cities + (r'\bWien\b', 'Wien'), + (r'\bVienna\b', 'Wien'), + (r'\bGraz\b', 'Graz'), + (r'\bLinz\b', 'Linz'), + (r'\bSalzburg\b', 'Salzburg'), + (r'\bInnsbruck\b', 'Innsbruck'), + (r'\bKlagenfurt\b', 'Klagenfurt'), + (r'\bVillach\b', 'Villach'), + (r'\bWels\b', 'Wels'), + (r'\bSt\.?\s*Pölten\b', 'Sankt Pölten'), + (r'\bSankt\s+Pölten\b', 'Sankt Pölten'), + (r'\bDornbirn\b', 'Dornbirn'), + (r'\bWiener\s+Neustadt\b', 'Wiener Neustadt'), + (r'\bSteyr\b', 'Steyr'), + (r'\bFeldkirch\b', 'Feldkirch'), + (r'\bBregenz\b', 'Bregenz'), + (r'\bLeonding\b', 'Leonding'), + (r'\bKlosterneuburg\b', 'Klosterneuburg'), + (r'\bBaden\b', 'Baden'), + (r'\bLeoben\b', 'Leoben'), + (r'\bKrems\b', 'Krems an der Donau'), + (r'\bAmstetten\b', 'Amstetten'), + (r'\bMödling\b', 'Mödling'), + (r'\bKapfenberg\b', 'Kapfenberg'), + (r'\bLustenau\b', 'Lustenau'), + (r'\bHallein\b', 'Hallein'), + (r'\bKufstein\b', 'Kufstein'), + (r'\bTraun\b', 'Traun'), + (r'\bAnsfelden\b', 'Ansfelden'), + (r'\bHohenems\b', 'Hohenems'), + (r'\bSchwechat\b', 'Schwechat'), + (r'\bBraunau\b', 'Braunau am Inn'), + (r'\bStockerau\b', 'Stockerau'), + (r'\bSaalfelden\b', 'Saalfelden am Steinernen Meer'), + (r'\bTernitz\b', 'Ternitz'), + (r'\bPerchtoldsdorf\b', 'Perchtoldsdorf'), + (r'\bEisenstädter?\b', 'Eisenstadt'), + (r'\bEisenstadt\b', 'Eisenstadt'), + (r'\bTelfs\b', 'Telfs'), + (r'\bWolfsberg\b', 'Wolfsberg'), + (r'\bHard\b', 'Hard'), + (r'\bKorneuburg\b', 'Korneuburg'), + (r'\bNeunkirchen\b', 'Neunkirchen'), + (r'\bRied\b', 'Ried im Innkreis'), + (r'\bBad\s+Ischl\b', 'Bad Ischl'), + (r'\bGmunden\b', 'Gmunden'), + (r'\bWörgl\b', 'Wörgl'), + (r'\bMelk\b', 'Melk'), + (r'\bZell\s+am\s+See\b', 'Zell am See'), + (r'\bMistelbach\b', 'Mistelbach'), + (r'\bVöcklabruck\b', 'Vöcklabruck'), + (r'\bMarchtrenk\b', 'Marchtrenk'), + (r'\bEnns\b', 'Enns'), + (r'\bBruck\s+an\s+der\s+Mur\b', 'Bruck an der Mur'), + (r'\bSpittal\b', 'Spittal an der Drau'), + (r'\bSchwaz\b', 'Schwaz'), + (r'\bVoitsberg\b', 'Voitsberg'), + (r'\bRankweil\b', 'Rankweil'), + (r'\bBad\s+Vöslau\b', 'Bad Vöslau'), + (r'\bTulln\b', 'Tulln an der Donau'), + (r'\bGänserndorf\b', 'Gänserndorf'), + (r'\bHollabrunn\b', 'Hollabrunn'), + (r'\bLienz\b', 'Lienz'), + (r'\bHall\s+in\s+Tirol\b', 'Hall in Tirol'), + (r'\bFeldkirchen\b', 'Feldkirchen in Kärnten'), + (r'\bZwettl\b', 'Zwettl'), + (r'\bWaidhofen\b', 'Waidhofen an der Ybbs'), + (r'\bMattersburg\b', 'Mattersburg'), + (r'\bOberwart\b', 'Oberwart'), + (r'\bJudenburg\b', 'Judenburg'), + (r'\bPöchlarn\b', 'Pöchlarn'), + (r'\bFranziskanerplatz\b', 'Wien'), # Common Vienna address + (r'\bJosefsplatz\b', 'Wien'), # Hofburg, Vienna + + # Regional references → capital cities + (r'\bTiroler\b', 'Innsbruck'), # Amt der Tiroler Landesregierung + (r'\bBurgenländische\b', 'Eisenstadt'), # Burgenländische Landesbibliothek + (r'\bKärnt(?:en|ner)\b', 'Klagenfurt'), # Kärnten/Kärntner → Klagenfurt + (r'\bVorarlberg(?:er)?\b', 'Feldkirch'), # Vorarlberg + (r'\bSteiermark\b', 'Graz'), # Steiermark + (r'\bSteiermärk\b', 'Graz'), # Steiermärkisch + (r'\bOÖ\b', 'Linz'), # OÖ = Oberösterreich + (r'\bOberösterreich\b', 'Linz'), # Oberösterreich + (r'\bNiederösterreich\b', 'Sankt Pölten'), # Niederösterreich + (r'\bNÖ\b', 'Sankt Pölten'), # NÖ = Niederösterreich + (r'\bSalzburg(?:er)?\b', 'Salzburg'), # Salzburger Festspiele + + # Small towns mentioned in institution names + (r'\bKaltenleutgeben\b', 'Kaltenleutgeben'), + (r'\bLambach\b', 'Lambach'), + (r'\bSeitenstetten\b', 'Seitenstetten'), + (r'\bMattsee\b', 'Mattsee'), + (r'\bPöggstall\b', 'Pöggstall'), + (r'\bLaxenburg\b', 'Laxenburg'), + (r'\bEggenburg\b', 'Eggenburg'), + (r'\bPressbaum\b', 'Pressbaum'), + (r'\bSeeburg\b', 'Seekirchen am Wallersee'), # Schloss Seeburg + (r'\bSchotten(?:stift)?\b', 'Wien'), # Schottenstift is in Vienna + (r'\bAlbertina\b', 'Wien'), # Albertina is in Vienna + (r'\bMozarteum\b', 'Salzburg'), # Mozarteum is in Salzburg + (r'\bParacelsus\b', 'Salzburg'), # Paracelsus Medizinische Privatuniversität + (r'\bJoanneum\b', 'Graz'), # FH Joanneum is in Graz + (r'\bParlament\b', 'Wien'), # Parlamentsbibliothek + (r'\bBundeskanzleramt\b', 'Wien'), # Federal Chancellery + (r'\bBundesministerium\b', 'Wien'), # Federal Ministries + (r'\bBundesdenkmalamt\b', 'Wien'), # Federal Monument Office + (r'\bÖsterreich(?:ische[rn]?)?\b', 'Wien'), # Austrian national institutions + (r'\bIST\s*Austria\b', 'Klosterneuburg'), # Institute of Science and Technology Austria + (r'\bInstitute\s+of\s+Science\s+and\s+Technology\s+Austria\b', 'Klosterneuburg'), # Full name + (r'\bRapid(?:eum)?\b', 'Wien'), # SK Rapid Vienna + (r'\bMetalab\b', 'Wien'), # Metalab hackerspace Vienna + (r'\bSigmund\s+Freud\b', 'Wien'), # Sigmund Freud museum Vienna + (r'\bMax\s+Perutz\b', 'Wien'), # Max Perutz Library (Vienna Biocenter) + + # Additional specific institutions + (r'\bAnton\s+Bruckner\b', 'Linz'), # Anton Bruckner Private University + (r'\bbifeb\b', 'Strobl'), # Bundesinstitut für Erwachsenenbildung + (r'\bBundesinstitut\s+für\s+Erwachsenenbildung\b', 'Strobl'), + (r'\bZeitgenossen\b', 'Krems an der Donau'), # Archiv der Zeitgenossen + (r'\bCompass[-\s]Verlag\b', 'Wien'), # Compass-Verlag + (r'\bErnst\s+Krenek\b', 'Krems an der Donau'), # Ernst Krenek Institut + (r'\bFrauensolidarität\b', 'Wien'), # Frauensolidarität + (r'\bGeoSphere\b', 'Wien'), # GeoSphere Austria + (r'\bHochschule\s+Burgenland\b', 'Eisenstadt'), # FH Burgenland + (r'\bAgrar[-\s]und\s+Umweltpädagogik\b', 'Wien'), # Hochschule für Agrar + (r'\bHochschule\s+für\s+Agrar\b', 'Wien'), # Hochschule für Agrar (full) + (r'\bHöhere\s+Studien\b', 'Wien'), # IHS + (r'\bInterdisciplinary\s+Transformation\b', 'Wien'), # ITU + (r'\bJAM\s+Music\s+Lab\b', 'Wien'), # JAM Music Lab + (r'\bKDZ\b', 'Wien'), # KDZ Zentrum + (r'\bNew\s+Design\s+University\b', 'Sankt Pölten'), # NDU + (r'\bPädagogische\s+Hochschule\s+Tirol\b', 'Innsbruck'), # PH Tirol + (r'\bPädagogische\s+Hochschule\s+Burgenland\b', 'Eisenstadt'), # PPH Burgenland + (r'\bShared\s+Archiving\b', 'Wien'), # SAA + (r'\bVerbund\s+für\s+Bildung\b', 'Wien'), # VBKV + (r'\bVilla\s+North\b', 'Wien'), # Villa North + (r'\bInformationswissenschaft\b', 'Graz'), # VFI + (r'\bErinnerungskultur\b', 'Villach'), # ZEG is in Villach, not Graz + (r'\bParlament(?:s)?(?:direktion|bibliothek)?\b', 'Wien'), # Parlamentsbibliothek +] + + +def load_source_data(source_file: str) -> dict: + """Load Austrian source data with coordinates and ISIL codes.""" + import yaml + + with open(source_file, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + lookup = {} + for inst in data.get('institutions', []): + # Get ISIL code + isil = None + for ident in inst.get('identifiers', []): + if ident.get('identifier_scheme') == 'ISIL': + isil = ident.get('identifier_value') + break + + if isil: + locs = inst.get('locations', []) + coords = None + if locs and locs[0].get('latitude') and locs[0].get('longitude'): + coords = (locs[0]['latitude'], locs[0]['longitude']) + + lookup[isil] = { + 'name': inst.get('name', ''), + 'coords': coords, + } + + return lookup + + +def extract_city_from_name(name: str) -> str | None: + """Extract city name from Austrian institution name.""" + for pattern, city in AUSTRIAN_CITY_PATTERNS: + if re.search(pattern, name, re.IGNORECASE): + return city + return None + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) + words = clean.split() + + if len(words) == 1: + return words[0][:3].upper() + else: + if len(words) == 2: + return (words[0][0] + words[1][:2]).upper() + else: + return ''.join(w[0] for w in words[:3]).upper() + + +def reverse_geocode(lat: float, lon: float, conn: sqlite3.Connection) -> dict | None: + """Reverse geocode coordinates to find nearest Austrian city.""" + cursor = conn.cursor() + + cursor.execute(''' + SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code, + ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq + FROM cities + WHERE country_code = 'AT' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + ORDER BY distance_sq + LIMIT 1 + ''', (lat, lat, lon, lon)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'latitude': row[4], + 'longitude': row[5], + 'geonames_id': row[6], + 'population': row[7], + 'feature_code': row[8], + } + return None + + +def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None: + """Look up city in GeoNames database.""" + cursor = conn.cursor() + + # Try exact match + cursor.execute(''' + SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code + FROM cities + WHERE country_code = 'AT' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) + ORDER BY population DESC + LIMIT 1 + ''', (city_name, city_name)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'latitude': row[4], + 'longitude': row[5], + 'geonames_id': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + # Try fuzzy match + cursor.execute(''' + SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code + FROM cities + WHERE country_code = 'AT' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) + ORDER BY population DESC + LIMIT 1 + ''', (f'{city_name}%', f'{city_name}%')) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'latitude': row[4], + 'longitude': row[5], + 'geonames_id': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + return None + + +def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, method: str, dry_run: bool = False) -> bool: + """Update a custodian file with city data.""" + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content) + if not ghcid_match: + return False + + old_ghcid = ghcid_match.group(1) + + region_code = AUSTRIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code']) + city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name']) + + parts = old_ghcid.split('-') + if len(parts) >= 5: + type_code = parts[3] + abbrev_and_suffix = '-'.join(parts[4:]) + new_ghcid = f"AT-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}" + else: + return False + + if old_ghcid == new_ghcid: + return False + + old_filename = file_path.name + new_filename = old_filename.replace(old_ghcid, new_ghcid) + new_file_path = file_path.parent / new_filename + + new_content = content.replace(old_ghcid, new_ghcid) + + old_resolution = re.search(r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content) + + if old_resolution: + new_resolution = f"""location_resolution: + country_code: AT + region_code: {region_code} + region_name: {geo_data['admin1_name']} + city_code: {city_code} + city_name: {geo_data['name']} + geonames_id: {geo_data['geonames_id']} + feature_code: {geo_data['feature_code']} + latitude: {geo_data['latitude']} + longitude: {geo_data['longitude']} + method: {method} + resolution_date: '{datetime.now(timezone.utc).isoformat()}' +""" + new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():] + + timestamp = datetime.now(timezone.utc).isoformat() + history_entry = f""" - ghcid: {new_ghcid} + valid_from: '{timestamp}' + reason: City enrichment from {method} - {city_name} resolved to {geo_data['name']} ({region_code}) +""" + + history_match = re.search(r'ghcid_history:\s*\n', new_content) + if history_match: + insert_pos = history_match.end() + new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:] + + if dry_run: + print(f" DRY RUN: {old_filename} -> {new_filename}") + return True + + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + + if new_file_path != file_path: + file_path.rename(new_file_path) + + return True + + +def main(): + dry_run = '--dry-run' in sys.argv + + base_dir = Path(__file__).parent.parent + custodian_dir = base_dir / 'data' / 'custodian' + source_file = base_dir / 'data' / 'instances' / 'austria_complete_ch_annotator.yaml' + geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' + + print("Austrian City Enrichment Script") + print("=" * 50) + + if dry_run: + print("DRY RUN MODE") + + # Load source data + print(f"\nLoading source data from {source_file.name}...") + source_lookup = load_source_data(str(source_file)) + print(f" Found {len(source_lookup)} ISIL entries") + + coords_count = sum(1 for v in source_lookup.values() if v['coords']) + print(f" {coords_count} entries have coordinates") + + conn = sqlite3.connect(str(geonames_db)) + + print(f"\nFinding Austrian XXX files...") + xxx_files = list(custodian_dir.glob('AT-*-XXX-*.yaml')) + print(f" Found {len(xxx_files)} files") + + updated = 0 + by_coords = 0 + by_name = 0 + no_city = 0 + no_geonames = 0 + errors = 0 + + for file_path in xxx_files: + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Find ISIL code + isil_match = re.search(r'identifier_value:\s*(AT-\w+)', content) + isil_code = isil_match.group(1) if isil_match else None + + # Get institution name + name_match = re.search(r'claim_value:\s*(.+)', content) + inst_name = name_match.group(1).strip() if name_match else '' + + geo_data = None + method = None + city_name = None + + # Strategy 1: Use coordinates for reverse geocoding + if isil_code and isil_code in source_lookup: + source_data = source_lookup[isil_code] + if source_data['coords']: + lat, lon = source_data['coords'] + geo_data = reverse_geocode(lat, lon, conn) + if geo_data: + method = 'REVERSE_GEOCODE' + city_name = geo_data['name'] + by_coords += 1 + + # Strategy 2: Extract city from institution name + if not geo_data: + city_name = extract_city_from_name(inst_name) + if city_name: + geo_data = lookup_city_in_geonames(city_name, conn) + if geo_data: + method = 'NAME_EXTRACTION' + by_name += 1 + + if not geo_data: + no_city += 1 + continue + + if update_custodian_file(file_path, city_name, geo_data, method, dry_run): + updated += 1 + if not dry_run: + print(f" Updated: {file_path.name} -> {city_name} ({method})") + + except Exception as e: + errors += 1 + print(f" ERROR: {file_path.name}: {e}") + + conn.close() + + print("\n" + "=" * 50) + print("SUMMARY") + print("=" * 50) + print(f"Total XXX files: {len(xxx_files)}") + print(f"Updated: {updated}") + print(f" By coordinates: {by_coords}") + print(f" By name extraction: {by_name}") + print(f"No city found: {no_city}") + print(f"Errors: {errors}") + print(f"Remaining XXX: {len(xxx_files) - updated}") + + # Generate report + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + report_path = base_dir / 'reports' / f'AUSTRIAN_CITY_ENRICHMENT_{timestamp}.md' + + with open(report_path, 'w') as f: + f.write(f"# Austrian City Enrichment Report\n\n") + f.write(f"**Date**: {datetime.now().isoformat()}\n") + f.write(f"**Dry Run**: {dry_run}\n\n") + f.write(f"## Summary\n\n") + f.write(f"| Metric | Count |\n") + f.write(f"|--------|-------|\n") + f.write(f"| Total XXX files | {len(xxx_files)} |\n") + f.write(f"| Updated | {updated} |\n") + f.write(f"| By coordinates | {by_coords} |\n") + f.write(f"| By name extraction | {by_name} |\n") + f.write(f"| No city found | {no_city} |\n") + f.write(f"| Errors | {errors} |\n") + f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n") + + print(f"\nReport: {report_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_belgian_cities.py b/scripts/enrich_belgian_cities.py new file mode 100644 index 0000000000..df7a33b202 --- /dev/null +++ b/scripts/enrich_belgian_cities.py @@ -0,0 +1,465 @@ +#!/usr/bin/env python3 +""" +Enrich Belgian custodian files with city data from ISIL registry. + +Strategy: +1. First try to get city from enriched source file (fast) +2. If not found, scrape the Belgian ISIL website (slow, 1 req/sec) + +Usage: + python scripts/enrich_belgian_cities.py [--dry-run] +""" + +import os +import re +import sqlite3 +import sys +import time +import urllib.request +from datetime import datetime, timezone +from pathlib import Path + +# Belgian admin1 codes (GeoNames uses BRU, VLG, WAL) +BELGIAN_ADMIN1_MAP = { + 'BRU': 'BRU', # Brussels Capital Region + 'VLG': 'VLG', # Flanders (Vlaanderen) + 'WAL': 'WAL', # Wallonia (Wallonië) +} + +# Belgian city name aliases (Dutch/French variants) +BELGIAN_CITY_ALIASES = { + 'Brussel': 'Brussels', + 'Bruxelles': 'Brussels', + 'Antwerpen': 'Antwerpen', + 'Anvers': 'Antwerpen', + 'Gent': 'Gent', + 'Gand': 'Gent', + 'Luik': 'Liège', + 'Liege': 'Liège', + 'Bergen': 'Mons', + 'Namen': 'Namur', + 'Mechelen': 'Mechelen', + 'Malines': 'Mechelen', + 'Leuven': 'Leuven', + 'Louvain': 'Leuven', + 'Elsene': 'Ixelles', + 'Ukkel': 'Uccle', + 'Oudergem': 'Auderghem', + 'Watermaal-Bosvoorde': 'Watermael-Boitsfort', + 'Sint-Gillis': 'Saint-Gilles', + 'Sint-Jans-Molenbeek': 'Molenbeek-Saint-Jean', + 'Schaarbeek': 'Schaerbeek', + 'Etterbeek': 'Etterbeek', + 'Vorst': 'Forest', + 'Anderlecht': 'Anderlecht', + 'Jette': 'Jette', + 'Koekelberg': 'Koekelberg', + 'Evere': 'Evere', + 'Sint-Pieters-Woluwe': 'Woluwe-Saint-Pierre', + 'Sint-Lambrechts-Woluwe': 'Woluwe-Saint-Lambert', + 'Ganshoren': 'Ganshoren', +} + + +def load_isil_city_lookup(enriched_file: str) -> dict: + """Load ISIL -> city mapping from enriched Belgian ISIL file.""" + with open(enriched_file, 'r', encoding='utf-8') as f: + content = f.read() + + # Split by 'id:' at start of line + entries = re.split(r'\n(?=id: BE-)', content) + + lookup = {} + for entry in entries[1:]: # Skip header + # Extract ISIL + isil_match = re.search(r'^id: (BE-\w+)', entry) + if not isil_match: + continue + isil = isil_match.group(1) + + # Extract city from locations section + city_match = re.search(r'locations:\s*\n-\s*city:\s*(\S.*)', entry) + if city_match: + city = city_match.group(1).strip() + lookup[isil] = city + + return lookup + + +def load_isil_source_urls(enriched_file: str) -> dict: + """Load ISIL -> source_url mapping for web scraping fallback.""" + with open(enriched_file, 'r', encoding='utf-8') as f: + content = f.read() + + entries = re.split(r'\n(?=id: BE-)', content) + + lookup = {} + for entry in entries[1:]: + isil_match = re.search(r'^id: (BE-\w+)', entry) + url_match = re.search(r'source_url:\s*(https://isil\.kbr\.be/\S+)', entry) + if isil_match and url_match: + lookup[isil_match.group(1)] = url_match.group(1) + + return lookup + + +def scrape_city_from_isil_website(url: str) -> str | None: + """Scrape city from Belgian ISIL website.""" + try: + req = urllib.request.Request(url, headers={'User-Agent': 'GLAM-Enricher/1.0'}) + with urllib.request.urlopen(req, timeout=10) as response: + html = response.read().decode('utf-8') + + # Look for address pattern: "Street, POSTCODE City" + # Belgian postal codes are 4 digits + address_match = re.search(r'Walk up adress.*?]*>([^<]+)', html, re.DOTALL | re.IGNORECASE) + if address_match: + address = address_match.group(1) + # Parse city from address: "Veldstraat 53, 9910 Knesselare" + city_match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', address) + if city_match: + city = city_match.group(2).strip() + # Clean up trailing HTML entities + city = re.sub(r'&\w+;.*$', '', city).strip() + return city + + return None + except Exception as e: + print(f" Error scraping {url}: {e}") + return None + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + import unicodedata + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Clean up + clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) + words = clean.split() + + if len(words) == 1: + return words[0][:3].upper() + else: + if len(words) == 2: + return (words[0][0] + words[1][:2]).upper() + else: + return ''.join(w[0] for w in words[:3]).upper() + + +def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None: + """Look up city in GeoNames database.""" + cursor = conn.cursor() + + # Check aliases first + normalized_name = BELGIAN_CITY_ALIASES.get(city_name, city_name) + + # Try exact match first + cursor.execute(''' + SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code + FROM cities + WHERE country_code = 'BE' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) + ORDER BY population DESC + LIMIT 1 + ''', (normalized_name, normalized_name)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'latitude': row[4], + 'longitude': row[5], + 'geonames_id': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + # Try original name if alias was used + if normalized_name != city_name: + cursor.execute(''' + SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code + FROM cities + WHERE country_code = 'BE' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) + ORDER BY population DESC + LIMIT 1 + ''', (city_name, city_name)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'latitude': row[4], + 'longitude': row[5], + 'geonames_id': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + # Try fuzzy match with LIKE + cursor.execute(''' + SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code + FROM cities + WHERE country_code = 'BE' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) + ORDER BY population DESC + LIMIT 1 + ''', (f'{city_name}%', f'{city_name}%')) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'latitude': row[4], + 'longitude': row[5], + 'geonames_id': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + return None + + +def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, dry_run: bool = False) -> bool: + """Update a custodian file with city data.""" + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Extract current GHCID + ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content) + if not ghcid_match: + print(f" WARNING: No ghcid_current found in {file_path.name}") + return False + + old_ghcid = ghcid_match.group(1) + + # Generate new GHCID components + region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code']) + city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name']) + + # Build new GHCID: BE-XX-XXX-{type}-{abbrev}[-suffix] + parts = old_ghcid.split('-') + if len(parts) >= 5: + type_code = parts[3] + abbrev_and_suffix = '-'.join(parts[4:]) + new_ghcid = f"BE-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}" + else: + print(f" WARNING: Unexpected GHCID format: {old_ghcid}") + return False + + if old_ghcid == new_ghcid: + return False + + # Calculate new filename + old_filename = file_path.name + new_filename = old_filename.replace(old_ghcid, new_ghcid) + new_file_path = file_path.parent / new_filename + + # Update content + new_content = content.replace(old_ghcid, new_ghcid) + + # Update location_resolution section + old_resolution = re.search( + r'location_resolution:\s*\n((?:\s+\S.*\n)*)', + new_content + ) + + if old_resolution: + new_resolution = f"""location_resolution: + country_code: BE + region_code: {region_code} + region_name: {geo_data['admin1_name']} + city_code: {city_code} + city_name: {geo_data['name']} + geonames_id: {geo_data['geonames_id']} + feature_code: {geo_data['feature_code']} + latitude: {geo_data['latitude']} + longitude: {geo_data['longitude']} + method: BELGIAN_ISIL_REGISTRY + resolution_date: '{datetime.now(timezone.utc).isoformat()}' +""" + new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():] + + # Add GHCID history entry + timestamp = datetime.now(timezone.utc).isoformat() + history_entry = f""" - ghcid: {new_ghcid} + valid_from: '{timestamp}' + reason: City enrichment from Belgian ISIL registry - {city_name} resolved to {geo_data['name']} ({region_code}) +""" + + history_match = re.search(r'ghcid_history:\s*\n', new_content) + if history_match: + insert_pos = history_match.end() + new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:] + + if dry_run: + print(f" DRY RUN: Would rename {old_filename} -> {new_filename}") + print(f" GHCID: {old_ghcid} -> {new_ghcid}") + return True + + # Write updated content + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + + # Rename file + if new_file_path != file_path: + file_path.rename(new_file_path) + + return True + + +def main(): + dry_run = '--dry-run' in sys.argv + + # Paths + base_dir = Path(__file__).parent.parent + custodian_dir = base_dir / 'data' / 'custodian' + enriched_file = base_dir / 'data' / 'instances' / 'belgium_isil_institutions_enriched.yaml' + geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' + + print("Belgian City Enrichment Script") + print("=" * 50) + + if dry_run: + print("DRY RUN MODE - No changes will be made") + + # Load lookups + print(f"\nLoading ISIL city lookup from {enriched_file.name}...") + isil_city_lookup = load_isil_city_lookup(str(enriched_file)) + isil_url_lookup = load_isil_source_urls(str(enriched_file)) + print(f" Found {len(isil_city_lookup)} ISIL codes with city data") + print(f" Found {len(isil_url_lookup)} ISIL codes with source URLs") + + # Connect to GeoNames + print(f"\nConnecting to GeoNames database...") + conn = sqlite3.connect(str(geonames_db)) + + # Find Belgian XXX files + print(f"\nFinding Belgian custodian files with XXX placeholder...") + xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml')) + print(f" Found {len(xxx_files)} files to process") + + # Process files + updated = 0 + no_isil = 0 + no_city = 0 + no_geonames = 0 + scraped = 0 + errors = 0 + not_found_cities = [] + + for file_path in xxx_files: + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Find ISIL code + isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content) + if not isil_match: + no_isil += 1 + continue + + isil_code = isil_match.group(1) + + # Strategy 1: Look up city from enriched file + city_name = isil_city_lookup.get(isil_code) + + # Strategy 2: Scrape from website if not in lookup + if not city_name and isil_code in isil_url_lookup: + url = isil_url_lookup[isil_code] + print(f" Scraping {isil_code} from {url}...") + city_name = scrape_city_from_isil_website(url) + if city_name: + scraped += 1 + print(f" Found: {city_name}") + time.sleep(1) # Rate limit + + if not city_name: + no_city += 1 + continue + + # Look up in GeoNames + geo_data = lookup_city_in_geonames(city_name, conn) + if not geo_data: + no_geonames += 1 + not_found_cities.append((file_path.name, isil_code, city_name)) + continue + + # Update file + if update_custodian_file(file_path, city_name, geo_data, dry_run): + updated += 1 + if not dry_run: + print(f" Updated: {file_path.name} -> {city_name} ({geo_data['admin1_code']})") + + except Exception as e: + errors += 1 + print(f" ERROR processing {file_path.name}: {e}") + + conn.close() + + # Summary + print("\n" + "=" * 50) + print("SUMMARY") + print("=" * 50) + print(f"Total XXX files: {len(xxx_files)}") + print(f"Updated: {updated}") + print(f"Scraped from website: {scraped}") + print(f"No ISIL in file: {no_isil}") + print(f"No city found: {no_city}") + print(f"City not in GeoNames: {no_geonames}") + print(f"Errors: {errors}") + print(f"Remaining XXX: {len(xxx_files) - updated}") + + if not_found_cities: + print(f"\nCities not found in GeoNames:") + for fname, isil, city in not_found_cities[:20]: + print(f" {isil}: {city}") + if len(not_found_cities) > 20: + print(f" ... and {len(not_found_cities) - 20} more") + + # Generate report + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + report_path = base_dir / 'reports' / f'BELGIAN_CITY_ENRICHMENT_{timestamp}.md' + + with open(report_path, 'w') as f: + f.write(f"# Belgian City Enrichment Report\n\n") + f.write(f"**Date**: {datetime.now().isoformat()}\n") + f.write(f"**Dry Run**: {dry_run}\n\n") + f.write(f"## Summary\n\n") + f.write(f"| Metric | Count |\n") + f.write(f"|--------|-------|\n") + f.write(f"| Total XXX files | {len(xxx_files)} |\n") + f.write(f"| Updated | {updated} |\n") + f.write(f"| Scraped from website | {scraped} |\n") + f.write(f"| No ISIL in file | {no_isil} |\n") + f.write(f"| No city found | {no_city} |\n") + f.write(f"| City not in GeoNames | {no_geonames} |\n") + f.write(f"| Errors | {errors} |\n") + f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n") + + if not_found_cities: + f.write(f"\n## Cities Not Found in GeoNames\n\n") + f.write(f"| File | ISIL | City |\n") + f.write(f"|------|------|------|\n") + for fname, isil, city in not_found_cities: + f.write(f"| {fname} | {isil} | {city} |\n") + + print(f"\nReport written to: {report_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_belgian_v2.py b/scripts/enrich_belgian_v2.py new file mode 100644 index 0000000000..6f111a6c06 --- /dev/null +++ b/scripts/enrich_belgian_v2.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +Belgian city enrichment v2 - with city name aliases. +""" + +import re +import sqlite3 +import unicodedata +from datetime import datetime, timezone +from pathlib import Path + +# Belgian city aliases (Dutch names → GeoNames names) +BELGIAN_CITY_ALIASES = { + 'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert', + 'sint-pieters-woluwe': 'Woluwe-Saint-Pierre', + 'sint-stevens-woluwe': 'Sint-Stevens-Woluwe', + 'oostende': 'Ostend', + 'gent': 'Gent', + 'brugge': 'Brugge', + 'brussel': 'Brussels', + 'antwerpen': 'Antwerpen', + 'luik': 'Liège', + 'liège': 'Liège', + 'leuven': 'Leuven', + 'mechelen': 'Mechelen', + 'aalst': 'Aalst', + 'hasselt': 'Hasselt', + 'kortrijk': 'Kortrijk', + 'sint-niklaas': 'Sint-Niklaas', + 'genk': 'Genk', + 'roeselare': 'Roeselare', + # Merged municipalities (2019) + 'kluisbergen': 'Kluisbergen', + 'lievegem': 'Nevele', # Lievegem was created from Nevele, Waarschoot, Zomergem, Lovendegem + 'kruisem': 'Kruishoutem', # Kruisem was created from Kruishoutem and Zingem + 'lierde': 'Sint-Maria-Lierde', + 'maarkedal': 'Etikhove', # Maarkedal includes Etikhove + # Other + 'de haan': 'De Haan', + 'lint': 'Lint', + 'herne': 'Herne', +} + +# Belgian admin1 mapping (GeoNames → ISO 3166-2:BE) +BELGIAN_ADMIN1_MAP = { + 'Brussels Capital': 'BRU', + 'Brussels': 'BRU', + 'Flanders': 'VLG', + 'Wallonia': 'WAL', +} + +def normalize_city_name(name): + """Normalize city name for lookup.""" + if not name: + return None + normalized = unicodedata.normalize('NFD', name.lower()) + normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + return normalized.strip() + +def lookup_city(city_name, conn): + """Look up city in GeoNames with alias support.""" + if not city_name: + return None + + normalized = normalize_city_name(city_name) + + # Check alias first + if normalized in BELGIAN_CITY_ALIASES: + lookup_name = BELGIAN_CITY_ALIASES[normalized] + else: + lookup_name = city_name + + cursor = conn.cursor() + + # Try exact match + cursor.execute(""" + SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population + FROM cities + WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?)) + ORDER BY population DESC LIMIT 1 + """, (lookup_name, lookup_name)) + + result = cursor.fetchone() + if result: + return { + 'name': result[0], + 'ascii_name': result[1], + 'admin1_name': result[2], + 'latitude': result[3], + 'longitude': result[4], + 'geonames_id': result[5], + 'population': result[6], + } + + # Try partial match + cursor.execute(""" + SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population + FROM cities + WHERE country_code='BE' AND (name LIKE ? OR ascii_name LIKE ?) + ORDER BY population DESC LIMIT 1 + """, (f"%{lookup_name}%", f"%{lookup_name}%")) + + result = cursor.fetchone() + if result: + return { + 'name': result[0], + 'ascii_name': result[1], + 'admin1_name': result[2], + 'latitude': result[3], + 'longitude': result[4], + 'geonames_id': result[5], + 'population': result[6], + } + + return None + +def generate_city_code(city_name): + """Generate 3-letter city code.""" + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) + words = clean.split() + + articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'} + + if len(words) == 1: + return clean[:3].upper() + elif words[0].lower() in articles: + return (words[0][0] + words[1][:2]).upper() + else: + return ''.join(w[0] for w in words[:3]).upper() + +def main(): + base_dir = Path(__file__).parent.parent + custodian_dir = base_dir / 'data' / 'custodian' + geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' + + print("Belgian City Enrichment v2") + print("=" * 50) + + conn = sqlite3.connect(str(geonames_db)) + + # Find Belgian XXX files + xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml')) + print(f"Found {len(xxx_files)} Belgian XXX files") + + updated = 0 + not_found = [] + + for file_path in xxx_files: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Get institution name + name_match = re.search(r'claim_value:\s*(.+)', content) + inst_name = name_match.group(1).strip() if name_match else '' + + # Try to extract city from filename or name + # Belgian cities often in the file details - let's look at the log + # The scraper was finding cities from ISIL website + + # Check if there's city info in the file already + city_match = re.search(r'city(?:_name)?:\s*([^\n]+)', content) + if city_match: + city_name = city_match.group(1).strip().strip('"\'') + if city_name and city_name != 'XXX': + geo_data = lookup_city(city_name, conn) + if geo_data: + print(f"✓ {file_path.name}: {city_name} → {geo_data['name']}") + updated += 1 + # Would update file here + else: + not_found.append((file_path.name, city_name)) + + print(f"\nUpdated: {updated}") + print(f"Not found: {len(not_found)}") + if not_found: + print("\nCities not found:") + for fname, city in not_found[:20]: + print(f" {fname}: {city}") + + conn.close() + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_bulgarian_cities.py b/scripts/enrich_bulgarian_cities.py new file mode 100755 index 0000000000..f8a936d6ca --- /dev/null +++ b/scripts/enrich_bulgarian_cities.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +""" +Enrich Bulgarian custodian files with proper city codes from GeoNames. +Maps Cyrillic city names to ASCII equivalents and resolves admin1 regions. +""" + +import os +import re +import sqlite3 +from pathlib import Path +from datetime import datetime, timezone + +import yaml + +# Bulgarian Cyrillic to ASCII city name mapping +# Based on standard transliteration +CYRILLIC_TO_ASCII = { + # Major cities found in XXX files + 'Самоков': 'Samokov', + 'Асеновград': 'Asenovgrad', + 'Казанлък': 'Kazanlak', + 'Карлово': 'Karlovo', + 'Котел': 'Kotel', + 'Димитровград': 'Dimitrovgrad', + 'Исперих': 'Isperih', + 'Панагюрище': 'Panagyurishte', + 'Раднево': 'Radnevo', + 'Белица': 'Belitsa', + 'Гоце Делчев': 'Gotse Delchev', + 'Горна Оряховица': 'Gorna Oryahovitsa', + 'Якоруда': 'Yakoruda', + 'Хаджидимово': 'Hadzhidimovo', + 'Генерал Тодоров': 'General Todorov', + 'Черноморец': 'Chernomorets', + 'Плоски': 'Ploski', + 'Плетена': 'Pletena', + 'Дюлево': 'Dyulevo', + 'Левуново': 'Levunovo', + 'Гълъбово': 'Galabovo', + 'Абланица': 'Ablanitsa', + # Additional common cities + 'София': 'Sofia', + 'Пловдив': 'Plovdiv', + 'Варна': 'Varna', + 'Бургас': 'Burgas', + 'Русе': 'Ruse', + 'Стара Загора': 'Stara Zagora', + 'Плевен': 'Pleven', + 'Сливен': 'Sliven', + 'Добрич': 'Dobrich', + 'Шумен': 'Shumen', + 'Перник': 'Pernik', + 'Хасково': 'Haskovo', + 'Благоевград': 'Blagoevgrad', + 'Велико Търново': 'Veliko Tarnovo', + 'Враца': 'Vratsa', + 'Габрово': 'Gabrovo', + 'Пазарджик': 'Pazardzhik', + 'Ямбол': 'Yambol', + 'Кърджали': 'Kardzhali', + 'Монтана': 'Montana', + 'Разград': 'Razgrad', + 'Силистра': 'Silistra', + 'Смолян': 'Smolyan', + 'Търговище': 'Targovishte', + 'Кюстендил': 'Kyustendil', + 'Ловеч': 'Lovech', + 'Видин': 'Vidin', +} + +# Bulgarian admin1 GeoNames code to ISO 3166-2:BG mapping +ADMIN1_TO_ISO = { + '38': 'BLG', # Blagoevgrad + '39': 'BGS', # Burgas + '40': 'DOB', # Dobrich + '41': 'GAB', # Gabrovo + '42': 'SOF', # Sofia-Capital (also SFO for city) + '43': 'KHO', # Haskovo (officially HKV but using KHO) + '44': 'KRZ', # Kardzhali + '45': 'KNL', # Kyustendil + '46': 'LOV', # Lovech + '47': 'MON', # Montana + '48': 'PAZ', # Pazardzhik + '49': 'PER', # Pernik + '50': 'PVN', # Pleven + '51': 'PDV', # Plovdiv + '52': 'RAZ', # Razgrad + '53': 'RSE', # Ruse + '54': 'SHU', # Shumen + '55': 'SLS', # Silistra + '56': 'SLV', # Sliven + '57': 'SML', # Smolyan + '58': 'SFO', # Sofia (Province) + '59': 'SZR', # Stara Zagora + '60': 'TGV', # Targovishte + '61': 'VAR', # Varna + '62': 'VTR', # Veliko Tarnovo + '63': 'VID', # Vidin + '64': 'VRC', # Vratsa + '65': 'JAM', # Yambol +} + + +def get_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + # Clean the name + name = city_name.strip() + words = name.split() + + if len(words) == 1: + # Single word: first 3 letters + return name[:3].upper() + elif len(words) == 2: + # Two words: first letter of each + first letter of second word + return (words[0][0] + words[1][:2]).upper() + else: + # Multiple words: first letter of each (up to 3) + return ''.join(w[0] for w in words[:3]).upper() + + +def transliterate_cyrillic(text: str) -> str: + """Basic Cyrillic to Latin transliteration.""" + # Check direct mapping first + if text in CYRILLIC_TO_ASCII: + return CYRILLIC_TO_ASCII[text] + + # Basic character-by-character transliteration + cyrillic_map = { + 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', + 'е': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', + 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', + 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', + 'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', + 'щ': 'sht', 'ъ': 'a', 'ь': '', 'ю': 'yu', 'я': 'ya', + 'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', + 'Е': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', + 'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', + 'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U', + 'Ф': 'F', 'Х': 'H', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh', + 'Щ': 'Sht', 'Ъ': 'A', 'Ь': '', 'Ю': 'Yu', 'Я': 'Ya', + } + result = [] + for char in text: + if char in cyrillic_map: + result.append(cyrillic_map[char]) + else: + result.append(char) + return ''.join(result) + + +def lookup_city_in_geonames(conn: sqlite3.Connection, city_name: str) -> dict | None: + """Look up city in GeoNames database.""" + cursor = conn.cursor() + + # First try direct ASCII lookup + ascii_name = CYRILLIC_TO_ASCII.get(city_name) or transliterate_cyrillic(city_name) + + # Try exact match first + cursor.execute(""" + SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, + latitude, longitude, population, feature_code + FROM cities + WHERE country_code='BG' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (ascii_name = ? OR name = ?) + ORDER BY population DESC + LIMIT 1 + """, (ascii_name, ascii_name)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'geonames_id': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + # Try fuzzy match with LIKE + cursor.execute(""" + SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, + latitude, longitude, population, feature_code + FROM cities + WHERE country_code='BG' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (ascii_name LIKE ? OR name LIKE ?) + ORDER BY population DESC + LIMIT 1 + """, (f'{ascii_name}%', f'{ascii_name}%')) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'geonames_id': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'population': row[7], + 'feature_code': row[8], + } + + return None + + +def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False) -> dict: + """Process a single Bulgarian custodian file.""" + result = { + 'file': str(filepath), + 'status': 'skipped', + 'old_ghcid': None, + 'new_ghcid': None, + 'city_cyrillic': None, + 'city_ascii': None, + 'error': None, + } + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + result['status'] = 'error' + result['error'] = f'Failed to load YAML: {e}' + return result + + if not data: + result['status'] = 'error' + result['error'] = 'Empty YAML file' + return result + + # Get current GHCID + ghcid_data = data.get('ghcid', {}) + old_ghcid = ghcid_data.get('ghcid_current', '') + result['old_ghcid'] = old_ghcid + + # Check if it's a BG-XX-XXX file + if not old_ghcid.startswith('BG-XX-XXX-'): + result['status'] = 'skipped' + result['error'] = 'Not a BG-XX-XXX file' + return result + + # Extract city from original_entry or locations + city_cyrillic = None + + if 'original_entry' in data and 'locations' in data['original_entry']: + locations = data['original_entry']['locations'] + if locations and isinstance(locations, list) and len(locations) > 0: + city_cyrillic = locations[0].get('city') + + if not city_cyrillic: + result['status'] = 'error' + result['error'] = 'No city found in original_entry' + return result + + result['city_cyrillic'] = city_cyrillic + + # Look up city in GeoNames + city_info = lookup_city_in_geonames(conn, city_cyrillic) + + if not city_info: + result['status'] = 'error' + result['error'] = f'City not found in GeoNames: {city_cyrillic}' + return result + + result['city_ascii'] = city_info['ascii_name'] + + # Get region code + admin1_code = city_info['admin1_code'] + region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX') + + # Generate city code + city_code = get_city_code(city_info['ascii_name']) + + # Build new GHCID + # Extract type and abbreviation from old GHCID + # Format: BG-XX-XXX-{type}-{abbrev} + parts = old_ghcid.split('-') + if len(parts) >= 5: + inst_type = parts[3] + abbreviation = '-'.join(parts[4:]) # May contain hyphens + else: + result['status'] = 'error' + result['error'] = f'Invalid GHCID format: {old_ghcid}' + return result + + new_ghcid = f'BG-{region_code}-{city_code}-{inst_type}-{abbreviation}' + result['new_ghcid'] = new_ghcid + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the GHCID data + timestamp = datetime.now(timezone.utc).isoformat() + + # Update ghcid section + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['location_resolution'] = { + 'method': 'GEONAMES_LOOKUP', + 'country_code': 'BG', + 'region_code': region_code, + 'region_name': city_info['admin1_name'], + 'city_code': city_code, + 'city_name': city_info['ascii_name'], + 'city_name_cyrillic': city_cyrillic, + 'geonames_id': city_info['geonames_id'], + 'feature_code': city_info['feature_code'], + 'resolution_date': timestamp, + } + + # Add to GHCID history + if 'ghcid_history' not in data['ghcid']: + data['ghcid']['ghcid_history'] = [] + + # Mark old GHCID as ended + for entry in data['ghcid']['ghcid_history']: + if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'): + entry['valid_to'] = timestamp + + # Add new GHCID entry + data['ghcid']['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), + 'valid_from': timestamp, + 'reason': f'City resolved via GeoNames: {city_cyrillic} → {city_info["ascii_name"]} ({region_code})', + }) + + # Update identifiers + if 'identifiers' in data: + for identifier in data['identifiers']: + if identifier.get('identifier_scheme') == 'GHCID': + identifier['identifier_value'] = new_ghcid + + # Calculate new file path + new_filename = f'{new_ghcid}.yaml' + new_filepath = filepath.parent / new_filename + + # Write updated data + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + # Rename file + if filepath != new_filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + result['new_file'] = str(new_filepath) + elif new_filepath.exists() and filepath != new_filepath: + result['status'] = 'collision' + result['error'] = f'Target file already exists: {new_filepath}' + return result + + result['status'] = 'updated' + return result + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Enrich Bulgarian custodian files with GeoNames data') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') + parser.add_argument('--limit', type=int, help='Limit number of files to process') + args = parser.parse_args() + + # Find all Bulgarian XXX files + custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') + geonames_db = Path('/Users/kempersc/apps/glam/data/reference/geonames.db') + + if not geonames_db.exists(): + print(f'ERROR: GeoNames database not found: {geonames_db}') + return + + files = sorted(custodian_dir.glob('BG-XX-XXX-*.yaml')) + + if args.limit: + files = files[:args.limit] + + print(f'Found {len(files)} Bulgarian XXX files') + print(f'Dry run: {args.dry_run}') + print() + + # Connect to GeoNames database + conn = sqlite3.connect(str(geonames_db)) + + stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} + errors = [] + + for filepath in files: + result = process_file(filepath, conn, dry_run=args.dry_run) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + if result['status'] == 'updated' or result['status'] == 'would_update': + print(f"✓ {result['city_cyrillic']} → {result['city_ascii']}: {result['old_ghcid']} → {result['new_ghcid']}") + elif result['status'] == 'error': + print(f"✗ {filepath.name}: {result['error']}") + errors.append(result) + elif result['status'] == 'collision': + print(f"⚠ {filepath.name}: {result['error']}") + + conn.close() + + print() + print('=' * 60) + print('Summary:') + print(f" Updated: {stats.get('updated', 0)}") + print(f" Would update: {stats.get('would_update', 0)}") + print(f" Errors: {stats.get('error', 0)}") + print(f" Collisions: {stats.get('collision', 0)}") + print(f" Skipped: {stats.get('skipped', 0)}") + + if errors: + print() + print('Errors:') + for err in errors: + print(f" - {err['file']}: {err['error']}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_cities_google.py b/scripts/enrich_cities_google.py new file mode 100755 index 0000000000..a62c81fd4d --- /dev/null +++ b/scripts/enrich_cities_google.py @@ -0,0 +1,459 @@ +#!/usr/bin/env python3 +""" +Enrich custodian files with city/region data using Google Places API. + +This is a generic script that works for any country's XXX files. + +Usage: + python scripts/enrich_cities_google.py --country KR [--dry-run] [--limit N] + python scripts/enrich_cities_google.py --country AR [--dry-run] [--limit N] + python scripts/enrich_cities_google.py --all [--dry-run] [--limit N] + +Environment Variables: + GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled +""" + +import os +import sys +import time +import sqlite3 +import re +import argparse +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional + +import yaml +import httpx +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Configuration +GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") +GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") + +# Google Places API +TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" +REQUEST_DELAY = 0.3 + +# Country name mapping for search queries +COUNTRY_NAMES = { + 'KR': 'South Korea', + 'AR': 'Argentina', + 'US': 'United States', + 'IN': 'India', + 'JM': 'Jamaica', + 'UZ': 'Uzbekistan', + 'UA': 'Ukraine', + 'TJ': 'Tajikistan', + 'OM': 'Oman', + 'NL': 'Netherlands', + 'NA': 'Namibia', + 'ML': 'Mali', + 'LK': 'Sri Lanka', + 'LB': 'Lebanon', + 'IT': 'Italy', + 'IR': 'Iran', + 'EC': 'Ecuador', + 'DK': 'Denmark', + 'CU': 'Cuba', + 'CO': 'Colombia', + 'BR': 'Brazil', + 'MX': 'Mexico', + 'JP': 'Japan', + 'CZ': 'Czech Republic', + 'DE': 'Germany', + 'FR': 'France', + 'GB': 'United Kingdom', +} + + +def get_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + name = city_name.strip() + # Remove common suffixes + for suffix in [' City', ' Town', '-shi', '-ku', '-gun', '-cho', ' District']: + if name.endswith(suffix): + name = name[:-len(suffix)] + + words = name.split() + + if len(words) == 1: + return name[:3].upper() + elif len(words) == 2: + return (words[0][0] + words[1][:2]).upper() + else: + return ''.join(w[0] for w in words[:3]).upper() + + +def search_google_places(query: str, api_key: str) -> Optional[dict]: + """Search Google Places API for a location.""" + headers = { + "Content-Type": "application/json", + "X-Goog-Api-Key": api_key, + "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri" + } + + payload = { + "textQuery": query, + "languageCode": "en" + } + + try: + response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if "places" in data and len(data["places"]) > 0: + return data["places"][0] + return None + except Exception as e: + print(f" Error searching Google Places: {e}") + return None + + +def extract_location_from_google(place: dict) -> dict: + """Extract location information from Google Places result.""" + result = { + 'city': None, + 'region': None, + 'latitude': None, + 'longitude': None, + 'formatted_address': None, + 'place_id': None, + 'website': None, + } + + if not place: + return result + + result['place_id'] = place.get('id') + result['formatted_address'] = place.get('formattedAddress') + result['website'] = place.get('websiteUri') + + location = place.get('location', {}) + result['latitude'] = location.get('latitude') + result['longitude'] = location.get('longitude') + + components = place.get('addressComponents', []) + for comp in components: + types = comp.get('types', []) + long_name = comp.get('longText', '') + + if 'locality' in types: + result['city'] = long_name + elif 'administrative_area_level_1' in types: + result['region'] = long_name + elif 'sublocality_level_1' in types and not result['city']: + result['city'] = long_name + + return result + + +def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, country_code: str) -> Optional[dict]: + """Reverse geocode coordinates to find nearest city in GeoNames.""" + cursor = conn.cursor() + + cursor.execute(""" + SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, + latitude, longitude, population, feature_code, + ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + ORDER BY dist_sq + LIMIT 1 + """, (lat, lat, lon, lon, country_code)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'geonames_id': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'population': row[7], + 'feature_code': row[8], + } + return None + + +def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str: + """Get ISO-style region code from GeoNames admin1_code.""" + if not admin1_code: + return 'XX' + + # For most countries, use first 2-3 characters of admin1_code or name + if len(admin1_code) <= 3: + return admin1_code.upper() + + # Use abbreviation from name + if admin1_name: + words = admin1_name.split() + if len(words) == 1: + return admin1_name[:2].upper() + else: + return ''.join(w[0] for w in words[:2]).upper() + + return admin1_code[:2].upper() + + +def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, + country_code: str, country_name: str, dry_run: bool = False) -> dict: + """Process a single custodian file.""" + result = { + 'file': str(filepath), + 'status': 'skipped', + 'old_ghcid': None, + 'new_ghcid': None, + 'city': None, + 'region': None, + 'error': None, + } + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + result['status'] = 'error' + result['error'] = f'Failed to load YAML: {e}' + return result + + if not data: + result['status'] = 'error' + result['error'] = 'Empty YAML file' + return result + + ghcid_data = data.get('ghcid', {}) + old_ghcid = ghcid_data.get('ghcid_current', '') + result['old_ghcid'] = old_ghcid + + # Match both patterns: + # 1. {country}-XX-XXX-... (no region, no city) + # 2. {country}-{region}-XXX-... (has region, no city) + xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-') + if not xxx_pattern.match(old_ghcid): + result['status'] = 'skipped' + result['error'] = f'Not a {country_code}-*-XXX file' + return result + + # Get institution name + name = data.get('custodian_name', {}).get('claim_value', '') + if not name: + name = data.get('original_entry', {}).get('name', '') + + if not name: + result['status'] = 'error' + result['error'] = 'No institution name found' + return result + + # Search Google Places + search_query = f"{name} {country_name}" + print(f" Searching: {name[:50]}...") + place = search_google_places(search_query, api_key) + time.sleep(REQUEST_DELAY) + + if not place: + result['status'] = 'error' + result['error'] = 'Not found in Google Places' + return result + + location_info = extract_location_from_google(place) + + if not location_info['latitude'] or not location_info['longitude']: + result['status'] = 'error' + result['error'] = 'No coordinates from Google' + return result + + # Lookup in GeoNames + city_info = lookup_city_geonames(conn, location_info['latitude'], + location_info['longitude'], country_code) + + if not city_info: + result['status'] = 'error' + result['error'] = 'City not found in GeoNames' + return result + + region_code = get_region_code(city_info['admin1_code'], country_code, city_info['admin1_name']) + city_code = get_city_code(city_info['ascii_name']) + + result['city'] = city_info['ascii_name'] + result['region'] = city_info['admin1_name'] + + # Build new GHCID + parts = old_ghcid.split('-') + if len(parts) >= 5: + inst_type = parts[3] + abbreviation = '-'.join(parts[4:]) + else: + result['status'] = 'error' + result['error'] = f'Invalid GHCID format: {old_ghcid}' + return result + + new_ghcid = f'{country_code}-{region_code}-{city_code}-{inst_type}-{abbreviation}' + result['new_ghcid'] = new_ghcid + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the data + timestamp = datetime.now(timezone.utc).isoformat() + + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['location_resolution'] = { + 'method': 'GOOGLE_PLACES_GEONAMES', + 'country_code': country_code, + 'region_code': region_code, + 'region_name': city_info['admin1_name'], + 'city_code': city_code, + 'city_name': city_info['ascii_name'], + 'geonames_id': city_info['geonames_id'], + 'feature_code': city_info['feature_code'], + 'google_place_id': location_info.get('place_id'), + 'latitude': location_info['latitude'], + 'longitude': location_info['longitude'], + 'resolution_date': timestamp, + } + + data['google_maps_enrichment'] = { + 'place_id': location_info.get('place_id'), + 'formatted_address': location_info.get('formatted_address'), + 'website': location_info.get('website'), + 'latitude': location_info['latitude'], + 'longitude': location_info['longitude'], + 'enriched_at': timestamp, + 'source': 'Google Places API (New)', + } + + # Update GHCID history + if 'ghcid_history' not in data['ghcid']: + data['ghcid']['ghcid_history'] = [] + + for entry in data['ghcid']['ghcid_history']: + if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'): + entry['valid_to'] = timestamp + + data['ghcid']['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), + 'valid_from': timestamp, + 'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})', + }) + + if 'identifiers' in data: + for identifier in data['identifiers']: + if identifier.get('identifier_scheme') == 'GHCID': + identifier['identifier_value'] = new_ghcid + + # Write and rename + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + new_filename = f'{new_ghcid}.yaml' + new_filepath = filepath.parent / new_filename + + if filepath != new_filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + result['new_file'] = str(new_filepath) + elif new_filepath.exists() and filepath != new_filepath: + result['status'] = 'collision' + result['error'] = f'Target file exists: {new_filepath.name}' + return result + + result['status'] = 'updated' + return result + + +def main(): + parser = argparse.ArgumentParser(description='Enrich custodian files with Google Places data') + parser.add_argument('--country', type=str, help='Country code (e.g., KR, AR, US)') + parser.add_argument('--all', action='store_true', help='Process all countries with XXX files') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done') + parser.add_argument('--limit', type=int, help='Limit number of files per country') + args = parser.parse_args() + + if not GOOGLE_PLACES_TOKEN: + print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required") + sys.exit(1) + + if not GEONAMES_DB.exists(): + print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") + sys.exit(1) + + # Determine which countries to process + if args.all: + # Find all countries with XXX files (either XX-XXX or {region}-XXX) + countries = set() + for f in CUSTODIAN_DIR.glob('*-*-XXX-*.yaml'): + cc = f.name[:2] + if cc in COUNTRY_NAMES: + countries.add(cc) + countries = sorted(countries) + elif args.country: + countries = [args.country.upper()] + else: + print("ERROR: Specify --country CODE or --all") + sys.exit(1) + + conn = sqlite3.connect(str(GEONAMES_DB)) + + total_stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} + + for country_code in countries: + country_name = COUNTRY_NAMES.get(country_code, country_code) + + files = sorted(CUSTODIAN_DIR.glob(f'{country_code}-*-XXX-*.yaml')) + + if args.limit: + files = files[:args.limit] + + if not files: + continue + + print(f"\n{'='*60}") + print(f"Processing {country_code} ({country_name}): {len(files)} files") + print('='*60) + + stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} + + for filepath in files: + print(f"Processing: {filepath.name}") + result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, + country_code, country_name, dry_run=args.dry_run) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + if result['status'] in ('updated', 'would_update'): + print(f" ✓ {result['city']} ({result['region']}): {result['old_ghcid']} → {result['new_ghcid']}") + elif result['status'] == 'error': + print(f" ✗ {result['error']}") + elif result['status'] == 'collision': + print(f" ⚠ {result['error']}") + + print(f"\n{country_code} Summary: Updated={stats.get('updated', 0)}, " + f"Would update={stats.get('would_update', 0)}, " + f"Errors={stats.get('error', 0)}") + + for k, v in stats.items(): + total_stats[k] = total_stats.get(k, 0) + v + + conn.close() + + print() + print('='*60) + print('TOTAL Summary:') + print(f" Updated: {total_stats.get('updated', 0)}") + print(f" Would update: {total_stats.get('would_update', 0)}") + print(f" Errors: {total_stats.get('error', 0)}") + print(f" Collisions: {total_stats.get('collision', 0)}") + print(f" Skipped: {total_stats.get('skipped', 0)}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_czech_cities.py b/scripts/enrich_czech_cities.py new file mode 100644 index 0000000000..70b6648b4a --- /dev/null +++ b/scripts/enrich_czech_cities.py @@ -0,0 +1,791 @@ +#!/usr/bin/env python3 +""" +Enrich Czech custodian files with city data from the CH-Annotator source file. + +For Czech custodian files with XXX city placeholder, this script: +1. Loads the source CH-Annotator file (czech_unified_ch_annotator.yaml) +2. Matches by name, ARON UUID, or Wikidata ID to get city/coordinates +3. Falls back to Wikidata P131 lookup via SPARQL for missing data +4. Updates the GHCID with correct city code +5. Renames the file if GHCID changes + +Usage: + python scripts/enrich_czech_cities.py [--dry-run] [--limit N] +""" + +import argparse +import hashlib +import os +import re +import shutil +import sqlite3 +import time +import uuid +import yaml +import requests +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +# Paths +PROJECT_ROOT = Path(__file__).parent.parent +CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" +GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db" +REPORTS_DIR = PROJECT_ROOT / "reports" +CZECH_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "czech_unified_ch_annotator.yaml" + +# GHCID namespace for UUID generation +GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') + +# Rate limiting for Wikidata +REQUEST_DELAY = 1.0 + +# Czech region mapping (GeoNames admin1 to ISO 3166-2:CZ) +CZECH_ADMIN1_MAP = { + '52': 'JC', # Jihočeský (South Bohemian) + '78': 'JM', # Jihomoravský (South Moravian) + '81': 'KA', # Karlovarský (Karlovy Vary) + '82': 'VY', # Vysočina (Vysočina) + '51': 'KR', # Královéhradecký (Hradec Králové) + '53': 'LI', # Liberecký (Liberec) + '84': 'MO', # Moravskoslezský (Moravian-Silesian) + '85': 'OL', # Olomoucký (Olomouc) + '86': 'PA', # Pardubický (Pardubice) + '54': 'PL', # Plzeňský (Plzeň) + '10': 'PR', # Praha (Prague) + '55': 'ST', # Středočeský (Central Bohemian) + '56': 'US', # Ústecký (Ústí nad Labem) + '87': 'ZL', # Zlínský (Zlín) +} + +# Region name to code mapping (from source data) +CZECH_REGION_NAMES = { + 'Jihočeský': 'JC', + 'Jihomoravský': 'JM', + 'Karlovarský': 'KA', + 'Vysočina': 'VY', + 'Královéhradecký': 'KR', + 'Liberecký': 'LI', + 'Moravskoslezský': 'MO', + 'Olomoucký': 'OL', + 'Pardubický': 'PA', + 'Plzeňský': 'PL', + 'Hlavní město Praha': 'PR', + 'Praha': 'PR', + 'Středočeský': 'ST', + 'Ústecký': 'US', + 'Zlínský': 'ZL', +} + + +def extract_city_from_name(name: str) -> Optional[str]: + """Try to extract city name from Czech institution name patterns.""" + if not name: + return None + + # Common patterns in Czech: "v Praze", "v Brně", "v Kladně", "ve Šlapanicích" + # Also: "nad Metují", "nad Labem" + import re + + # Pattern: "v/ve + City" (locative case) + patterns = [ + # "v CityName" - most common + r'\bv\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)', + # "ve CityName" (before consonant clusters) + r'\bve\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)', + # "nad CityName" or "pod CityName" + r'\b(?:nad|pod)\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)', + ] + + for pattern in patterns: + match = re.search(pattern, name) + if match: + city = match.group(1) + # Convert locative case to nominative (approximation) + # Common endings: -ě/-e -> -a, -ích -> -y, -ové -> -ov + city = convert_locative_to_nominative(city) + return city + + return None + + +def convert_locative_to_nominative(city: str) -> str: + """Convert Czech locative case to nominative (best effort).""" + # This is approximate - Czech declension is complex + # Common patterns: + replacements = [ + # Praha (Prague): Praze -> Praha + ('Praze', 'Praha'), + ('Brně', 'Brno'), + ('Hradci Králové', 'Hradec Králové'), + ('Havlíčkově Brodě', 'Havlíčkův Brod'), + ('Liberci', 'Liberec'), + ('Olomouci', 'Olomouc'), + ('Plzni', 'Plzeň'), + ('Ostravě', 'Ostrava'), + ('Ústí nad Labem', 'Ústí nad Labem'), # no change + ('Opavě', 'Opava'), + # Generic endings + ] + + for locative, nominative in replacements: + if city == locative: + return nominative + + # Generic ending transformations (approximate) + if city.endswith('ě') or city.endswith('e'): + # Could be -a noun (Praha -> Praze) or -o noun (Brno -> Brně) + # Try replacing with -a first (more common) + pass + + # For now, return as-is if no specific mapping found + return city + + +def normalize_czech_name(name: str) -> str: + """Normalize Czech institution name for matching.""" + if not name: + return '' + + # Remove common suffixes and legal forms + suffixes = [ + 'o. p. s.', + 'o.p.s.', + 'p. o.', + 'p.o.', + 's. r. o.', + 's.r.o.', + 'příspěvková organizace', + ', příspěvková organizace', + ', p. o.', + ] + + result = name + for suffix in suffixes: + result = result.replace(suffix, '') + + # Clean up extra whitespace + result = ' '.join(result.split()) + result = result.strip(' -,') + + return result + + +def load_czech_source_data() -> Dict[str, Dict]: + """Load Czech CH-Annotator source file and create lookup tables.""" + by_name = {} + by_aron_uuid = {} + by_wikidata = {} + + if not CZECH_CH_ANNOTATOR_FILE.exists(): + print(f"Warning: Czech CH-Annotator file not found: {CZECH_CH_ANNOTATOR_FILE}") + return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata} + + print(f"Loading Czech CH-Annotator source file...") + with open(CZECH_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f: + entries = yaml.safe_load(f) + + if not entries: + return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata} + + for entry in entries: + if not isinstance(entry, dict): + continue + + # Extract location data + locations = entry.get('locations', []) + if not locations: + continue + + loc = locations[0] if locations else {} + if not loc.get('city'): + continue + + location_data = { + 'city': loc.get('city'), + 'region': loc.get('region'), + 'region_code': CZECH_REGION_NAMES.get(loc.get('region', ''), None), + 'postal_code': loc.get('postal_code'), + 'street_address': loc.get('street_address'), + 'latitude': loc.get('latitude'), + 'longitude': loc.get('longitude'), + 'name': entry.get('name', '') + } + + # Index by name (exact and normalized) + name = entry.get('name', '') + if name: + by_name[name] = location_data + by_name[name.lower()] = location_data + # Also normalized version + normalized = normalize_czech_name(name) + if normalized and normalized != name: + by_name[normalized] = location_data + by_name[normalized.lower()] = location_data + + # Index by alternative names + for alt_name in entry.get('alternative_names', []): + if alt_name: + by_name[alt_name] = location_data + by_name[alt_name.lower()] = location_data + normalized = normalize_czech_name(alt_name) + if normalized and normalized != alt_name: + by_name[normalized] = location_data + by_name[normalized.lower()] = location_data + + # Index by ARON UUID and Wikidata + for ident in entry.get('identifiers', []): + if not isinstance(ident, dict): + continue + scheme = ident.get('identifier_scheme', '') + value = ident.get('identifier_value', '') + if scheme == 'ARON_UUID' and value: + by_aron_uuid[value] = location_data + elif scheme == 'Wikidata' and value: + by_wikidata[value] = location_data + + print(f" Loaded {len(by_name)} by name, {len(by_aron_uuid)} by ARON UUID, {len(by_wikidata)} by Wikidata") + return {'by_name': by_name, 'by_aron_uuid': by_aron_uuid, 'by_wikidata': by_wikidata} + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + if not city_name: + return 'XXX' + + # Remove diacritics and normalize + import unicodedata + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Czech articles/prepositions to skip + skip_words = {'nad', 'pod', 'u', 'v', 've', 'na', 'do', 'z', 'ze', 'k', 'ke', 'o', 's', 'se'} + words = ascii_name.split() + significant_words = [w for w in words if w.lower() not in skip_words] + + if not significant_words: + significant_words = words + + if len(significant_words) == 1: + # Single word: first 3 letters + return significant_words[0][:3].upper() + else: + # Multiple words: initials (up to 3) + return ''.join(w[0] for w in significant_words[:3]).upper() + + +def generate_ghcid_uuid(ghcid_string: str) -> str: + """Generate deterministic UUID v5 from GHCID string.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) + + +def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: + """Generate UUID v8 style from SHA-256 hash.""" + hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16] + hash_bytes = bytearray(hash_bytes) + hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8 + hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant + return str(uuid.UUID(bytes=bytes(hash_bytes))) + + +def generate_ghcid_numeric(ghcid_string: str) -> int: + """Generate 64-bit numeric ID from SHA-256 hash.""" + hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + return int.from_bytes(hash_bytes[:8], 'big') + + +def fetch_wikidata_location(wikidata_id: str, session: requests.Session) -> Optional[Dict]: + """Fetch location via Wikidata SPARQL (P131 located in administrative entity).""" + if not wikidata_id or not wikidata_id.startswith('Q'): + return None + + query = f""" + SELECT ?cityLabel ?regionLabel ?coords WHERE {{ + wd:{wikidata_id} wdt:P131* ?city . + ?city wdt:P31/wdt:P279* wd:Q515 . # city + OPTIONAL {{ ?city wdt:P625 ?coords }} + OPTIONAL {{ + wd:{wikidata_id} wdt:P131+ ?region . + ?region wdt:P31 wd:Q20916591 . # Czech region + }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "cs,en" }} + }} + LIMIT 1 + """ + + try: + response = session.get( + 'https://query.wikidata.org/sparql', + params={'query': query, 'format': 'json'}, + headers={'User-Agent': 'GLAMDataExtractor/1.0'}, + timeout=30 + ) + response.raise_for_status() + data = response.json() + + results = data.get('results', {}).get('bindings', []) + if results: + result = results[0] + city = result.get('cityLabel', {}).get('value', '') + region = result.get('regionLabel', {}).get('value', '') + coords = result.get('coords', {}).get('value', '') + + lat, lon = None, None + if coords and coords.startswith('Point('): + # Parse Point(lon lat) format + match = re.match(r'Point\(([^ ]+) ([^)]+)\)', coords) + if match: + lon, lat = float(match.group(1)), float(match.group(2)) + + return { + 'city': city, + 'region': region, + 'region_code': CZECH_REGION_NAMES.get(region, None), + 'latitude': lat, + 'longitude': lon, + 'source': 'wikidata_sparql' + } + except Exception as e: + print(f" Wikidata SPARQL error: {e}") + + return None + + +def reverse_geocode_city(city_name: str, country_code: str, db_path: Path) -> Optional[Dict]: + """Look up city in GeoNames database to get coordinates and admin1.""" + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Try exact match first + cursor.execute(""" + SELECT geonames_id, name, ascii_name, latitude, longitude, + population, feature_code, admin1_code, admin1_name + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?)) + ORDER BY population DESC + LIMIT 1 + """, (country_code, city_name, city_name, city_name)) + + row = cursor.fetchone() + + if not row: + # Try fuzzy match + cursor.execute(""" + SELECT geonames_id, name, ascii_name, latitude, longitude, + population, feature_code, admin1_code, admin1_name + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (name LIKE ? OR ascii_name LIKE ?) + ORDER BY population DESC + LIMIT 1 + """, (country_code, f"{city_name}%", f"{city_name}%")) + row = cursor.fetchone() + + conn.close() + + if row: + admin1_code = row[7] + region_code = CZECH_ADMIN1_MAP.get(admin1_code, None) + return { + 'geonames_id': row[0], + 'geonames_name': row[1], + 'ascii_name': row[2], + 'latitude': row[3], + 'longitude': row[4], + 'population': row[5], + 'feature_code': row[6], + 'admin1_code': admin1_code, + 'admin1_name': row[8], + 'region_code': region_code + } + + return None + + except Exception as e: + print(f" GeoNames lookup error: {e}") + return None + + +def process_file(file_path: Path, lookup: Dict, session: requests.Session, dry_run: bool = True) -> Dict: + """Process a single custodian file.""" + result = { + 'status': 'unchanged', + 'old_ghcid': None, + 'new_ghcid': None, + 'city': None, + 'error': None + } + + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + result['status'] = 'error' + result['error'] = 'Empty file' + return result + + # Check if this is a Czech file with XXX city placeholder + ghcid_current = data.get('ghcid', {}).get('ghcid_current', '') + if not ghcid_current.startswith('CZ-') or '-XXX-' not in ghcid_current: + result['status'] = 'skipped' + return result + + result['old_ghcid'] = ghcid_current + + # Get institution name for lookup + inst_name = data.get('original_entry', {}).get('name', '') + if not inst_name: + inst_name = data.get('custodian_name', {}).get('claim_value', '') + + # Get identifiers for lookup + aron_uuid = None + wikidata_id = None + for ident in data.get('identifiers', []): + if isinstance(ident, dict): + scheme = ident.get('identifier_scheme', '') + value = ident.get('identifier_value', '') + if scheme == 'ARON_UUID': + aron_uuid = value + elif scheme == 'Wikidata': + wikidata_id = value + + # Also check original_entry.identifiers + for ident in data.get('original_entry', {}).get('identifiers', []): + if isinstance(ident, dict): + scheme = ident.get('identifier_scheme', '') + value = ident.get('identifier_value', '') + if scheme == 'ARON_UUID' and not aron_uuid: + aron_uuid = value + elif scheme == 'Wikidata' and not wikidata_id: + wikidata_id = value + + # Try to find location data from source + location_data = None + location_source = None + + # Try by name first + if inst_name: + location_data = lookup['by_name'].get(inst_name) + if location_data: + location_source = 'source_by_name' + else: + # Try lowercase + location_data = lookup['by_name'].get(inst_name.lower()) + if location_data: + location_source = 'source_by_name_lower' + else: + # Try normalized + normalized = normalize_czech_name(inst_name) + if normalized: + location_data = lookup['by_name'].get(normalized) + if location_data: + location_source = 'source_by_normalized_name' + else: + location_data = lookup['by_name'].get(normalized.lower()) + if location_data: + location_source = 'source_by_normalized_name_lower' + + # Try by ARON UUID + if not location_data and aron_uuid: + location_data = lookup['by_aron_uuid'].get(aron_uuid) + if location_data: + location_source = 'source_by_aron_uuid' + + # Try by Wikidata + if not location_data and wikidata_id: + location_data = lookup['by_wikidata'].get(wikidata_id) + if location_data: + location_source = 'source_by_wikidata' + + # Fallback to Wikidata SPARQL (skip for now - too slow) + # if not location_data and wikidata_id: + # time.sleep(REQUEST_DELAY) + # location_data = fetch_wikidata_location(wikidata_id, session) + # if location_data: + # location_source = 'wikidata_sparql' + + # Fallback: extract city from institution name + if not location_data or not location_data.get('city'): + extracted_city = extract_city_from_name(inst_name) + if extracted_city: + # Validate against GeoNames + geonames_data = reverse_geocode_city(extracted_city, 'CZ', GEONAMES_DB) + if geonames_data: + location_data = { + 'city': geonames_data.get('geonames_name', extracted_city), + 'region_code': geonames_data.get('region_code'), + 'geonames_id': geonames_data.get('geonames_id'), + 'geonames_name': geonames_data.get('geonames_name'), + 'latitude': geonames_data.get('latitude'), + 'longitude': geonames_data.get('longitude'), + } + location_source = 'extracted_from_name' + + if not location_data or not location_data.get('city'): + result['status'] = 'no_city_found' + result['error'] = f'No location data for: {inst_name}' + return result + + city_name = location_data['city'] + result['city'] = city_name + + # Generate city code + city_code = generate_city_code(city_name) + + # Get region code + region_code = location_data.get('region_code') + if not region_code: + # Try to get from GeoNames + geonames_data = reverse_geocode_city(city_name, 'CZ', GEONAMES_DB) + if geonames_data: + region_code = geonames_data.get('region_code') + location_data['geonames_id'] = geonames_data.get('geonames_id') + location_data['geonames_name'] = geonames_data.get('geonames_name') + if not location_data.get('latitude'): + location_data['latitude'] = geonames_data.get('latitude') + location_data['longitude'] = geonames_data.get('longitude') + + # Build new GHCID + parts = ghcid_current.split('-') + if len(parts) >= 5: + # Replace XXX with city code, and update region if we have it + parts[2] = city_code + if region_code: + parts[1] = region_code + new_ghcid = '-'.join(parts) + else: + new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-') + + result['new_ghcid'] = new_ghcid + + if new_ghcid == ghcid_current: + result['status'] = 'unchanged' + return result + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the data + now = datetime.now(timezone.utc).isoformat() + + # Update GHCID + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid) + data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid) + data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid) + + # Update location_resolution + location_resolution = { + 'method': 'CZECH_CH_ANNOTATOR_ENRICHMENT', + 'city_name': city_name, + 'city_code': city_code, + 'country_code': 'CZ', + 'enrichment_date': now, + 'source': location_source + } + + if region_code: + location_resolution['region_code'] = region_code + location_resolution['region_name'] = location_data.get('region', f'CZ-{region_code}') + + if location_data.get('geonames_id'): + location_resolution['geonames_id'] = location_data['geonames_id'] + location_resolution['geonames_name'] = location_data['geonames_name'] + + if location_data.get('latitude'): + location_resolution['latitude'] = location_data['latitude'] + location_resolution['longitude'] = location_data['longitude'] + + data['ghcid']['location_resolution'] = location_resolution + + # Add GHCID history entry + history = data['ghcid'].get('ghcid_history', []) + if history and isinstance(history, list) and len(history) > 0: + # Close previous entry + if isinstance(history[0], dict): + history[0]['valid_to'] = now + + history.insert(0, { + 'ghcid': new_ghcid, + 'ghcid_numeric': data['ghcid']['ghcid_numeric'], + 'valid_from': now, + 'valid_to': None, + 'reason': f'City code updated from Czech CH-Annotator enrichment: {city_name} -> {city_code}' + }) + data['ghcid']['ghcid_history'] = history + + # Update location in original_entry if exists + if 'original_entry' in data: + if 'locations' not in data['original_entry'] or not data['original_entry']['locations']: + data['original_entry']['locations'] = [{}] + for loc in data['original_entry']['locations']: + if isinstance(loc, dict): + loc['city'] = city_name + if location_data.get('postal_code'): + loc['postal_code'] = location_data['postal_code'] + if location_data.get('street_address'): + loc['street_address'] = location_data['street_address'] + if location_data.get('latitude'): + loc['latitude'] = location_data['latitude'] + loc['longitude'] = location_data['longitude'] + if region_code: + loc['region'] = location_data.get('region', f'CZ-{region_code}') + + # Update identifiers + for ident in data.get('identifiers', []): + if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID': + ident['identifier_value'] = new_ghcid + + # Add provenance note + notes = data.get('provenance', {}).get('notes', []) + if isinstance(notes, str): + notes = [notes] + if not isinstance(notes, list): + notes = [] + notes.append(f'City resolved {now[:19]}Z: {city_name} -> {city_code} via {location_source}') + data['provenance'] = data.get('provenance', {}) + data['provenance']['notes'] = notes + + # Write updated file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Rename file if GHCID changed + new_filename = f"{new_ghcid}.yaml" + new_path = file_path.parent / new_filename + + if new_path != file_path and not new_path.exists(): + shutil.move(file_path, new_path) + result['renamed_to'] = str(new_path.name) + + result['status'] = 'updated' + return result + + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + import traceback + traceback.print_exc() + return result + + +def main(): + parser = argparse.ArgumentParser(description='Enrich Czech custodian files with city data') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') + parser.add_argument('--limit', type=int, help='Limit number of files to process') + parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output') + args = parser.parse_args() + + print("=" * 60) + print("CZECH CITY ENRICHMENT") + print("=" * 60) + + if args.dry_run: + print("DRY RUN MODE - No files will be modified") + + # Find Czech files with XXX city placeholder + czech_xxx_files = list(CUSTODIAN_DIR.glob("CZ-*-XXX-*.yaml")) + + if args.limit: + czech_xxx_files = czech_xxx_files[:args.limit] + print(f"Limited to {args.limit} files") + + print(f"Found {len(czech_xxx_files)} Czech files with XXX city placeholder") + print() + + # Load Czech source data + lookup = load_czech_source_data() + + # Process files + session = requests.Session() + session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)' + + stats = { + 'updated': 0, + 'would_update': 0, + 'unchanged': 0, + 'skipped': 0, + 'no_city_found': 0, + 'error': 0 + } + + cities_found = {} + errors = [] + + for i, file_path in enumerate(czech_xxx_files, 1): + if i % 100 == 0 or args.verbose: + print(f"Progress: {i}/{len(czech_xxx_files)}") + + result = process_file(file_path, lookup, session, dry_run=args.dry_run) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + if result.get('city'): + cities_found[result['city']] = cities_found.get(result['city'], 0) + 1 + + if result.get('error'): + errors.append(f"{file_path.name}: {result['error']}") + + if args.verbose and result['status'] in ('updated', 'would_update'): + print(f" {file_path.name}") + print(f" City: {result.get('city')}") + print(f" {result['old_ghcid']} -> {result['new_ghcid']}") + + # Print summary + print() + print("=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Total files processed: {len(czech_xxx_files)}") + print() + print("Results:") + for status, count in sorted(stats.items()): + if count > 0: + print(f" {status}: {count}") + + if cities_found: + print() + print(f"Cities found: {len(cities_found)} unique") + print("Top 10 cities:") + for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]: + print(f" {city}: {count}") + + if errors: + print() + print(f"Errors ({len(errors)}):") + for err in errors[:10]: + print(f" {err}") + if len(errors) > 10: + print(f" ... and {len(errors) - 10} more") + + # Save report + REPORTS_DIR.mkdir(exist_ok=True) + report_file = REPORTS_DIR / f"CZECH_CITY_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" + + with open(report_file, 'w') as f: + f.write("# Czech City Enrichment Report\n\n") + f.write(f"**Date**: {datetime.now().isoformat()}\n") + f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n") + f.write("## Summary\n\n") + f.write(f"- Total files processed: {len(czech_xxx_files)}\n") + for status, count in sorted(stats.items()): + if count > 0: + f.write(f"- {status}: {count}\n") + + if cities_found: + f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n") + for city, count in sorted(cities_found.items(), key=lambda x: -x[1]): + f.write(f"- {city}: {count}\n") + + print() + print(f"Report saved to: {report_file}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_czech_cities_fast.py b/scripts/enrich_czech_cities_fast.py new file mode 100644 index 0000000000..7237209f65 --- /dev/null +++ b/scripts/enrich_czech_cities_fast.py @@ -0,0 +1,449 @@ +#!/usr/bin/env python3 +""" +Fast Czech city enrichment - extracts cities from institution names. + +This is a simplified script that: +1. Extracts city names from Czech institution name patterns (v/ve + City) +2. Converts from Czech locative case to nominative +3. Validates against GeoNames +4. Updates custodian files with city codes + +Usage: + python scripts/enrich_czech_cities_fast.py [--dry-run] [--limit N] +""" + +import argparse +import hashlib +import os +import re +import shutil +import sqlite3 +import uuid +import yaml +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, Optional + + +# Paths +PROJECT_ROOT = Path(__file__).parent.parent +CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" +GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db" +REPORTS_DIR = PROJECT_ROOT / "reports" + +# GHCID namespace for UUID generation +GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') + +# Czech region mapping (GeoNames admin1 to ISO 3166-2:CZ) +CZECH_ADMIN1_MAP = { + '52': 'JC', '78': 'JM', '81': 'KA', '82': 'VY', '51': 'KR', + '53': 'LI', '84': 'MO', '85': 'OL', '86': 'PA', '54': 'PL', + '10': 'PR', '55': 'ST', '56': 'US', '87': 'ZL', +} + +# Czech locative to nominative mappings +LOCATIVE_TO_NOMINATIVE = { + # Major cities + 'Praze': 'Praha', + 'Brně': 'Brno', + 'Ostravě': 'Ostrava', + 'Plzni': 'Plzeň', + 'Olomouci': 'Olomouc', + 'Liberci': 'Liberec', + 'Opavě': 'Opava', + 'Hradci Králové': 'Hradec Králové', + 'Českých Budějovicích': 'České Budějovice', + 'Pardubicích': 'Pardubice', + 'Zlíně': 'Zlín', + 'Kladně': 'Kladno', + 'Havlíčkově Brodě': 'Havlíčkův Brod', + + # Medium cities + 'Prostějově': 'Prostějov', + 'Domažlicích': 'Domažlice', + 'Litoměřicích': 'Litoměřice', + 'Klatovech': 'Klatovy', + 'Kopřivnici': 'Kopřivnice', + 'Pacově': 'Pacov', + 'Táboře': 'Tábor', + 'Písku': 'Písek', + 'Trutnově': 'Trutnov', + 'Chebu': 'Cheb', + 'Karviné': 'Karviná', + 'Havířově': 'Havířov', + 'Mostě': 'Most', + 'Chomutově': 'Chomutov', + 'Teplicích': 'Teplice', + 'Děčíně': 'Děčín', + 'Jablonci nad Nisou': 'Jablonec nad Nisou', + 'Mladé Boleslavi': 'Mladá Boleslav', + 'Příbrami': 'Příbram', + 'Kolíně': 'Kolín', + 'Jihlavě': 'Jihlava', + 'Třebíči': 'Třebíč', + 'Znojmě': 'Znojmo', + 'Břeclavi': 'Břeclav', + 'Hodoníně': 'Hodonín', + 'Vyškově': 'Vyškov', + 'Kroměříži': 'Kroměříž', + 'Vsetíně': 'Vsetín', + 'Frýdku-Místku': 'Frýdek-Místek', + 'Novém Jičíně': 'Nový Jičín', + 'Šumperku': 'Šumperk', + 'Přerově': 'Přerov', + 'Prostějově': 'Prostějov', + 'Uherském Hradišti': 'Uherské Hradiště', + 'Svitavách': 'Svitavy', + 'Chrudimi': 'Chrudim', + 'Ústí nad Orlicí': 'Ústí nad Orlicí', + 'Náchodě': 'Náchod', + 'Rychnově nad Kněžnou': 'Rychnov nad Kněžnou', + 'Semilech': 'Semily', + 'Jičíně': 'Jičín', + 'České Lípě': 'Česká Lípa', + 'Lounech': 'Louny', + 'Rakovníku': 'Rakovník', + 'Berouně': 'Beroun', + 'Benešově': 'Benešov', + 'Kutné Hoře': 'Kutná Hora', + 'Nymburce': 'Nymburk', + 'Mělníku': 'Mělník', + 'Sokolově': 'Sokolov', + 'Rokycanech': 'Rokycany', + 'Klatovech': 'Klatovy', + 'Strakonicích': 'Strakonice', + 'Českém Krumlově': 'Český Krumlov', + 'Jindřichově Hradci': 'Jindřichův Hradec', + 'Pelhřimově': 'Pelhřimov', + 'Žďáru nad Sázavou': 'Žďár nad Sázavou', + + # Compound patterns with "nad" + 'Metují': 'Metuje', # Nové Město nad Metují + 'Nisou': 'Nisa', + 'Labem': 'Labe', + 'Sázavou': 'Sázava', + 'Kněžnou': 'Kněžná', + 'Orlicí': 'Orlice', +} + + +def convert_locative_to_nominative(city: str) -> str: + """Convert Czech locative case to nominative.""" + # Try exact match first + if city in LOCATIVE_TO_NOMINATIVE: + return LOCATIVE_TO_NOMINATIVE[city] + + # Try lowercase match + for locative, nominative in LOCATIVE_TO_NOMINATIVE.items(): + if city.lower() == locative.lower(): + return nominative + + # Return as-is if no mapping + return city + + +def extract_city_from_name(name: str) -> Optional[str]: + """Extract city name from Czech institution name patterns.""" + if not name: + return None + + # Pattern: "v/ve + City" (locative case) + patterns = [ + r'\bv\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+(?:nad|pod)?\s*[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)', + r'\bve\s+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+(?:\s+(?:nad|pod)?\s*[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][a-záčďéěíňóřšťúůýž]+)*)', + ] + + for pattern in patterns: + match = re.search(pattern, name) + if match: + city = match.group(1) + return convert_locative_to_nominative(city) + + return None + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + if not city_name: + return 'XXX' + + import unicodedata + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + skip_words = {'nad', 'pod', 'u', 'v', 've', 'na', 'do', 'z', 'ze', 'k', 'ke'} + words = ascii_name.split() + significant_words = [w for w in words if w.lower() not in skip_words] + + if not significant_words: + significant_words = words + + if len(significant_words) == 1: + return significant_words[0][:3].upper() + else: + return ''.join(w[0] for w in significant_words[:3]).upper() + + +def generate_ghcid_uuid(ghcid_string: str) -> str: + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) + + +def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: + hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16] + hash_bytes = bytearray(hash_bytes) + hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 + hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 + return str(uuid.UUID(bytes=bytes(hash_bytes))) + + +def generate_ghcid_numeric(ghcid_string: str) -> int: + hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + return int.from_bytes(hash_bytes[:8], 'big') + + +def lookup_city_geonames(city_name: str, db_path: Path) -> Optional[Dict]: + """Look up city in GeoNames database.""" + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Try exact match + cursor.execute(""" + SELECT geonames_id, name, ascii_name, latitude, longitude, + population, feature_code, admin1_code + FROM cities + WHERE country_code = 'CZ' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC') + AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?)) + ORDER BY population DESC + LIMIT 1 + """, (city_name, city_name, city_name)) + + row = cursor.fetchone() + + if not row: + # Try prefix match + cursor.execute(""" + SELECT geonames_id, name, ascii_name, latitude, longitude, + population, feature_code, admin1_code + FROM cities + WHERE country_code = 'CZ' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC') + AND (name LIKE ? OR ascii_name LIKE ?) + ORDER BY population DESC + LIMIT 1 + """, (f"{city_name}%", f"{city_name}%")) + row = cursor.fetchone() + + conn.close() + + if row: + admin1_code = row[7] + return { + 'geonames_id': row[0], + 'geonames_name': row[1], + 'ascii_name': row[2], + 'latitude': row[3], + 'longitude': row[4], + 'population': row[5], + 'feature_code': row[6], + 'admin1_code': admin1_code, + 'region_code': CZECH_ADMIN1_MAP.get(admin1_code), + } + + return None + + except Exception as e: + print(f" GeoNames error: {e}") + return None + + +def process_file(file_path: Path, dry_run: bool = True) -> Dict: + """Process a single custodian file.""" + result = {'status': 'unchanged', 'old_ghcid': None, 'new_ghcid': None, 'city': None, 'error': None} + + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + result['status'] = 'error' + result['error'] = 'Empty file' + return result + + ghcid_current = data.get('ghcid', {}).get('ghcid_current', '') + if not ghcid_current.startswith('CZ-') or '-XXX-' not in ghcid_current: + result['status'] = 'skipped' + return result + + result['old_ghcid'] = ghcid_current + + # Get institution name + inst_name = data.get('original_entry', {}).get('name', '') + if not inst_name: + inst_name = data.get('custodian_name', {}).get('claim_value', '') + + # Try to extract city from name + extracted_city = extract_city_from_name(inst_name) + if not extracted_city: + result['status'] = 'no_city_in_name' + return result + + # Validate against GeoNames + geonames_data = lookup_city_geonames(extracted_city, GEONAMES_DB) + if not geonames_data: + result['status'] = 'city_not_in_geonames' + result['error'] = f'City not found in GeoNames: {extracted_city}' + return result + + city_name = geonames_data['geonames_name'] + city_code = generate_city_code(city_name) + region_code = geonames_data.get('region_code') + + result['city'] = city_name + + # Build new GHCID + parts = ghcid_current.split('-') + if len(parts) >= 5: + parts[2] = city_code + if region_code: + parts[1] = region_code + new_ghcid = '-'.join(parts) + else: + new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-') + + result['new_ghcid'] = new_ghcid + + if new_ghcid == ghcid_current: + result['status'] = 'unchanged' + return result + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the data + now = datetime.now(timezone.utc).isoformat() + + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid) + data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid) + data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid) + + data['ghcid']['location_resolution'] = { + 'method': 'EXTRACTED_FROM_NAME', + 'city_name': city_name, + 'city_code': city_code, + 'region_code': region_code, + 'country_code': 'CZ', + 'enrichment_date': now, + 'geonames_id': geonames_data['geonames_id'], + 'geonames_name': geonames_data['geonames_name'], + 'latitude': geonames_data['latitude'], + 'longitude': geonames_data['longitude'], + } + + # Add history entry + history = data['ghcid'].get('ghcid_history', []) + if history and isinstance(history[0], dict): + history[0]['valid_to'] = now + history.insert(0, { + 'ghcid': new_ghcid, + 'ghcid_numeric': data['ghcid']['ghcid_numeric'], + 'valid_from': now, + 'reason': f'City extracted from name: {city_name} -> {city_code}' + }) + data['ghcid']['ghcid_history'] = history + + # Update identifiers + for ident in data.get('identifiers', []): + if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID': + ident['identifier_value'] = new_ghcid + + # Write updated file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Rename file + new_filename = f"{new_ghcid}.yaml" + new_path = file_path.parent / new_filename + if new_path != file_path and not new_path.exists(): + shutil.move(file_path, new_path) + result['renamed_to'] = str(new_path.name) + + result['status'] = 'updated' + return result + + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + return result + + +def main(): + parser = argparse.ArgumentParser(description='Fast Czech city enrichment from names') + parser.add_argument('--dry-run', action='store_true') + parser.add_argument('--limit', type=int) + parser.add_argument('--verbose', '-v', action='store_true') + args = parser.parse_args() + + print("=" * 60) + print("CZECH CITY ENRICHMENT (Fast Mode)") + print("=" * 60) + + if args.dry_run: + print("DRY RUN MODE") + + czech_xxx_files = list(CUSTODIAN_DIR.glob("CZ-*-XXX-*.yaml")) + if args.limit: + czech_xxx_files = czech_xxx_files[:args.limit] + + print(f"Found {len(czech_xxx_files)} Czech files with XXX placeholder") + + stats = {} + cities_found = {} + + for i, file_path in enumerate(czech_xxx_files, 1): + if i % 50 == 0: + print(f"Progress: {i}/{len(czech_xxx_files)}") + + result = process_file(file_path, dry_run=args.dry_run) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + if result.get('city'): + cities_found[result['city']] = cities_found.get(result['city'], 0) + 1 + + if args.verbose and result['status'] in ('updated', 'would_update'): + print(f" {result['old_ghcid']} -> {result['new_ghcid']} ({result['city']})") + + print() + print("=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Total processed: {len(czech_xxx_files)}") + for status, count in sorted(stats.items()): + if count > 0: + print(f" {status}: {count}") + + if cities_found: + print(f"\nCities found: {len(cities_found)} unique") + print("Top 10:") + for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]: + print(f" {city}: {count}") + + # Save report + REPORTS_DIR.mkdir(exist_ok=True) + report_file = REPORTS_DIR / f"CZECH_CITY_FAST_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" + with open(report_file, 'w') as f: + f.write(f"# Czech City Enrichment (Fast Mode)\n\n") + f.write(f"**Date**: {datetime.now().isoformat()}\n") + f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n") + f.write(f"## Results\n") + for status, count in sorted(stats.items()): + f.write(f"- {status}: {count}\n") + + print(f"\nReport: {report_file}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_japanese_cities.py b/scripts/enrich_japanese_cities.py new file mode 100755 index 0000000000..69a63d20d4 --- /dev/null +++ b/scripts/enrich_japanese_cities.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python3 +""" +Enrich Japanese custodian files with city/region data using Google Places API. + +This script: +1. Finds Japanese XXX files (no city/region resolved) +2. Uses Google Places API to search for each institution +3. Extracts location data (city, prefecture, coordinates) +4. Updates GHCID with proper region/city codes +5. Adds Google Maps enrichment data + +Usage: + python scripts/enrich_japanese_cities.py [--dry-run] [--limit N] + +Environment Variables: + GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled +""" + +import os +import sys +import time +import sqlite3 +import re +import argparse +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional + +import yaml +import httpx +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Configuration +GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") +GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") + +# Google Places API +TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" +REQUEST_DELAY = 0.3 # Rate limiting + +# Japanese prefecture GeoNames admin1_code to ISO 3166-2:JP mapping +ADMIN1_TO_ISO = { + '01': 'AI', # Aichi + '02': 'AK', # Akita + '03': 'AO', # Aomori + '04': 'CH', # Chiba + '05': 'EH', # Ehime + '06': 'FI', # Fukui + '07': 'FO', # Fukuoka + '08': 'FS', # Fukushima + '09': 'GI', # Gifu + '10': 'GU', # Gunma + '11': 'HS', # Hiroshima + '12': 'HO', # Hokkaido + '13': 'HG', # Hyogo + '14': 'IB', # Ibaraki + '15': 'IS', # Ishikawa + '16': 'IW', # Iwate + '17': 'KA', # Kagawa + '18': 'KS', # Kagoshima + '19': 'KN', # Kanagawa + '20': 'KC', # Kochi + '21': 'KM', # Kumamoto + '22': 'KY', # Kyoto + '23': 'ME', # Mie + '24': 'MG', # Miyagi + '25': 'MZ', # Miyazaki + '26': 'NN', # Nagano + '27': 'NS', # Nagasaki + '28': 'NR', # Nara + '29': 'NI', # Niigata + '30': 'OT', # Oita + '31': 'OK', # Okayama + '32': 'OS', # Osaka + '33': 'SG', # Saga + '34': 'ST', # Saitama + '35': 'SI', # Shiga + '36': 'SM', # Shimane + '37': 'SZ', # Shizuoka + '38': 'TC', # Tochigi + '39': 'TS', # Tokushima + '40': 'TK', # Tokyo + '41': 'TT', # Tottori + '42': 'TY', # Toyama + '43': 'WK', # Wakayama + '44': 'YG', # Yamagata + '45': 'YM', # Yamaguchi + '46': 'YN', # Yamanashi + '47': 'ON', # Okinawa +} + +# Reverse mapping for lookup by prefecture name +PREFECTURE_TO_ISO = { + 'Aichi': 'AI', 'Akita': 'AK', 'Aomori': 'AO', 'Chiba': 'CH', 'Ehime': 'EH', + 'Fukui': 'FI', 'Fukuoka': 'FO', 'Fukushima': 'FS', 'Gifu': 'GI', 'Gunma': 'GU', + 'Hiroshima': 'HS', 'Hokkaido': 'HO', 'Hyogo': 'HG', 'Hyōgo': 'HG', + 'Ibaraki': 'IB', 'Ishikawa': 'IS', 'Iwate': 'IW', 'Kagawa': 'KA', + 'Kagoshima': 'KS', 'Kanagawa': 'KN', 'Kochi': 'KC', 'Kumamoto': 'KM', + 'Kyoto': 'KY', 'Mie': 'ME', 'Miyagi': 'MG', 'Miyazaki': 'MZ', + 'Nagano': 'NN', 'Nagasaki': 'NS', 'Nara': 'NR', 'Niigata': 'NI', + 'Oita': 'OT', 'Okayama': 'OK', 'Osaka': 'OS', 'Saga': 'SG', + 'Saitama': 'ST', 'Shiga': 'SI', 'Shimane': 'SM', 'Shizuoka': 'SZ', + 'Tochigi': 'TC', 'Tokushima': 'TS', 'Tokyo': 'TK', 'Tottori': 'TT', + 'Toyama': 'TY', 'Wakayama': 'WK', 'Yamagata': 'YG', 'Yamaguchi': 'YM', + 'Yamanashi': 'YN', 'Okinawa': 'ON', + # Alternative spellings from address strings + 'Tokyo To': 'TK', 'Osaka Fu': 'OS', 'Kyoto Fu': 'KY', 'Hokkaido': 'HO', + 'Aichi Ken': 'AI', 'Hyogo Ken': 'HG', 'Kanagawa Ken': 'KN', +} + + +def get_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + # Clean suffixes common in Japanese city names + name = city_name.strip() + for suffix in [' Shi', ' Ku', ' Cho', ' Machi', ' Mura', ' Gun', ' City', '-shi', '-ku']: + if name.endswith(suffix): + name = name[:-len(suffix)] + + words = name.split() + + if len(words) == 1: + return name[:3].upper() + elif len(words) == 2: + return (words[0][0] + words[1][:2]).upper() + else: + return ''.join(w[0] for w in words[:3]).upper() + + +def search_google_places(query: str, api_key: str, country_bias: str = "JP") -> Optional[dict]: + """Search Google Places API for a location.""" + headers = { + "Content-Type": "application/json", + "X-Goog-Api-Key": api_key, + "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri" + } + + payload = { + "textQuery": query, + "languageCode": "en" + } + + try: + response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if "places" in data and len(data["places"]) > 0: + return data["places"][0] + return None + except Exception as e: + print(f" Error searching Google Places: {e}") + return None + + +def extract_location_from_google(place: dict) -> dict: + """Extract location information from Google Places result.""" + result = { + 'city': None, + 'prefecture': None, + 'prefecture_code': None, + 'latitude': None, + 'longitude': None, + 'formatted_address': None, + 'place_id': None, + 'website': None, + } + + if not place: + return result + + result['place_id'] = place.get('id') + result['formatted_address'] = place.get('formattedAddress') + result['website'] = place.get('websiteUri') + + # Get coordinates + location = place.get('location', {}) + result['latitude'] = location.get('latitude') + result['longitude'] = location.get('longitude') + + # Parse address components + components = place.get('addressComponents', []) + for comp in components: + types = comp.get('types', []) + long_name = comp.get('longText', '') + + if 'locality' in types: + result['city'] = long_name + elif 'administrative_area_level_1' in types: + result['prefecture'] = long_name + # Try to get ISO code + result['prefecture_code'] = PREFECTURE_TO_ISO.get(long_name) + elif 'sublocality_level_1' in types and not result['city']: + # Use ward/sublocality as city if no locality + result['city'] = long_name + + return result + + +def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float) -> Optional[dict]: + """Reverse geocode coordinates to find nearest city in GeoNames.""" + cursor = conn.cursor() + + cursor.execute(""" + SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, + latitude, longitude, population, feature_code, + ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq + FROM cities + WHERE country_code = 'JP' + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + ORDER BY dist_sq + LIMIT 1 + """, (lat, lat, lon, lon)) + + row = cursor.fetchone() + if row: + return { + 'name': row[0], + 'ascii_name': row[1], + 'admin1_code': row[2], + 'admin1_name': row[3], + 'geonames_id': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'population': row[7], + 'feature_code': row[8], + } + return None + + +def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, dry_run: bool = False) -> dict: + """Process a single Japanese custodian file.""" + result = { + 'file': str(filepath), + 'status': 'skipped', + 'old_ghcid': None, + 'new_ghcid': None, + 'city': None, + 'prefecture': None, + 'error': None, + } + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + result['status'] = 'error' + result['error'] = f'Failed to load YAML: {e}' + return result + + if not data: + result['status'] = 'error' + result['error'] = 'Empty YAML file' + return result + + # Get current GHCID + ghcid_data = data.get('ghcid', {}) + old_ghcid = ghcid_data.get('ghcid_current', '') + result['old_ghcid'] = old_ghcid + + if not old_ghcid.startswith('JP-XX-XXX-'): + result['status'] = 'skipped' + result['error'] = 'Not a JP-XX-XXX file' + return result + + # Get institution name for search + name = data.get('custodian_name', {}).get('claim_value', '') + if not name: + name = data.get('original_entry', {}).get('name', '') + + if not name: + result['status'] = 'error' + result['error'] = 'No institution name found' + return result + + # Search Google Places + print(f" Searching: {name[:50]}...") + place = search_google_places(f"{name} Japan", api_key) + time.sleep(REQUEST_DELAY) + + if not place: + result['status'] = 'error' + result['error'] = 'Not found in Google Places' + return result + + # Extract location + location_info = extract_location_from_google(place) + + if not location_info['latitude'] or not location_info['longitude']: + result['status'] = 'error' + result['error'] = 'No coordinates from Google' + return result + + # Lookup in GeoNames for city code + city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude']) + + if not city_info: + result['status'] = 'error' + result['error'] = 'City not found in GeoNames' + return result + + # Determine region code + admin1_code = city_info['admin1_code'] + region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX') + + if region_code == 'XX': + # Try from Google address + region_code = location_info.get('prefecture_code', 'XX') + + # Generate city code + city_code = get_city_code(city_info['ascii_name']) + + result['city'] = city_info['ascii_name'] + result['prefecture'] = city_info['admin1_name'] + + # Build new GHCID + parts = old_ghcid.split('-') + if len(parts) >= 5: + inst_type = parts[3] + abbreviation = '-'.join(parts[4:]) + else: + result['status'] = 'error' + result['error'] = f'Invalid GHCID format: {old_ghcid}' + return result + + new_ghcid = f'JP-{region_code}-{city_code}-{inst_type}-{abbreviation}' + result['new_ghcid'] = new_ghcid + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the data + timestamp = datetime.now(timezone.utc).isoformat() + + # Update ghcid section + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['location_resolution'] = { + 'method': 'GOOGLE_PLACES_GEONAMES', + 'country_code': 'JP', + 'region_code': region_code, + 'region_name': city_info['admin1_name'], + 'city_code': city_code, + 'city_name': city_info['ascii_name'], + 'geonames_id': city_info['geonames_id'], + 'feature_code': city_info['feature_code'], + 'google_place_id': location_info.get('place_id'), + 'latitude': location_info['latitude'], + 'longitude': location_info['longitude'], + 'resolution_date': timestamp, + } + + # Add Google Maps enrichment + data['google_maps_enrichment'] = { + 'place_id': location_info.get('place_id'), + 'formatted_address': location_info.get('formatted_address'), + 'website': location_info.get('website'), + 'latitude': location_info['latitude'], + 'longitude': location_info['longitude'], + 'enriched_at': timestamp, + 'source': 'Google Places API (New)', + } + + # Update location in original_entry + if 'original_entry' in data and 'locations' in data['original_entry']: + if data['original_entry']['locations']: + data['original_entry']['locations'][0]['city'] = city_info['ascii_name'] + data['original_entry']['locations'][0]['region'] = city_info['admin1_name'] + if location_info['latitude']: + data['original_entry']['locations'][0]['latitude'] = location_info['latitude'] + data['original_entry']['locations'][0]['longitude'] = location_info['longitude'] + + # Add to GHCID history + if 'ghcid_history' not in data['ghcid']: + data['ghcid']['ghcid_history'] = [] + + for entry in data['ghcid']['ghcid_history']: + if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'): + entry['valid_to'] = timestamp + + data['ghcid']['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), + 'valid_from': timestamp, + 'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})', + }) + + # Update identifiers + if 'identifiers' in data: + for identifier in data['identifiers']: + if identifier.get('identifier_scheme') == 'GHCID': + identifier['identifier_value'] = new_ghcid + + # Write updated data + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + # Rename file + new_filename = f'{new_ghcid}.yaml' + new_filepath = filepath.parent / new_filename + + if filepath != new_filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + result['new_file'] = str(new_filepath) + elif new_filepath.exists() and filepath != new_filepath: + result['status'] = 'collision' + result['error'] = f'Target file exists: {new_filepath.name}' + return result + + result['status'] = 'updated' + return result + + +def main(): + parser = argparse.ArgumentParser(description='Enrich Japanese custodian files with Google Places data') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done') + parser.add_argument('--limit', type=int, help='Limit number of files to process') + args = parser.parse_args() + + if not GOOGLE_PLACES_TOKEN: + print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required") + print("Set it in .env file or export GOOGLE_PLACES_TOKEN=...") + sys.exit(1) + + if not GEONAMES_DB.exists(): + print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") + sys.exit(1) + + # Find Japanese XXX files + files = sorted(CUSTODIAN_DIR.glob('JP-XX-XXX-*.yaml')) + + if args.limit: + files = files[:args.limit] + + print(f"Found {len(files)} Japanese XXX files") + print(f"Dry run: {args.dry_run}") + print() + + conn = sqlite3.connect(str(GEONAMES_DB)) + + stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} + errors = [] + + for filepath in files: + print(f"Processing: {filepath.name}") + result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, dry_run=args.dry_run) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + if result['status'] in ('updated', 'would_update'): + print(f" ✓ {result['city']} ({result['prefecture']}): {result['old_ghcid']} → {result['new_ghcid']}") + elif result['status'] == 'error': + print(f" ✗ {result['error']}") + errors.append(result) + elif result['status'] == 'collision': + print(f" ⚠ {result['error']}") + + conn.close() + + print() + print('=' * 60) + print('Summary:') + print(f" Updated: {stats.get('updated', 0)}") + print(f" Would update: {stats.get('would_update', 0)}") + print(f" Errors: {stats.get('error', 0)}") + print(f" Collisions: {stats.get('collision', 0)}") + print(f" Skipped: {stats.get('skipped', 0)}") + + if errors: + print() + print('Files with errors (may need manual research):') + for err in errors[:10]: + print(f" - {Path(err['file']).name}: {err['error']}") + + +if __name__ == '__main__': + main() diff --git a/scripts/enrich_swiss_isil_cities.py b/scripts/enrich_swiss_isil_cities.py new file mode 100644 index 0000000000..6448bbba14 --- /dev/null +++ b/scripts/enrich_swiss_isil_cities.py @@ -0,0 +1,559 @@ +#!/usr/bin/env python3 +""" +Enrich Swiss ISIL custodian files with city data from the Swiss ISIL website. + +For Swiss custodian files with XXX city placeholder, this script: +1. Loads the source CH-Annotator file to get ISIL URLs by institution name +2. Fetches the institution page from isil.nb.admin.ch +3. Extracts city (Location) and address data +4. Reverse geocodes using GeoNames to get proper city code +5. Updates the GHCID with correct city code +6. Renames the file if GHCID changes + +Usage: + python scripts/enrich_swiss_isil_cities.py [--dry-run] [--limit N] +""" + +import argparse +import hashlib +import os +import re +import shutil +import sqlite3 +import time +import uuid +import yaml +import requests +from bs4 import BeautifulSoup +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +# Paths +PROJECT_ROOT = Path(__file__).parent.parent +CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" +GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db" +REPORTS_DIR = PROJECT_ROOT / "reports" +SWISS_CH_ANNOTATOR_FILE = PROJECT_ROOT / "data" / "instances" / "switzerland_isil_ch_annotator.yaml" + +# GHCID namespace for UUID generation +GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') + +# Rate limiting +REQUEST_DELAY = 1.0 # seconds between requests + +# Swiss canton codes (already ISO 3166-2) +SWISS_CANTON_CODES = { + 'Aargau': 'AG', 'Appenzell Ausserrhoden': 'AR', 'Appenzell Innerrhoden': 'AI', + 'Basel-Landschaft': 'BL', 'Basel-Stadt': 'BS', 'Bern': 'BE', 'Fribourg': 'FR', + 'Geneva': 'GE', 'Glarus': 'GL', 'Graubünden': 'GR', 'Jura': 'JU', 'Lucerne': 'LU', + 'Neuchâtel': 'NE', 'Nidwalden': 'NW', 'Obwalden': 'OW', 'Schaffhausen': 'SH', + 'Schwyz': 'SZ', 'Solothurn': 'SO', 'St. Gallen': 'SG', 'Thurgau': 'TG', + 'Ticino': 'TI', 'Uri': 'UR', 'Valais': 'VS', 'Vaud': 'VD', 'Zug': 'ZG', 'Zürich': 'ZH', + # German names + 'Genf': 'GE', 'Luzern': 'LU', 'Neuenburg': 'NE', 'Wallis': 'VS', 'Waadt': 'VD', + # French names + 'Genève': 'GE', 'Lucerne': 'LU', 'Valais': 'VS', 'Vaud': 'VD', 'Fribourg': 'FR', + # Italian names + 'Ginevra': 'GE', 'Grigioni': 'GR', 'Ticino': 'TI', 'Vallese': 'VS', +} + + +def load_swiss_isil_lookup() -> Dict[str, str]: + """Load Swiss CH-Annotator source file and create name -> ISIL URL lookup.""" + lookup = {} + + if not SWISS_CH_ANNOTATOR_FILE.exists(): + print(f"Warning: Swiss CH-Annotator file not found: {SWISS_CH_ANNOTATOR_FILE}") + return lookup + + print(f"Loading Swiss CH-Annotator source file...") + with open(SWISS_CH_ANNOTATOR_FILE, 'r', encoding='utf-8') as f: + entries = yaml.safe_load(f) + + if not entries: + return lookup + + for entry in entries: + if not isinstance(entry, dict): + continue + + name = entry.get('name', '') + if not name: + continue + + # Look for ISIL URL in digital_platforms + for platform in entry.get('digital_platforms', []): + if isinstance(platform, dict): + url = platform.get('platform_url', '') + if 'isil.nb.admin.ch' in url: + lookup[name] = url + break + + print(f" Loaded {len(lookup)} institutions with ISIL URLs") + return lookup + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + if not city_name: + return 'XXX' + + # Remove diacritics and normalize + import unicodedata + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Skip articles and prepositions + skip_words = {'de', 'la', 'le', 'les', 'du', 'des', 'von', 'am', 'im', 'an', 'der', 'die', 'das'} + words = ascii_name.split() + significant_words = [w for w in words if w.lower() not in skip_words] + + if not significant_words: + significant_words = words + + if len(significant_words) == 1: + # Single word: first 3 letters + return significant_words[0][:3].upper() + else: + # Multiple words: initials + return ''.join(w[0] for w in significant_words[:3]).upper() + + +def generate_ghcid_uuid(ghcid_string: str) -> str: + """Generate deterministic UUID v5 from GHCID string.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) + + +def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: + """Generate UUID v8 style from SHA-256 hash.""" + hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest()[:16] + hash_bytes = bytearray(hash_bytes) + hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # version 8 + hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # variant + return str(uuid.UUID(bytes=bytes(hash_bytes))) + + +def generate_ghcid_numeric(ghcid_string: str) -> int: + """Generate 64-bit numeric ID from SHA-256 hash.""" + hash_bytes = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + return int.from_bytes(hash_bytes[:8], 'big') + + +def fetch_isil_page(isil_url: str, session: requests.Session) -> Optional[Dict]: + """Fetch and parse Swiss ISIL institution page.""" + try: + response = session.get(isil_url, timeout=30) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + result = {} + + # Find all dt/dd pairs in the definition lists + for dt in soup.find_all('dt'): + label = dt.get_text(strip=True) + dd = dt.find_next_sibling('dd') + if dd: + value = dd.get_text(strip=True) + + if label == 'Location': + result['city'] = value + elif label == 'Zip code': + result['postal_code'] = value + elif label == 'Street and number': + result['street_address'] = value + elif label == 'Canton': + result['canton'] = value + result['region'] = SWISS_CANTON_CODES.get(value, value[:2].upper() if len(value) >= 2 else None) + + return result if result.get('city') else None + + except Exception as e: + print(f" Error fetching {isil_url}: {e}") + return None + + +def reverse_geocode_city(city_name: str, region_code: str, country_code: str, db_path: Path) -> Optional[Dict]: + """Look up city in GeoNames database to get coordinates and proper data.""" + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Swiss admin1 codes in GeoNames + swiss_admin1_map = { + 'AG': '01', 'AR': '15', 'AI': '16', 'BL': '06', 'BS': '05', + 'BE': '02', 'FR': '04', 'GE': '07', 'GL': '08', 'GR': '03', + 'JU': '26', 'LU': '09', 'NE': '10', 'NW': '11', 'OW': '12', + 'SH': '14', 'SZ': '17', 'SO': '13', 'SG': '18', 'TG': '20', + 'TI': '21', 'UR': '19', 'VS': '22', 'VD': '23', 'ZG': '25', 'ZH': '24' + } + + admin1_code = swiss_admin1_map.get(region_code) + + # Try exact match first + query = """ + SELECT geonames_id, name, ascii_name, latitude, longitude, + population, feature_code, admin1_code, admin1_name + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (name = ? OR ascii_name = ? OR LOWER(name) = LOWER(?)) + """ + + if admin1_code: + query += " AND admin1_code = ?" + cursor.execute(query + " ORDER BY population DESC LIMIT 1", + (country_code, city_name, city_name, city_name, admin1_code)) + else: + cursor.execute(query + " ORDER BY population DESC LIMIT 1", + (country_code, city_name, city_name, city_name)) + + row = cursor.fetchone() + + if row: + return { + 'geonames_id': row[0], + 'geonames_name': row[1], + 'ascii_name': row[2], + 'latitude': row[3], + 'longitude': row[4], + 'population': row[5], + 'feature_code': row[6], + 'admin1_code': row[7], + 'admin1_name': row[8] + } + + # Try fuzzy match + cursor.execute(""" + SELECT geonames_id, name, ascii_name, latitude, longitude, + population, feature_code, admin1_code, admin1_name + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (name LIKE ? OR ascii_name LIKE ?) + ORDER BY population DESC + LIMIT 1 + """, (country_code, f"{city_name}%", f"{city_name}%")) + + row = cursor.fetchone() + conn.close() + + if row: + return { + 'geonames_id': row[0], + 'geonames_name': row[1], + 'ascii_name': row[2], + 'latitude': row[3], + 'longitude': row[4], + 'population': row[5], + 'feature_code': row[6], + 'admin1_code': row[7], + 'admin1_name': row[8] + } + + return None + + except Exception as e: + print(f" GeoNames lookup error: {e}") + return None + + +def process_file(file_path: Path, session: requests.Session, isil_lookup: Dict[str, str], dry_run: bool = True) -> Dict: + """Process a single custodian file.""" + result = { + 'status': 'unchanged', + 'old_ghcid': None, + 'new_ghcid': None, + 'city': None, + 'error': None + } + + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + result['status'] = 'error' + result['error'] = 'Empty file' + return result + + # Check if this is a Swiss file with XXX city placeholder + ghcid_current = data.get('ghcid', {}).get('ghcid_current', '') + if not ghcid_current.startswith('CH-') or '-XXX-' not in ghcid_current: + result['status'] = 'skipped' + return result + + result['old_ghcid'] = ghcid_current + + # Get institution name for lookup + inst_name = data.get('original_entry', {}).get('name', '') + if not inst_name: + inst_name = data.get('custodian_name', {}).get('claim_value', '') + + # Find ISIL URL - first try lookup by name + isil_url = isil_lookup.get(inst_name) + + # Then check identifiers in the file + if not isil_url: + identifiers = data.get('identifiers', []) + for ident in identifiers: + if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL': + url = ident.get('identifier_url', '') + if 'isil.nb.admin.ch' in url: + isil_url = url + break + + # Also check original_entry.identifiers + if not isil_url: + original_identifiers = data.get('original_entry', {}).get('identifiers', []) + for ident in original_identifiers: + if isinstance(ident, dict) and ident.get('identifier_scheme') == 'ISIL': + url = ident.get('identifier_url', '') + if 'isil.nb.admin.ch' in url: + isil_url = url + break + + if not isil_url: + result['status'] = 'no_isil_url' + result['error'] = f'No ISIL URL found for: {inst_name}' + return result + + # Convert to proper page URL format + if '?isil=' in isil_url: + isil_code = isil_url.split('?isil=')[-1] + # Convert to institution page URL + isil_url = f"https://www.isil.nb.admin.ch/en/?isil={isil_code}" + + # Fetch city data from ISIL website + time.sleep(REQUEST_DELAY) + isil_data = fetch_isil_page(isil_url, session) + + if not isil_data or not isil_data.get('city'): + result['status'] = 'no_city_found' + return result + + city_name = isil_data['city'] + result['city'] = city_name + + # Get region from GHCID or ISIL data + parts = ghcid_current.split('-') + region_code = parts[1] if len(parts) > 1 else isil_data.get('region', 'XX') + + # Generate city code + city_code = generate_city_code(city_name) + + # Try to get GeoNames data for coordinates + geonames_data = reverse_geocode_city(city_name, region_code, 'CH', GEONAMES_DB) + + # Build new GHCID + # Format: CH-{region}-{city}-{type}-{abbrev}[-{suffix}] + new_ghcid = ghcid_current.replace('-XXX-', f'-{city_code}-') + result['new_ghcid'] = new_ghcid + + if new_ghcid == ghcid_current: + result['status'] = 'unchanged' + return result + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the data + now = datetime.now(timezone.utc).isoformat() + + # Update GHCID + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid) + data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid) + data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid) + + # Update location_resolution + location_resolution = { + 'method': 'SWISS_ISIL_ENRICHMENT', + 'city_name': city_name, + 'city_code': city_code, + 'region_code': region_code, + 'country_code': 'CH', + 'enrichment_date': now, + 'source_url': isil_url + } + + if geonames_data: + location_resolution.update({ + 'geonames_id': geonames_data['geonames_id'], + 'geonames_name': geonames_data['geonames_name'], + 'feature_code': geonames_data['feature_code'], + 'population': geonames_data['population'], + 'latitude': geonames_data['latitude'], + 'longitude': geonames_data['longitude'] + }) + + data['ghcid']['location_resolution'] = location_resolution + + # Add GHCID history entry + history = data['ghcid'].get('ghcid_history', []) + if history: + # Close previous entry + history[0]['valid_to'] = now + + history.insert(0, { + 'ghcid': new_ghcid, + 'ghcid_numeric': data['ghcid']['ghcid_numeric'], + 'valid_from': now, + 'valid_to': None, + 'reason': f'City code updated from Swiss ISIL enrichment: {city_name} -> {city_code}' + }) + data['ghcid']['ghcid_history'] = history + + # Update location in original_entry if exists + if 'locations' in data.get('original_entry', {}): + for loc in data['original_entry']['locations']: + if isinstance(loc, dict) and not loc.get('city'): + loc['city'] = city_name + if isil_data.get('postal_code'): + loc['postal_code'] = isil_data['postal_code'] + if isil_data.get('street_address'): + loc['street_address'] = isil_data['street_address'] + + # Update identifiers + for ident in data.get('identifiers', []): + if isinstance(ident, dict) and ident.get('identifier_scheme') == 'GHCID': + ident['identifier_value'] = new_ghcid + + # Write updated file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Rename file if GHCID changed + new_filename = f"{new_ghcid}.yaml" + new_path = file_path.parent / new_filename + + if new_path != file_path and not new_path.exists(): + shutil.move(file_path, new_path) + result['renamed_to'] = str(new_path.name) + + result['status'] = 'updated' + return result + + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + return result + + +def main(): + parser = argparse.ArgumentParser(description='Enrich Swiss ISIL custodian files with city data') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') + parser.add_argument('--limit', type=int, help='Limit number of files to process') + parser.add_argument('--verbose', '-v', action='store_true', help='Show verbose output') + args = parser.parse_args() + + print("=" * 60) + print("SWISS ISIL CITY ENRICHMENT") + print("=" * 60) + + if args.dry_run: + print("DRY RUN MODE - No files will be modified") + + # Find Swiss files with XXX city placeholder + swiss_xxx_files = list(CUSTODIAN_DIR.glob("CH-*-XXX-*.yaml")) + + if args.limit: + swiss_xxx_files = swiss_xxx_files[:args.limit] + print(f"Limited to {args.limit} files") + + print(f"Found {len(swiss_xxx_files)} Swiss files with XXX city placeholder") + print() + + # Load Swiss ISIL lookup from CH-Annotator source file + isil_lookup = load_swiss_isil_lookup() + + # Process files + session = requests.Session() + session.headers['User-Agent'] = 'GLAMDataExtractor/1.0 (heritage-data-enrichment)' + + stats = { + 'updated': 0, + 'would_update': 0, + 'unchanged': 0, + 'skipped': 0, + 'no_isil_url': 0, + 'no_city_found': 0, + 'error': 0 + } + + cities_found = {} + errors = [] + + for i, file_path in enumerate(swiss_xxx_files, 1): + if i % 100 == 0 or args.verbose: + print(f"Progress: {i}/{len(swiss_xxx_files)}") + + result = process_file(file_path, session, isil_lookup, dry_run=args.dry_run) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + if result.get('city'): + cities_found[result['city']] = cities_found.get(result['city'], 0) + 1 + + if result.get('error'): + errors.append(f"{file_path.name}: {result['error']}") + + if args.verbose and result['status'] in ('updated', 'would_update'): + print(f" {file_path.name}") + print(f" City: {result.get('city')}") + print(f" {result['old_ghcid']} -> {result['new_ghcid']}") + + # Print summary + print() + print("=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Total files processed: {len(swiss_xxx_files)}") + print() + print("Results:") + for status, count in sorted(stats.items()): + if count > 0: + print(f" {status}: {count}") + + if cities_found: + print() + print(f"Cities found: {len(cities_found)} unique") + print("Top 10 cities:") + for city, count in sorted(cities_found.items(), key=lambda x: -x[1])[:10]: + print(f" {city}: {count}") + + if errors: + print() + print(f"Errors ({len(errors)}):") + for err in errors[:10]: + print(f" {err}") + if len(errors) > 10: + print(f" ... and {len(errors) - 10} more") + + # Save report + REPORTS_DIR.mkdir(exist_ok=True) + report_file = REPORTS_DIR / f"SWISS_ISIL_ENRICHMENT_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" + + with open(report_file, 'w') as f: + f.write("# Swiss ISIL City Enrichment Report\n\n") + f.write(f"**Date**: {datetime.now().isoformat()}\n") + f.write(f"**Mode**: {'Dry Run' if args.dry_run else 'Live'}\n\n") + f.write("## Summary\n\n") + f.write(f"- Total files processed: {len(swiss_xxx_files)}\n") + for status, count in sorted(stats.items()): + if count > 0: + f.write(f"- {status}: {count}\n") + + if cities_found: + f.write(f"\n## Cities Found ({len(cities_found)} unique)\n\n") + for city, count in sorted(cities_found.items(), key=lambda x: -x[1]): + f.write(f"- {city}: {count}\n") + + print() + print(f"Report saved to: {report_file}") + + +if __name__ == '__main__': + main() diff --git a/scripts/extract_locations_ch_annotator.py b/scripts/extract_locations_ch_annotator.py new file mode 100755 index 0000000000..ae40147f86 --- /dev/null +++ b/scripts/extract_locations_ch_annotator.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python3 +""" +Extract and resolve locations from custodian files using CH-Annotator convention. + +This script follows CH-Annotator v1.7.0 TOPONYM (TOP) hypernym for: +- TOP.SET: Settlements (cities, towns, villages) +- TOP.REG: Regions (provinces, states) +- TOP.CTY: Countries + +Following AGENTS.md Rules: +- Rule 5: Additive only - never delete existing data +- Rule 10: CH-Annotator is the entity annotation convention +- GHCID settlement standardization: GeoNames is authoritative +""" + +import os +import sys +import yaml +import sqlite3 +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, Any, List, Tuple + +# GeoNames database path +GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db" + +# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods) +SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + +# Admin1 to ISO 3166-2 mappings by country +ADMIN1_TO_ISO = { + 'BE': { + 'BRU': 'BRU', # Brussels-Capital + 'VLG': 'VLG', # Flanders + 'WAL': 'WAL', # Wallonia + 'VAN': 'VAN', # Antwerp + 'VBR': 'VBR', # Flemish Brabant + 'VLI': 'VLI', # Limburg + 'VOV': 'VOV', # East Flanders + 'VWV': 'VWV', # West Flanders + 'WBR': 'WBR', # Walloon Brabant + 'WHT': 'WHT', # Hainaut + 'WLG': 'WLG', # Liège + 'WLX': 'WLX', # Luxembourg + 'WNA': 'WNA', # Namur + }, + 'AT': { + '01': '1', # Burgenland + '02': '2', # Kärnten + '03': '3', # Niederösterreich + '04': '4', # Oberösterreich + '05': '5', # Salzburg + '06': '6', # Steiermark + '07': '7', # Tirol + '08': '8', # Vorarlberg + '09': '9', # Wien + }, + 'BG': { + '42': '22', # Sofia City + '41': '23', # Sofia Province + '01': '01', # Blagoevgrad + '02': '02', # Burgas + '03': '03', # Varna + '04': '04', # Veliko Tarnovo + '05': '05', # Vidin + '06': '06', # Vratsa + '07': '07', # Gabrovo + '08': '08', # Dobrich + '09': '09', # Kardzhali + '10': '10', # Kyustendil + '11': '11', # Lovech + '12': '12', # Montana + '13': '13', # Pazardzhik + '14': '14', # Pernik + '15': '15', # Pleven + '16': '16', # Plovdiv + '17': '17', # Razgrad + '18': '18', # Ruse + '19': '19', # Silistra + '20': '20', # Sliven + '21': '21', # Smolyan + '24': '24', # Stara Zagora + '25': '25', # Targovishte + '26': '26', # Haskovo + '27': '27', # Shumen + '28': '28', # Yambol + }, + 'CH': { + 'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL', + 'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR', + 'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW', + 'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG', + 'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG', + 'ZH': 'ZH', + }, + 'CZ': { + '52': '10', # Prague + '78': '20', # Central Bohemia + '79': '31', # South Bohemia + '80': '32', # Plzeň + '81': '41', # Karlovy Vary + '82': '42', # Ústí nad Labem + '83': '51', # Liberec + '84': '52', # Hradec Králové + '85': '53', # Pardubice + '86': '63', # Vysočina + '78': '64', # South Moravia + '87': '71', # Olomouc + '88': '72', # Zlín + '89': '80', # Moravia-Silesia + }, +} + + +def connect_geonames() -> Optional[sqlite3.Connection]: + """Connect to GeoNames database.""" + if not GEONAMES_DB.exists(): + print(f"Error: GeoNames database not found at {GEONAMES_DB}") + return None + return sqlite3.connect(str(GEONAMES_DB)) + + +def extract_toponym_from_name(name: str, country: str) -> Optional[str]: + """ + Extract TOPONYM (TOP.SET) from institution name using CH-Annotator patterns. + + CH-Annotator TOP.SET pattern: + - City/town names embedded in institution names + - Often after prepositions: "in", "van", "de", "of", etc. + - Or as suffix/prefix in compound names + + Returns extracted city name or None. + """ + if not name: + return None + + # Normalize + name_lower = name.lower() + + # Pattern 1: Explicit city indicators + # "bibliotheek [CityName]", "museum [CityName]", etc. + city_patterns = [ + r'bibliotheek\s+(\w+)', + r'bibliothek\s+(\w+)', + r'museum\s+(\w+)', + r'archief\s+(\w+)', + r'archiv\s+(\w+)', + r'archive\s+(\w+)', + r'openbare\s+bibliotheek\s+(\w+)', + r'gemeentelijke.*bibliotheek\s+(\w+)', + r'stedelijke.*bibliotheek\s+(\w+)', + r'stadsarchief\s+(\w+)', + ] + + for pattern in city_patterns: + match = re.search(pattern, name_lower) + if match: + city = match.group(1) + # Filter out generic words + if city not in ('van', 'de', 'het', 'der', 'voor', 'en', 'vzw', 'bv', 'nv'): + return city.title() + + # Pattern 2: Parenthetical city names + # "Institution Name (City)" or "City Name (Alias)" + paren_match = re.search(r'\(([^)]+)\)', name) + if paren_match: + paren_content = paren_match.group(1).strip() + # Check for "(Bib CityName)" pattern - extract last word + bib_match = re.match(r'(?:Bib|OB|POB|Bibliotheek)\s+(\w+)', paren_content, re.IGNORECASE) + if bib_match: + return bib_match.group(1).title() + # Check if it looks like a city name (capitalized, not too long) + words = paren_content.split() + if len(words) <= 3 and words[0][0].isupper(): + return paren_content + + # Pattern 3: Hyphenated city names (Belgian pattern) + # "Brussel-Stad", "Sint-Niklaas" + hyphen_match = re.search(r'(\w+-\w+)', name) + if hyphen_match: + compound = hyphen_match.group(1) + # Check against known Belgian compound cities + known_compounds = ['sint-niklaas', 'sint-truiden', 'brussel-stad', + 'la-louvière', 'molenbeek-saint-jean'] + if compound.lower() in known_compounds: + return compound.title() + + # Pattern 4: Last word as city (common pattern) + # "Historisch Museum [CityName]" + words = name.split() + if len(words) >= 2: + last_word = words[-1].strip('()') + # Check if last word is capitalized and not a common suffix + if (last_word[0].isupper() and + last_word.lower() not in ('vzw', 'bv', 'nv', 'asbl', 'bibliotheek', + 'museum', 'archief', 'archiv')): + return last_word + + return None + + +def lookup_city_in_geonames(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]: + """ + Look up a city name in GeoNames database. + + Returns dict with: + - geonames_id + - name (ascii_name) + - admin1_code + - region_code (ISO 3166-2) + - latitude, longitude + """ + cursor = conn.cursor() + + # Try exact match first - include admin2_code for countries that use it (Belgium) + cursor.execute(""" + SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, latitude, longitude, feature_code, population + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) + ORDER BY population DESC + LIMIT 1 + """, (country, city_name, city_name)) + + row = cursor.fetchone() + + if not row: + # Try partial match - but require minimum 4 chars to avoid false positives + if len(city_name) >= 4: + cursor.execute(""" + SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, latitude, longitude, feature_code, population + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) + ORDER BY population DESC + LIMIT 1 + """, (country, f"{city_name}%", f"{city_name}%")) + row = cursor.fetchone() + + if not row: + return None + + geonames_id, name, ascii_name, admin1_code, admin2_code, lat, lon, feature_code, population = row + + # Convert to ISO region code + # Belgium uses admin2 for provinces, most countries use admin1 + region_code = 'XX' + if country == 'BE': + # Belgium: use admin2 (province) instead of admin1 (region) + if admin2_code: + region_code = admin2_code + elif admin1_code: + region_code = admin1_code + elif country in ADMIN1_TO_ISO and admin1_code in ADMIN1_TO_ISO[country]: + region_code = ADMIN1_TO_ISO[country][admin1_code] + elif admin1_code: + region_code = admin1_code + + return { + 'geonames_id': geonames_id, + 'geonames_name': ascii_name or name, + 'admin1_code': admin1_code, + 'region_code': region_code, + 'latitude': lat, + 'longitude': lon, + 'feature_code': feature_code, + 'population': population, + } + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from name.""" + words = city_name.split() + if len(words) == 1: + return city_name[:3].upper() + else: + # Use initials for multi-word names + initials = ''.join(w[0] for w in words if w)[:3] + return initials.upper() + + +def update_file_with_location(filepath: Path, location_data: Dict, city_name: str, + dry_run: bool = True) -> Tuple[bool, Optional[Path]]: + """Update custodian file with resolved location following CH-Annotator convention.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {filepath}: {e}") + return False, None + + if 'ghcid' not in data: + return False, None + + ghcid = data['ghcid'] + if 'location_resolution' not in ghcid: + ghcid['location_resolution'] = {} + + loc_res = ghcid['location_resolution'] + country_code = loc_res.get('country_code', '') + old_region = loc_res.get('region_code', 'XX') + old_city = loc_res.get('city_code', 'XXX') + + if not country_code: + return False, None + + # Only update if we have XX or XXX to resolve + if old_region != 'XX' and old_city != 'XXX': + return False, None + + region_code = location_data['region_code'] + city_code = generate_city_code(location_data['geonames_name']) + + # Update location resolution with CH-Annotator provenance + if old_region == 'XX': + loc_res['region_code'] = region_code + if old_city == 'XXX': + loc_res['city_code'] = city_code + loc_res['city_name'] = location_data['geonames_name'] + + loc_res['geonames_id'] = location_data['geonames_id'] + loc_res['feature_code'] = location_data['feature_code'] + loc_res['method'] = 'CH_ANNOTATOR_TOP_SET' + loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() + loc_res['extracted_toponym'] = city_name + + if location_data.get('latitude'): + loc_res['latitude'] = location_data['latitude'] + loc_res['longitude'] = location_data['longitude'] + + # Update GHCID string + old_ghcid = ghcid.get('ghcid_current', '') + new_ghcid = old_ghcid + + if old_region == 'XX': + new_ghcid = new_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') + if old_city == 'XXX': + new_ghcid = new_ghcid.replace(f'-XXX-', f'-{city_code}-') + + if new_ghcid != old_ghcid: + ghcid['ghcid_current'] = new_ghcid + + if 'ghcid_history' not in ghcid: + ghcid['ghcid_history'] = [] + + ghcid['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'valid_from': datetime.now(timezone.utc).isoformat(), + 'reason': f"Location resolved via CH-Annotator TOP.SET extraction: {city_name} -> {location_data['geonames_name']} (GeoNames:{location_data['geonames_id']})" + }) + + # Add CH-Annotator entity claim for location + if 'ch_annotator' not in data: + data['ch_annotator'] = {} + + if 'entity_claims' not in data['ch_annotator']: + data['ch_annotator']['entity_claims'] = [] + + # Add TOP.SET claim + data['ch_annotator']['entity_claims'].append({ + 'claim_type': 'location_settlement', + 'claim_value': location_data['geonames_name'], + 'property_uri': 'schema:location', + 'hypernym_code': 'TOP.SET', + 'hypernym_label': 'SETTLEMENT', + 'provenance': { + 'namespace': 'geonames', + 'path': f"/geonames/{location_data['geonames_id']}", + 'timestamp': datetime.now(timezone.utc).isoformat(), + 'agent': 'extract_locations_ch_annotator.py', + 'context_convention': 'ch_annotator-v1_7_0', + }, + 'confidence': 0.85, + 'extraction_source': { + 'field': 'institution_name', + 'extracted_text': city_name, + 'method': 'pattern_matching', + }, + }) + + # Add provenance note + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance']: + data['provenance']['notes'] = [] + elif isinstance(data['provenance']['notes'], str): + data['provenance']['notes'] = [data['provenance']['notes']] + + data['provenance']['notes'].append( + f"Location resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " + f"CH-Annotator TOP.SET extraction '{city_name}' -> {location_data['geonames_name']} " + f"(GeoNames:{location_data['geonames_id']}, Region:{region_code})" + ) + + # Determine new filename + new_filename = filepath.name + if old_region == 'XX': + new_filename = new_filename.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') + if old_city == 'XXX': + new_filename = new_filename.replace(f'-XXX-', f'-{city_code}-') + + new_filepath = filepath.parent / new_filename + + if not dry_run: + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + if new_filepath != filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + + return True, new_filepath if new_filepath != filepath else None + + +def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser( + description='Extract locations using CH-Annotator TOPONYM convention' + ) + parser.add_argument('--apply', action='store_true', + help='Actually apply the fixes (default: dry run)') + parser.add_argument('--path', type=str, default='data/custodian', + help='Path to custodian files directory') + parser.add_argument('--limit', type=int, default=100, + help='Limit number of files to process') + parser.add_argument('--country', type=str, + help='Only process files for a specific country') + + args = parser.parse_args() + + custodian_dir = Path(args.path) + if not custodian_dir.exists(): + print(f"Error: Directory {custodian_dir} does not exist") + sys.exit(1) + + # Connect to GeoNames + conn = connect_geonames() + if not conn: + sys.exit(1) + + dry_run = not args.apply + + print("=" * 70) + print("CH-ANNOTATOR TOPONYM (TOP.SET) LOCATION EXTRACTION") + print("=" * 70) + print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") + print(f"Convention: ch_annotator-v1_7_0") + print() + + # Find files with XX region codes or XXX city codes + files_to_process = [] + + for filepath in custodian_dir.glob('*-XX-*.yaml'): + files_to_process.append(filepath) + for filepath in custodian_dir.glob('*-XXX-*.yaml'): + if filepath not in files_to_process: + files_to_process.append(filepath) + + print(f"Found {len(files_to_process)} files with XX/XXX codes") + + # Process files + file_data = [] + files_processed = 0 + for filepath in files_to_process: + # Apply limit AFTER country filtering + if len(file_data) >= args.limit: + break + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + # Get country code + country = None + if 'ghcid' in data and 'location_resolution' in data['ghcid']: + country = data['ghcid']['location_resolution'].get('country_code') + + if not country: + continue + + if args.country and country != args.country: + continue + + # Get institution name + name = None + if 'custodian_name' in data: + name = data['custodian_name'].get('claim_value') + if not name and 'original_entry' in data: + name = data['original_entry'].get('name') + + if not name: + continue + + file_data.append({ + 'filepath': filepath, + 'data': data, + 'country': country, + 'name': name, + }) + except Exception as e: + print(f"Error loading {filepath}: {e}") + + print(f"Processing {len(file_data)} files") + print() + + # Process each file + resolved = 0 + renamed = 0 + no_toponym = 0 + no_geonames = 0 + + for f in file_data: + filepath = f['filepath'] + name = f['name'] + country = f['country'] + + # Extract toponym using CH-Annotator patterns + toponym = extract_toponym_from_name(name, country) + + if not toponym: + no_toponym += 1 + continue + + # Look up in GeoNames + location = lookup_city_in_geonames(toponym, country, conn) + + if not location: + no_geonames += 1 + print(f" No GeoNames match for '{toponym}' in {country}") + continue + + print(f"Processing {filepath.name}...") + print(f" Name: {name}") + print(f" TOP.SET: {toponym} -> {location['geonames_name']} (Region: {location['region_code']})") + + # Update file + success, new_path = update_file_with_location(filepath, location, toponym, dry_run=dry_run) + + if success: + resolved += 1 + if new_path: + renamed += 1 + print(f" Renamed: {filepath.name} -> {new_path.name}") + + conn.close() + + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {len(file_data)}") + print(f"Resolved: {resolved}") + print(f"Renamed: {renamed}") + print(f"No toponym extracted: {no_toponym}") + print(f"No GeoNames match: {no_geonames}") + + if dry_run: + print() + print("This was a DRY RUN. Use --apply to make changes.") + + +if __name__ == '__main__': + main() diff --git a/scripts/fix_belgian_cities.py b/scripts/fix_belgian_cities.py new file mode 100644 index 0000000000..6e9619d303 --- /dev/null +++ b/scripts/fix_belgian_cities.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +""" +Fix remaining Belgian XXX files by re-scraping ISIL website with correct city extraction. +""" + +import re +import sqlite3 +import time +import unicodedata +from datetime import datetime, timezone +from pathlib import Path +from urllib.request import urlopen, Request + +# Belgian admin1 mapping +BELGIAN_ADMIN1_MAP = { + 'Brussels Capital': 'BRU', + 'Brussels': 'BRU', + 'Flanders': 'VLG', + 'Wallonia': 'WAL', +} + +# City name aliases (Dutch → GeoNames) +CITY_ALIASES = { + 'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert', + 'sint-pieters-woluwe': 'Woluwe-Saint-Pierre', + 'oostende': 'Ostend', + 'brussel': 'Brussels', + 'bruxelles': 'Brussels', +} + +def scrape_isil_city(isil_code): + """Scrape city from Belgian ISIL website.""" + url = f"https://isil.kbr.be/{isil_code}" + try: + req = Request(url, headers={'User-Agent': 'Mozilla/5.0 GLAM-Scraper/1.0'}) + with urlopen(req, timeout=10) as response: + html = response.read().decode('utf-8') + + # Look for address pattern: "Street 123, POSTCODE City" + match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', html) + if match: + postal_code = match.group(1) + city = match.group(2).strip() + return city, postal_code + + # Alternative pattern + match = re.search(r'(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', html) + if match: + return match.group(2).strip(), match.group(1) + + except Exception as e: + print(f" Error scraping {isil_code}: {e}") + + return None, None + +def lookup_city(city_name, conn): + """Look up city in GeoNames.""" + if not city_name: + return None + + # Check alias + normalized = city_name.lower().strip() + lookup_name = CITY_ALIASES.get(normalized, city_name) + + cursor = conn.cursor() + cursor.execute(""" + SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population, feature_code + FROM cities + WHERE country_code='BE' + AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?)) + AND feature_code NOT IN ('PPLX') + ORDER BY population DESC LIMIT 1 + """, (lookup_name, lookup_name)) + + result = cursor.fetchone() + if result: + return { + 'name': result[0], + 'ascii_name': result[1], + 'admin1_name': result[2], + 'latitude': result[3], + 'longitude': result[4], + 'geonames_id': result[5], + 'population': result[6], + } + return None + +def generate_city_code(city_name): + """Generate 3-letter city code.""" + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) + words = clean.split() + + articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'} + + if len(words) == 1: + return clean[:3].upper() + elif words[0].lower() in articles: + return (words[0][0] + words[1][:2]).upper() + else: + return ''.join(w[0] for w in words[:3]).upper() + +def update_file(file_path, geo_data, method='ISIL_SCRAPE'): + """Update custodian file with city data.""" + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + city_code = generate_city_code(geo_data['name']) + region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_name'], 'XX') + + # Update GHCID + old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content) + if not old_ghcid_match: + return False + + old_ghcid = old_ghcid_match.group(1).strip() + new_ghcid = re.sub(r'^BE-XX-XXX-', f'BE-{region_code}-{city_code}-', old_ghcid) + + if new_ghcid == old_ghcid: + return False + + # Update content + content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}') + content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}') + content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}') + content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}") + + # Update location_resolution + content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content) + content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content) + + # Add resolution details + timestamp = datetime.now(timezone.utc).isoformat() + history_entry = f""" + - ghcid: {new_ghcid} + valid_from: '{timestamp}' + reason: City resolved via {method} - {geo_data['name']} (GeoNames ID {geo_data['geonames_id']})""" + + history_match = re.search(r'(ghcid_history:\s*\n)', content) + if history_match: + insert_pos = history_match.end() + content = content[:insert_pos] + history_entry + content[insert_pos:] + + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + + # Rename file + old_filename = file_path.name + new_filename = old_filename.replace('BE-XX-XXX-', f'BE-{region_code}-{city_code}-') + if new_filename != old_filename: + new_path = file_path.parent / new_filename + file_path.rename(new_path) + + return True + +def main(): + import sys + dry_run = '--dry-run' in sys.argv + + base_dir = Path(__file__).parent.parent + custodian_dir = base_dir / 'data' / 'custodian' + geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' + + print("Belgian City Fix Script") + print("=" * 50) + if dry_run: + print("DRY RUN MODE\n") + + conn = sqlite3.connect(str(geonames_db)) + + xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml')) + print(f"Found {len(xxx_files)} Belgian XXX files\n") + + updated = 0 + not_found = [] + + for file_path in xxx_files: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Get ISIL code + isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content) + if not isil_match: + continue + + isil_code = isil_match.group(1) + + # Scrape city from website + city, postal = scrape_isil_city(isil_code) + if not city: + print(f"✗ {file_path.name}: No city found for {isil_code}") + not_found.append((file_path.name, isil_code, 'scrape failed')) + time.sleep(1) + continue + + # Lookup in GeoNames + geo_data = lookup_city(city, conn) + if not geo_data: + print(f"? {file_path.name}: {city} not in GeoNames") + not_found.append((file_path.name, isil_code, city)) + time.sleep(1) + continue + + if dry_run: + print(f"✓ {file_path.name}: {isil_code} → {city} ({geo_data['name']})") + else: + if update_file(file_path, geo_data): + print(f"✓ Updated: {file_path.name} → {geo_data['name']}") + updated += 1 + + time.sleep(1) # Rate limit + + print(f"\n{'=' * 50}") + print(f"Updated: {updated}") + print(f"Not found: {len(not_found)}") + + if not_found: + print("\nNot resolved:") + for fname, isil, city in not_found: + print(f" {fname}: {isil} → {city}") + + conn.close() + +if __name__ == '__main__': + main() diff --git a/scripts/migrate_egyptian_from_ch.py b/scripts/migrate_egyptian_from_ch.py new file mode 100644 index 0000000000..de8a571cf2 --- /dev/null +++ b/scripts/migrate_egyptian_from_ch.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Migrate Egyptian institutions incorrectly placed under CH (Switzerland) to EG (Egypt). +""" + +import re +import sqlite3 +import unicodedata +from datetime import datetime, timezone +from pathlib import Path + +# Egyptian city mapping +EGYPTIAN_CITIES = { + 'Cairo': {'region': 'C', 'city_code': 'CAI'}, + 'Alexandria': {'region': 'ALX', 'city_code': 'ALX'}, + 'Giza': {'region': 'GZ', 'city_code': 'GIZ'}, + 'Assiut': {'region': 'AST', 'city_code': 'ASS'}, + 'Helwan': {'region': 'C', 'city_code': 'HEL'}, + '6th of October City': {'region': 'GZ', 'city_code': 'OCT'}, + 'Ain Shams': {'region': 'C', 'city_code': 'ASH'}, + 'Maadi': {'region': 'C', 'city_code': 'MAA'}, + 'New Cairo': {'region': 'C', 'city_code': 'NCA'}, +} + +def extract_city_from_name(name): + """Extract Egyptian city from institution name.""" + name_lower = name.lower() + + if 'cairo' in name_lower or 'ain shams' in name_lower or 'helwan' in name_lower: + return 'Cairo' + if 'alexandria' in name_lower: + return 'Alexandria' + if 'assiut' in name_lower or 'asyut' in name_lower: + return 'Assiut' + if 'giza' in name_lower or 'october' in name_lower: + return 'Giza' + if 'nile' in name_lower or 'maadi' in name_lower: + return 'Cairo' # Most Egyptian institutions without city are in Cairo + if 'egypt' in name_lower or 'egyptian' in name_lower: + return 'Cairo' # Default for national institutions + + return 'Cairo' # Default + +def update_file(file_path, city_name, dry_run=False): + """Update file from CH to EG namespace.""" + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + city_info = EGYPTIAN_CITIES.get(city_name, {'region': 'C', 'city_code': 'CAI'}) + region_code = city_info['region'] + city_code = city_info['city_code'] + + # Get current GHCID + old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content) + if not old_ghcid_match: + return False, None + + old_ghcid = old_ghcid_match.group(1).strip() + + # Create new GHCID with EG namespace + new_ghcid = re.sub(r'^CH-XX-XXX-', f'EG-{region_code}-{city_code}-', old_ghcid) + + if dry_run: + return True, (old_ghcid, new_ghcid) + + # Update all GHCID references + content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}') + content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}') + content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}') + content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}") + + # Update country code + content = re.sub(r'country:\s*CH', 'country: EG', content) + content = re.sub(r'country_code:\s*CH', 'country_code: EG', content) + + # Update location_resolution + content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content) + content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content) + + # Add history entry + timestamp = datetime.now(timezone.utc).isoformat() + history_entry = f""" + - ghcid: {new_ghcid} + valid_from: '{timestamp}' + reason: Migrated from CH to EG namespace - {city_name}""" + + history_match = re.search(r'(ghcid_history:\s*\n)', content) + if history_match: + insert_pos = history_match.end() + content = content[:insert_pos] + history_entry + content[insert_pos:] + + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + + # Rename file + old_filename = file_path.name + new_filename = old_filename.replace('CH-XX-XXX-', f'EG-{region_code}-{city_code}-') + if new_filename != old_filename: + new_path = file_path.parent / new_filename + file_path.rename(new_path) + + return True, (old_ghcid, new_ghcid) + +def main(): + import sys + dry_run = '--dry-run' in sys.argv + + base_dir = Path(__file__).parent.parent + custodian_dir = base_dir / 'data' / 'custodian' + + print("Egyptian Institution Migration (CH → EG)") + print("=" * 50) + if dry_run: + print("DRY RUN MODE\n") + + # Find CH-XX-XXX files that are actually Egyptian + xxx_files = list(custodian_dir.glob('CH-XX-XXX-*.yaml')) + print(f"Found {len(xxx_files)} CH-XX-XXX files\n") + + migrated = 0 + egyptian_keywords = ['egypt', 'cairo', 'alexandria', 'ain shams', 'helwan', 'assiut', + 'giza', 'nile', 'al-azhar', 'dar al-kutub', 'guc', 'auc', 'bue'] + + for file_path in xxx_files: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Check if this is an Egyptian institution + name_match = re.search(r'claim_value:\s*(.+)', content) + if not name_match: + continue + + inst_name = name_match.group(1).strip().lower() + + is_egyptian = any(keyword in inst_name for keyword in egyptian_keywords) + if not is_egyptian: + continue + + city = extract_city_from_name(inst_name) + success, ghcid_change = update_file(file_path, city, dry_run) + + if success: + if dry_run: + print(f" {file_path.name}") + print(f" → {ghcid_change[0]} → {ghcid_change[1]}") + else: + print(f"✓ Migrated: {file_path.name} → {city}") + migrated += 1 + + print(f"\n{'=' * 50}") + print(f"Migrated: {migrated}") + +if __name__ == '__main__': + main() diff --git a/scripts/migrate_web_archives.py b/scripts/migrate_web_archives.py new file mode 100644 index 0000000000..2dd5987362 --- /dev/null +++ b/scripts/migrate_web_archives.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Migrate web archives from /data/nde/enriched/entries/web/ to /data/custodian/{GHCID}/web/ + +This script: +1. Builds a mapping from entry_index -> GHCID by scanning custodian files +2. Moves (or symlinks) web archive folders to the appropriate custodian folder +3. Creates a DuckDB database with web archive metadata for DuckLake ingestion + +Usage: + python scripts/migrate_web_archives.py --dry-run # Preview changes + python scripts/migrate_web_archives.py --execute # Actually migrate + python scripts/migrate_web_archives.py --build-ducklake # Create DuckDB tables +""" + +import os +import sys +import re +import yaml +import shutil +import argparse +import logging +from pathlib import Path +from datetime import datetime +from typing import Dict, Optional, List, Any +import json + +# Try to import duckdb for DuckLake ingestion +try: + import duckdb + HAS_DUCKDB = True +except ImportError: + HAS_DUCKDB = False + print("Warning: duckdb not installed. DuckLake ingestion disabled.") + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Paths +BASE_DIR = Path("/Users/kempersc/apps/glam") +CUSTODIAN_DIR = BASE_DIR / "data" / "custodian" +WEB_ARCHIVE_SOURCE = BASE_DIR / "data" / "nde" / "enriched" / "entries" / "web" +DUCKLAKE_DB = BASE_DIR / "data" / "ducklake" / "web_archives.duckdb" +MAPPING_FILE = WEB_ARCHIVE_SOURCE / "_entry_to_ghcid.txt" + + +def build_entry_index_to_ghcid_mapping() -> Dict[int, str]: + """ + Load mapping from pre-built file (created via ripgrep for speed). + Falls back to scanning YAML files if file doesn't exist. + + Returns: + Dict mapping entry_index (int) to GHCID (str, e.g., "NL-GE-GEN-S-HKG") + """ + mapping = {} + + # Try to load from pre-built mapping file + if MAPPING_FILE.exists(): + logger.info(f"Loading mapping from {MAPPING_FILE}") + with open(MAPPING_FILE, 'r') as f: + for line in f: + parts = line.strip().split(' ', 1) + if len(parts) == 2 and parts[0].isdigit(): + entry_index = int(parts[0]) + ghcid = parts[1] + mapping[entry_index] = ghcid + logger.info(f"Loaded {len(mapping)} entries from mapping file") + return mapping + + # Fallback: scan YAML files (slow) + logger.info("Mapping file not found, scanning custodian files...") + custodian_files = list(CUSTODIAN_DIR.glob("*.yaml")) + logger.info(f"Scanning {len(custodian_files)} custodian files...") + + for filepath in custodian_files: + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if data and 'entry_index' in data: + entry_index = data['entry_index'] + if isinstance(entry_index, int): + ghcid = filepath.stem # e.g., "NL-GE-GEN-S-HKG" + mapping[entry_index] = ghcid + except Exception as e: + logger.debug(f"Error reading {filepath}: {e}") + continue + + logger.info(f"Built mapping for {len(mapping)} entries with entry_index") + return mapping + + +def get_web_archive_folders() -> List[Path]: + """Get list of web archive folders (entry numbers).""" + folders = [] + for item in WEB_ARCHIVE_SOURCE.iterdir(): + if item.is_dir() and item.name.isdigit(): + folders.append(item) + return sorted(folders, key=lambda p: int(p.name)) + + +def parse_metadata(metadata_path: Path) -> Optional[Dict[str, Any]]: + """Parse web archive metadata.yaml file.""" + try: + with open(metadata_path, 'r', encoding='utf-8') as f: + return yaml.safe_load(f) + except Exception as e: + logger.error(f"Failed to parse {metadata_path}: {e}") + return None + + +def migrate_web_archive(source_folder: Path, ghcid: str, dry_run: bool = True) -> bool: + """ + Migrate a web archive folder to the custodian's web/ folder. + + Args: + source_folder: Path to source web archive (e.g., .../web/0183/historischekringgente.nl/) + ghcid: Target GHCID (e.g., "NL-GE-GEN-S-HKG") + dry_run: If True, only preview changes + + Returns: + True if successful + """ + target_dir = CUSTODIAN_DIR / ghcid / "web" + + # Find domain subfolder + domain_folders = [d for d in source_folder.iterdir() if d.is_dir()] + + if not domain_folders: + logger.warning(f"No domain folders in {source_folder}") + return False + + for domain_folder in domain_folders: + domain_name = domain_folder.name + target_path = target_dir / domain_name + + if dry_run: + logger.info(f"[DRY-RUN] Would migrate: {domain_folder} -> {target_path}") + else: + try: + target_dir.mkdir(parents=True, exist_ok=True) + if target_path.exists(): + logger.warning(f"Target already exists: {target_path}") + continue + shutil.copytree(domain_folder, target_path) + logger.info(f"Migrated: {domain_folder} -> {target_path}") + except Exception as e: + logger.error(f"Failed to migrate {domain_folder}: {e}") + return False + + return True + + +def build_ducklake_database(mapping: Dict[int, str]): + """ + Create DuckDB database with web archive metadata for DuckLake. + + Tables: + - web_archives: Archive metadata (ghcid, url, timestamp, stats) + - web_pages: Individual pages with extraction counts + - web_claims: Extracted claims/entities from annotations + """ + if not HAS_DUCKDB: + logger.error("DuckDB not installed. Cannot build DuckLake database.") + return + + DUCKLAKE_DB.parent.mkdir(parents=True, exist_ok=True) + + con = duckdb.connect(str(DUCKLAKE_DB)) + + # Create tables + con.execute(""" + CREATE TABLE IF NOT EXISTS web_archives ( + ghcid VARCHAR PRIMARY KEY, + entry_index INTEGER, + domain VARCHAR, + url VARCHAR, + archive_timestamp TIMESTAMP, + archive_method VARCHAR, + total_pages INTEGER, + processed_pages INTEGER, + warc_file VARCHAR, + warc_size_bytes BIGINT, + has_annotations BOOLEAN DEFAULT FALSE + ) + """) + + con.execute(""" + CREATE TABLE IF NOT EXISTS web_pages ( + id INTEGER PRIMARY KEY, + ghcid VARCHAR, + page_title VARCHAR, + source_path VARCHAR, + archived_file VARCHAR, + extractions_count INTEGER, + FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid) + ) + """) + + con.execute(""" + CREATE TABLE IF NOT EXISTS web_claims ( + id INTEGER PRIMARY KEY, + ghcid VARCHAR, + claim_id VARCHAR, + claim_type VARCHAR, + text_content VARCHAR, + hypernym VARCHAR, + hyponym VARCHAR, + class_uri VARCHAR, + xpath VARCHAR, + recognition_confidence FLOAT, + linking_confidence FLOAT, + wikidata_id VARCHAR, + FOREIGN KEY (ghcid) REFERENCES web_archives(ghcid) + ) + """) + + # Clear existing data + con.execute("DELETE FROM web_claims") + con.execute("DELETE FROM web_pages") + con.execute("DELETE FROM web_archives") + + page_id = 0 + claim_id_counter = 0 + + web_folders = get_web_archive_folders() + logger.info(f"Processing {len(web_folders)} web archive folders for DuckLake...") + + for folder in web_folders: + entry_index = int(folder.name) + ghcid = mapping.get(entry_index) + + if not ghcid: + logger.debug(f"No GHCID mapping for entry {entry_index}") + continue + + # Find domain folder + domain_folders = [d for d in folder.iterdir() if d.is_dir()] + + for domain_folder in domain_folders: + metadata_path = domain_folder / "metadata.yaml" + if not metadata_path.exists(): + continue + + metadata = parse_metadata(metadata_path) + if not metadata: + continue + + # Check for annotations + annotations_path = domain_folder / "annotations_v1.7.0.yaml" + has_annotations = annotations_path.exists() + + # Parse warc info + warc_info = metadata.get('warc', {}) + + # Insert archive record + try: + archive_ts = metadata.get('archive_timestamp') + if archive_ts: + archive_ts = datetime.fromisoformat(archive_ts.replace('Z', '+00:00')) + + con.execute(""" + INSERT INTO web_archives VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, [ + ghcid, + entry_index, + domain_folder.name, + metadata.get('url'), + archive_ts, + metadata.get('archive_method'), + metadata.get('total_pages', 0), + metadata.get('processed_pages', 0), + warc_info.get('warc_file'), + warc_info.get('warc_size_bytes', 0), + has_annotations + ]) + except Exception as e: + logger.debug(f"Error inserting archive {ghcid}: {e}") + continue + + # Insert pages + for page in metadata.get('pages', []): + page_id += 1 + try: + con.execute(""" + INSERT INTO web_pages VALUES (?, ?, ?, ?, ?, ?) + """, [ + page_id, + ghcid, + page.get('title'), + page.get('source_path'), + page.get('archived_file'), + page.get('extractions_count', 0) + ]) + except Exception as e: + logger.debug(f"Error inserting page: {e}") + + # Insert claims from annotations + if has_annotations: + try: + with open(annotations_path, 'r', encoding='utf-8') as f: + annotations = yaml.safe_load(f) + + session = annotations.get('session', {}) + claims = session.get('claims', {}) + + # Process entity claims + for claim in claims.get('entity', []): + claim_id_counter += 1 + provenance = claim.get('provenance', {}) + con.execute(""" + INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, [ + claim_id_counter, + ghcid, + claim.get('claim_id'), + claim.get('claim_type'), + claim.get('text_content'), + claim.get('hypernym'), + claim.get('hyponym'), + claim.get('class_uri'), + provenance.get('path'), + claim.get('recognition_confidence', 0), + claim.get('linking_confidence', 0), + claim.get('wikidata_id') + ]) + + # Process aggregate claims + for claim in claims.get('aggregate', []): + claim_id_counter += 1 + provenance = claim.get('provenance', {}) + con.execute(""" + INSERT INTO web_claims VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, [ + claim_id_counter, + ghcid, + claim.get('claim_id'), + claim.get('claim_type'), + claim.get('text_content'), + None, + None, + None, + provenance.get('path'), + provenance.get('confidence', 0), + 0, + None + ]) + except Exception as e: + logger.debug(f"Error processing annotations for {ghcid}: {e}") + + # Create indices + con.execute("CREATE INDEX IF NOT EXISTS idx_pages_ghcid ON web_pages(ghcid)") + con.execute("CREATE INDEX IF NOT EXISTS idx_claims_ghcid ON web_claims(ghcid)") + con.execute("CREATE INDEX IF NOT EXISTS idx_claims_type ON web_claims(claim_type)") + con.execute("CREATE INDEX IF NOT EXISTS idx_claims_hypernym ON web_claims(hypernym)") + + # Get stats + archive_count = con.execute("SELECT COUNT(*) FROM web_archives").fetchone()[0] + page_count = con.execute("SELECT COUNT(*) FROM web_pages").fetchone()[0] + claim_count = con.execute("SELECT COUNT(*) FROM web_claims").fetchone()[0] + + con.close() + + logger.info(f"DuckLake database created at: {DUCKLAKE_DB}") + logger.info(f" - Archives: {archive_count}") + logger.info(f" - Pages: {page_count}") + logger.info(f" - Claims: {claim_count}") + + +def main(): + parser = argparse.ArgumentParser(description="Migrate web archives to custodian folders") + parser.add_argument('--dry-run', action='store_true', help='Preview changes without executing') + parser.add_argument('--execute', action='store_true', help='Actually migrate files') + parser.add_argument('--build-ducklake', action='store_true', help='Build DuckDB database only') + parser.add_argument('--build-mapping', action='store_true', help='Just build and show mapping') + args = parser.parse_args() + + if not any([args.dry_run, args.execute, args.build_ducklake, args.build_mapping]): + parser.print_help() + sys.exit(1) + + # Build the mapping + mapping = build_entry_index_to_ghcid_mapping() + + if args.build_mapping: + print(f"\nMapping has {len(mapping)} entries") + print("\nSample entries:") + for idx, (entry_idx, ghcid) in enumerate(sorted(mapping.items())[:20]): + print(f" {entry_idx:04d} -> {ghcid}") + return + + if args.build_ducklake: + build_ducklake_database(mapping) + return + + # Migration mode + web_folders = get_web_archive_folders() + logger.info(f"Found {len(web_folders)} web archive folders") + + migrated = 0 + skipped = 0 + no_mapping = 0 + + for folder in web_folders: + entry_index = int(folder.name) + ghcid = mapping.get(entry_index) + + if not ghcid: + logger.debug(f"No GHCID for entry {entry_index}") + no_mapping += 1 + continue + + success = migrate_web_archive(folder, ghcid, dry_run=not args.execute) + if success: + migrated += 1 + else: + skipped += 1 + + print(f"\n{'[DRY-RUN] ' if args.dry_run else ''}Migration summary:") + print(f" - Migrated: {migrated}") + print(f" - Skipped: {skipped}") + print(f" - No mapping: {no_mapping}") + + +if __name__ == '__main__': + main() diff --git a/scripts/resolve_cities_from_file_coords.py b/scripts/resolve_cities_from_file_coords.py new file mode 100755 index 0000000000..e18a07e537 --- /dev/null +++ b/scripts/resolve_cities_from_file_coords.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +""" +Resolve XXX city codes using coordinates already in the file (locations[].latitude/longitude). + +This script handles files that already have coordinates but haven't been geocoded yet. + +Following AGENTS.md Rules: +- Rule 5: Additive only - never delete existing data +- GHCID settlement standardization: GeoNames is authoritative +""" + +import os +import sys +import yaml +import sqlite3 +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, Any, List + +# GeoNames database +GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db" +CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian" + +# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods) +SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + +# Netherlands admin1 code mapping +NL_ADMIN1_MAP = { + '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', + '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', + '15': 'OV', '16': 'FL' +} + +# Belgian admin2 to ISO mapping +BE_ADMIN2_MAP = { + 'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV', + 'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA', 'BRU': 'BRU' +} + + +def generate_city_code(name: str) -> str: + """Generate 2-4 letter city code from name.""" + import re + import unicodedata + + # Normalize unicode + normalized = unicodedata.normalize('NFD', name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Remove special characters + clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) + words = clean.split() + + if not words: + return 'XXX' + + # Dutch articles + dutch_articles = {'de', 'het', 'den', "'s", 's'} + + if len(words) == 1: + # Single word: take first 3 letters + return words[0][:3].upper() + elif words[0].lower() in dutch_articles: + # Article + word: D + first 2 letters of main word + return (words[0][0] + words[1][:2]).upper() + else: + # Multi-word: initials + initials = ''.join(w[0] for w in words[:3]) + return initials.upper() + + +def reverse_geocode(lat: float, lon: float, country: str, conn: sqlite3.Connection) -> Optional[Dict]: + """Reverse geocode coordinates to nearest city in GeoNames.""" + cursor = conn.cursor() + + cursor.execute(f''' + SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, + latitude, longitude, feature_code, population + FROM cities + WHERE country_code = ? + AND feature_code IN {SETTLEMENT_FEATURE_CODES} + ORDER BY ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) + LIMIT 1 + ''', (country, lat, lat, lon, lon)) + + row = cursor.fetchone() + if not row: + return None + + return { + 'geonames_id': row[0], + 'name': row[1], + 'ascii_name': row[2], + 'admin1_code': row[3], + 'admin2_code': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'feature_code': row[7], + 'population': row[8], + } + + +def get_region_code(country: str, admin1_code: str, admin2_code: str) -> str: + """Get ISO 3166-2 region code from admin codes.""" + if country == 'NL': + return NL_ADMIN1_MAP.get(admin1_code, 'XX') + elif country == 'BE': + return BE_ADMIN2_MAP.get(admin2_code, admin1_code if admin1_code else 'XX') + else: + return admin1_code if admin1_code else 'XX' + + +def find_coords_in_file(data: Dict) -> Optional[tuple]: + """Find latitude/longitude in file data.""" + # Check original_entry.locations + if 'original_entry' in data: + locations = data['original_entry'].get('locations', []) + for loc in locations: + if 'latitude' in loc and 'longitude' in loc: + country = loc.get('country', data.get('ghcid', {}).get('location_resolution', {}).get('country_code', 'XX')) + return (loc['latitude'], loc['longitude'], country) + + # Check top-level locations + locations = data.get('locations', []) + for loc in locations: + if 'latitude' in loc and 'longitude' in loc: + country = loc.get('country', 'XX') + return (loc['latitude'], loc['longitude'], country) + + return None + + +def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool: + """Process a single file with XXX city code and coordinates.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {filepath}: {e}") + return False + + if not data: + return False + + # Get coordinates from file + coords = find_coords_in_file(data) + if not coords: + return False + + lat, lon, country = coords + print(f" Coords: {lat:.4f}, {lon:.4f} ({country})") + + # Reverse geocode + city_data = reverse_geocode(lat, lon, country, conn) + if not city_data: + print(f" No GeoNames match for {country}") + return False + + city_code = generate_city_code(city_data['ascii_name']) + region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code', '')) + + print(f" City: {city_data['name']} ({city_code}), Region: {region_code}") + + if not apply: + return True + + # Update GHCID + ghcid = data.get('ghcid', {}) + current = ghcid.get('ghcid_current', '') + + # Parse current GHCID + parts = current.split('-') + if len(parts) < 5: + print(f" Invalid GHCID format: {current}") + return False + + # Update city code (and region if still XX) + old_region = parts[1] + old_city = parts[2] + + if old_city != 'XXX': + print(f" City already resolved: {old_city}") + return False + + # Update parts + if old_region == 'XX' and region_code != 'XX': + parts[1] = region_code + parts[2] = city_code + + new_ghcid = '-'.join(parts) + + # Update data + ghcid['ghcid_current'] = new_ghcid + loc_res = ghcid.get('location_resolution', {}) + loc_res['city_code'] = city_code + loc_res['city_name'] = city_data['name'] + loc_res['geonames_id'] = city_data['geonames_id'] + loc_res['feature_code'] = city_data['feature_code'] + if old_region == 'XX' and region_code != 'XX': + loc_res['region_code'] = region_code + loc_res['method'] = 'REVERSE_GEOCODE_FROM_FILE_COORDS' + loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() + ghcid['location_resolution'] = loc_res + + # Add to history + history = ghcid.get('ghcid_history', []) + history.append({ + 'ghcid': new_ghcid, + 'valid_from': datetime.now(timezone.utc).isoformat(), + 'reason': f'City resolved via reverse geocoding: XXX->{city_code} ({city_data["name"]})' + }) + ghcid['ghcid_history'] = history + data['ghcid'] = ghcid + + # Calculate new filename + old_name = filepath.name + new_name = old_name.replace(f'{old_region}-XXX', f'{parts[1]}-{city_code}') + if old_region != 'XX' or region_code == 'XX': + new_name = old_name.replace('-XXX-', f'-{city_code}-') + + new_path = filepath.parent / new_name + + # Write and rename + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + if new_path != filepath: + filepath.rename(new_path) + print(f" Renamed: {old_name} -> {new_name}") + + return True + + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Resolve XXX city codes using coordinates in files') + parser.add_argument('--limit', type=int, default=100, help='Max files to process') + parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)') + parser.add_argument('--country', help='Filter by country code') + args = parser.parse_args() + + print("=" * 70) + print("CITY RESOLUTION FROM FILE COORDINATES") + print("=" * 70) + print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}") + print() + + # Connect to GeoNames + if not GEONAMES_DB.exists(): + print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") + sys.exit(1) + + conn = sqlite3.connect(str(GEONAMES_DB)) + + # Find XXX files with coordinates + xxx_files = [] + for f in CUSTODIAN_DIR.glob('*.yaml'): + if '-XXX-' in f.name: + if args.country and not f.name.startswith(f'{args.country}-'): + continue + xxx_files.append(f) + + print(f"Found {len(xxx_files)} files with XXX codes") + + # Filter to files with coordinates + files_with_coords = [] + for f in xxx_files: + try: + with open(f, 'r', encoding='utf-8') as fp: + content = fp.read() + if 'latitude:' in content and 'longitude:' in content: + files_with_coords.append(f) + except: + pass + + print(f"Processing {min(len(files_with_coords), args.limit)} files with coordinates") + print() + + resolved = 0 + renamed = 0 + + for f in files_with_coords[:args.limit]: + print(f"Processing {f.name}...") + if process_file(f, conn, args.apply): + resolved += 1 + if args.apply: + renamed += 1 + + conn.close() + + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {min(len(files_with_coords), args.limit)}") + print(f"Resolved: {resolved}") + print(f"Renamed: {renamed}") + + +if __name__ == '__main__': + main() diff --git a/scripts/resolve_cities_wikidata.py b/scripts/resolve_cities_wikidata.py new file mode 100755 index 0000000000..6b789e64dc --- /dev/null +++ b/scripts/resolve_cities_wikidata.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +""" +Resolve XXX city codes using Wikidata P159 (headquarters) or P625 (coordinates). + +This script handles files with XXX city codes by: +1. Getting Wikidata ID from the file +2. Querying P625 (coordinates) or P159 (headquarters location) +3. Reverse geocoding to GeoNames to find the nearest city + +Following AGENTS.md Rules: +- Rule 5: Additive only - never delete existing data +- GHCID settlement standardization: GeoNames is authoritative +""" + +import os +import sys +import yaml +import json +import time +import sqlite3 +import urllib.request +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, Any, Tuple + +# GeoNames database +GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db" + +# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods) +SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + + +def get_wikidata_location(wikidata_id: str) -> Optional[Tuple[float, float]]: + """Get coordinates from Wikidata entity using P625 or P159.""" + headers = {'User-Agent': 'GLAM-Extractor/1.0 (heritage research project)'} + url = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={wikidata_id}&props=claims&format=json' + + try: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=30) as response: + data = json.loads(response.read().decode('utf-8')) + + claims = data['entities'][wikidata_id]['claims'] + + # Try P625 (coordinates) first + if 'P625' in claims: + coords = claims['P625'][0]['mainsnak']['datavalue']['value'] + return (coords['latitude'], coords['longitude']) + + # Try P159 (headquarters location) + if 'P159' in claims: + loc_id = claims['P159'][0]['mainsnak']['datavalue']['value']['id'] + time.sleep(0.5) # Rate limiting + + # Get coordinates of headquarters + url2 = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={loc_id}&props=claims&format=json' + req2 = urllib.request.Request(url2, headers=headers) + with urllib.request.urlopen(req2, timeout=30) as response2: + data2 = json.loads(response2.read().decode('utf-8')) + + claims2 = data2['entities'][loc_id]['claims'] + if 'P625' in claims2: + coords = claims2['P625'][0]['mainsnak']['datavalue']['value'] + return (coords['latitude'], coords['longitude']) + + return None + except Exception as e: + print(f" Error fetching Wikidata {wikidata_id}: {e}") + return None + + +def reverse_geocode(lat: float, lon: float, country: str, conn: sqlite3.Connection) -> Optional[Dict]: + """Reverse geocode coordinates to nearest city in GeoNames.""" + cursor = conn.cursor() + + cursor.execute(f''' + SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, + latitude, longitude, feature_code, population, + ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq + FROM cities + WHERE country_code = ? + AND feature_code IN {SETTLEMENT_FEATURE_CODES} + ORDER BY distance_sq + LIMIT 1 + ''', (lat, lat, lon, lon, country)) + + row = cursor.fetchone() + if not row: + return None + + return { + 'geonames_id': row[0], + 'name': row[1], + 'ascii_name': row[2], + 'admin1_code': row[3], + 'admin2_code': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'feature_code': row[7], + 'population': row[8], + 'distance_sq': row[9], + } + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from name.""" + words = city_name.split() + if len(words) == 1: + return city_name[:3].upper() + else: + initials = ''.join(w[0] for w in words if w)[:3] + return initials.upper() + + +def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = True) -> Tuple[bool, Optional[Path]]: + """Process a single file to resolve XXX city code.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {filepath}: {e}") + return False, None + + # Check if has XXX city code + ghcid = data.get('ghcid', {}) + loc_res = ghcid.get('location_resolution', {}) + + if loc_res.get('city_code', '') != 'XXX': + return False, None + + country = loc_res.get('country_code', '') + if not country: + return False, None + + # Get Wikidata ID + wikidata_id = None + if 'original_entry' in data and 'wikidata_id' in data['original_entry']: + wikidata_id = data['original_entry']['wikidata_id'] + elif 'wikidata_enrichment' in data and 'wikidata_entity_id' in data['wikidata_enrichment']: + wikidata_id = data['wikidata_enrichment']['wikidata_entity_id'] + + if not wikidata_id: + return False, None + + # Get coordinates from Wikidata + coords = get_wikidata_location(wikidata_id) + if not coords: + print(f" No coordinates for {wikidata_id}") + return False, None + + lat, lon = coords + print(f" Coords: {lat:.4f}, {lon:.4f}") + + # Reverse geocode + city_data = reverse_geocode(lat, lon, country, conn) + if not city_data: + print(f" No GeoNames match in {country}") + return False, None + + city_name = city_data['ascii_name'] or city_data['name'] + city_code = generate_city_code(city_name) + + print(f" City: {city_name} ({city_code})") + + # Update file + old_city_code = loc_res.get('city_code', 'XXX') + loc_res['city_code'] = city_code + loc_res['city_label'] = city_name + loc_res['geonames_id'] = city_data['geonames_id'] + loc_res['method'] = 'WIKIDATA_COORDS_REVERSE_GEOCODE' + loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() + + # Update GHCID string + old_ghcid = ghcid.get('ghcid_current', '') + new_ghcid = old_ghcid.replace(f'-XXX-', f'-{city_code}-') + ghcid['ghcid_current'] = new_ghcid + + # Add to history + if 'ghcid_history' not in ghcid: + ghcid['ghcid_history'] = [] + ghcid['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'valid_from': datetime.now(timezone.utc).isoformat(), + 'reason': f"City resolved via Wikidata {wikidata_id} coordinates: XXX->{city_code} ({city_name})" + }) + + # Add provenance note + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance']: + data['provenance']['notes'] = [] + elif isinstance(data['provenance']['notes'], str): + data['provenance']['notes'] = [data['provenance']['notes']] + + data['provenance']['notes'].append( + f"City resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " + f"XXX->{city_code} via Wikidata {wikidata_id} coords ({lat:.4f},{lon:.4f}) -> {city_name} (GeoNames:{city_data['geonames_id']})" + ) + + # Determine new filename + new_filename = filepath.name.replace(f'-XXX-', f'-{city_code}-') + new_filepath = filepath.parent / new_filename + + if not dry_run: + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + if new_filepath != filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + + return True, new_filepath if new_filepath != filepath else None + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Resolve XXX city codes using Wikidata coordinates') + parser.add_argument('--apply', action='store_true', help='Actually apply the fixes') + parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files') + parser.add_argument('--limit', type=int, default=50, help='Limit number of files to process') + parser.add_argument('--country', type=str, help='Only process files for a specific country') + + args = parser.parse_args() + + custodian_dir = Path(args.path) + if not custodian_dir.exists(): + print(f"Error: Directory {custodian_dir} does not exist") + sys.exit(1) + + # Connect to GeoNames + if not GEONAMES_DB.exists(): + print(f"Error: GeoNames database not found at {GEONAMES_DB}") + sys.exit(1) + + conn = sqlite3.connect(GEONAMES_DB) + dry_run = not args.apply + + print("=" * 70) + print("WIKIDATA COORDINATES CITY RESOLUTION") + print("=" * 70) + print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") + print() + + # Find files with XXX city codes + files_to_process = list(custodian_dir.glob('*-XXX-*.yaml')) + print(f"Found {len(files_to_process)} files with XXX codes") + + # Filter and collect files with Wikidata IDs + file_data = [] + for filepath in files_to_process: + if len(file_data) >= args.limit: + break + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + country = data.get('ghcid', {}).get('location_resolution', {}).get('country_code', '') + if args.country and country != args.country: + continue + + # Check for Wikidata ID + wikidata_id = None + if 'original_entry' in data and 'wikidata_id' in data['original_entry']: + wikidata_id = data['original_entry']['wikidata_id'] + elif 'wikidata_enrichment' in data and 'wikidata_entity_id' in data['wikidata_enrichment']: + wikidata_id = data['wikidata_enrichment']['wikidata_entity_id'] + + if not wikidata_id: + continue + + file_data.append({ + 'filepath': filepath, + 'wikidata_id': wikidata_id, + 'country': country, + }) + except Exception: + pass + + print(f"Processing {len(file_data)} files with Wikidata IDs") + print() + + resolved = 0 + renamed = 0 + + for f in file_data: + filepath = f['filepath'] + print(f"Processing {filepath.name}...") + print(f" Wikidata: {f['wikidata_id']}") + + success, new_path = process_file(filepath, conn, dry_run=dry_run) + + if success: + resolved += 1 + if new_path: + renamed += 1 + print(f" Renamed: {filepath.name} -> {new_path.name}") + + time.sleep(0.5) # Rate limiting + + conn.close() + + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {len(file_data)}") + print(f"Resolved: {resolved}") + print(f"Renamed: {renamed}") + + if dry_run: + print() + print("This was a DRY RUN. Use --apply to make changes.") + + +if __name__ == '__main__': + main() diff --git a/scripts/resolve_country_codes.py b/scripts/resolve_country_codes.py new file mode 100644 index 0000000000..e99dcebeed --- /dev/null +++ b/scripts/resolve_country_codes.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python3 +""" +Resolve XX country codes using Wikidata P17 (country) lookup. + +This script: +1. Finds files with XX country code +2. Extracts Wikidata IDs from the files +3. Queries Wikidata P17 to get country +4. Updates files with resolved country code +5. Renames files to match new GHCID + +Following AGENTS.md Rules: +- Rule 5: Additive only - never delete existing data +""" + +import os +import sys +import yaml +import json +import re +import urllib.request +import urllib.parse +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, Any, List, Tuple + + +# Wikidata entity ID to ISO 3166-1 alpha-2 country code mapping +WIKIDATA_COUNTRY_TO_ISO = { + 'Q213': 'CZ', # Czechia + 'Q40': 'AT', # Austria + 'Q183': 'DE', # Germany + 'Q36': 'PL', # Poland + 'Q39': 'CH', # Switzerland + 'Q31': 'BE', # Belgium + 'Q142': 'FR', # France + 'Q145': 'GB', # United Kingdom + 'Q38': 'IT', # Italy + 'Q29': 'ES', # Spain + 'Q55': 'NL', # Netherlands + 'Q30': 'US', # United States + 'Q17': 'JP', # Japan + 'Q884': 'KR', # South Korea + 'Q148': 'CN', # China + 'Q668': 'IN', # India + 'Q155': 'BR', # Brazil + 'Q96': 'MX', # Mexico + 'Q414': 'AR', # Argentina + 'Q298': 'CL', # Chile + 'Q45': 'PT', # Portugal + 'Q27': 'IE', # Ireland + 'Q20': 'NO', # Norway + 'Q35': 'DK', # Denmark + 'Q34': 'SE', # Sweden + 'Q33': 'FI', # Finland + 'Q211': 'LV', # Latvia + 'Q37': 'LT', # Lithuania + 'Q191': 'EE', # Estonia + 'Q159': 'RU', # Russia + 'Q212': 'UA', # Ukraine + 'Q184': 'BY', # Belarus + 'Q219': 'BG', # Bulgaria + 'Q218': 'RO', # Romania + 'Q28': 'HU', # Hungary + 'Q214': 'SK', # Slovakia + 'Q215': 'SI', # Slovenia + 'Q224': 'HR', # Croatia + 'Q225': 'BA', # Bosnia and Herzegovina + 'Q117': 'GH', # Ghana + 'Q115': 'ET', # Ethiopia + 'Q1033': 'NG', # Nigeria + 'Q258': 'ZA', # South Africa + 'Q916': 'AO', # Angola + 'Q1008': 'CI', # Ivory Coast + 'Q114': 'KE', # Kenya + 'Q1044': 'SN', # Senegal + 'Q262': 'DZ', # Algeria + 'Q1028': 'MA', # Morocco + 'Q948': 'TN', # Tunisia + 'Q79': 'EG', # Egypt + 'Q1030': 'LY', # Libya + 'Q265': 'UZ', # Uzbekistan + 'Q232': 'KZ', # Kazakhstan + 'Q863': 'TJ', # Tajikistan + 'Q874': 'TM', # Turkmenistan + 'Q813': 'KG', # Kyrgyzstan + 'Q889': 'AF', # Afghanistan + 'Q794': 'IR', # Iran + 'Q796': 'IQ', # Iraq + 'Q858': 'SY', # Syria + 'Q801': 'IL', # Israel + 'Q810': 'JO', # Jordan + 'Q822': 'LB', # Lebanon + 'Q846': 'QA', # Qatar + 'Q878': 'AE', # United Arab Emirates + 'Q851': 'SA', # Saudi Arabia + 'Q805': 'YE', # Yemen + 'Q842': 'OM', # Oman + 'Q398': 'BH', # Bahrain + 'Q817': 'KW', # Kuwait + 'Q16': 'CA', # Canada + 'Q408': 'AU', # Australia + 'Q664': 'NZ', # New Zealand + 'Q869': 'TH', # Thailand + 'Q881': 'VN', # Vietnam + 'Q928': 'PH', # Philippines + 'Q252': 'ID', # Indonesia + 'Q833': 'MY', # Malaysia + 'Q334': 'SG', # Singapore + 'Q836': 'MM', # Myanmar + 'Q424': 'KH', # Cambodia + 'Q819': 'LA', # Laos + 'Q865': 'TW', # Taiwan + 'Q921': 'BN', # Brunei + 'Q399': 'AM', # Armenia + 'Q230': 'GE', # Georgia + 'Q227': 'AZ', # Azerbaijan + 'Q217': 'MD', # Moldova + 'Q229': 'CY', # Cyprus + 'Q41': 'GR', # Greece + 'Q43': 'TR', # Turkey + 'Q221': 'MK', # North Macedonia + 'Q222': 'AL', # Albania + 'Q403': 'RS', # Serbia + 'Q236': 'ME', # Montenegro + 'Q23635': 'XK', # Kosovo + 'Q347': 'LI', # Liechtenstein + 'Q32': 'LU', # Luxembourg + 'Q235': 'MC', # Monaco + 'Q238': 'SM', # San Marino + 'Q237': 'VA', # Vatican City + 'Q228': 'AD', # Andorra + 'Q233': 'MT', # Malta + 'Q189': 'IS', # Iceland + 'Q219060': 'PS', # Palestine + # Add more as needed +} + + +def extract_wikidata_ids(data: Dict[str, Any]) -> List[str]: + """Extract all Wikidata IDs from custodian data.""" + wikidata_ids = [] + + # Check identifiers array + if 'identifiers' in data: + for ident in data['identifiers']: + if ident.get('identifier_scheme') == 'Wikidata': + value = ident.get('identifier_value', '') + if value.startswith('Q'): + wikidata_ids.append(value) + + # Check original_entry.identifiers + if 'original_entry' in data and 'identifiers' in data['original_entry']: + for ident in data['original_entry']['identifiers']: + if ident.get('identifier_scheme') == 'Wikidata': + value = ident.get('identifier_value', '') + if value.startswith('Q') and value not in wikidata_ids: + wikidata_ids.append(value) + + # Check wikidata_enrichment + if 'wikidata_enrichment' in data: + wd_id = data['wikidata_enrichment'].get('wikidata_entity_id', '') + if wd_id.startswith('Q') and wd_id not in wikidata_ids: + wikidata_ids.append(wd_id) + + return wikidata_ids + + +def query_wikidata_countries(wikidata_ids: List[str]) -> Dict[str, str]: + """Query Wikidata for P17 (country) in batch.""" + if not wikidata_ids: + return {} + + values = ' '.join([f'wd:{qid}' for qid in wikidata_ids]) + + query = f""" + SELECT ?item ?country WHERE {{ + VALUES ?item {{ {values} }} + ?item wdt:P17 ?country. + }} + """ + + url = "https://query.wikidata.org/sparql" + headers = { + 'Accept': 'application/sparql-results+json', + 'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)' + } + + data = urllib.parse.urlencode({'query': query}).encode('utf-8') + + try: + request = urllib.request.Request(url, data=data, headers=headers) + with urllib.request.urlopen(request, timeout=60) as response: + result = json.loads(response.read().decode('utf-8')) + bindings = result.get('results', {}).get('bindings', []) + except Exception as e: + print(f" Wikidata SPARQL error: {e}") + return {} + + country_map = {} + for row in bindings: + item_uri = row.get('item', {}).get('value', '') + country_uri = row.get('country', {}).get('value', '') + + if item_uri and country_uri: + qid = item_uri.split('/')[-1] + country_qid = country_uri.split('/')[-1] + + if country_qid in WIKIDATA_COUNTRY_TO_ISO: + country_map[qid] = WIKIDATA_COUNTRY_TO_ISO[country_qid] + + return country_map + + +def update_custodian_file(filepath: Path, country_code: str, dry_run: bool = True) -> Tuple[bool, Optional[Path]]: + """Update a custodian file with resolved country code.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {filepath}: {e}") + return False, None + + if 'ghcid' not in data: + return False, None + + ghcid = data['ghcid'] + if 'location_resolution' not in ghcid: + ghcid['location_resolution'] = {} + + loc_res = ghcid['location_resolution'] + + # Check if country code is XX + old_country = loc_res.get('country_code', 'XX') + if old_country != 'XX': + return False, None + + # Update country code + loc_res['country_code'] = country_code + loc_res['method'] = 'WIKIDATA_P17' + loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() + + # Update GHCID string + old_ghcid = ghcid.get('ghcid_current', '') + new_ghcid = old_ghcid.replace('XX-XX-', f'{country_code}-XX-') + + if new_ghcid != old_ghcid: + ghcid['ghcid_current'] = new_ghcid + + # Add to history + if 'ghcid_history' not in ghcid: + ghcid['ghcid_history'] = [] + + ghcid['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'valid_from': datetime.now(timezone.utc).isoformat(), + 'reason': f"Country resolved via Wikidata P17: XX→{country_code}" + }) + + # Add provenance note + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance']: + data['provenance']['notes'] = [] + elif isinstance(data['provenance']['notes'], str): + data['provenance']['notes'] = [data['provenance']['notes']] + + data['provenance']['notes'].append( + f"Country resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " + f"XX→{country_code} via Wikidata P17" + ) + + # Determine new filename + old_filename = filepath.name + new_filename = old_filename.replace('XX-XX-', f'{country_code}-XX-') + new_filepath = filepath.parent / new_filename + + if not dry_run: + # Write updated file + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Rename if needed + if new_filepath != filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + + return True, new_filepath if new_filepath != filepath else None + + +def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser( + description='Resolve XX country codes using Wikidata P17 lookup' + ) + parser.add_argument('--apply', action='store_true', + help='Actually apply the fixes (default: dry run)') + parser.add_argument('--path', type=str, default='data/custodian', + help='Path to custodian files directory') + parser.add_argument('--limit', type=int, default=100, + help='Limit number of files to process') + + args = parser.parse_args() + + custodian_dir = Path(args.path) + if not custodian_dir.exists(): + print(f"Error: Directory {custodian_dir} does not exist") + sys.exit(1) + + dry_run = not args.apply + + print("=" * 70) + print("COUNTRY CODE RESOLUTION VIA WIKIDATA P17") + print("=" * 70) + print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") + print() + + # Find files with XX country code + files_to_process = list(custodian_dir.glob('XX-*.yaml'))[:args.limit] + + print(f"Found {len(files_to_process)} files with XX country code") + print() + + # Load files and extract Wikidata IDs + file_data = [] + for filepath in files_to_process: + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + wikidata_ids = extract_wikidata_ids(data) + + file_data.append({ + 'filepath': filepath, + 'data': data, + 'wikidata_ids': wikidata_ids + }) + except Exception as e: + print(f"Error loading {filepath}: {e}") + + print(f"Loaded {len(file_data)} files") + + # Count files with Wikidata IDs + with_wikidata = [f for f in file_data if f['wikidata_ids']] + without_wikidata = [f for f in file_data if not f['wikidata_ids']] + + print(f" With Wikidata IDs: {len(with_wikidata)}") + print(f" Without Wikidata IDs: {len(without_wikidata)}") + print() + + # Query Wikidata for countries in batch + all_wikidata_ids = [] + for f in with_wikidata: + all_wikidata_ids.extend(f['wikidata_ids']) + all_wikidata_ids = list(set(all_wikidata_ids)) + + print(f"Querying Wikidata for {len(all_wikidata_ids)} entities...") + + # Batch in groups of 50 + all_countries = {} + for i in range(0, len(all_wikidata_ids), 50): + batch = all_wikidata_ids[i:i+50] + countries = query_wikidata_countries(batch) + all_countries.update(countries) + if i + 50 < len(all_wikidata_ids): + import time + time.sleep(1) # Rate limiting + + print(f" Retrieved country for {len(all_countries)} entities") + print() + + # Process files + resolved = 0 + renamed = 0 + no_country = [] + + # First process files with Wikidata IDs + for f in with_wikidata: + filepath = f['filepath'] + wikidata_ids = f['wikidata_ids'] + + # Find country from any Wikidata ID + country_code = None + for wid in wikidata_ids: + if wid in all_countries: + country_code = all_countries[wid] + break + + if not country_code: + no_country.append(filepath.name) + continue + + # Update file + success, new_path = update_custodian_file(filepath, country_code, dry_run=dry_run) + + if success: + resolved += 1 + if new_path: + renamed += 1 + print(f" {filepath.name} → {new_path.name}") + else: + print(f" Updated: {filepath.name}") + + # Now process files without Wikidata IDs using source-based inference + source_resolved = 0 + for f in without_wikidata: + filepath = f['filepath'] + data = f['data'] + + # Try to infer country from source file + country_code = None + source = data.get('original_entry', {}).get('source', '') + + # Czech source patterns + if 'czech' in source.lower() or 'cz_' in source.lower(): + country_code = 'CZ' + # Austrian source patterns + elif 'austria' in source.lower() or 'at_' in source.lower(): + country_code = 'AT' + # German source patterns + elif 'german' in source.lower() or 'de_' in source.lower(): + country_code = 'DE' + # Swiss source patterns + elif 'swiss' in source.lower() or 'switzerland' in source.lower() or 'ch_' in source.lower(): + country_code = 'CH' + # Belgian source patterns + elif 'belgium' in source.lower() or 'belgian' in source.lower() or 'be_' in source.lower(): + country_code = 'BE' + # Dutch source patterns + elif 'dutch' in source.lower() or 'netherlands' in source.lower() or 'nl_' in source.lower(): + country_code = 'NL' + # Japanese source patterns + elif 'japan' in source.lower() or 'jp_' in source.lower(): + country_code = 'JP' + + if country_code: + success, new_path = update_custodian_file(filepath, country_code, dry_run=dry_run) + if success: + source_resolved += 1 + resolved += 1 + if new_path: + renamed += 1 + print(f" [source-inferred] {filepath.name} → {new_path.name}") + else: + no_country.append(filepath.name) + + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {len(file_data)}") + print(f"With Wikidata IDs: {len(with_wikidata)}") + print(f"Source-inferred: {source_resolved}") + print(f"Resolved: {resolved}") + print(f"Renamed: {renamed}") + print(f"No country found: {len(no_country)}") + print(f"Without Wikidata IDs: {len(without_wikidata)}") + + if no_country and len(no_country) <= 20: + print() + print("Files without country resolution:") + for name in no_country: + print(f" - {name}") + + if dry_run: + print() + print("This was a DRY RUN. Use --apply to make changes.") + + +if __name__ == '__main__': + main() diff --git a/scripts/resolve_cz_xx_regions.py b/scripts/resolve_cz_xx_regions.py new file mode 100644 index 0000000000..bfac40c805 --- /dev/null +++ b/scripts/resolve_cz_xx_regions.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +""" +Resolve CZ-XX (unknown region) files to proper ISO 3166-2:CZ region codes. + +This script updates 36 Czech institution files that have placeholder XX region codes +to their correct ISO 3166-2:CZ region codes based on researched location data. + +Research completed 2025-12-07 via GeoNames database and web searches. +""" + +import os +import re +import yaml +from datetime import datetime, timezone +from pathlib import Path + +# GeoNames Admin1 → ISO 3166-2:CZ region code mapping +ADMIN1_TO_ISO = { + '52': '10', # Prague + '78': '64', # South Moravian (Jihomoravský) + '79': '31', # South Bohemian (Jihočeský) + '80': '63', # Vysočina + '81': '41', # Karlovy Vary + '82': '52', # Hradec Králové + '83': '51', # Liberec + '84': '71', # Olomouc + '85': '80', # Moravian-Silesian (Moravskoslezský) + '86': '53', # Pardubice + '87': '32', # Plzeň + '88': '20', # Central Bohemian (Středočeský) + '89': '42', # Ústí nad Labem + '90': '72', # Zlín +} + +# Research results: mapping from old filename suffix to resolution data +# Format: (new_region_code, new_city_code, city_name, geonames_id, admin1_code) +RESOLUTIONS = { + # Archives (A) + 'A-SAČTÚ': ('10', 'PRA', 'Prague', 3067696, '52'), + 'A-SAČÚZK': ('10', 'PRA', 'Prague', 3067696, '52'), + 'A-SAERÚ': ('63', 'JIH', 'Jihlava', 3074199, '80'), + 'A-SAÚPOHS': ('64', 'BRN', 'Brno', 3078610, '78'), + 'A-BSS': ('51', 'MAS', 'Malá Skála', 3071223, '83'), + 'A-PSJAK': ('53', 'BNO', 'Brandýs nad Orlicí', 3078836, '86'), + 'A-ZI': ('10', 'PRA', 'Prague', 3067696, '52'), # Admin location + + # Galleries (G) + 'G-GAU': ('52', 'HOS', 'Hostinné', 3075058, '82'), + 'G-GVP': ('20', 'MLB', 'Mladá Boleslav', 3070544, '88'), + + # Libraries (L) - Many are research institutes in Prague/Brno + 'L-SÚPRO': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE064 + 'L-ÚRB': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE444 + 'L-ÚSLOZ': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE215 + 'L-VŠZFA': ('10', 'PRA', 'Prague', 3067696, '52'), + 'L-VŠZR': ('10', 'PRA', 'Prague', 3067696, '52'), + 'L-VÚB': ('64', 'BRN', 'Brno', 3078610, '78'), # BOC006 + 'L-VÚI': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC043 + 'L-VÚP': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC066 + 'L-VÚRV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC162 + 'L-VUTÚTMŘP': ('64', 'BRN', 'Brno', 3078610, '78'), + 'L-VVÚNP': ('64', 'BRN', 'Brno', 3078610, '78'), # BOF045 + 'L-ZVVZVÚV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABF127 + + # Museums (M) + 'M-BMOP': ('32', 'KPC', 'Klenčí pod Čerchovem', 3073644, '87'), + 'M-MD': ('10', 'PRA', 'Prague', 3067696, '52'), + 'M-MH': ('20', 'KNC', 'Kostelec nad Černými Lesy', 3073152, '88'), + 'M-MJD': ('32', 'CHU', 'Chudenice', 3077528, '87'), + 'M-MKISMDAH': ('63', 'HUM', 'Humpolec', 3074723, '80'), + 'M-MMGLK': ('20', 'POD', 'Poděbrady', 3068107, '88'), + 'M-MMM': ('42', 'MIK', 'Mikulášovice', 3070725, '89'), # Mikcentrum! + 'M-MMSR': ('10', 'PRA', 'Prague', 3067696, '52'), + 'M-MRV': ('51', 'DES', 'Desná', 3077198, '83'), + 'M-MSČ': ('20', 'OST', 'Ostředek', 3068792, '88'), + 'M-MTZSŘ': ('52', 'DEO', 'Deštné v Orlických horách', 3077191, '82'), + 'M-MVBŽS': ('31', 'VOD', 'Vodňany', 3062642, '79'), + 'M-PDEHAM': ('53', 'HOL', 'Holice', 3075599, '86'), + 'M-PMJH': ('31', 'HUS', 'Husinec', 3074686, '79'), + 'M-PZV': ('51', 'PNJ', 'Paseky nad Jizerou', 3068552, '83'), +} + + +def generate_city_code(city_name: str) -> str: + """Generate 3-letter city code from city name.""" + # Remove diacritics and common prefixes + import unicodedata + normalized = unicodedata.normalize('NFD', city_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Handle multi-word names + words = ascii_name.split() + + # Skip common prefixes in Czech + skip_words = {'nad', 'pod', 'v', 'u', 'na'} + significant_words = [w for w in words if w.lower() not in skip_words] + + if len(significant_words) == 1: + # Single word: first 3 letters + return significant_words[0][:3].upper() + elif len(significant_words) >= 2: + # Multi-word: initials + return ''.join(w[0].upper() for w in significant_words[:3]) + else: + return ascii_name[:3].upper() + + +def update_yaml_file(filepath: Path, resolution: tuple) -> tuple: + """ + Update a YAML file with resolved region/city data. + + Returns: (old_ghcid, new_ghcid, new_filepath) + """ + region_code, city_code, city_name, geonames_id, admin1_code = resolution + + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + + # Parse YAML + data = yaml.safe_load(content) + + # Extract current GHCID + old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') + + # Build new GHCID + # Pattern: CZ-XX-XXX-{TYPE}-{ABBREV} -> CZ-{REGION}-{CITY}-{TYPE}-{ABBREV} + match = re.match(r'CZ-XX-XXX-([A-Z])-(.+)$', old_ghcid) + if not match: + print(f" WARNING: Could not parse GHCID: {old_ghcid}") + return None, None, None + + inst_type, abbrev = match.groups() + new_ghcid = f"CZ-{region_code}-{city_code}-{inst_type}-{abbrev}" + + timestamp = datetime.now(timezone.utc).isoformat() + + # Update ghcid section + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['location_resolution'] = { + 'method': 'GEONAMES_RESEARCH', + 'country_code': 'CZ', + 'region_code': region_code, + 'region_name': get_region_name(region_code), + 'city_code': city_code, + 'city_name': city_name, + 'geonames_id': geonames_id, + 'admin1_code': admin1_code, + 'resolution_timestamp': timestamp, + 'research_date': '2025-12-07', + 'research_method': 'GeoNames database + web search verification' + } + + # Add history entry + if 'ghcid_history' not in data['ghcid']: + data['ghcid']['ghcid_history'] = [] + + data['ghcid']['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'valid_from': timestamp, + 'reason': f'Region resolved via GeoNames research: XX→{region_code}, city: {city_name} (GeoNames ID: {geonames_id})' + }) + + # Update provenance notes + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance']: + data['provenance']['notes'] = [] + data['provenance']['notes'].append( + f'Region resolved {timestamp[:10]}: XX→CZ-{region_code} ({city_name}) via GeoNames research' + ) + + # Update location if present + if 'location' not in data: + data['location'] = {} + data['location']['city'] = city_name + data['location']['country'] = 'CZ' + data['location']['region'] = get_region_name(region_code) + data['location']['geonames_id'] = geonames_id + + # Write updated YAML + new_filename = f"{new_ghcid}.yaml" + new_filepath = filepath.parent / new_filename + + with open(new_filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + # Remove old file if different + if new_filepath != filepath: + filepath.unlink() + + return old_ghcid, new_ghcid, new_filepath + + +def get_region_name(region_code: str) -> str: + """Get region name from ISO 3166-2:CZ code.""" + region_names = { + '10': 'Prague', + '20': 'Central Bohemian', + '31': 'South Bohemian', + '32': 'Plzeň', + '41': 'Karlovy Vary', + '42': 'Ústí nad Labem', + '51': 'Liberec', + '52': 'Hradec Králové', + '53': 'Pardubice', + '63': 'Vysočina', + '64': 'South Moravian', + '71': 'Olomouc', + '72': 'Zlín', + '80': 'Moravian-Silesian', + } + return region_names.get(region_code, 'Unknown') + + +def main(): + """Main execution function.""" + custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') + + # Find all CZ-XX-XXX files + xx_files = list(custodian_dir.glob('CZ-XX-XXX-*.yaml')) + print(f"Found {len(xx_files)} CZ-XX-XXX files to resolve") + + resolved = 0 + failed = 0 + + for filepath in sorted(xx_files): + filename = filepath.stem + # Extract suffix (e.g., "A-SAČTÚ" from "CZ-XX-XXX-A-SAČTÚ") + suffix_match = re.match(r'CZ-XX-XXX-(.+)$', filename) + if not suffix_match: + print(f" SKIP: Could not parse filename: {filename}") + failed += 1 + continue + + suffix = suffix_match.group(1) + + if suffix not in RESOLUTIONS: + print(f" SKIP: No resolution for: {suffix}") + failed += 1 + continue + + resolution = RESOLUTIONS[suffix] + try: + old_ghcid, new_ghcid, new_filepath = update_yaml_file(filepath, resolution) + if old_ghcid and new_ghcid: + print(f" ✓ {old_ghcid} → {new_ghcid}") + resolved += 1 + else: + print(f" ✗ Failed to update: {filepath.name}") + failed += 1 + except Exception as e: + print(f" ✗ Error processing {filepath.name}: {e}") + failed += 1 + + print(f"\n{'='*60}") + print(f"SUMMARY: Resolved {resolved}/{len(xx_files)} files") + if failed: + print(f" Failed: {failed}") + + # Verify no CZ-XX files remain + remaining = list(custodian_dir.glob('CZ-XX-*.yaml')) + print(f"\nRemaining CZ-XX files: {len(remaining)}") + if remaining: + for f in remaining: + print(f" - {f.name}") + + +if __name__ == '__main__': + main() diff --git a/scripts/resolve_locations_by_name.py b/scripts/resolve_locations_by_name.py new file mode 100755 index 0000000000..cced3707c4 --- /dev/null +++ b/scripts/resolve_locations_by_name.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 +""" +Resolve XX region codes using city names extracted from institution names. + +This script handles files without coordinates or Wikidata IDs by: +1. Extracting city names from institution names +2. Looking up cities in GeoNames database +3. Mapping to ISO 3166-2 region codes + +Following AGENTS.md Rules: +- Rule 5: Additive only - never delete existing data +""" + +import os +import sys +import yaml +import sqlite3 +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, Any, List, Tuple + +# Belgian city name patterns +BELGIAN_CITIES = { + 'brussel': 'BRU', 'bruxelles': 'BRU', 'brussels': 'BRU', + 'antwerpen': 'VAN', 'anvers': 'VAN', 'antwerp': 'VAN', + 'gent': 'VOV', 'ghent': 'VOV', 'gand': 'VOV', + 'brugge': 'VWV', 'bruges': 'VWV', + 'leuven': 'VBR', 'louvain': 'VBR', + 'mechelen': 'VAN', 'malines': 'VAN', + 'hasselt': 'VLI', + 'luik': 'WLG', 'liège': 'WLG', 'liege': 'WLG', + 'charleroi': 'WHT', + 'namur': 'WNA', 'namen': 'WNA', + 'mons': 'WHT', 'bergen': 'WHT', + 'tournai': 'WHT', 'doornik': 'WHT', + 'kortrijk': 'VWV', 'courtrai': 'VWV', + 'oostende': 'VWV', 'ostende': 'VWV', + 'aalst': 'VOV', 'alost': 'VOV', + 'sint-niklaas': 'VOV', + 'dendermonde': 'VOV', + 'genk': 'VLI', + 'roeselare': 'VWV', + 'mouscron': 'WHT', 'moeskroen': 'WHT', + 'tienen': 'VBR', 'tirlemont': 'VBR', + 'ieper': 'VWV', 'ypres': 'VWV', + 'turnhout': 'VAN', + 'waregem': 'VWV', + 'lokeren': 'VOV', + 'beveren': 'VOV', + 'vilvoorde': 'VBR', + 'dilbeek': 'VBR', + 'schoten': 'VAN', + 'brasschaat': 'VAN', + 'boom': 'VAN', + 'mortsel': 'VAN', + 'temse': 'VOV', + 'herzele': 'VOV', + 'brecht': 'VAN', + 'oudenaarde': 'VOV', + 'rotselaar': 'VBR', + 'niel': 'VAN', + 'lint': 'VAN', + 'ravels': 'VAN', + 'bree': 'VLI', + 'peer': 'VLI', + 'meeuwen': 'VLI', + 'gruitrode': 'VLI', + 'arlon': 'WLX', 'aarlen': 'WLX', + 'bastogne': 'WLX', 'bastenaken': 'WLX', +} + +# Austrian state codes +AUSTRIAN_STATES = { + 'wien': '9', 'vienna': '9', + 'salzburg': '5', + 'tirol': '7', 'tyrol': '7', 'innsbruck': '7', + 'vorarlberg': '8', 'bregenz': '8', + 'kärnten': '2', 'carinthia': '2', 'klagenfurt': '2', + 'steiermark': '6', 'styria': '6', 'graz': '6', + 'oberösterreich': '4', 'upper austria': '4', 'linz': '4', + 'niederösterreich': '3', 'lower austria': '3', 'st. pölten': '3', + 'burgenland': '1', 'eisenstadt': '1', +} + +# Bulgarian province codes +BULGARIAN_PROVINCES = { + 'sofia': '22', 'софія': '22', + 'plovdiv': '16', 'пловдив': '16', + 'varna': '03', 'варна': '03', + 'burgas': '02', 'бургас': '02', + 'ruse': '18', 'русе': '18', + 'stara zagora': '24', + 'pleven': '15', 'плевен': '15', +} + +# Swiss canton codes (abbreviated) +SWISS_CANTONS = { + 'zürich': 'ZH', 'zurich': 'ZH', + 'bern': 'BE', 'berne': 'BE', + 'luzern': 'LU', 'lucerne': 'LU', + 'genève': 'GE', 'geneva': 'GE', 'genf': 'GE', + 'basel': 'BS', + 'lausanne': 'VD', + 'winterthur': 'ZH', + 'st. gallen': 'SG', 'st gallen': 'SG', + 'lugano': 'TI', + 'biel': 'BE', 'bienne': 'BE', + 'thun': 'BE', + 'fribourg': 'FR', 'freiburg': 'FR', + 'schaffhausen': 'SH', + 'chur': 'GR', + 'neuchâtel': 'NE', 'neuchatel': 'NE', + 'sion': 'VS', + 'aarau': 'AG', + 'baden': 'AG', +} + + +def extract_city_from_name(name: str, country: str) -> Optional[Tuple[str, str]]: + """ + Extract city name from institution name. + Returns (city_name, region_code) or None. + """ + name_lower = name.lower() + + if country == 'BE': + for city, region in BELGIAN_CITIES.items(): + if city in name_lower: + return (city.title(), region) + + elif country == 'AT': + for city, region in AUSTRIAN_STATES.items(): + if city in name_lower: + return (city.title(), region) + + elif country == 'BG': + for city, region in BULGARIAN_PROVINCES.items(): + if city in name_lower: + return (city.title(), region) + + elif country == 'CH': + for city, region in SWISS_CANTONS.items(): + if city in name_lower: + return (city.title(), region) + + return None + + +def update_file_with_region(filepath: Path, region_code: str, city_name: str, + dry_run: bool = True) -> Tuple[bool, Optional[Path]]: + """Update a custodian file with resolved region code.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {filepath}: {e}") + return False, None + + if 'ghcid' not in data: + return False, None + + ghcid = data['ghcid'] + if 'location_resolution' not in ghcid: + ghcid['location_resolution'] = {} + + loc_res = ghcid['location_resolution'] + country_code = loc_res.get('country_code', '') + + if not country_code: + return False, None + + old_region = loc_res.get('region_code', 'XX') + + if old_region != 'XX': + return False, None + + # Update location resolution + loc_res['region_code'] = region_code + loc_res['region_name'] = city_name + loc_res['method'] = 'NAME_LOOKUP' + loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() + + # Update GHCID string + old_ghcid = ghcid.get('ghcid_current', '') + new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') + + if new_ghcid != old_ghcid: + ghcid['ghcid_current'] = new_ghcid + + if 'ghcid_history' not in ghcid: + ghcid['ghcid_history'] = [] + + ghcid['ghcid_history'].append({ + 'ghcid': new_ghcid, + 'valid_from': datetime.now(timezone.utc).isoformat(), + 'reason': f"Region resolved via name lookup: XX->{region_code} (city: {city_name})" + }) + + # Add provenance note + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance']: + data['provenance']['notes'] = [] + elif isinstance(data['provenance']['notes'], str): + data['provenance']['notes'] = [data['provenance']['notes']] + + data['provenance']['notes'].append( + f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " + f"XX->{region_code} via name lookup (city: {city_name})" + ) + + # Determine new filename + new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') + new_filepath = filepath.parent / new_filename + + if not dry_run: + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + if new_filepath != filepath and not new_filepath.exists(): + filepath.rename(new_filepath) + + return True, new_filepath if new_filepath != filepath else None + + +def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser( + description='Resolve XX region codes using city names from institution names' + ) + parser.add_argument('--apply', action='store_true', + help='Actually apply the fixes (default: dry run)') + parser.add_argument('--path', type=str, default='data/custodian', + help='Path to custodian files directory') + parser.add_argument('--limit', type=int, default=100, + help='Limit number of files to process') + parser.add_argument('--country', type=str, + help='Only process files for a specific country') + + args = parser.parse_args() + + custodian_dir = Path(args.path) + if not custodian_dir.exists(): + print(f"Error: Directory {custodian_dir} does not exist") + sys.exit(1) + + dry_run = not args.apply + + print("=" * 70) + print("REGION RESOLUTION VIA NAME LOOKUP") + print("=" * 70) + print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") + print() + + # Find files with XX region codes + files_to_process = [] + + for filepath in custodian_dir.glob('*-XX-*.yaml'): + files_to_process.append(filepath) + + print(f"Found {len(files_to_process)} files with XX region codes") + + # Load files and extract institution names + file_data = [] + for filepath in files_to_process[:args.limit]: + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + # Get country code + country = None + if 'ghcid' in data and 'location_resolution' in data['ghcid']: + country = data['ghcid']['location_resolution'].get('country_code') + + if not country: + continue + + if args.country and country != args.country: + continue + + # Get institution name + name = None + if 'custodian_name' in data: + name = data['custodian_name'].get('claim_value') + if not name and 'original_entry' in data: + name = data['original_entry'].get('name') + + if not name: + continue + + file_data.append({ + 'filepath': filepath, + 'data': data, + 'country': country, + 'name': name + }) + except Exception as e: + print(f"Error loading {filepath}: {e}") + + print(f"Processing {len(file_data)} files with institution names") + print() + + # Process each file + resolved = 0 + renamed = 0 + no_match = 0 + + for f in file_data: + filepath = f['filepath'] + name = f['name'] + country = f['country'] + + # Try to extract city from name + result = extract_city_from_name(name, country) + + if not result: + no_match += 1 + continue + + city_name, region_code = result + + print(f"Processing {filepath.name}...") + print(f" Name: {name}") + print(f" City: {city_name} -> Region: {region_code}") + + # Update file + success, new_path = update_file_with_region(filepath, region_code, city_name, dry_run=dry_run) + + if success: + resolved += 1 + if new_path: + renamed += 1 + print(f" {filepath.name} -> {new_path.name}") + + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {len(file_data)}") + print(f"Resolved: {resolved}") + print(f"Renamed: {renamed}") + print(f"No city match: {no_match}") + + if dry_run: + print() + print("This was a DRY RUN. Use --apply to make changes.") + + +if __name__ == '__main__': + main() diff --git a/scripts/resolve_regions_from_city.py b/scripts/resolve_regions_from_city.py new file mode 100644 index 0000000000..9793c22188 --- /dev/null +++ b/scripts/resolve_regions_from_city.py @@ -0,0 +1,568 @@ +#!/usr/bin/env python3 +""" +Resolve XX region codes using city names already in the file. + +This script handles files that have city data but unknown region codes. +It looks up the city in GeoNames to get the admin1 (region) code. + +Following AGENTS.md Rules: +- Rule 5: Additive only - never delete existing data +- GHCID settlement standardization: GeoNames is authoritative +""" + +import os +import sys +import yaml +import sqlite3 +import re +import unicodedata +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, Any, List, Tuple + +# GeoNames database +GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db" +CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian" + +# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods) +SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + +# Country-specific region code mappings (GeoNames admin1 → ISO 3166-2) +COUNTRY_ADMIN_MAPS = { + 'NL': { + '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', + '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', + '15': 'OV', '16': 'FL' + }, + 'BE': { + 'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV', + 'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA', + 'BRU': 'BRU' + }, + # Georgia: GeoNames admin1 → ISO 3166-2:GE + 'GE': { + '51': 'TB', # Tbilisi + '04': 'AJ', # Adjara + '67': 'KA', # Kakheti + '66': 'IM', # Imereti + '68': 'KK', # Kvemo Kartli + '69': 'MM', # Mtskheta-Mtianeti + '70': 'RL', # Racha-Lechkhumi and Kvemo Svaneti + '71': 'SZ', # Samegrelo and Zemo Svaneti + '72': 'SJ', # Samtskhe-Javakheti + '73': 'SK', # Shida Kartli + '65': 'GU', # Guria + }, + # Czech Republic: GeoNames admin1 → ISO 3166-2:CZ (2-digit NUTS codes) + # Source: https://en.wikipedia.org/wiki/ISO_3166-2:CZ + 'CZ': { + '52': '10', # Prague (Praha) + '88': '20', # Central Bohemian (Středočeský kraj) + '79': '31', # South Bohemian (Jihočeský kraj) + '87': '32', # Plzeň Region (Plzeňský kraj) + '81': '41', # Karlovy Vary Region (Karlovarský kraj) + '89': '42', # Ústí nad Labem Region (Ústecký kraj) + '83': '51', # Liberec Region (Liberecký kraj) + '82': '52', # Hradec Králové Region (Královéhradecký kraj) + '86': '53', # Pardubice Region (Pardubický kraj) + '80': '63', # Vysočina Region + '78': '64', # South Moravian (Jihomoravský kraj) + '84': '71', # Olomouc Region (Olomoucký kraj) + '90': '72', # Zlín Region (Zlínský kraj) + '85': '80', # Moravian-Silesian (Moravskoslezský kraj) + }, + # Austria: GeoNames admin1 → ISO 3166-2:AT + 'AT': { + '01': '1', # Burgenland + '02': '2', # Kärnten (Carinthia) + '03': '3', # Niederösterreich (Lower Austria) + '04': '4', # Oberösterreich (Upper Austria) + '05': '5', # Salzburg + '06': '6', # Steiermark (Styria) + '07': '7', # Tirol (Tyrol) + '08': '8', # Vorarlberg + '09': '9', # Wien (Vienna) + }, + # Bulgaria: GeoNames admin1 → ISO 3166-2:BG (2-letter province codes) + 'BG': { + '38': '01', # Blagoevgrad + '39': '02', # Burgas + '40': '08', # Dobrich + '41': '07', # Gabrovo + '42': '26', # Haskovo + '43': '09', # Kardzhali (Kurdzhali) + '44': '10', # Kyustendil + '45': '11', # Lovech + '46': '12', # Montana + '47': '13', # Pazardzhik + '48': '14', # Pernik + '49': '15', # Pleven + '50': '16', # Plovdiv + '51': '17', # Razgrad + '52': '18', # Ruse + '53': '27', # Shumen + '54': '19', # Silistra + '55': '20', # Sliven + '56': '21', # Smolyan + '57': '23', # Sofia (Sofiya-Grad) + '58': '22', # Sofia Province (Sofiya) + '59': '24', # Stara Zagora + '60': '25', # Targovishte + '61': '03', # Varna + '62': '04', # Veliko Tarnovo + '63': '05', # Vidin + '64': '06', # Vratsa + '65': '28', # Yambol + }, + # Switzerland: GeoNames already uses ISO 3166-2:CH canton codes + 'CH': { + 'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL', + 'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR', + 'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW', + 'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG', + 'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG', + 'ZH': 'ZH', + }, + # Vietnam: GeoNames admin1 codes are the ISO 3166-2:VN codes (use directly) + # GeoNames uses 2-digit codes that match ISO 3166-2:VN province codes + 'VN': { + '01': 'HN', # Hanoi (Ha Noi) + '31': 'HP', # Hai Phong + '48': 'DN', # Da Nang (Đà Nẵng) + '79': 'SG', # Ho Chi Minh City (Saigon) + '92': 'CT', # Can Tho + '75': 'DNa', # Dong Nai + '24': 'BN', # Bac Ninh + '22': 'QN', # Quang Ninh (Quảng Ninh) + '38': 'TH', # Thanh Hoa (Thanh Hóa) + '46': 'TTH', # Thua Thien-Hue (Thừa Thiên Huế) + '40': 'NA', # Nghe An (Nghệ An) + '04': 'CB', # Cao Bang + '37': 'NB', # Ninh Binh + '56': 'KH', # Khanh Hoa + '66': 'DLK', # Dak Lak + '68': 'LDG', # Lam Dong + '91': 'AG', # An Giang + '86': 'VL', # Vinh Long + '82': 'DTP', # Dong Thap + '80': 'TNi', # Tay Ninh + '96': 'CMa', # Ca Mau + '51': 'QNg', # Quang Ngai + '52': 'GL', # Gia Lai + '19': 'TN', # Thai Nguyen + '25': 'PT', # Phu Tho + }, + # Japan: GeoNames admin1 → ISO 3166-2:JP (2-digit prefecture codes) + # See: https://en.wikipedia.org/wiki/ISO_3166-2:JP + 'JP': { + '01': '23', # Aichi + '02': '05', # Akita + '03': '02', # Aomori + '04': '12', # Chiba + '05': '38', # Ehime + '06': '18', # Fukui + '07': '40', # Fukuoka + '08': '07', # Fukushima + '09': '21', # Gifu + '10': '10', # Gunma + '11': '34', # Hiroshima + '12': '01', # Hokkaido + '13': '28', # Hyogo + '14': '08', # Ibaraki + '15': '17', # Ishikawa + '16': '03', # Iwate + '17': '37', # Kagawa + '18': '46', # Kagoshima + '19': '14', # Kanagawa + '20': '39', # Kochi + '21': '43', # Kumamoto + '22': '26', # Kyoto + '23': '24', # Mie + '24': '04', # Miyagi + '25': '45', # Miyazaki + '26': '20', # Nagano + '27': '42', # Nagasaki + '28': '29', # Nara + '29': '15', # Niigata + '30': '44', # Oita + '31': '33', # Okayama + '32': '27', # Osaka + '33': '41', # Saga + '34': '11', # Saitama + '35': '25', # Shiga + '36': '32', # Shimane + '37': '22', # Shizuoka + '38': '09', # Tochigi + '39': '36', # Tokushima + '40': '13', # Tokyo + '41': '31', # Tottori + '42': '16', # Toyama + '43': '30', # Wakayama + '44': '06', # Yamagata + '45': '35', # Yamaguchi + '46': '19', # Yamanashi + '47': '47', # Okinawa + }, + # Egypt: GeoNames admin1 → ISO 3166-2:EG + # See: https://en.wikipedia.org/wiki/ISO_3166-2:EG + 'EG': { + '01': 'DK', # Dakahlia + '02': 'BA', # Red Sea (Al Bahr al Ahmar) + '03': 'BH', # Beheira + '04': 'FYM', # Faiyum + '05': 'GH', # Gharbia + '06': 'ALX', # Alexandria + '07': 'IS', # Ismailia + '08': 'GZ', # Giza + '09': 'MNF', # Monufia + '10': 'MN', # Minya + '11': 'C', # Cairo + '12': 'KB', # Qalyubia + '13': 'WAD', # New Valley (Al Wadi al Jadid) + '14': 'SHR', # Sharqia + '15': 'SUZ', # Suez + '16': 'ASN', # Aswan + '17': 'AST', # Asyut + '18': 'BNS', # Beni Suweif + '19': 'PTS', # Port Said + '20': 'DT', # Damietta + '21': 'KFS', # Kafr el-Sheikh + '22': 'MT', # Matruh + '23': 'KN', # Qena + '24': 'SHG', # Sohag + '26': 'JS', # South Sinai + '27': 'SIN', # North Sinai + '28': 'LX', # Luxor + }, +} + +# City name translations (native → GeoNames ASCII name) +# Many cities in GeoNames use English/anglicized names +CITY_NAME_TRANSLATIONS = { + # German → English + 'wien': 'vienna', + 'munchen': 'munich', + 'koln': 'cologne', + 'nurnberg': 'nuremberg', + 'braunschweig': 'brunswick', + # Czech → GeoNames (use normalized/ASCII keys) + 'praha': 'prague', + 'plzen': 'pilsen', # Plzeň → plzen after normalization + 'brno': 'brno', + 'ostrava': 'ostrava', + # Swiss cities + 'geneve': 'geneva', + 'zurich': 'zurich', + 'bern': 'berne', + 'basel': 'basle', + # Italian cities + 'roma': 'rome', + 'milano': 'milan', + 'napoli': 'naples', + 'firenze': 'florence', + 'venezia': 'venice', + 'torino': 'turin', + # Austrian special cases (use normalized keys after diacritics removal) + # GeoNames uses 'oe' for ö, so 'Sankt Poelten' + 'st. polten': 'sankt poelten', + 'st polten': 'sankt poelten', + 'sankt polten': 'sankt poelten', + # Japanese cities - complex administrative format to GeoNames + # Format: "District Gun City Machi/Cho" → just the city name + 'haga gun motegi machi': 'motegi', + 'motegi machi': 'motegi', + # Egyptian landmarks → Cairo + 'nile corniche': 'cairo', +} + + +def normalize_city_name(name: str) -> str: + """Normalize city name for matching.""" + # NFD normalization to separate diacritics + normalized = unicodedata.normalize('NFD', name) + # Remove diacritics + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + # Lowercase + return ascii_name.lower().strip() + + +def clean_city_name(city: str) -> str: + """Extract base city name from complex strings like 'Praha 1' or 'Zlín - Louky'.""" + # Remove district numbers like "Praha 1", "Praha 9 - Běchovice" + city = re.sub(r'\s+\d+.*$', '', city) + # Remove parts after dash + city = re.sub(r'\s*-\s*.*$', '', city) + # Remove postal code patterns + city = re.sub(r'\s+\d{3}\s*\d{2}.*$', '', city) + return city.strip() + + +def lookup_city_region(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]: + """Look up city in GeoNames and return region info.""" + cursor = conn.cursor() + + # Clean city name + base_city = clean_city_name(city_name) + normalized = normalize_city_name(base_city) + + # Check for translated name (native → GeoNames) + if normalized in CITY_NAME_TRANSLATIONS: + translated = CITY_NAME_TRANSLATIONS[normalized] + else: + translated = normalized + + # Try translated name first, then normalized + row = None + for search_name in [translated, normalized]: + cursor.execute(f''' + SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, + latitude, longitude, feature_code, population + FROM cities + WHERE country_code = ? + AND feature_code IN {SETTLEMENT_FEATURE_CODES} + AND LOWER(ascii_name) = ? + ORDER BY population DESC + LIMIT 1 + ''', (country, search_name)) + + row = cursor.fetchone() + if row: + break + + # If no match, try LIKE search with normalized name + if not row: + cursor.execute(f''' + SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, + latitude, longitude, feature_code, population + FROM cities + WHERE country_code = ? + AND feature_code IN {SETTLEMENT_FEATURE_CODES} + AND LOWER(ascii_name) LIKE ? + ORDER BY population DESC + LIMIT 1 + ''', (country, f'{normalized}%')) + row = cursor.fetchone() + + if not row: + return None + + return { + 'geonames_id': row[0], + 'name': row[1], + 'ascii_name': row[2], + 'admin1_code': row[3], + 'admin2_code': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'feature_code': row[7], + 'population': row[8], + } + + +def get_region_code(country: str, admin1_code: Optional[str], admin2_code: Optional[str] = None) -> str: + """Convert GeoNames admin codes to ISO 3166-2 region codes.""" + if country in COUNTRY_ADMIN_MAPS: + country_map = COUNTRY_ADMIN_MAPS[country] + if country == 'BE' and admin2_code: + return country_map.get(admin2_code, admin1_code or 'XX') + if admin1_code: + return country_map.get(admin1_code, admin1_code) + return 'XX' + return admin1_code if admin1_code else 'XX' + + +def find_city_in_file(data: Dict) -> Optional[Tuple[str, str]]: + """Find city name and country from file data.""" + country = None + city = None + + # Get country from ghcid + ghcid = data.get('ghcid', {}) + loc_res = ghcid.get('location_resolution', {}) + country = loc_res.get('country_code') + + # Check original_entry.locations + if 'original_entry' in data: + locations = data['original_entry'].get('locations', []) + for loc in locations: + if 'city' in loc and loc['city']: + city = loc['city'] + if not country and 'country' in loc: + country = loc['country'] + break + + # Check top-level locations + if not city: + locations = data.get('locations', []) + for loc in locations: + if 'city' in loc and loc['city']: + city = loc['city'] + if not country and 'country' in loc: + country = loc['country'] + break + + if city and country: + return (city, country) + return None + + +def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool: + """Process a single file with XX region code.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {filepath}: {e}") + return False + + if not data: + return False + + # Check if region is already resolved + ghcid = data.get('ghcid', {}) + loc_res = ghcid.get('location_resolution', {}) + if loc_res.get('region_code', 'XX') != 'XX': + return False + + # Find city name + city_info = find_city_in_file(data) + if not city_info: + return False + + city_name, country = city_info + print(f" City: {city_name} ({country})") + + # Look up in GeoNames + city_data = lookup_city_region(city_name, country, conn) + if not city_data: + print(f" No GeoNames match for '{city_name}'") + return False + + region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code')) + if region_code == 'XX': + print(f" Could not determine region for admin1={city_data['admin1_code']}") + return False + + print(f" Found: {city_data['name']} -> Region {region_code}") + + if not apply: + return True + + # Update GHCID + current = ghcid.get('ghcid_current', '') + parts = current.split('-') + if len(parts) < 5: + print(f" Invalid GHCID format: {current}") + return False + + old_region = parts[1] + if old_region != 'XX': + print(f" Region already set: {old_region}") + return False + + parts[1] = region_code + new_ghcid = '-'.join(parts) + + # Update data + ghcid['ghcid_current'] = new_ghcid + loc_res['region_code'] = region_code + loc_res['region_name'] = f"{country}-{region_code}" + loc_res['geonames_id'] = city_data['geonames_id'] + loc_res['method'] = 'GEONAMES_CITY_LOOKUP' + loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() + ghcid['location_resolution'] = loc_res + + # Add to history + history = ghcid.get('ghcid_history', []) + history.append({ + 'ghcid': new_ghcid, + 'valid_from': datetime.now(timezone.utc).isoformat(), + 'reason': f'Region resolved via GeoNames city lookup: XX->{region_code} ({city_data["name"]})' + }) + ghcid['ghcid_history'] = history + data['ghcid'] = ghcid + + # Calculate new filename + old_name = filepath.name + new_name = old_name.replace(f'{country}-XX-', f'{country}-{region_code}-') + new_path = filepath.parent / new_name + + # Write and rename + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + if new_path != filepath: + filepath.rename(new_path) + print(f" Renamed: {old_name} -> {new_name}") + + return True + + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Resolve XX region codes using city names in files') + parser.add_argument('--limit', type=int, default=100, help='Max files to process') + parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)') + parser.add_argument('--country', help='Filter by country code') + args = parser.parse_args() + + print("=" * 70) + print("REGION RESOLUTION FROM FILE CITY NAMES") + print("=" * 70) + print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}") + print() + + # Connect to GeoNames + if not GEONAMES_DB.exists(): + print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") + sys.exit(1) + + conn = sqlite3.connect(str(GEONAMES_DB)) + + # Find XX files with city names + xx_files = [] + for f in CUSTODIAN_DIR.glob('*.yaml'): + if '-XX-' in f.name: + if args.country and not f.name.startswith(f'{args.country}-'): + continue + xx_files.append(f) + + print(f"Found {len(xx_files)} files with XX region codes") + + # Filter to files with city names + files_with_cities = [] + for f in xx_files: + try: + with open(f, 'r', encoding='utf-8') as fp: + content = fp.read() + if 'city:' in content: + files_with_cities.append(f) + except: + pass + + print(f"Processing {min(len(files_with_cities), args.limit)} files with city names") + print() + + resolved = 0 + renamed = 0 + + for f in files_with_cities[:args.limit]: + print(f"Processing {f.name}...") + if process_file(f, conn, args.apply): + resolved += 1 + if args.apply: + renamed += 1 + + conn.close() + + print() + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"Files processed: {min(len(files_with_cities), args.limit)}") + print(f"Resolved: {resolved}") + print(f"Renamed: {renamed}") + + +if __name__ == '__main__': + main() diff --git a/scripts/update_ghcid_with_geonames.py b/scripts/update_ghcid_with_geonames.py new file mode 100644 index 0000000000..515b23c53b --- /dev/null +++ b/scripts/update_ghcid_with_geonames.py @@ -0,0 +1,619 @@ +#!/usr/bin/env python3 +""" +Update GHCID region and city codes using GeoNames reverse geocoding. + +For custodian files that have coordinates, this script: +1. Reverse geocodes coordinates to find the nearest GeoNames city +2. Extracts proper admin1_code (region) and city code +3. Updates the GHCID with correct codes +4. Renames the file if GHCID changes + +Usage: + python scripts/update_ghcid_with_geonames.py [--dry-run] [--limit N] [--country CODE] +""" + +import argparse +import hashlib +import os +import re +import shutil +import sqlite3 +import uuid +import yaml +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +# Paths +PROJECT_ROOT = Path(__file__).parent.parent +CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" +GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db" +REPORTS_DIR = PROJECT_ROOT / "reports" + +# GHCID namespace for UUID generation +GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') + +# Country-specific region code mappings (GeoNames admin1_code -> ISO 3166-2) +# This handles cases where GeoNames codes differ from ISO codes +REGION_CODE_MAPPINGS = { + 'NL': { + '01': 'DR', # Drenthe + '02': 'FR', # Friesland + '03': 'GE', # Gelderland + '04': 'GR', # Groningen + '05': 'LI', # Limburg + '06': 'NB', # Noord-Brabant + '07': 'NH', # Noord-Holland + '09': 'UT', # Utrecht + '10': 'ZE', # Zeeland + '11': 'ZH', # Zuid-Holland + '15': 'OV', # Overijssel + '16': 'FL', # Flevoland + }, + # Japan uses prefecture numbers which are fine as-is (2-digit) + # Most countries can use admin1_code directly +} + +# Type code mapping +TYPE_TO_CODE = { + 'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M', + 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R', 'CORPORATION': 'C', + 'UNKNOWN': 'U', 'BOTANICAL_ZOO': 'B', 'EDUCATION_PROVIDER': 'E', + 'COLLECTING_SOCIETY': 'S', 'FEATURES': 'F', 'INTANGIBLE_HERITAGE_GROUP': 'I', + 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P', 'HOLY_SITES': 'H', + 'DIGITAL_PLATFORM': 'D', 'NGO': 'N', 'TASTE_SMELL': 'T', +} + + +def get_geonames_connection() -> sqlite3.Connection: + """Get connection to GeoNames database.""" + return sqlite3.connect(GEONAMES_DB) + + +def reverse_geocode(lat: float, lon: float, country_code: str, conn: sqlite3.Connection) -> Optional[Dict]: + """ + Find nearest GeoNames city for given coordinates. + + Uses simple Euclidean distance (good enough for nearby city matching). + Filters by feature_code to exclude neighborhoods (PPLX). + """ + # Query for nearest city, excluding PPLX (neighborhoods) + cursor = conn.execute(""" + SELECT + geonames_id, name, ascii_name, admin1_code, admin1_name, + latitude, longitude, feature_code, population, + ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq + FROM cities + WHERE country_code = ? + AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + ORDER BY distance_sq + LIMIT 1 + """, (lat, lat, lon, lon, country_code)) + + row = cursor.fetchone() + if row: + return { + 'geonames_id': row[0], + 'city_name': row[1], + 'ascii_name': row[2], + 'admin1_code': row[3], + 'admin1_name': row[4], + 'latitude': row[5], + 'longitude': row[6], + 'feature_code': row[7], + 'population': row[8], + 'distance_sq': row[9], + } + return None + + +def generate_city_code(name: str) -> str: + """Generate 3-letter city code from name.""" + import unicodedata + if not name: + return "XXX" + + # Normalize and remove diacritics + normalized = unicodedata.normalize('NFD', name) + ascii_only = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Keep only alphanumeric + clean = re.sub(r'[^a-zA-Z0-9]', '', ascii_only) + + return clean[:3].upper() if clean else "XXX" + + +def get_region_code(country_code: str, admin1_code: str) -> str: + """Get 2-letter region code, using mappings if available.""" + if not admin1_code: + return "XX" + + # Check for country-specific mapping + if country_code in REGION_CODE_MAPPINGS: + mapped = REGION_CODE_MAPPINGS[country_code].get(admin1_code) + if mapped: + return mapped + + # Use admin1_code directly (truncate to 2 chars if needed) + return admin1_code[:2].upper() + + +def generate_ghcid(country_code: str, region_code: str, city_code: str, + institution_type: str, abbreviation: str, + name_suffix: Optional[str] = None) -> str: + """Generate GHCID string.""" + type_code = TYPE_TO_CODE.get(institution_type, 'U') + ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}" + if name_suffix: + ghcid = f"{ghcid}-{name_suffix}" + return ghcid + + +def generate_ghcid_uuid(ghcid: str) -> str: + """Generate UUID v5 from GHCID.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid)) + + +def generate_ghcid_uuid_sha256(ghcid: str) -> str: + """Generate UUID v8 (SHA-256 based) from GHCID.""" + sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest() + return f"{sha256_hash[:8]}-{sha256_hash[8:12]}-8{sha256_hash[13:16]}-{sha256_hash[16:20]}-{sha256_hash[20:32]}" + + +def generate_ghcid_numeric(ghcid: str) -> int: + """Generate 64-bit numeric ID from GHCID.""" + sha256_hash = hashlib.sha256(ghcid.encode()).digest() + return int.from_bytes(sha256_hash[:8], 'big') + + +def extract_coordinates(data: Dict) -> Optional[Tuple[float, float]]: + """Extract latitude/longitude from custodian data.""" + # Check original_entry.locations + locations = data.get('original_entry', {}).get('locations', []) + if locations and isinstance(locations, list): + loc = locations[0] + lat = loc.get('latitude') + lon = loc.get('longitude') + if lat is not None and lon is not None: + return (float(lat), float(lon)) + + # Check top-level locations + locations = data.get('locations', []) + if locations and isinstance(locations, list): + loc = locations[0] + lat = loc.get('latitude') + lon = loc.get('longitude') + if lat is not None and lon is not None: + return (float(lat), float(lon)) + + # Check google_maps_enrichment + gm = data.get('google_maps_enrichment', {}) + lat = gm.get('latitude') + lon = gm.get('longitude') + if lat is not None and lon is not None: + return (float(lat), float(lon)) + + return None + + +def extract_country_code(data: Dict) -> str: + """Extract country code from custodian data.""" + # Try ghcid.location_resolution + country = data.get('ghcid', {}).get('location_resolution', {}).get('country_code') + if country and country != 'XX': + return country + + # Try original_entry.locations + locations = data.get('original_entry', {}).get('locations', []) + if locations: + country = locations[0].get('country') + if country: + return country + + # Try top-level locations + locations = data.get('locations', []) + if locations: + country = locations[0].get('country') + if country: + return country + + return 'XX' + + +def extract_abbreviation_from_ghcid(ghcid: str) -> str: + """Extract the abbreviation component from a GHCID.""" + parts = ghcid.split('-') + if len(parts) >= 5: + return parts[4] + return "UNK" + + +def extract_name_suffix_from_ghcid(ghcid: str) -> Optional[str]: + """Extract name suffix from GHCID if present.""" + parts = ghcid.split('-') + if len(parts) > 5: + return '-'.join(parts[5:]) + return None + + +def validate_ch_annotator_entity(data: Dict) -> Tuple[bool, str]: + """ + Validate that the entity has a valid CH-Annotator profile for heritage institutions. + + Returns (is_valid, entity_subtype). + Valid subtypes for enrichment: GRP.HER.* (heritage institutions) + """ + ch_annotator = data.get('ch_annotator', {}) + entity_class = ch_annotator.get('entity_classification', {}) + + hypernym = entity_class.get('hypernym', '') + subtype = entity_class.get('subtype', '') + + # Valid heritage institution subtypes + valid_subtypes = [ + 'GRP.HER', # Generic heritage institution + 'GRP.HER.GAL', # Gallery + 'GRP.HER.LIB', # Library + 'GRP.HER.ARC', # Archive + 'GRP.HER.MUS', # Museum + 'GRP.HER.RES', # Research center + 'GRP.HER.EDU', # Education provider + 'GRP.HER.REL', # Religious heritage site + 'GRP.HER.BOT', # Botanical/zoo + 'GRP.HER.MIX', # Mixed type + ] + + # Check if entity has valid heritage subtype + if subtype: + for valid in valid_subtypes: + if subtype.startswith(valid): + return (True, subtype) + + # Fallback: check hypernym is GROUP + if hypernym == 'GRP': + # Check institution_type from original_entry + inst_type = data.get('original_entry', {}).get('institution_type', '') + if inst_type in TYPE_TO_CODE: + return (True, f'GRP.HER.{inst_type[:3]}') + + # No valid CH-Annotator profile - but still allow processing if has institution_type + inst_type = data.get('original_entry', {}).get('institution_type', '') + if inst_type and inst_type != 'UNKNOWN': + return (True, f'INFERRED.{inst_type}') + + return (False, '') + + +def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False, + require_ch_annotator: bool = False) -> Dict: + """ + Process a single custodian file. + + Args: + filepath: Path to custodian YAML file + conn: GeoNames database connection + dry_run: If True, don't write changes + require_ch_annotator: If True, skip files without valid CH-Annotator entity profile + + Returns dict with processing results. + """ + result = { + 'file': filepath.name, + 'status': 'skipped', + 'old_ghcid': None, + 'new_ghcid': None, + 'geonames_match': None, + 'entity_profile': None, + 'error': None, + } + + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if not data: + result['status'] = 'error' + result['error'] = 'Empty file' + return result + + # Validate CH-Annotator entity profile + is_valid_entity, entity_subtype = validate_ch_annotator_entity(data) + result['entity_profile'] = entity_subtype + + if require_ch_annotator and not is_valid_entity: + result['status'] = 'invalid_entity_profile' + result['error'] = 'No valid CH-Annotator GRP.HER.* entity profile' + return result + + # Get current GHCID + current_ghcid = data.get('ghcid', {}).get('ghcid_current') + if not current_ghcid: + result['status'] = 'error' + result['error'] = 'No GHCID found' + return result + + result['old_ghcid'] = current_ghcid + + # Check if already has proper GeoNames resolution + resolution = data.get('ghcid', {}).get('location_resolution', {}) + if resolution.get('method') == 'REVERSE_GEOCODE' and resolution.get('geonames_id'): + result['status'] = 'already_geocoded' + return result + + # Extract coordinates + coords = extract_coordinates(data) + if not coords: + result['status'] = 'no_coordinates' + return result + + lat, lon = coords + country_code = extract_country_code(data) + + if country_code == 'XX': + result['status'] = 'no_country' + return result + + # Reverse geocode + geo_result = reverse_geocode(lat, lon, country_code, conn) + if not geo_result: + result['status'] = 'geocode_failed' + return result + + result['geonames_match'] = { + 'city': geo_result['city_name'], + 'admin1': geo_result['admin1_name'], + 'geonames_id': geo_result['geonames_id'], + } + + # Generate new codes + new_region_code = get_region_code(country_code, geo_result['admin1_code']) + new_city_code = generate_city_code(geo_result['ascii_name']) + + # Extract existing abbreviation and name suffix + abbreviation = extract_abbreviation_from_ghcid(current_ghcid) + name_suffix = extract_name_suffix_from_ghcid(current_ghcid) + + # Get institution type + inst_type = data.get('original_entry', {}).get('institution_type', 'UNKNOWN') + + # Generate new GHCID + new_ghcid = generate_ghcid(country_code, new_region_code, new_city_code, + inst_type, abbreviation, name_suffix) + + result['new_ghcid'] = new_ghcid + + # Check if GHCID changed + if new_ghcid == current_ghcid: + result['status'] = 'unchanged' + return result + + if dry_run: + result['status'] = 'would_update' + return result + + # Update the data + timestamp = datetime.now(timezone.utc).isoformat() + + # Update GHCID section + data['ghcid']['ghcid_current'] = new_ghcid + data['ghcid']['ghcid_uuid'] = generate_ghcid_uuid(new_ghcid) + data['ghcid']['ghcid_uuid_sha256'] = generate_ghcid_uuid_sha256(new_ghcid) + data['ghcid']['ghcid_numeric'] = generate_ghcid_numeric(new_ghcid) + + # Update location_resolution + data['ghcid']['location_resolution'] = { + 'method': 'REVERSE_GEOCODE', + 'country_code': country_code, + 'region_code': new_region_code, + 'region_name': geo_result['admin1_name'], + 'city_code': new_city_code, + 'city_name': geo_result['city_name'], + 'geonames_id': geo_result['geonames_id'], + 'feature_code': geo_result['feature_code'], + 'resolution_date': timestamp, + } + + # Add to GHCID history + history = data['ghcid'].get('ghcid_history', []) + + # Mark old GHCID as superseded + if history: + history[0]['valid_to'] = timestamp + history[0]['superseded_by'] = new_ghcid + + # Add new GHCID entry + history.insert(0, { + 'ghcid': new_ghcid, + 'ghcid_numeric': generate_ghcid_numeric(new_ghcid), + 'valid_from': timestamp, + 'reason': f'Updated via GeoNames reverse geocoding (matched {geo_result["city_name"]}, geonames:{geo_result["geonames_id"]})', + }) + + data['ghcid']['ghcid_history'] = history + + # Update identifiers + for ident in data.get('identifiers', []): + if ident.get('identifier_scheme') == 'GHCID': + ident['identifier_value'] = new_ghcid + elif ident.get('identifier_scheme') == 'GHCID_UUID': + ident['identifier_value'] = generate_ghcid_uuid(new_ghcid) + elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256': + ident['identifier_value'] = generate_ghcid_uuid_sha256(new_ghcid) + elif ident.get('identifier_scheme') == 'GHCID_NUMERIC': + ident['identifier_value'] = str(generate_ghcid_numeric(new_ghcid)) + + # Write updated data + new_filename = f"{new_ghcid}.yaml" + new_filepath = CUSTODIAN_DIR / new_filename + + with open(new_filepath, 'w') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Remove old file if different + if filepath != new_filepath: + os.remove(filepath) + + result['status'] = 'updated' + return result + + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + return result + + +def main(): + parser = argparse.ArgumentParser(description='Update GHCID with GeoNames data') + parser.add_argument('--dry-run', action='store_true', help='Show changes without applying') + parser.add_argument('--limit', type=int, help='Limit number of files to process') + parser.add_argument('--country', type=str, help='Only process files for specific country') + parser.add_argument('--verbose', action='store_true', help='Show detailed output') + parser.add_argument('--require-ch-annotator', action='store_true', + help='Only process files with valid CH-Annotator GRP.HER.* entity profile') + args = parser.parse_args() + + print("=" * 60) + print("Update GHCID with GeoNames Reverse Geocoding") + print("=" * 60) + print() + + if args.dry_run: + print("*** DRY RUN - No changes will be made ***") + print() + + if args.require_ch_annotator: + print("*** Requiring CH-Annotator entity profile (GRP.HER.*) ***") + print() + + # Connect to GeoNames + if not GEONAMES_DB.exists(): + print(f"Error: GeoNames database not found at {GEONAMES_DB}") + return + + conn = get_geonames_connection() + print(f"Connected to GeoNames database") + + # Get list of files + files = list(CUSTODIAN_DIR.glob("*.yaml")) + print(f"Found {len(files)} custodian files") + + # Filter by country if specified + if args.country: + files = [f for f in files if f.name.startswith(f"{args.country}-")] + print(f"Filtered to {len(files)} files for country {args.country}") + + # Apply limit + if args.limit: + files = files[:args.limit] + print(f"Limited to {args.limit} files") + + print() + + # Process files + stats = { + 'updated': 0, + 'unchanged': 0, + 'already_geocoded': 0, + 'no_coordinates': 0, + 'no_country': 0, + 'geocode_failed': 0, + 'would_update': 0, + 'invalid_entity_profile': 0, + 'error': 0, + } + + updates = [] + entity_profiles_seen = {} + + for i, filepath in enumerate(files): + if (i + 1) % 500 == 0: + print(f"Progress: {i + 1}/{len(files)}") + + result = process_file(filepath, conn, args.dry_run, args.require_ch_annotator) + stats[result['status']] = stats.get(result['status'], 0) + 1 + + # Track entity profiles + profile = result.get('entity_profile', 'NONE') + entity_profiles_seen[profile] = entity_profiles_seen.get(profile, 0) + 1 + + if result['status'] in ('updated', 'would_update'): + updates.append(result) + if args.verbose: + print(f" {result['old_ghcid']} -> {result['new_ghcid']}") + print(f" Matched: {result['geonames_match']}") + print(f" Entity: {result.get('entity_profile', 'N/A')}") + + conn.close() + + # Print summary + print() + print("=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Total files processed: {len(files)}") + print() + print("Results:") + print(f" Updated: {stats.get('updated', 0)}") + print(f" Would update (dry-run): {stats.get('would_update', 0)}") + print(f" Unchanged: {stats.get('unchanged', 0)}") + print(f" Already geocoded: {stats.get('already_geocoded', 0)}") + print(f" No coordinates: {stats.get('no_coordinates', 0)}") + print(f" No country code: {stats.get('no_country', 0)}") + print(f" Geocode failed: {stats.get('geocode_failed', 0)}") + print(f" Invalid entity profile: {stats.get('invalid_entity_profile', 0)}") + print(f" Errors: {stats.get('error', 0)}") + + # Print entity profile breakdown + if entity_profiles_seen: + print() + print("CH-Annotator Entity Profiles:") + for profile, count in sorted(entity_profiles_seen.items(), key=lambda x: -x[1])[:10]: + print(f" {profile}: {count}") + + # Save report + timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S') + report_file = REPORTS_DIR / f"GEONAMES_UPDATE_REPORT_{timestamp}.md" + + with open(report_file, 'w') as f: + f.write("# GeoNames GHCID Update Report\n\n") + f.write(f"Generated: {datetime.now(timezone.utc).isoformat()}\n\n") + f.write("## Summary\n\n") + f.write(f"| Metric | Count |\n") + f.write(f"|--------|-------|\n") + f.write(f"| Files processed | {len(files)} |\n") + f.write(f"| Updated | {stats.get('updated', 0)} |\n") + f.write(f"| Would update | {stats.get('would_update', 0)} |\n") + f.write(f"| Unchanged | {stats.get('unchanged', 0)} |\n") + f.write(f"| Already geocoded | {stats.get('already_geocoded', 0)} |\n") + f.write(f"| No coordinates | {stats.get('no_coordinates', 0)} |\n") + f.write(f"| Geocode failed | {stats.get('geocode_failed', 0)} |\n") + f.write(f"| Invalid entity profile | {stats.get('invalid_entity_profile', 0)} |\n") + f.write(f"| Errors | {stats.get('error', 0)} |\n") + + # Entity profile breakdown + if entity_profiles_seen: + f.write("\n## CH-Annotator Entity Profiles\n\n") + f.write("| Entity Profile | Count |\n") + f.write("|---------------|-------|\n") + for profile, count in sorted(entity_profiles_seen.items(), key=lambda x: -x[1]): + f.write(f"| {profile} | {count} |\n") + + if updates: + f.write("\n## Updates\n\n") + f.write("| Old GHCID | New GHCID | Matched City | Entity Profile |\n") + f.write("|-----------|-----------|-------------|----------------|\n") + for u in updates[:100]: # Limit to first 100 + city = u.get('geonames_match', {}).get('city', 'N/A') + profile = u.get('entity_profile', 'N/A') + f.write(f"| {u['old_ghcid']} | {u['new_ghcid']} | {city} | {profile} |\n") + + if len(updates) > 100: + f.write(f"\n*... and {len(updates) - 100} more updates*\n") + + print() + print(f"Report saved to: {report_file}") + + +if __name__ == '__main__': + main()