#!/usr/bin/env python3 """ Resolve location data for NL-XX-XXX-PENDING files that have city names in their filename. This script: 1. Scans PENDING files for Dutch city names in their filename 2. Looks up the city in GeoNames database 3. Updates the YAML with location data 4. Generates proper GHCID 5. Renames files to match new GHCID Usage: python scripts/resolve_pending_locations.py --dry-run # Preview changes python scripts/resolve_pending_locations.py # Apply changes """ import argparse import hashlib import os import re import sqlite3 import uuid from datetime import datetime, timezone from pathlib import Path from typing import Optional, List, Tuple import yaml # GHCID namespace UUID (RFC 4122 DNS namespace) GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # Netherlands admin1 code to ISO 3166-2 province code mapping ADMIN1_TO_PROVINCE = { '01': 'DR', # Drenthe '02': 'FR', # Friesland '03': 'GE', # Gelderland '04': 'GR', # Groningen '05': 'LI', # Limburg '06': 'NB', # Noord-Brabant '07': 'NH', # Noord-Holland '09': 'UT', # Utrecht '10': 'ZE', # Zeeland '11': 'ZH', # Zuid-Holland '15': 'OV', # Overijssel '16': 'FL', # Flevoland } # Dutch cities to search for in filenames (lowercase for matching) DUTCH_CITIES = [ 'amsterdam', 'rotterdam', 'den-haag', 'the-hague', 'utrecht', 'eindhoven', 'groningen', 'tilburg', 'almere', 'breda', 'nijmegen', 'apeldoorn', 'haarlem', 'arnhem', 'enschede', 'amersfoort', 'zaanstad', 'haarlemmermeer', 's-hertogenbosch', 'hertogenbosch', 'den-bosch', 'zwolle', 'zoetermeer', 'leiden', 'maastricht', 'dordrecht', 'ede', 'delft', 'alkmaar', 'venlo', 'deventer', 'hilversum', 'heerlen', 'leeuwarden', 'lelystad', 'roosendaal', 'middelburg', 'oss', 'helmond', 'almelo', 'gouda', 'vlissingen', 'hoorn' ] # Map filename city patterns to GeoNames search names CITY_FILENAME_MAP = { 'den-haag': 'The Hague', 'the-hague': 'The Hague', 's-hertogenbosch': "'s-Hertogenbosch", 'hertogenbosch': "'s-Hertogenbosch", 'den-bosch': "'s-Hertogenbosch", } # Institution type mapping from institution_type field INST_TYPE_MAP = { 'ARCHIVE': 'A', 'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'DIGITAL_PLATFORM': 'D', 'EDUCATION_PROVIDER': 'E', 'FEATURES': 'F', 'GALLERY': 'G', 'HOLY_SITES': 'H', 'INTANGIBLE_HERITAGE_GROUP': 'I', 'LIBRARY': 'L', 'MUSEUM': 'M', 'NGO': 'N', 'OFFICIAL_INSTITUTION': 'O', 'PERSONAL_COLLECTION': 'P', 'RESEARCH_CENTER': 'R', 'COLLECTING_SOCIETY': 'S', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U', 'MIXED': 'X', } # Valid feature codes for settlements (not neighborhoods) VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') def extract_city_from_filename(filename: str) -> Optional[str]: """Extract Dutch city name from PENDING filename.""" # Remove extension and prefix name = filename.replace('.yaml', '').replace('NL-XX-XXX-PENDING-', '') name_lower = name.lower() # Check each city for city in DUTCH_CITIES: # Check if city appears as a word boundary in filename pattern = rf'(^|-)({re.escape(city)})(-|$)' if re.search(pattern, name_lower): # Map to proper GeoNames name if city in CITY_FILENAME_MAP: return CITY_FILENAME_MAP[city] # Capitalize properly return city.replace('-', ' ').title() return None def lookup_city_geonames(db_path: str, city_name: str, country_code: str = 'NL') -> Optional[dict]: """Look up city in GeoNames database.""" conn = sqlite3.connect(db_path) cursor = conn.cursor() # Normalize city name for search search_name = city_name.replace("'s-", "s-").replace("'", "") # Try exact match first cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) ORDER BY population DESC LIMIT 1 """, (country_code, city_name, city_name) + VALID_FEATURE_CODES) row = cursor.fetchone() if not row: # Try with normalized name cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) ORDER BY population DESC LIMIT 1 """, (country_code, search_name, search_name) + VALID_FEATURE_CODES) row = cursor.fetchone() if not row: # Try fuzzy match cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) ORDER BY population DESC LIMIT 1 """, (country_code, f"%{city_name}%", f"%{city_name}%") + VALID_FEATURE_CODES) row = cursor.fetchone() conn.close() if row: admin1_code = row[3] or '' province_code = ADMIN1_TO_PROVINCE.get(admin1_code, 'XX') return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': admin1_code, 'admin1_name': row[4], 'province_code': province_code, 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], } return None def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" # Special mappings special_codes = { "'s-Hertogenbosch": "SHE", "The Hague": "DHA", "'s-Gravenhage": "SGR", } if city_name in special_codes: return special_codes[city_name] # Handle Dutch articles and prefixes name = city_name.replace("'", "").replace("-", " ") words = name.split() if len(words) == 1: return words[0][:3].upper() elif len(words) >= 2: dutch_articles = ['de', 'het', 'den', "'s"] if words[0].lower() in dutch_articles: return (words[0][0] + words[1][:2]).upper() else: initials = ''.join(w[0] for w in words[:3]) return initials.upper() return city_name[:3].upper() def generate_abbreviation(emic_name: str) -> str: """Generate abbreviation from emic name.""" # Skip words (articles, prepositions) skip_words = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of', 'the', 'a', 'an', 'of', 'and', 'or', 'for', 'to', 'at', 'by', 'with', 'from'} # Clean name name = re.sub(r'[^\w\s]', '', emic_name) words = name.split() # Take first letter of significant words initials = [] for word in words: if word.lower() not in skip_words and word: initials.append(word[0].upper()) abbrev = ''.join(initials[:10]) # Max 10 chars return abbrev if abbrev else emic_name[:3].upper() def generate_ghcid_identifiers(ghcid_string: str) -> dict: """Generate all GHCID identifier formats.""" # UUID v5 (SHA-1) - PRIMARY uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string) # UUID v8 (SHA-256) - Secondary sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()[:16] sha256_hash = bytearray(sha256_hash) sha256_hash[6] = (sha256_hash[6] & 0x0F) | 0x80 # Version 8 sha256_hash[8] = (sha256_hash[8] & 0x3F) | 0x80 # Variant uuid_sha256 = uuid.UUID(bytes=bytes(sha256_hash)) # Numeric (64-bit from SHA-256) full_hash = hashlib.sha256(ghcid_string.encode()).digest() numeric = int.from_bytes(full_hash[:8], 'big') return { 'ghcid_uuid': str(uuid_v5), 'ghcid_uuid_sha256': str(uuid_sha256), 'ghcid_numeric': str(numeric), } def update_yaml_with_location(data: dict, geonames_data: dict, new_ghcid: str, old_ghcid: str, timestamp: str) -> dict: """Update YAML data with location and GHCID information.""" identifiers = generate_ghcid_identifiers(new_ghcid) # Add locations array if missing if 'locations' not in data or not data['locations']: data['locations'] = [] # Add location entry location_entry = { 'city': geonames_data['name'], 'region_code': geonames_data['province_code'], 'country': 'NL', 'geonames_id': geonames_data['geonames_id'], 'latitude': geonames_data['latitude'], 'longitude': geonames_data['longitude'], } # Only add if not already present existing_cities = [loc.get('city') for loc in data['locations']] if geonames_data['name'] not in existing_cities: data['locations'].insert(0, location_entry) # Update ghcid section if 'ghcid' not in data: data['ghcid'] = {} ghcid_section = data['ghcid'] ghcid_section['ghcid_current'] = new_ghcid ghcid_section['ghcid_uuid'] = identifiers['ghcid_uuid'] ghcid_section['ghcid_uuid_sha256'] = identifiers['ghcid_uuid_sha256'] ghcid_section['ghcid_numeric'] = int(identifiers['ghcid_numeric']) ghcid_section['generation_timestamp'] = timestamp # Update location_resolution ghcid_section['location_resolution'] = { 'method': 'FILENAME_CITY_EXTRACTION', 'geonames_id': geonames_data['geonames_id'], 'geonames_name': geonames_data['name'], 'feature_code': geonames_data['feature_code'], 'population': geonames_data['population'], 'admin1_code': geonames_data['admin1_code'], 'region_code': geonames_data['province_code'], 'country_code': 'NL', } # Update ghcid_history if 'ghcid_history' not in ghcid_section: ghcid_section['ghcid_history'] = [] ghcid_section['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': int(identifiers['ghcid_numeric']), 'valid_from': timestamp, 'valid_to': None, 'reason': f"Location resolved from filename: {old_ghcid} -> {new_ghcid}", }) # Update top-level ghcid_current data['ghcid_current'] = new_ghcid # Add provenance note if 'provenance' in data: if 'notes' not in data['provenance']: data['provenance']['notes'] = '' notes = data['provenance'].get('notes', '') if isinstance(notes, str): data['provenance']['notes'] = notes + f"\nLocation resolved from filename on {timestamp}." return data def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> bool: """Check if the new GHCID would collide with an existing file.""" new_filepath = custodian_dir / f"{new_ghcid}.yaml" return new_filepath.exists() and new_filepath != old_filepath def find_resolvable_pending_files(custodian_dir: Path, db_path: str) -> List[dict]: """Find PENDING files that can be resolved via filename city extraction.""" resolvable = [] for filepath in sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml')): filename = filepath.name # Try to extract city from filename city = extract_city_from_filename(filename) if not city: continue # Look up city in GeoNames geonames_data = lookup_city_geonames(db_path, city) if not geonames_data: print(f"WARNING: Could not find GeoNames data for '{city}' extracted from {filename}") continue # Load YAML to get institution type and emic name try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f"Error loading {filepath}: {e}") continue # Get institution type code inst_type_str = data.get('institution_type', 'UNKNOWN') inst_type_code = INST_TYPE_MAP.get(inst_type_str, 'U') # Get emic name for abbreviation emic_name = data.get('custodian_name', {}).get('emic_name', '') if not emic_name: emic_name = filename.replace('NL-XX-XXX-PENDING-', '').replace('.yaml', '').replace('-', ' ') # Generate abbreviation abbrev = generate_abbreviation(emic_name) # Build new GHCID city_code = generate_city_code(geonames_data['name']) new_ghcid = f"NL-{geonames_data['province_code']}-{city_code}-{inst_type_code}-{abbrev}" resolvable.append({ 'filepath': filepath, 'old_ghcid': filename.replace('.yaml', ''), 'new_ghcid': new_ghcid, 'city': city, 'geonames_data': geonames_data, 'data': data, 'emic_name': emic_name, }) return resolvable def main(): parser = argparse.ArgumentParser(description='Resolve location data for PENDING custodian files') parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying') parser.add_argument('--custodian-dir', default='data/custodian', help='Path to custodian directory') parser.add_argument('--geonames-db', default='data/reference/geonames.db', help='Path to GeoNames database') parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = no limit)') args = parser.parse_args() # Resolve paths script_dir = Path(__file__).parent.parent custodian_dir = script_dir / args.custodian_dir db_path = script_dir / args.geonames_db if not custodian_dir.exists(): print(f"ERROR: Custodian directory not found: {custodian_dir}") return 1 if not db_path.exists(): print(f"ERROR: GeoNames database not found: {db_path}") return 1 print("=" * 80) print("PENDING File Location Resolver") print("=" * 80) print(f"Custodian directory: {custodian_dir}") print(f"GeoNames database: {db_path}") print(f"Mode: {'DRY RUN (preview only)' if args.dry_run else 'LIVE (applying changes)'}") if args.limit: print(f"Limit: {args.limit} files") print() # Find resolvable files print("Scanning for PENDING files with city names in filename...") resolvable = find_resolvable_pending_files(custodian_dir, str(db_path)) if args.limit: resolvable = resolvable[:args.limit] print(f"Found {len(resolvable)} files that can be resolved") print() if not resolvable: print("No resolvable files found. Exiting.") return 0 # Generate timestamp for all updates timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') # Process each file resolved_count = 0 skipped_count = 0 errors = [] for item in resolvable: old_ghcid = item['old_ghcid'] new_ghcid = item['new_ghcid'] city = item['city'] filepath = item['filepath'] geonames_data = item['geonames_data'] data = item['data'] emic_name = item['emic_name'] print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing: {old_ghcid}") print(f" Emic name: {emic_name}") print(f" City extracted: {city}") print(f" Province: {geonames_data['province_code']} ({geonames_data.get('admin1_name', 'Unknown')})") print(f" New GHCID: {new_ghcid}") # Check for collision if check_collision(custodian_dir, new_ghcid, filepath): print(f" SKIPPED: Collision - {new_ghcid}.yaml already exists") skipped_count += 1 print() continue # Generate identifiers for display identifiers = generate_ghcid_identifiers(new_ghcid) print(f" UUID v5: {identifiers['ghcid_uuid']}") if args.dry_run: print(f" Would rename: {filepath.name} -> {new_ghcid}.yaml") print() resolved_count += 1 continue try: # Update YAML data updated_data = update_yaml_with_location( data, geonames_data, new_ghcid, old_ghcid, timestamp ) # Write updated YAML to new file new_filepath = filepath.parent / f"{new_ghcid}.yaml" with open(new_filepath, 'w', encoding='utf-8') as f: yaml.dump(updated_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Remove old file filepath.unlink() print(f" Renamed: {filepath.name} -> {new_filepath.name}") resolved_count += 1 except Exception as e: error_msg = f"Error processing {filepath}: {e}" print(f" ERROR: {e}") errors.append(error_msg) print() # Summary print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total resolvable files: {len(resolvable)}") print(f"Successfully {'would resolve' if args.dry_run else 'resolved'}: {resolved_count}") print(f"Skipped (collisions): {skipped_count}") print(f"Errors: {len(errors)}") if errors: print("\nErrors:") for error in errors: print(f" - {error}") if args.dry_run: print("\nThis was a dry run. Run without --dry-run to apply changes.") return 0 if not errors else 1 if __name__ == '__main__': exit(main())