#!/usr/bin/env python3 """ Generate GHCIDs for KIEN intangible heritage custodian entries. This script is a targeted version of enrich_nde_entries_ghcid.py that only processes KIEN entries (entry_index 1674-1860) to avoid processing the entire NDE dataset. Usage: python scripts/enrich_kien_ghcid.py [--dry-run] """ import argparse import hashlib import json import re import sqlite3 import sys import unicodedata import uuid from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Optional, Tuple import yaml # Project root PROJECT_ROOT = Path(__file__).parent.parent # GHCID UUID v5 Namespace (DNS namespace from RFC 4122) GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # GeoNames admin1 code to ISO 3166-2 NL mapping GEONAMES_ADMIN1_TO_ISO_NL = { "01": "DR", # Drenthe "02": "FR", # Friesland "03": "GE", # Gelderland "04": "GR", # Groningen "05": "LI", # Limburg "06": "NB", # Noord-Brabant "07": "NH", # Noord-Holland "09": "UT", # Utrecht "10": "ZE", # Zeeland "11": "ZH", # Zuid-Holland "15": "OV", # Overijssel "16": "FL", # Flevoland } # Dutch articles/prepositions to skip in abbreviation generation DUTCH_SKIP_WORDS = { 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a', 'an' } # Valid GeoNames feature codes (settlements, not neighborhoods) VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') def generate_uuid_v7() -> uuid.UUID: """Generate a UUID v7 (time-ordered, for database records).""" import time import os # Get current time in milliseconds timestamp_ms = int(time.time() * 1000) # Create 16-byte array uuid_bytes = bytearray(16) # First 6 bytes: timestamp (48 bits) uuid_bytes[0:6] = timestamp_ms.to_bytes(6, byteorder='big') # 4 bits version (7) + 12 bits random random_a = int.from_bytes(os.urandom(2), byteorder='big') uuid_bytes[6] = 0x70 | ((random_a >> 8) & 0x0F) uuid_bytes[7] = random_a & 0xFF # 2 bits variant (10) + 62 bits random random_b = int.from_bytes(os.urandom(8), byteorder='big') uuid_bytes[8] = 0x80 | ((random_b >> 56) & 0x3F) uuid_bytes[9:16] = random_b.to_bytes(8, byteorder='big')[1:] return uuid.UUID(bytes=bytes(uuid_bytes)) def normalize_city_name(city_name: str) -> str: """Normalize city name for code generation.""" # NFD decomposition to separate accents normalized = unicodedata.normalize('NFD', city_name) # Remove combining marks (accents) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Remove apostrophes and special chars ascii_name = re.sub(r"[''`]", '', ascii_name) return ascii_name def get_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" if not city_name: return "XXX" normalized = normalize_city_name(city_name) words = normalized.split() if not words: return "XXX" articles = {'de', 'het', 'den', "'s", 'op', 'aan', 'bij', 'ter'} if len(words) == 1: code = words[0][:3].upper() elif words[0].lower() in articles and len(words) > 1: code = (words[0][0] + words[1][:2]).upper() else: code = ''.join(w[0] for w in words[:3]).upper() if len(code) < 3: code = code.ljust(3, 'X') elif len(code) > 3: code = code[:3] code = re.sub(r'[^A-Z]', 'X', code) return code def extract_abbreviation_from_name(name: str) -> str: """Extract abbreviation from institution name using first letters of significant words.""" if not name: return "INST" # Normalize normalized = unicodedata.normalize('NFD', name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Remove punctuation except hyphens and apostrophes cleaned = re.sub(r"[''`\",.:;!?()[\]{}]", '', ascii_name) # Split into words words = cleaned.split() # Filter out skip words and digits significant = [] for word in words: word_lower = word.lower() if word_lower not in DUTCH_SKIP_WORDS and not word.isdigit(): significant.append(word) if not significant: significant = words[:3] # Fallback to first 3 words # Take first letter of each significant word (up to 10) abbrev = ''.join(w[0].upper() for w in significant[:10] if w) return abbrev if abbrev else "INST" def generate_name_suffix(institution_name: str) -> str: """Generate snake_case name suffix for collision resolution.""" if not institution_name: return "unknown" # Normalize normalized = unicodedata.normalize('NFD', institution_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Convert to lowercase lowercase = ascii_name.lower() # Remove punctuation no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase) # Replace spaces/hyphens with underscores underscored = re.sub(r'[\s\-/]+', '_', no_punct) # Remove non-alphanumeric (except underscores) clean = re.sub(r'[^a-z0-9_]', '', underscored) # Collapse multiple underscores final = re.sub(r'_+', '_', clean).strip('_') # Truncate if len(final) > 50: final = final[:50].rstrip('_') return final if final else "unknown" def reverse_geocode(lat: float, lon: float, db_path: Path) -> Optional[dict]: """Reverse geocode coordinates to find nearest city using GeoNames.""" conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() try: query = """ SELECT name, ascii_name, admin1_code, geonames_id, population, feature_code, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq FROM cities WHERE country_code = 'NL' AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) AND population >= 100 ORDER BY distance_sq LIMIT 1 """ cursor.execute(query, (lat, lat, lon, lon, *VALID_FEATURE_CODES)) row = cursor.fetchone() if row: name, ascii_name, admin1_code, geonames_id, population, feature_code, dist_sq = row region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00") return { 'city': name, 'city_code': get_city_code(name), 'region_code': region_code, 'admin1_code': admin1_code, 'geonames_id': geonames_id, 'feature_code': feature_code, 'population': population, 'distance_km': (dist_sq ** 0.5) * 111, } finally: conn.close() return None def lookup_city_by_name(city_name: str, db_path: Path) -> Optional[dict]: """Look up city in GeoNames by name.""" conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() try: query = """ SELECT name, admin1_code, geonames_id, population, feature_code FROM cities WHERE country_code = 'NL' AND (name = ? OR ascii_name = ?) AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?) ORDER BY population DESC LIMIT 1 """ cursor.execute(query, (city_name, city_name, *VALID_FEATURE_CODES)) row = cursor.fetchone() if row: name, admin1_code, geonames_id, population, feature_code = row region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00") return { 'city': name, 'city_code': get_city_code(name), 'region_code': region_code, 'admin1_code': admin1_code, 'geonames_id': geonames_id, 'feature_code': feature_code, 'population': population, } finally: conn.close() return None def extract_entry_data(entry: dict, db_path: Path) -> dict: """Extract data for GHCID generation from a KIEN entry.""" # Get name from custodian_name or original_entry name = None if 'custodian_name' in entry and entry['custodian_name'].get('claim_value'): name = entry['custodian_name']['claim_value'] if not name and 'original_entry' in entry: name = entry['original_entry'].get('organisatie') if not name and 'kien_enrichment' in entry: name = entry['kien_enrichment'].get('kien_name') if not name: name = "Unknown Institution" # Get type code - KIEN entries are type I (Intangible Heritage) or T (Taste/Smell) type_code = 'I' # Default for KIEN if 'original_entry' in entry and 'type' in entry['original_entry']: types = entry['original_entry']['type'] if isinstance(types, list) and types: type_code = types[0] elif isinstance(types, str): type_code = types # Get location data city = None region_code = "00" geonames_id = None location_resolution = None # Try coordinates first lat, lon = None, None if 'locations' in entry and entry['locations']: loc = entry['locations'][0] lat = loc.get('latitude') lon = loc.get('longitude') city = loc.get('city') # Reverse geocode if we have coordinates if lat is not None and lon is not None: geo_result = reverse_geocode(lat, lon, db_path) if geo_result: city = geo_result['city'] region_code = geo_result['region_code'] geonames_id = geo_result['geonames_id'] location_resolution = { 'method': 'REVERSE_GEOCODE', 'geonames_id': geonames_id, 'geonames_name': city, 'feature_code': geo_result['feature_code'], 'population': geo_result['population'], 'admin1_code': geo_result['admin1_code'], 'region_code': region_code, 'country_code': 'NL', 'source_coordinates': {'latitude': lat, 'longitude': lon}, 'distance_km': geo_result['distance_km'], } # If we have a city name but no geocode result, look it up if city and not location_resolution: geo_result = lookup_city_by_name(city, db_path) if geo_result: region_code = geo_result['region_code'] geonames_id = geo_result['geonames_id'] location_resolution = { 'method': 'NAME_LOOKUP', 'geonames_id': geonames_id, 'geonames_name': geo_result['city'], 'feature_code': geo_result['feature_code'], 'population': geo_result['population'], 'admin1_code': geo_result['admin1_code'], 'region_code': region_code, 'country_code': 'NL', } else: location_resolution = { 'method': 'TEXT_FALLBACK', 'city_name': city, 'needs_review': True, } return { 'name': name, 'type_code': type_code, 'city': city, 'city_code': get_city_code(city) if city else "XXX", 'region_code': region_code, 'country_code': 'NL', 'geonames_id': geonames_id, 'location_resolution': location_resolution, } def generate_ghcid(data: dict) -> Tuple[str, dict]: """Generate base GHCID and all identifier formats.""" # Build base GHCID string country = data['country_code'] region = data['region_code'] city = data['city_code'] inst_type = data['type_code'] abbrev = extract_abbreviation_from_name(data['name']) base_ghcid = f"{country}-{region}-{city}-{inst_type}-{abbrev}" return base_ghcid, { 'country': country, 'region': region, 'city': city, 'type': inst_type, 'abbrev': abbrev, } def generate_identifier_formats(final_ghcid: str) -> dict: """Generate all 4 identifier formats from final GHCID string.""" # UUID v5 (SHA-1) ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, final_ghcid) # UUID v8 (SHA-256) hash_bytes = hashlib.sha256(final_ghcid.encode('utf-8')).digest() uuid_bytes = bytearray(hash_bytes[:16]) uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant RFC 4122 ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes)) # Numeric (64-bit) ghcid_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False) # Record ID (UUID v7) record_id = generate_uuid_v7() return { 'ghcid_uuid': str(ghcid_uuid), 'ghcid_uuid_sha256': str(ghcid_uuid_sha256), 'ghcid_numeric': ghcid_numeric, 'record_id': str(record_id), } def process_kien_entries(entries_dir: Path, db_path: Path, dry_run: bool = False) -> dict: """Process KIEN entries and generate GHCIDs.""" stats = { 'total': 0, 'processed': 0, 'with_location': 0, 'without_location': 0, 'already_has_ghcid': 0, 'collisions': 0, 'collision_groups': 0, 'files_updated': 0, 'errors': [], } timestamp = datetime.now(timezone.utc).isoformat() # Find KIEN entries (1674-1860) kien_files = [] for f in entries_dir.glob("*.yaml"): # Extract entry index from filename match = re.match(r'^(\d+)_', f.name) if match: idx = int(match.group(1)) if 1674 <= idx <= 1860: kien_files.append(f) def get_entry_index(filepath: Path) -> int: match = re.match(r'^(\d+)_', filepath.name) return int(match.group(1)) if match else 0 kien_files.sort(key=get_entry_index) stats['total'] = len(kien_files) print(f"Found {len(kien_files)} KIEN entries") # Phase 1: Load entries and extract data print("\nPhase 1: Loading entries and extracting location data...") entries_data = [] for filepath in kien_files: try: with open(filepath, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: continue # Check if already has GHCID if 'ghcid' in entry and entry['ghcid'].get('ghcid_current'): stats['already_has_ghcid'] += 1 continue # Extract data data = extract_entry_data(entry, db_path) if not data['city']: stats['without_location'] += 1 continue stats['with_location'] += 1 # Generate base GHCID base_ghcid, components = generate_ghcid(data) entries_data.append({ 'filepath': filepath, 'entry': entry, 'data': data, 'base_ghcid': base_ghcid, 'components': components, }) except Exception as e: stats['errors'].append(f"{filepath.name}: {str(e)}") print(f" Entries with location: {stats['with_location']}") print(f" Entries without location: {stats['without_location']}") print(f" Already have GHCID: {stats['already_has_ghcid']}") # Phase 2: Detect collisions print("\nPhase 2: Detecting collisions...") collision_groups = defaultdict(list) for ed in entries_data: collision_groups[ed['base_ghcid']].append(ed) for base_ghcid, group in collision_groups.items(): if len(group) > 1: stats['collision_groups'] += 1 stats['collisions'] += len(group) print(f" Collision groups: {stats['collision_groups']}") print(f" Entries with collisions: {stats['collisions']}") # Phase 3: Resolve collisions print("\nPhase 3: Resolving collisions...") collision_report = [] for base_ghcid, group in collision_groups.items(): if len(group) > 1: # All get name suffixes collision_report.append({ 'base_ghcid': base_ghcid, 'count': len(group), 'institutions': [ed['data']['name'] for ed in group], }) for ed in group: name_suffix = generate_name_suffix(ed['data']['name']) ed['final_ghcid'] = f"{base_ghcid}-{name_suffix}" ed['had_collision'] = True else: ed = group[0] ed['final_ghcid'] = base_ghcid ed['had_collision'] = False # Phase 4: Generate identifiers and update entries print("\nPhase 4: Generating identifiers and updating entries...") for ed in entries_data: final_ghcid = ed['final_ghcid'] ids = generate_identifier_formats(final_ghcid) # Create GHCID block ghcid_block = { 'ghcid_current': final_ghcid, 'ghcid_original': final_ghcid, 'ghcid_uuid': ids['ghcid_uuid'], 'ghcid_uuid_sha256': ids['ghcid_uuid_sha256'], 'ghcid_numeric': ids['ghcid_numeric'], 'record_id': ids['record_id'], 'generation_timestamp': timestamp, 'ghcid_history': [ { 'ghcid': final_ghcid, 'ghcid_numeric': ids['ghcid_numeric'], 'valid_from': timestamp, 'valid_to': None, 'reason': 'Initial GHCID assignment (KIEN batch import December 2025)' + (' - name suffix added to resolve collision' if ed.get('had_collision') else ''), } ], } # Add location resolution metadata if ed['data'].get('location_resolution'): ghcid_block['location_resolution'] = ed['data']['location_resolution'] if ed['data'].get('geonames_id'): ghcid_block['geonames_id'] = ed['data']['geonames_id'] if ed.get('had_collision'): ghcid_block['collision_resolved'] = True ghcid_block['base_ghcid_before_collision'] = ed['base_ghcid'] # Update entry entry = ed['entry'] entry['ghcid'] = ghcid_block # Add to identifiers list if 'identifiers' not in entry: entry['identifiers'] = [] # Remove existing GHCID identifiers entry['identifiers'] = [ i for i in entry['identifiers'] if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID'] ] # Add new identifiers entry['identifiers'].extend([ {'identifier_scheme': 'GHCID', 'identifier_value': final_ghcid}, {'identifier_scheme': 'GHCID_UUID', 'identifier_value': ids['ghcid_uuid'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid']}"}, {'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': ids['ghcid_uuid_sha256'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid_sha256']}"}, {'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ids['ghcid_numeric'])}, {'identifier_scheme': 'RECORD_ID', 'identifier_value': ids['record_id'], 'identifier_url': f"urn:uuid:{ids['record_id']}"}, ]) ed['entry'] = entry stats['processed'] += 1 # Phase 5: Write updated entries if not dry_run: print("\nPhase 5: Writing updated entries...") for ed in entries_data: try: with open(ed['filepath'], 'w', encoding='utf-8') as f: yaml.dump(ed['entry'], f, default_flow_style=False, allow_unicode=True, sort_keys=False) stats['files_updated'] += 1 except Exception as e: stats['errors'].append(f"Write error {ed['filepath'].name}: {str(e)}") print(f" Updated {stats['files_updated']} files") # Write collision report if collision_report: report_path = entries_dir.parent / "kien_ghcid_collision_report.json" report = { 'generation_timestamp': timestamp, 'total_kien_entries': stats['total'], 'entries_with_ghcid': stats['processed'], 'collision_groups': stats['collision_groups'], 'entries_with_collisions': stats['collisions'], 'collisions': collision_report, } with open(report_path, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f" Collision report: {report_path}") else: print("\nPhase 5: DRY RUN - no files written") return stats def main(): parser = argparse.ArgumentParser(description="Generate GHCIDs for KIEN entries") parser.add_argument('--dry-run', action='store_true', help="Preview changes without writing") args = parser.parse_args() entries_dir = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries" db_path = PROJECT_ROOT / "data" / "reference" / "geonames.db" print("="*70) print("KIEN HERITAGE CUSTODIAN GHCID GENERATION") print("="*70) print(f"Entries directory: {entries_dir}") print(f"GeoNames database: {db_path}") print(f"Dry run: {args.dry_run}") print() if not entries_dir.exists(): print(f"ERROR: Entries directory not found: {entries_dir}") sys.exit(1) if not db_path.exists(): print(f"ERROR: GeoNames database not found: {db_path}") sys.exit(1) stats = process_kien_entries(entries_dir, db_path, dry_run=args.dry_run) print() print("="*70) print("SUMMARY") print("="*70) print(f"Total KIEN entries: {stats['total']}") print(f"Already have GHCID: {stats['already_has_ghcid']}") print(f"Entries with location: {stats['with_location']}") print(f"Entries without location: {stats['without_location']}") print(f"GHCIDs generated: {stats['processed']}") print(f"Collision groups: {stats['collision_groups']}") print(f"Entries with collisions: {stats['collisions']}") print(f"Files updated: {stats['files_updated']}") if stats['errors']: print(f"\nErrors ({len(stats['errors'])}):") for err in stats['errors'][:5]: print(f" - {err}") if len(stats['errors']) > 5: print(f" ... and {len(stats['errors']) - 5} more") print() if args.dry_run: print("DRY RUN COMPLETE - No files modified") else: print("GHCID GENERATION COMPLETE") if __name__ == "__main__": main()