glam/scripts/enrich_kien_ghcid.py
2025-12-05 15:30:23 +01:00

674 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Generate GHCIDs for KIEN intangible heritage custodian entries.
This script is a targeted version of enrich_nde_entries_ghcid.py that only
processes KIEN entries (entry_index 1674-1860) to avoid processing the
entire NDE dataset.
Usage:
python scripts/enrich_kien_ghcid.py [--dry-run]
"""
import argparse
import hashlib
import json
import re
import sqlite3
import sys
import unicodedata
import uuid
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple
import yaml
# Project root
PROJECT_ROOT = Path(__file__).parent.parent
# GHCID UUID v5 Namespace (DNS namespace from RFC 4122)
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
# GeoNames admin1 code to ISO 3166-2 NL mapping
GEONAMES_ADMIN1_TO_ISO_NL = {
"01": "DR", # Drenthe
"02": "FR", # Friesland
"03": "GE", # Gelderland
"04": "GR", # Groningen
"05": "LI", # Limburg
"06": "NB", # Noord-Brabant
"07": "NH", # Noord-Holland
"09": "UT", # Utrecht
"10": "ZE", # Zeeland
"11": "ZH", # Zuid-Holland
"15": "OV", # Overijssel
"16": "FL", # Flevoland
}
# Dutch articles/prepositions to skip in abbreviation generation
DUTCH_SKIP_WORDS = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
"'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a', 'an'
}
# Valid GeoNames feature codes (settlements, not neighborhoods)
VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
def generate_uuid_v7() -> uuid.UUID:
"""Generate a UUID v7 (time-ordered, for database records)."""
import time
import os
# Get current time in milliseconds
timestamp_ms = int(time.time() * 1000)
# Create 16-byte array
uuid_bytes = bytearray(16)
# First 6 bytes: timestamp (48 bits)
uuid_bytes[0:6] = timestamp_ms.to_bytes(6, byteorder='big')
# 4 bits version (7) + 12 bits random
random_a = int.from_bytes(os.urandom(2), byteorder='big')
uuid_bytes[6] = 0x70 | ((random_a >> 8) & 0x0F)
uuid_bytes[7] = random_a & 0xFF
# 2 bits variant (10) + 62 bits random
random_b = int.from_bytes(os.urandom(8), byteorder='big')
uuid_bytes[8] = 0x80 | ((random_b >> 56) & 0x3F)
uuid_bytes[9:16] = random_b.to_bytes(8, byteorder='big')[1:]
return uuid.UUID(bytes=bytes(uuid_bytes))
def normalize_city_name(city_name: str) -> str:
"""Normalize city name for code generation."""
# NFD decomposition to separate accents
normalized = unicodedata.normalize('NFD', city_name)
# Remove combining marks (accents)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Remove apostrophes and special chars
ascii_name = re.sub(r"[''`]", '', ascii_name)
return ascii_name
def get_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
if not city_name:
return "XXX"
normalized = normalize_city_name(city_name)
words = normalized.split()
if not words:
return "XXX"
articles = {'de', 'het', 'den', "'s", 'op', 'aan', 'bij', 'ter'}
if len(words) == 1:
code = words[0][:3].upper()
elif words[0].lower() in articles and len(words) > 1:
code = (words[0][0] + words[1][:2]).upper()
else:
code = ''.join(w[0] for w in words[:3]).upper()
if len(code) < 3:
code = code.ljust(3, 'X')
elif len(code) > 3:
code = code[:3]
code = re.sub(r'[^A-Z]', 'X', code)
return code
def extract_abbreviation_from_name(name: str) -> str:
"""Extract abbreviation from institution name using first letters of significant words."""
if not name:
return "INST"
# Normalize
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Remove punctuation except hyphens and apostrophes
cleaned = re.sub(r"[''`\",.:;!?()[\]{}]", '', ascii_name)
# Split into words
words = cleaned.split()
# Filter out skip words and digits
significant = []
for word in words:
word_lower = word.lower()
if word_lower not in DUTCH_SKIP_WORDS and not word.isdigit():
significant.append(word)
if not significant:
significant = words[:3] # Fallback to first 3 words
# Take first letter of each significant word (up to 10)
abbrev = ''.join(w[0].upper() for w in significant[:10] if w)
return abbrev if abbrev else "INST"
def generate_name_suffix(institution_name: str) -> str:
"""Generate snake_case name suffix for collision resolution."""
if not institution_name:
return "unknown"
# Normalize
normalized = unicodedata.normalize('NFD', institution_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Convert to lowercase
lowercase = ascii_name.lower()
# Remove punctuation
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
# Replace spaces/hyphens with underscores
underscored = re.sub(r'[\s\-/]+', '_', no_punct)
# Remove non-alphanumeric (except underscores)
clean = re.sub(r'[^a-z0-9_]', '', underscored)
# Collapse multiple underscores
final = re.sub(r'_+', '_', clean).strip('_')
# Truncate
if len(final) > 50:
final = final[:50].rstrip('_')
return final if final else "unknown"
def reverse_geocode(lat: float, lon: float, db_path: Path) -> Optional[dict]:
"""Reverse geocode coordinates to find nearest city using GeoNames."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
try:
query = """
SELECT
name, ascii_name, admin1_code, geonames_id, population, feature_code,
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
FROM cities
WHERE country_code = 'NL'
AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
AND population >= 100
ORDER BY distance_sq
LIMIT 1
"""
cursor.execute(query, (lat, lat, lon, lon, *VALID_FEATURE_CODES))
row = cursor.fetchone()
if row:
name, ascii_name, admin1_code, geonames_id, population, feature_code, dist_sq = row
region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00")
return {
'city': name,
'city_code': get_city_code(name),
'region_code': region_code,
'admin1_code': admin1_code,
'geonames_id': geonames_id,
'feature_code': feature_code,
'population': population,
'distance_km': (dist_sq ** 0.5) * 111,
}
finally:
conn.close()
return None
def lookup_city_by_name(city_name: str, db_path: Path) -> Optional[dict]:
"""Look up city in GeoNames by name."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
try:
query = """
SELECT
name, admin1_code, geonames_id, population, feature_code
FROM cities
WHERE country_code = 'NL'
AND (name = ? OR ascii_name = ?)
AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
ORDER BY population DESC
LIMIT 1
"""
cursor.execute(query, (city_name, city_name, *VALID_FEATURE_CODES))
row = cursor.fetchone()
if row:
name, admin1_code, geonames_id, population, feature_code = row
region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00")
return {
'city': name,
'city_code': get_city_code(name),
'region_code': region_code,
'admin1_code': admin1_code,
'geonames_id': geonames_id,
'feature_code': feature_code,
'population': population,
}
finally:
conn.close()
return None
def extract_entry_data(entry: dict, db_path: Path) -> dict:
"""Extract data for GHCID generation from a KIEN entry."""
# Get name from custodian_name or original_entry
name = None
if 'custodian_name' in entry and entry['custodian_name'].get('claim_value'):
name = entry['custodian_name']['claim_value']
if not name and 'original_entry' in entry:
name = entry['original_entry'].get('organisatie')
if not name and 'kien_enrichment' in entry:
name = entry['kien_enrichment'].get('kien_name')
if not name:
name = "Unknown Institution"
# Get type code - KIEN entries are type I (Intangible Heritage) or T (Taste/Smell)
type_code = 'I' # Default for KIEN
if 'original_entry' in entry and 'type' in entry['original_entry']:
types = entry['original_entry']['type']
if isinstance(types, list) and types:
type_code = types[0]
elif isinstance(types, str):
type_code = types
# Get location data
city = None
region_code = "00"
geonames_id = None
location_resolution = None
# Try coordinates first
lat, lon = None, None
if 'locations' in entry and entry['locations']:
loc = entry['locations'][0]
lat = loc.get('latitude')
lon = loc.get('longitude')
city = loc.get('city')
# Reverse geocode if we have coordinates
if lat is not None and lon is not None:
geo_result = reverse_geocode(lat, lon, db_path)
if geo_result:
city = geo_result['city']
region_code = geo_result['region_code']
geonames_id = geo_result['geonames_id']
location_resolution = {
'method': 'REVERSE_GEOCODE',
'geonames_id': geonames_id,
'geonames_name': city,
'feature_code': geo_result['feature_code'],
'population': geo_result['population'],
'admin1_code': geo_result['admin1_code'],
'region_code': region_code,
'country_code': 'NL',
'source_coordinates': {'latitude': lat, 'longitude': lon},
'distance_km': geo_result['distance_km'],
}
# If we have a city name but no geocode result, look it up
if city and not location_resolution:
geo_result = lookup_city_by_name(city, db_path)
if geo_result:
region_code = geo_result['region_code']
geonames_id = geo_result['geonames_id']
location_resolution = {
'method': 'NAME_LOOKUP',
'geonames_id': geonames_id,
'geonames_name': geo_result['city'],
'feature_code': geo_result['feature_code'],
'population': geo_result['population'],
'admin1_code': geo_result['admin1_code'],
'region_code': region_code,
'country_code': 'NL',
}
else:
location_resolution = {
'method': 'TEXT_FALLBACK',
'city_name': city,
'needs_review': True,
}
return {
'name': name,
'type_code': type_code,
'city': city,
'city_code': get_city_code(city) if city else "XXX",
'region_code': region_code,
'country_code': 'NL',
'geonames_id': geonames_id,
'location_resolution': location_resolution,
}
def generate_ghcid(data: dict) -> Tuple[str, dict]:
"""Generate base GHCID and all identifier formats."""
# Build base GHCID string
country = data['country_code']
region = data['region_code']
city = data['city_code']
inst_type = data['type_code']
abbrev = extract_abbreviation_from_name(data['name'])
base_ghcid = f"{country}-{region}-{city}-{inst_type}-{abbrev}"
return base_ghcid, {
'country': country,
'region': region,
'city': city,
'type': inst_type,
'abbrev': abbrev,
}
def generate_identifier_formats(final_ghcid: str) -> dict:
"""Generate all 4 identifier formats from final GHCID string."""
# UUID v5 (SHA-1)
ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, final_ghcid)
# UUID v8 (SHA-256)
hash_bytes = hashlib.sha256(final_ghcid.encode('utf-8')).digest()
uuid_bytes = bytearray(hash_bytes[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant RFC 4122
ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))
# Numeric (64-bit)
ghcid_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)
# Record ID (UUID v7)
record_id = generate_uuid_v7()
return {
'ghcid_uuid': str(ghcid_uuid),
'ghcid_uuid_sha256': str(ghcid_uuid_sha256),
'ghcid_numeric': ghcid_numeric,
'record_id': str(record_id),
}
def process_kien_entries(entries_dir: Path, db_path: Path, dry_run: bool = False) -> dict:
"""Process KIEN entries and generate GHCIDs."""
stats = {
'total': 0,
'processed': 0,
'with_location': 0,
'without_location': 0,
'already_has_ghcid': 0,
'collisions': 0,
'collision_groups': 0,
'files_updated': 0,
'errors': [],
}
timestamp = datetime.now(timezone.utc).isoformat()
# Find KIEN entries (1674-1860)
kien_files = []
for f in entries_dir.glob("*.yaml"):
# Extract entry index from filename
match = re.match(r'^(\d+)_', f.name)
if match:
idx = int(match.group(1))
if 1674 <= idx <= 1860:
kien_files.append(f)
def get_entry_index(filepath: Path) -> int:
match = re.match(r'^(\d+)_', filepath.name)
return int(match.group(1)) if match else 0
kien_files.sort(key=get_entry_index)
stats['total'] = len(kien_files)
print(f"Found {len(kien_files)} KIEN entries")
# Phase 1: Load entries and extract data
print("\nPhase 1: Loading entries and extracting location data...")
entries_data = []
for filepath in kien_files:
try:
with open(filepath, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
continue
# Check if already has GHCID
if 'ghcid' in entry and entry['ghcid'].get('ghcid_current'):
stats['already_has_ghcid'] += 1
continue
# Extract data
data = extract_entry_data(entry, db_path)
if not data['city']:
stats['without_location'] += 1
continue
stats['with_location'] += 1
# Generate base GHCID
base_ghcid, components = generate_ghcid(data)
entries_data.append({
'filepath': filepath,
'entry': entry,
'data': data,
'base_ghcid': base_ghcid,
'components': components,
})
except Exception as e:
stats['errors'].append(f"{filepath.name}: {str(e)}")
print(f" Entries with location: {stats['with_location']}")
print(f" Entries without location: {stats['without_location']}")
print(f" Already have GHCID: {stats['already_has_ghcid']}")
# Phase 2: Detect collisions
print("\nPhase 2: Detecting collisions...")
collision_groups = defaultdict(list)
for ed in entries_data:
collision_groups[ed['base_ghcid']].append(ed)
for base_ghcid, group in collision_groups.items():
if len(group) > 1:
stats['collision_groups'] += 1
stats['collisions'] += len(group)
print(f" Collision groups: {stats['collision_groups']}")
print(f" Entries with collisions: {stats['collisions']}")
# Phase 3: Resolve collisions
print("\nPhase 3: Resolving collisions...")
collision_report = []
for base_ghcid, group in collision_groups.items():
if len(group) > 1:
# All get name suffixes
collision_report.append({
'base_ghcid': base_ghcid,
'count': len(group),
'institutions': [ed['data']['name'] for ed in group],
})
for ed in group:
name_suffix = generate_name_suffix(ed['data']['name'])
ed['final_ghcid'] = f"{base_ghcid}-{name_suffix}"
ed['had_collision'] = True
else:
ed = group[0]
ed['final_ghcid'] = base_ghcid
ed['had_collision'] = False
# Phase 4: Generate identifiers and update entries
print("\nPhase 4: Generating identifiers and updating entries...")
for ed in entries_data:
final_ghcid = ed['final_ghcid']
ids = generate_identifier_formats(final_ghcid)
# Create GHCID block
ghcid_block = {
'ghcid_current': final_ghcid,
'ghcid_original': final_ghcid,
'ghcid_uuid': ids['ghcid_uuid'],
'ghcid_uuid_sha256': ids['ghcid_uuid_sha256'],
'ghcid_numeric': ids['ghcid_numeric'],
'record_id': ids['record_id'],
'generation_timestamp': timestamp,
'ghcid_history': [
{
'ghcid': final_ghcid,
'ghcid_numeric': ids['ghcid_numeric'],
'valid_from': timestamp,
'valid_to': None,
'reason': 'Initial GHCID assignment (KIEN batch import December 2025)'
+ (' - name suffix added to resolve collision' if ed.get('had_collision') else ''),
}
],
}
# Add location resolution metadata
if ed['data'].get('location_resolution'):
ghcid_block['location_resolution'] = ed['data']['location_resolution']
if ed['data'].get('geonames_id'):
ghcid_block['geonames_id'] = ed['data']['geonames_id']
if ed.get('had_collision'):
ghcid_block['collision_resolved'] = True
ghcid_block['base_ghcid_before_collision'] = ed['base_ghcid']
# Update entry
entry = ed['entry']
entry['ghcid'] = ghcid_block
# Add to identifiers list
if 'identifiers' not in entry:
entry['identifiers'] = []
# Remove existing GHCID identifiers
entry['identifiers'] = [
i for i in entry['identifiers']
if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID']
]
# Add new identifiers
entry['identifiers'].extend([
{'identifier_scheme': 'GHCID', 'identifier_value': final_ghcid},
{'identifier_scheme': 'GHCID_UUID', 'identifier_value': ids['ghcid_uuid'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid']}"},
{'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': ids['ghcid_uuid_sha256'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid_sha256']}"},
{'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ids['ghcid_numeric'])},
{'identifier_scheme': 'RECORD_ID', 'identifier_value': ids['record_id'], 'identifier_url': f"urn:uuid:{ids['record_id']}"},
])
ed['entry'] = entry
stats['processed'] += 1
# Phase 5: Write updated entries
if not dry_run:
print("\nPhase 5: Writing updated entries...")
for ed in entries_data:
try:
with open(ed['filepath'], 'w', encoding='utf-8') as f:
yaml.dump(ed['entry'], f, default_flow_style=False, allow_unicode=True, sort_keys=False)
stats['files_updated'] += 1
except Exception as e:
stats['errors'].append(f"Write error {ed['filepath'].name}: {str(e)}")
print(f" Updated {stats['files_updated']} files")
# Write collision report
if collision_report:
report_path = entries_dir.parent / "kien_ghcid_collision_report.json"
report = {
'generation_timestamp': timestamp,
'total_kien_entries': stats['total'],
'entries_with_ghcid': stats['processed'],
'collision_groups': stats['collision_groups'],
'entries_with_collisions': stats['collisions'],
'collisions': collision_report,
}
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f" Collision report: {report_path}")
else:
print("\nPhase 5: DRY RUN - no files written")
return stats
def main():
parser = argparse.ArgumentParser(description="Generate GHCIDs for KIEN entries")
parser.add_argument('--dry-run', action='store_true', help="Preview changes without writing")
args = parser.parse_args()
entries_dir = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
db_path = PROJECT_ROOT / "data" / "reference" / "geonames.db"
print("="*70)
print("KIEN HERITAGE CUSTODIAN GHCID GENERATION")
print("="*70)
print(f"Entries directory: {entries_dir}")
print(f"GeoNames database: {db_path}")
print(f"Dry run: {args.dry_run}")
print()
if not entries_dir.exists():
print(f"ERROR: Entries directory not found: {entries_dir}")
sys.exit(1)
if not db_path.exists():
print(f"ERROR: GeoNames database not found: {db_path}")
sys.exit(1)
stats = process_kien_entries(entries_dir, db_path, dry_run=args.dry_run)
print()
print("="*70)
print("SUMMARY")
print("="*70)
print(f"Total KIEN entries: {stats['total']}")
print(f"Already have GHCID: {stats['already_has_ghcid']}")
print(f"Entries with location: {stats['with_location']}")
print(f"Entries without location: {stats['without_location']}")
print(f"GHCIDs generated: {stats['processed']}")
print(f"Collision groups: {stats['collision_groups']}")
print(f"Entries with collisions: {stats['collisions']}")
print(f"Files updated: {stats['files_updated']}")
if stats['errors']:
print(f"\nErrors ({len(stats['errors'])}):")
for err in stats['errors'][:5]:
print(f" - {err}")
if len(stats['errors']) > 5:
print(f" ... and {len(stats['errors']) - 5} more")
print()
if args.dry_run:
print("DRY RUN COMPLETE - No files modified")
else:
print("GHCID GENERATION COMPLETE")
if __name__ == "__main__":
main()