674 lines
23 KiB
Python
674 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate GHCIDs for KIEN intangible heritage custodian entries.
|
|
|
|
This script is a targeted version of enrich_nde_entries_ghcid.py that only
|
|
processes KIEN entries (entry_index 1674-1860) to avoid processing the
|
|
entire NDE dataset.
|
|
|
|
Usage:
|
|
python scripts/enrich_kien_ghcid.py [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
import unicodedata
|
|
import uuid
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple
|
|
|
|
import yaml
|
|
|
|
# Project root
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
|
|
# GHCID UUID v5 Namespace (DNS namespace from RFC 4122)
|
|
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
|
|
|
# GeoNames admin1 code to ISO 3166-2 NL mapping
|
|
GEONAMES_ADMIN1_TO_ISO_NL = {
|
|
"01": "DR", # Drenthe
|
|
"02": "FR", # Friesland
|
|
"03": "GE", # Gelderland
|
|
"04": "GR", # Groningen
|
|
"05": "LI", # Limburg
|
|
"06": "NB", # Noord-Brabant
|
|
"07": "NH", # Noord-Holland
|
|
"09": "UT", # Utrecht
|
|
"10": "ZE", # Zeeland
|
|
"11": "ZH", # Zuid-Holland
|
|
"15": "OV", # Overijssel
|
|
"16": "FL", # Flevoland
|
|
}
|
|
|
|
# Dutch articles/prepositions to skip in abbreviation generation
|
|
DUTCH_SKIP_WORDS = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
|
"'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
|
|
'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a', 'an'
|
|
}
|
|
|
|
# Valid GeoNames feature codes (settlements, not neighborhoods)
|
|
VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
|
|
|
|
def generate_uuid_v7() -> uuid.UUID:
|
|
"""Generate a UUID v7 (time-ordered, for database records)."""
|
|
import time
|
|
import os
|
|
|
|
# Get current time in milliseconds
|
|
timestamp_ms = int(time.time() * 1000)
|
|
|
|
# Create 16-byte array
|
|
uuid_bytes = bytearray(16)
|
|
|
|
# First 6 bytes: timestamp (48 bits)
|
|
uuid_bytes[0:6] = timestamp_ms.to_bytes(6, byteorder='big')
|
|
|
|
# 4 bits version (7) + 12 bits random
|
|
random_a = int.from_bytes(os.urandom(2), byteorder='big')
|
|
uuid_bytes[6] = 0x70 | ((random_a >> 8) & 0x0F)
|
|
uuid_bytes[7] = random_a & 0xFF
|
|
|
|
# 2 bits variant (10) + 62 bits random
|
|
random_b = int.from_bytes(os.urandom(8), byteorder='big')
|
|
uuid_bytes[8] = 0x80 | ((random_b >> 56) & 0x3F)
|
|
uuid_bytes[9:16] = random_b.to_bytes(8, byteorder='big')[1:]
|
|
|
|
return uuid.UUID(bytes=bytes(uuid_bytes))
|
|
|
|
|
|
def normalize_city_name(city_name: str) -> str:
|
|
"""Normalize city name for code generation."""
|
|
# NFD decomposition to separate accents
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
# Remove combining marks (accents)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
# Remove apostrophes and special chars
|
|
ascii_name = re.sub(r"[''`]", '', ascii_name)
|
|
return ascii_name
|
|
|
|
|
|
def get_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
if not city_name:
|
|
return "XXX"
|
|
|
|
normalized = normalize_city_name(city_name)
|
|
words = normalized.split()
|
|
|
|
if not words:
|
|
return "XXX"
|
|
|
|
articles = {'de', 'het', 'den', "'s", 'op', 'aan', 'bij', 'ter'}
|
|
|
|
if len(words) == 1:
|
|
code = words[0][:3].upper()
|
|
elif words[0].lower() in articles and len(words) > 1:
|
|
code = (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
code = ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
if len(code) < 3:
|
|
code = code.ljust(3, 'X')
|
|
elif len(code) > 3:
|
|
code = code[:3]
|
|
|
|
code = re.sub(r'[^A-Z]', 'X', code)
|
|
return code
|
|
|
|
|
|
def extract_abbreviation_from_name(name: str) -> str:
|
|
"""Extract abbreviation from institution name using first letters of significant words."""
|
|
if not name:
|
|
return "INST"
|
|
|
|
# Normalize
|
|
normalized = unicodedata.normalize('NFD', name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Remove punctuation except hyphens and apostrophes
|
|
cleaned = re.sub(r"[''`\",.:;!?()[\]{}]", '', ascii_name)
|
|
|
|
# Split into words
|
|
words = cleaned.split()
|
|
|
|
# Filter out skip words and digits
|
|
significant = []
|
|
for word in words:
|
|
word_lower = word.lower()
|
|
if word_lower not in DUTCH_SKIP_WORDS and not word.isdigit():
|
|
significant.append(word)
|
|
|
|
if not significant:
|
|
significant = words[:3] # Fallback to first 3 words
|
|
|
|
# Take first letter of each significant word (up to 10)
|
|
abbrev = ''.join(w[0].upper() for w in significant[:10] if w)
|
|
|
|
return abbrev if abbrev else "INST"
|
|
|
|
|
|
def generate_name_suffix(institution_name: str) -> str:
|
|
"""Generate snake_case name suffix for collision resolution."""
|
|
if not institution_name:
|
|
return "unknown"
|
|
|
|
# Normalize
|
|
normalized = unicodedata.normalize('NFD', institution_name)
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Convert to lowercase
|
|
lowercase = ascii_name.lower()
|
|
|
|
# Remove punctuation
|
|
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
|
|
|
|
# Replace spaces/hyphens with underscores
|
|
underscored = re.sub(r'[\s\-/]+', '_', no_punct)
|
|
|
|
# Remove non-alphanumeric (except underscores)
|
|
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
|
|
|
# Collapse multiple underscores
|
|
final = re.sub(r'_+', '_', clean).strip('_')
|
|
|
|
# Truncate
|
|
if len(final) > 50:
|
|
final = final[:50].rstrip('_')
|
|
|
|
return final if final else "unknown"
|
|
|
|
|
|
def reverse_geocode(lat: float, lon: float, db_path: Path) -> Optional[dict]:
|
|
"""Reverse geocode coordinates to find nearest city using GeoNames."""
|
|
conn = sqlite3.connect(str(db_path))
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
query = """
|
|
SELECT
|
|
name, ascii_name, admin1_code, geonames_id, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
|
FROM cities
|
|
WHERE country_code = 'NL'
|
|
AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
|
|
AND population >= 100
|
|
ORDER BY distance_sq
|
|
LIMIT 1
|
|
"""
|
|
cursor.execute(query, (lat, lat, lon, lon, *VALID_FEATURE_CODES))
|
|
row = cursor.fetchone()
|
|
|
|
if row:
|
|
name, ascii_name, admin1_code, geonames_id, population, feature_code, dist_sq = row
|
|
region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00")
|
|
|
|
return {
|
|
'city': name,
|
|
'city_code': get_city_code(name),
|
|
'region_code': region_code,
|
|
'admin1_code': admin1_code,
|
|
'geonames_id': geonames_id,
|
|
'feature_code': feature_code,
|
|
'population': population,
|
|
'distance_km': (dist_sq ** 0.5) * 111,
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
return None
|
|
|
|
|
|
def lookup_city_by_name(city_name: str, db_path: Path) -> Optional[dict]:
|
|
"""Look up city in GeoNames by name."""
|
|
conn = sqlite3.connect(str(db_path))
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
query = """
|
|
SELECT
|
|
name, admin1_code, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE country_code = 'NL'
|
|
AND (name = ? OR ascii_name = ?)
|
|
AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
"""
|
|
cursor.execute(query, (city_name, city_name, *VALID_FEATURE_CODES))
|
|
row = cursor.fetchone()
|
|
|
|
if row:
|
|
name, admin1_code, geonames_id, population, feature_code = row
|
|
region_code = GEONAMES_ADMIN1_TO_ISO_NL.get(admin1_code, "00")
|
|
|
|
return {
|
|
'city': name,
|
|
'city_code': get_city_code(name),
|
|
'region_code': region_code,
|
|
'admin1_code': admin1_code,
|
|
'geonames_id': geonames_id,
|
|
'feature_code': feature_code,
|
|
'population': population,
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
return None
|
|
|
|
|
|
def extract_entry_data(entry: dict, db_path: Path) -> dict:
|
|
"""Extract data for GHCID generation from a KIEN entry."""
|
|
# Get name from custodian_name or original_entry
|
|
name = None
|
|
if 'custodian_name' in entry and entry['custodian_name'].get('claim_value'):
|
|
name = entry['custodian_name']['claim_value']
|
|
if not name and 'original_entry' in entry:
|
|
name = entry['original_entry'].get('organisatie')
|
|
if not name and 'kien_enrichment' in entry:
|
|
name = entry['kien_enrichment'].get('kien_name')
|
|
if not name:
|
|
name = "Unknown Institution"
|
|
|
|
# Get type code - KIEN entries are type I (Intangible Heritage) or T (Taste/Smell)
|
|
type_code = 'I' # Default for KIEN
|
|
if 'original_entry' in entry and 'type' in entry['original_entry']:
|
|
types = entry['original_entry']['type']
|
|
if isinstance(types, list) and types:
|
|
type_code = types[0]
|
|
elif isinstance(types, str):
|
|
type_code = types
|
|
|
|
# Get location data
|
|
city = None
|
|
region_code = "00"
|
|
geonames_id = None
|
|
location_resolution = None
|
|
|
|
# Try coordinates first
|
|
lat, lon = None, None
|
|
if 'locations' in entry and entry['locations']:
|
|
loc = entry['locations'][0]
|
|
lat = loc.get('latitude')
|
|
lon = loc.get('longitude')
|
|
city = loc.get('city')
|
|
|
|
# Reverse geocode if we have coordinates
|
|
if lat is not None and lon is not None:
|
|
geo_result = reverse_geocode(lat, lon, db_path)
|
|
if geo_result:
|
|
city = geo_result['city']
|
|
region_code = geo_result['region_code']
|
|
geonames_id = geo_result['geonames_id']
|
|
location_resolution = {
|
|
'method': 'REVERSE_GEOCODE',
|
|
'geonames_id': geonames_id,
|
|
'geonames_name': city,
|
|
'feature_code': geo_result['feature_code'],
|
|
'population': geo_result['population'],
|
|
'admin1_code': geo_result['admin1_code'],
|
|
'region_code': region_code,
|
|
'country_code': 'NL',
|
|
'source_coordinates': {'latitude': lat, 'longitude': lon},
|
|
'distance_km': geo_result['distance_km'],
|
|
}
|
|
|
|
# If we have a city name but no geocode result, look it up
|
|
if city and not location_resolution:
|
|
geo_result = lookup_city_by_name(city, db_path)
|
|
if geo_result:
|
|
region_code = geo_result['region_code']
|
|
geonames_id = geo_result['geonames_id']
|
|
location_resolution = {
|
|
'method': 'NAME_LOOKUP',
|
|
'geonames_id': geonames_id,
|
|
'geonames_name': geo_result['city'],
|
|
'feature_code': geo_result['feature_code'],
|
|
'population': geo_result['population'],
|
|
'admin1_code': geo_result['admin1_code'],
|
|
'region_code': region_code,
|
|
'country_code': 'NL',
|
|
}
|
|
else:
|
|
location_resolution = {
|
|
'method': 'TEXT_FALLBACK',
|
|
'city_name': city,
|
|
'needs_review': True,
|
|
}
|
|
|
|
return {
|
|
'name': name,
|
|
'type_code': type_code,
|
|
'city': city,
|
|
'city_code': get_city_code(city) if city else "XXX",
|
|
'region_code': region_code,
|
|
'country_code': 'NL',
|
|
'geonames_id': geonames_id,
|
|
'location_resolution': location_resolution,
|
|
}
|
|
|
|
|
|
def generate_ghcid(data: dict) -> Tuple[str, dict]:
|
|
"""Generate base GHCID and all identifier formats."""
|
|
# Build base GHCID string
|
|
country = data['country_code']
|
|
region = data['region_code']
|
|
city = data['city_code']
|
|
inst_type = data['type_code']
|
|
abbrev = extract_abbreviation_from_name(data['name'])
|
|
|
|
base_ghcid = f"{country}-{region}-{city}-{inst_type}-{abbrev}"
|
|
|
|
return base_ghcid, {
|
|
'country': country,
|
|
'region': region,
|
|
'city': city,
|
|
'type': inst_type,
|
|
'abbrev': abbrev,
|
|
}
|
|
|
|
|
|
def generate_identifier_formats(final_ghcid: str) -> dict:
|
|
"""Generate all 4 identifier formats from final GHCID string."""
|
|
# UUID v5 (SHA-1)
|
|
ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, final_ghcid)
|
|
|
|
# UUID v8 (SHA-256)
|
|
hash_bytes = hashlib.sha256(final_ghcid.encode('utf-8')).digest()
|
|
uuid_bytes = bytearray(hash_bytes[:16])
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant RFC 4122
|
|
ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))
|
|
|
|
# Numeric (64-bit)
|
|
ghcid_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)
|
|
|
|
# Record ID (UUID v7)
|
|
record_id = generate_uuid_v7()
|
|
|
|
return {
|
|
'ghcid_uuid': str(ghcid_uuid),
|
|
'ghcid_uuid_sha256': str(ghcid_uuid_sha256),
|
|
'ghcid_numeric': ghcid_numeric,
|
|
'record_id': str(record_id),
|
|
}
|
|
|
|
|
|
def process_kien_entries(entries_dir: Path, db_path: Path, dry_run: bool = False) -> dict:
|
|
"""Process KIEN entries and generate GHCIDs."""
|
|
stats = {
|
|
'total': 0,
|
|
'processed': 0,
|
|
'with_location': 0,
|
|
'without_location': 0,
|
|
'already_has_ghcid': 0,
|
|
'collisions': 0,
|
|
'collision_groups': 0,
|
|
'files_updated': 0,
|
|
'errors': [],
|
|
}
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Find KIEN entries (1674-1860)
|
|
kien_files = []
|
|
for f in entries_dir.glob("*.yaml"):
|
|
# Extract entry index from filename
|
|
match = re.match(r'^(\d+)_', f.name)
|
|
if match:
|
|
idx = int(match.group(1))
|
|
if 1674 <= idx <= 1860:
|
|
kien_files.append(f)
|
|
|
|
def get_entry_index(filepath: Path) -> int:
|
|
match = re.match(r'^(\d+)_', filepath.name)
|
|
return int(match.group(1)) if match else 0
|
|
|
|
kien_files.sort(key=get_entry_index)
|
|
stats['total'] = len(kien_files)
|
|
|
|
print(f"Found {len(kien_files)} KIEN entries")
|
|
|
|
# Phase 1: Load entries and extract data
|
|
print("\nPhase 1: Loading entries and extracting location data...")
|
|
entries_data = []
|
|
|
|
for filepath in kien_files:
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
continue
|
|
|
|
# Check if already has GHCID
|
|
if 'ghcid' in entry and entry['ghcid'].get('ghcid_current'):
|
|
stats['already_has_ghcid'] += 1
|
|
continue
|
|
|
|
# Extract data
|
|
data = extract_entry_data(entry, db_path)
|
|
|
|
if not data['city']:
|
|
stats['without_location'] += 1
|
|
continue
|
|
|
|
stats['with_location'] += 1
|
|
|
|
# Generate base GHCID
|
|
base_ghcid, components = generate_ghcid(data)
|
|
|
|
entries_data.append({
|
|
'filepath': filepath,
|
|
'entry': entry,
|
|
'data': data,
|
|
'base_ghcid': base_ghcid,
|
|
'components': components,
|
|
})
|
|
|
|
except Exception as e:
|
|
stats['errors'].append(f"{filepath.name}: {str(e)}")
|
|
|
|
print(f" Entries with location: {stats['with_location']}")
|
|
print(f" Entries without location: {stats['without_location']}")
|
|
print(f" Already have GHCID: {stats['already_has_ghcid']}")
|
|
|
|
# Phase 2: Detect collisions
|
|
print("\nPhase 2: Detecting collisions...")
|
|
collision_groups = defaultdict(list)
|
|
|
|
for ed in entries_data:
|
|
collision_groups[ed['base_ghcid']].append(ed)
|
|
|
|
for base_ghcid, group in collision_groups.items():
|
|
if len(group) > 1:
|
|
stats['collision_groups'] += 1
|
|
stats['collisions'] += len(group)
|
|
|
|
print(f" Collision groups: {stats['collision_groups']}")
|
|
print(f" Entries with collisions: {stats['collisions']}")
|
|
|
|
# Phase 3: Resolve collisions
|
|
print("\nPhase 3: Resolving collisions...")
|
|
collision_report = []
|
|
|
|
for base_ghcid, group in collision_groups.items():
|
|
if len(group) > 1:
|
|
# All get name suffixes
|
|
collision_report.append({
|
|
'base_ghcid': base_ghcid,
|
|
'count': len(group),
|
|
'institutions': [ed['data']['name'] for ed in group],
|
|
})
|
|
|
|
for ed in group:
|
|
name_suffix = generate_name_suffix(ed['data']['name'])
|
|
ed['final_ghcid'] = f"{base_ghcid}-{name_suffix}"
|
|
ed['had_collision'] = True
|
|
else:
|
|
ed = group[0]
|
|
ed['final_ghcid'] = base_ghcid
|
|
ed['had_collision'] = False
|
|
|
|
# Phase 4: Generate identifiers and update entries
|
|
print("\nPhase 4: Generating identifiers and updating entries...")
|
|
|
|
for ed in entries_data:
|
|
final_ghcid = ed['final_ghcid']
|
|
ids = generate_identifier_formats(final_ghcid)
|
|
|
|
# Create GHCID block
|
|
ghcid_block = {
|
|
'ghcid_current': final_ghcid,
|
|
'ghcid_original': final_ghcid,
|
|
'ghcid_uuid': ids['ghcid_uuid'],
|
|
'ghcid_uuid_sha256': ids['ghcid_uuid_sha256'],
|
|
'ghcid_numeric': ids['ghcid_numeric'],
|
|
'record_id': ids['record_id'],
|
|
'generation_timestamp': timestamp,
|
|
'ghcid_history': [
|
|
{
|
|
'ghcid': final_ghcid,
|
|
'ghcid_numeric': ids['ghcid_numeric'],
|
|
'valid_from': timestamp,
|
|
'valid_to': None,
|
|
'reason': 'Initial GHCID assignment (KIEN batch import December 2025)'
|
|
+ (' - name suffix added to resolve collision' if ed.get('had_collision') else ''),
|
|
}
|
|
],
|
|
}
|
|
|
|
# Add location resolution metadata
|
|
if ed['data'].get('location_resolution'):
|
|
ghcid_block['location_resolution'] = ed['data']['location_resolution']
|
|
|
|
if ed['data'].get('geonames_id'):
|
|
ghcid_block['geonames_id'] = ed['data']['geonames_id']
|
|
|
|
if ed.get('had_collision'):
|
|
ghcid_block['collision_resolved'] = True
|
|
ghcid_block['base_ghcid_before_collision'] = ed['base_ghcid']
|
|
|
|
# Update entry
|
|
entry = ed['entry']
|
|
entry['ghcid'] = ghcid_block
|
|
|
|
# Add to identifiers list
|
|
if 'identifiers' not in entry:
|
|
entry['identifiers'] = []
|
|
|
|
# Remove existing GHCID identifiers
|
|
entry['identifiers'] = [
|
|
i for i in entry['identifiers']
|
|
if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID']
|
|
]
|
|
|
|
# Add new identifiers
|
|
entry['identifiers'].extend([
|
|
{'identifier_scheme': 'GHCID', 'identifier_value': final_ghcid},
|
|
{'identifier_scheme': 'GHCID_UUID', 'identifier_value': ids['ghcid_uuid'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid']}"},
|
|
{'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': ids['ghcid_uuid_sha256'], 'identifier_url': f"urn:uuid:{ids['ghcid_uuid_sha256']}"},
|
|
{'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ids['ghcid_numeric'])},
|
|
{'identifier_scheme': 'RECORD_ID', 'identifier_value': ids['record_id'], 'identifier_url': f"urn:uuid:{ids['record_id']}"},
|
|
])
|
|
|
|
ed['entry'] = entry
|
|
stats['processed'] += 1
|
|
|
|
# Phase 5: Write updated entries
|
|
if not dry_run:
|
|
print("\nPhase 5: Writing updated entries...")
|
|
|
|
for ed in entries_data:
|
|
try:
|
|
with open(ed['filepath'], 'w', encoding='utf-8') as f:
|
|
yaml.dump(ed['entry'], f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
stats['files_updated'] += 1
|
|
except Exception as e:
|
|
stats['errors'].append(f"Write error {ed['filepath'].name}: {str(e)}")
|
|
|
|
print(f" Updated {stats['files_updated']} files")
|
|
|
|
# Write collision report
|
|
if collision_report:
|
|
report_path = entries_dir.parent / "kien_ghcid_collision_report.json"
|
|
report = {
|
|
'generation_timestamp': timestamp,
|
|
'total_kien_entries': stats['total'],
|
|
'entries_with_ghcid': stats['processed'],
|
|
'collision_groups': stats['collision_groups'],
|
|
'entries_with_collisions': stats['collisions'],
|
|
'collisions': collision_report,
|
|
}
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
print(f" Collision report: {report_path}")
|
|
else:
|
|
print("\nPhase 5: DRY RUN - no files written")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Generate GHCIDs for KIEN entries")
|
|
parser.add_argument('--dry-run', action='store_true', help="Preview changes without writing")
|
|
args = parser.parse_args()
|
|
|
|
entries_dir = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
|
|
db_path = PROJECT_ROOT / "data" / "reference" / "geonames.db"
|
|
|
|
print("="*70)
|
|
print("KIEN HERITAGE CUSTODIAN GHCID GENERATION")
|
|
print("="*70)
|
|
print(f"Entries directory: {entries_dir}")
|
|
print(f"GeoNames database: {db_path}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
if not entries_dir.exists():
|
|
print(f"ERROR: Entries directory not found: {entries_dir}")
|
|
sys.exit(1)
|
|
|
|
if not db_path.exists():
|
|
print(f"ERROR: GeoNames database not found: {db_path}")
|
|
sys.exit(1)
|
|
|
|
stats = process_kien_entries(entries_dir, db_path, dry_run=args.dry_run)
|
|
|
|
print()
|
|
print("="*70)
|
|
print("SUMMARY")
|
|
print("="*70)
|
|
print(f"Total KIEN entries: {stats['total']}")
|
|
print(f"Already have GHCID: {stats['already_has_ghcid']}")
|
|
print(f"Entries with location: {stats['with_location']}")
|
|
print(f"Entries without location: {stats['without_location']}")
|
|
print(f"GHCIDs generated: {stats['processed']}")
|
|
print(f"Collision groups: {stats['collision_groups']}")
|
|
print(f"Entries with collisions: {stats['collisions']}")
|
|
print(f"Files updated: {stats['files_updated']}")
|
|
|
|
if stats['errors']:
|
|
print(f"\nErrors ({len(stats['errors'])}):")
|
|
for err in stats['errors'][:5]:
|
|
print(f" - {err}")
|
|
if len(stats['errors']) > 5:
|
|
print(f" ... and {len(stats['errors']) - 5} more")
|
|
|
|
print()
|
|
if args.dry_run:
|
|
print("DRY RUN COMPLETE - No files modified")
|
|
else:
|
|
print("GHCID GENERATION COMPLETE")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|