diff --git a/scripts/lib/__init__.py b/scripts/lib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/lib/safe_yaml_update.py b/scripts/lib/safe_yaml_update.py new file mode 100644 index 0000000000..3afaa20380 --- /dev/null +++ b/scripts/lib/safe_yaml_update.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +""" +Safe YAML update utilities for custodian files. + +CRITICAL: This module implements AGENTS.md Rule 5 - NEVER Delete Enriched Data. +All enrichment scripts MUST use these functions to prevent data loss. + +Usage: + from lib.safe_yaml_update import safe_update_custodian, PROTECTED_KEYS + + # Update a custodian file safely + success = safe_update_custodian( + yaml_path="/path/to/custodian.yaml", + updates={"new_field": "value", "wikidata_enrichment": {...}}, + script_name="my_enrichment_script.py" + ) +""" + +import logging +import yaml +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional, Set + +logger = logging.getLogger(__name__) + +# Keys that must NEVER be deleted during enrichment +# Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only +PROTECTED_KEYS: Set[str] = { + # Core structural keys + 'original_entry', + 'ghcid', + 'custodian_name', + 'identifiers', + 'provenance', + + # Normalized location data (expensive to regenerate) + 'location', + + # Annotation data + 'ch_annotator', + + # Enrichment sources (expensive API calls to collect) + 'google_maps_enrichment', + 'osm_enrichment', + 'wikidata_enrichment', + 'unesco_mow_enrichment', + 'web_enrichment', + 'youtube_enrichment', + 'viaf_enrichment', + + # Platform data + 'digital_platforms', +} + + +def safe_update_custodian( + yaml_path: Path | str, + updates: Dict[str, Any], + script_name: str = "unknown_script", + add_provenance_note: bool = True +) -> bool: + """ + Safely update a custodian YAML file without losing existing data. + + This function: + 1. Reads the current file content FRESH (not from cache) + 2. Verifies all protected keys are preserved after update + 3. Adds a provenance note documenting the change + 4. Writes back atomically + + Args: + yaml_path: Path to the custodian YAML file + updates: Dictionary of key-value pairs to add/update + script_name: Name of the calling script (for provenance) + add_provenance_note: Whether to add a note to provenance.notes + + Returns: + True if successful, False if update was blocked to prevent data loss + """ + yaml_path = Path(yaml_path) + + if not yaml_path.exists(): + logger.error(f"File not found: {yaml_path}") + return False + + try: + # Read FRESH from disk (not from cache) + with open(yaml_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if data is None: + logger.error(f"Empty or invalid YAML: {yaml_path}") + return False + + # Record protected keys BEFORE modification + keys_before = set(data.keys()) + protected_before = keys_before & PROTECTED_KEYS + + # Apply updates + for key, value in updates.items(): + data[key] = value + + # Verify protected keys AFTER modification + keys_after = set(data.keys()) + protected_after = keys_after & PROTECTED_KEYS + + lost_keys = protected_before - protected_after + if lost_keys: + logger.error( + f"BLOCKED: Update to {yaml_path.name} would delete protected keys: {lost_keys}" + ) + return False + + # Add provenance note + if add_provenance_note: + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list): + data['provenance']['notes'] = [] + + timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + note = f"Updated by {script_name} on {timestamp}" + data['provenance']['notes'].append(note) + + # Write back + with open(yaml_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) + + return True + + except Exception as e: + logger.error(f"Error updating {yaml_path}: {e}") + return False + + +def merge_enrichment( + yaml_path: Path | str, + enrichment_key: str, + enrichment_data: Dict[str, Any], + script_name: str = "unknown_script" +) -> bool: + """ + Merge new enrichment data with existing enrichment data. + + Unlike simple updates, this MERGES nested dictionaries rather than + replacing them entirely. Useful for adding to wikidata_enrichment, etc. + + Args: + yaml_path: Path to the custodian YAML file + enrichment_key: Key for the enrichment (e.g., 'wikidata_enrichment') + enrichment_data: New enrichment data to merge + script_name: Name of the calling script (for provenance) + + Returns: + True if successful, False if blocked to prevent data loss + """ + yaml_path = Path(yaml_path) + + if not yaml_path.exists(): + logger.error(f"File not found: {yaml_path}") + return False + + try: + with open(yaml_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if data is None: + data = {} + + # Record protected keys BEFORE modification + keys_before = set(data.keys()) + protected_before = keys_before & PROTECTED_KEYS + + # Merge enrichment data + if enrichment_key not in data: + data[enrichment_key] = {} + + if isinstance(data[enrichment_key], dict) and isinstance(enrichment_data, dict): + # Deep merge for dict values + _deep_merge(data[enrichment_key], enrichment_data) + else: + # Replace for non-dict values + data[enrichment_key] = enrichment_data + + # Verify protected keys AFTER modification + keys_after = set(data.keys()) + protected_after = keys_after & PROTECTED_KEYS + + lost_keys = protected_before - protected_after + if lost_keys: + logger.error( + f"BLOCKED: Merge to {yaml_path.name} would delete protected keys: {lost_keys}" + ) + return False + + # Add provenance note + if 'provenance' not in data: + data['provenance'] = {} + if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list): + data['provenance']['notes'] = [] + + timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + note = f"Enrichment merged ({enrichment_key}) by {script_name} on {timestamp}" + data['provenance']['notes'].append(note) + + with open(yaml_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) + + return True + + except Exception as e: + logger.error(f"Error merging enrichment to {yaml_path}: {e}") + return False + + +def _deep_merge(base: Dict, updates: Dict) -> None: + """ + Deep merge updates into base dict, modifying base in-place. + + For nested dicts, recursively merges. For other types, updates replace. + """ + for key, value in updates.items(): + if key in base and isinstance(base[key], dict) and isinstance(value, dict): + _deep_merge(base[key], value) + else: + base[key] = value