#!/usr/bin/env python3 """ Safe YAML update utilities for custodian files. CRITICAL: This module implements AGENTS.md Rule 5 - NEVER Delete Enriched Data. All enrichment scripts MUST use these functions to prevent data loss. Usage: from lib.safe_yaml_update import safe_update_custodian, PROTECTED_KEYS # Update a custodian file safely success = safe_update_custodian( yaml_path="/path/to/custodian.yaml", updates={"new_field": "value", "wikidata_enrichment": {...}}, script_name="my_enrichment_script.py" ) """ import logging import yaml from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, Optional, Set logger = logging.getLogger(__name__) # Keys that must NEVER be deleted during enrichment # Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only PROTECTED_KEYS: Set[str] = { # Core structural keys 'original_entry', 'ghcid', 'custodian_name', 'identifiers', 'provenance', # Normalized location data (expensive to regenerate) 'location', # Annotation data 'ch_annotator', # Enrichment sources (expensive API calls to collect) 'google_maps_enrichment', 'osm_enrichment', 'wikidata_enrichment', 'unesco_mow_enrichment', 'web_enrichment', 'youtube_enrichment', 'viaf_enrichment', # Platform data 'digital_platforms', } def safe_update_custodian( yaml_path: Path | str, updates: Dict[str, Any], script_name: str = "unknown_script", add_provenance_note: bool = True ) -> bool: """ Safely update a custodian YAML file without losing existing data. This function: 1. Reads the current file content FRESH (not from cache) 2. Verifies all protected keys are preserved after update 3. Adds a provenance note documenting the change 4. Writes back atomically Args: yaml_path: Path to the custodian YAML file updates: Dictionary of key-value pairs to add/update script_name: Name of the calling script (for provenance) add_provenance_note: Whether to add a note to provenance.notes Returns: True if successful, False if update was blocked to prevent data loss """ yaml_path = Path(yaml_path) if not yaml_path.exists(): logger.error(f"File not found: {yaml_path}") return False try: # Read FRESH from disk (not from cache) with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data is None: logger.error(f"Empty or invalid YAML: {yaml_path}") return False # Record protected keys BEFORE modification keys_before = set(data.keys()) protected_before = keys_before & PROTECTED_KEYS # Apply updates for key, value in updates.items(): data[key] = value # Verify protected keys AFTER modification keys_after = set(data.keys()) protected_after = keys_after & PROTECTED_KEYS lost_keys = protected_before - protected_after if lost_keys: logger.error( f"BLOCKED: Update to {yaml_path.name} would delete protected keys: {lost_keys}" ) return False # Add provenance note if add_provenance_note: if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list): data['provenance']['notes'] = [] timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') note = f"Updated by {script_name} on {timestamp}" data['provenance']['notes'].append(note) # Write back with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) return True except Exception as e: logger.error(f"Error updating {yaml_path}: {e}") return False def merge_enrichment( yaml_path: Path | str, enrichment_key: str, enrichment_data: Dict[str, Any], script_name: str = "unknown_script" ) -> bool: """ Merge new enrichment data with existing enrichment data. Unlike simple updates, this MERGES nested dictionaries rather than replacing them entirely. Useful for adding to wikidata_enrichment, etc. Args: yaml_path: Path to the custodian YAML file enrichment_key: Key for the enrichment (e.g., 'wikidata_enrichment') enrichment_data: New enrichment data to merge script_name: Name of the calling script (for provenance) Returns: True if successful, False if blocked to prevent data loss """ yaml_path = Path(yaml_path) if not yaml_path.exists(): logger.error(f"File not found: {yaml_path}") return False try: with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data is None: data = {} # Record protected keys BEFORE modification keys_before = set(data.keys()) protected_before = keys_before & PROTECTED_KEYS # Merge enrichment data if enrichment_key not in data: data[enrichment_key] = {} if isinstance(data[enrichment_key], dict) and isinstance(enrichment_data, dict): # Deep merge for dict values _deep_merge(data[enrichment_key], enrichment_data) else: # Replace for non-dict values data[enrichment_key] = enrichment_data # Verify protected keys AFTER modification keys_after = set(data.keys()) protected_after = keys_after & PROTECTED_KEYS lost_keys = protected_before - protected_after if lost_keys: logger.error( f"BLOCKED: Merge to {yaml_path.name} would delete protected keys: {lost_keys}" ) return False # Add provenance note if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list): data['provenance']['notes'] = [] timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') note = f"Enrichment merged ({enrichment_key}) by {script_name} on {timestamp}" data['provenance']['notes'].append(note) with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) return True except Exception as e: logger.error(f"Error merging enrichment to {yaml_path}: {e}") return False def _deep_merge(base: Dict, updates: Dict) -> None: """ Deep merge updates into base dict, modifying base in-place. For nested dicts, recursively merges. For other types, updates replace. """ for key, value in updates.items(): if key in base and isinstance(base[key], dict) and isinstance(value, dict): _deep_merge(base[key], value) else: base[key] = value