glam/scripts/lib/safe_yaml_update.py
kempersc fade1ed5b3 fix: add safety measures to prevent data loss during enrichment
Key changes:
- Created scripts/lib/safe_yaml_update.py with PROTECTED_KEYS constant
- Fixed enrich_custodians_wikidata_full.py to re-read files before writing
  (prevents race conditions where another script modified the file)
- Added safety check to abort if protected keys would be lost
- Protected keys include: location, original_entry, ghcid, provenance,
  google_maps_enrichment, osm_enrichment, etc.

Root cause of data loss in 62fdd35321:
- Script loaded files into list, then processed them later
- If another script modified files between load and write, changes were lost
- Now files are re-read immediately before modification

Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
2025-12-09 12:27:09 +01:00

227 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Safe YAML update utilities for custodian files.
CRITICAL: This module implements AGENTS.md Rule 5 - NEVER Delete Enriched Data.
All enrichment scripts MUST use these functions to prevent data loss.
Usage:
from lib.safe_yaml_update import safe_update_custodian, PROTECTED_KEYS
# Update a custodian file safely
success = safe_update_custodian(
yaml_path="/path/to/custodian.yaml",
updates={"new_field": "value", "wikidata_enrichment": {...}},
script_name="my_enrichment_script.py"
)
"""
import logging
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, Optional, Set
logger = logging.getLogger(__name__)
# Keys that must NEVER be deleted during enrichment
# Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
PROTECTED_KEYS: Set[str] = {
# Core structural keys
'original_entry',
'ghcid',
'custodian_name',
'identifiers',
'provenance',
# Normalized location data (expensive to regenerate)
'location',
# Annotation data
'ch_annotator',
# Enrichment sources (expensive API calls to collect)
'google_maps_enrichment',
'osm_enrichment',
'wikidata_enrichment',
'unesco_mow_enrichment',
'web_enrichment',
'youtube_enrichment',
'viaf_enrichment',
# Platform data
'digital_platforms',
}
def safe_update_custodian(
yaml_path: Path | str,
updates: Dict[str, Any],
script_name: str = "unknown_script",
add_provenance_note: bool = True
) -> bool:
"""
Safely update a custodian YAML file without losing existing data.
This function:
1. Reads the current file content FRESH (not from cache)
2. Verifies all protected keys are preserved after update
3. Adds a provenance note documenting the change
4. Writes back atomically
Args:
yaml_path: Path to the custodian YAML file
updates: Dictionary of key-value pairs to add/update
script_name: Name of the calling script (for provenance)
add_provenance_note: Whether to add a note to provenance.notes
Returns:
True if successful, False if update was blocked to prevent data loss
"""
yaml_path = Path(yaml_path)
if not yaml_path.exists():
logger.error(f"File not found: {yaml_path}")
return False
try:
# Read FRESH from disk (not from cache)
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data is None:
logger.error(f"Empty or invalid YAML: {yaml_path}")
return False
# Record protected keys BEFORE modification
keys_before = set(data.keys())
protected_before = keys_before & PROTECTED_KEYS
# Apply updates
for key, value in updates.items():
data[key] = value
# Verify protected keys AFTER modification
keys_after = set(data.keys())
protected_after = keys_after & PROTECTED_KEYS
lost_keys = protected_before - protected_after
if lost_keys:
logger.error(
f"BLOCKED: Update to {yaml_path.name} would delete protected keys: {lost_keys}"
)
return False
# Add provenance note
if add_provenance_note:
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list):
data['provenance']['notes'] = []
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
note = f"Updated by {script_name} on {timestamp}"
data['provenance']['notes'].append(note)
# Write back
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
return True
except Exception as e:
logger.error(f"Error updating {yaml_path}: {e}")
return False
def merge_enrichment(
yaml_path: Path | str,
enrichment_key: str,
enrichment_data: Dict[str, Any],
script_name: str = "unknown_script"
) -> bool:
"""
Merge new enrichment data with existing enrichment data.
Unlike simple updates, this MERGES nested dictionaries rather than
replacing them entirely. Useful for adding to wikidata_enrichment, etc.
Args:
yaml_path: Path to the custodian YAML file
enrichment_key: Key for the enrichment (e.g., 'wikidata_enrichment')
enrichment_data: New enrichment data to merge
script_name: Name of the calling script (for provenance)
Returns:
True if successful, False if blocked to prevent data loss
"""
yaml_path = Path(yaml_path)
if not yaml_path.exists():
logger.error(f"File not found: {yaml_path}")
return False
try:
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data is None:
data = {}
# Record protected keys BEFORE modification
keys_before = set(data.keys())
protected_before = keys_before & PROTECTED_KEYS
# Merge enrichment data
if enrichment_key not in data:
data[enrichment_key] = {}
if isinstance(data[enrichment_key], dict) and isinstance(enrichment_data, dict):
# Deep merge for dict values
_deep_merge(data[enrichment_key], enrichment_data)
else:
# Replace for non-dict values
data[enrichment_key] = enrichment_data
# Verify protected keys AFTER modification
keys_after = set(data.keys())
protected_after = keys_after & PROTECTED_KEYS
lost_keys = protected_before - protected_after
if lost_keys:
logger.error(
f"BLOCKED: Merge to {yaml_path.name} would delete protected keys: {lost_keys}"
)
return False
# Add provenance note
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list):
data['provenance']['notes'] = []
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
note = f"Enrichment merged ({enrichment_key}) by {script_name} on {timestamp}"
data['provenance']['notes'].append(note)
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
return True
except Exception as e:
logger.error(f"Error merging enrichment to {yaml_path}: {e}")
return False
def _deep_merge(base: Dict, updates: Dict) -> None:
"""
Deep merge updates into base dict, modifying base in-place.
For nested dicts, recursively merges. For other types, updates replace.
"""
for key, value in updates.items():
if key in base and isinstance(base[key], dict) and isinstance(value, dict):
_deep_merge(base[key], value)
else:
base[key] = value