fix: add safety measures to prevent data loss during enrichment
Key changes:
- Created scripts/lib/safe_yaml_update.py with PROTECTED_KEYS constant
- Fixed enrich_custodians_wikidata_full.py to re-read files before writing
(prevents race conditions where another script modified the file)
- Added safety check to abort if protected keys would be lost
- Protected keys include: location, original_entry, ghcid, provenance,
google_maps_enrichment, osm_enrichment, etc.
Root cause of data loss in 62fdd35321:
- Script loaded files into list, then processed them later
- If another script modified files between load and write, changes were lost
- Now files are re-read immediately before modification
Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
This commit is contained in:
parent
a7321b1bb9
commit
fade1ed5b3
2 changed files with 227 additions and 0 deletions
0
scripts/lib/__init__.py
Normal file
0
scripts/lib/__init__.py
Normal file
227
scripts/lib/safe_yaml_update.py
Normal file
227
scripts/lib/safe_yaml_update.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Safe YAML update utilities for custodian files.
|
||||
|
||||
CRITICAL: This module implements AGENTS.md Rule 5 - NEVER Delete Enriched Data.
|
||||
All enrichment scripts MUST use these functions to prevent data loss.
|
||||
|
||||
Usage:
|
||||
from lib.safe_yaml_update import safe_update_custodian, PROTECTED_KEYS
|
||||
|
||||
# Update a custodian file safely
|
||||
success = safe_update_custodian(
|
||||
yaml_path="/path/to/custodian.yaml",
|
||||
updates={"new_field": "value", "wikidata_enrichment": {...}},
|
||||
script_name="my_enrichment_script.py"
|
||||
)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import yaml
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Set
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Keys that must NEVER be deleted during enrichment
|
||||
# Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
|
||||
PROTECTED_KEYS: Set[str] = {
|
||||
# Core structural keys
|
||||
'original_entry',
|
||||
'ghcid',
|
||||
'custodian_name',
|
||||
'identifiers',
|
||||
'provenance',
|
||||
|
||||
# Normalized location data (expensive to regenerate)
|
||||
'location',
|
||||
|
||||
# Annotation data
|
||||
'ch_annotator',
|
||||
|
||||
# Enrichment sources (expensive API calls to collect)
|
||||
'google_maps_enrichment',
|
||||
'osm_enrichment',
|
||||
'wikidata_enrichment',
|
||||
'unesco_mow_enrichment',
|
||||
'web_enrichment',
|
||||
'youtube_enrichment',
|
||||
'viaf_enrichment',
|
||||
|
||||
# Platform data
|
||||
'digital_platforms',
|
||||
}
|
||||
|
||||
|
||||
def safe_update_custodian(
|
||||
yaml_path: Path | str,
|
||||
updates: Dict[str, Any],
|
||||
script_name: str = "unknown_script",
|
||||
add_provenance_note: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Safely update a custodian YAML file without losing existing data.
|
||||
|
||||
This function:
|
||||
1. Reads the current file content FRESH (not from cache)
|
||||
2. Verifies all protected keys are preserved after update
|
||||
3. Adds a provenance note documenting the change
|
||||
4. Writes back atomically
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the custodian YAML file
|
||||
updates: Dictionary of key-value pairs to add/update
|
||||
script_name: Name of the calling script (for provenance)
|
||||
add_provenance_note: Whether to add a note to provenance.notes
|
||||
|
||||
Returns:
|
||||
True if successful, False if update was blocked to prevent data loss
|
||||
"""
|
||||
yaml_path = Path(yaml_path)
|
||||
|
||||
if not yaml_path.exists():
|
||||
logger.error(f"File not found: {yaml_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Read FRESH from disk (not from cache)
|
||||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if data is None:
|
||||
logger.error(f"Empty or invalid YAML: {yaml_path}")
|
||||
return False
|
||||
|
||||
# Record protected keys BEFORE modification
|
||||
keys_before = set(data.keys())
|
||||
protected_before = keys_before & PROTECTED_KEYS
|
||||
|
||||
# Apply updates
|
||||
for key, value in updates.items():
|
||||
data[key] = value
|
||||
|
||||
# Verify protected keys AFTER modification
|
||||
keys_after = set(data.keys())
|
||||
protected_after = keys_after & PROTECTED_KEYS
|
||||
|
||||
lost_keys = protected_before - protected_after
|
||||
if lost_keys:
|
||||
logger.error(
|
||||
f"BLOCKED: Update to {yaml_path.name} would delete protected keys: {lost_keys}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Add provenance note
|
||||
if add_provenance_note:
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list):
|
||||
data['provenance']['notes'] = []
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
note = f"Updated by {script_name} on {timestamp}"
|
||||
data['provenance']['notes'].append(note)
|
||||
|
||||
# Write back
|
||||
with open(yaml_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating {yaml_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def merge_enrichment(
|
||||
yaml_path: Path | str,
|
||||
enrichment_key: str,
|
||||
enrichment_data: Dict[str, Any],
|
||||
script_name: str = "unknown_script"
|
||||
) -> bool:
|
||||
"""
|
||||
Merge new enrichment data with existing enrichment data.
|
||||
|
||||
Unlike simple updates, this MERGES nested dictionaries rather than
|
||||
replacing them entirely. Useful for adding to wikidata_enrichment, etc.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the custodian YAML file
|
||||
enrichment_key: Key for the enrichment (e.g., 'wikidata_enrichment')
|
||||
enrichment_data: New enrichment data to merge
|
||||
script_name: Name of the calling script (for provenance)
|
||||
|
||||
Returns:
|
||||
True if successful, False if blocked to prevent data loss
|
||||
"""
|
||||
yaml_path = Path(yaml_path)
|
||||
|
||||
if not yaml_path.exists():
|
||||
logger.error(f"File not found: {yaml_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if data is None:
|
||||
data = {}
|
||||
|
||||
# Record protected keys BEFORE modification
|
||||
keys_before = set(data.keys())
|
||||
protected_before = keys_before & PROTECTED_KEYS
|
||||
|
||||
# Merge enrichment data
|
||||
if enrichment_key not in data:
|
||||
data[enrichment_key] = {}
|
||||
|
||||
if isinstance(data[enrichment_key], dict) and isinstance(enrichment_data, dict):
|
||||
# Deep merge for dict values
|
||||
_deep_merge(data[enrichment_key], enrichment_data)
|
||||
else:
|
||||
# Replace for non-dict values
|
||||
data[enrichment_key] = enrichment_data
|
||||
|
||||
# Verify protected keys AFTER modification
|
||||
keys_after = set(data.keys())
|
||||
protected_after = keys_after & PROTECTED_KEYS
|
||||
|
||||
lost_keys = protected_before - protected_after
|
||||
if lost_keys:
|
||||
logger.error(
|
||||
f"BLOCKED: Merge to {yaml_path.name} would delete protected keys: {lost_keys}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Add provenance note
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list):
|
||||
data['provenance']['notes'] = []
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
note = f"Enrichment merged ({enrichment_key}) by {script_name} on {timestamp}"
|
||||
data['provenance']['notes'].append(note)
|
||||
|
||||
with open(yaml_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error merging enrichment to {yaml_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _deep_merge(base: Dict, updates: Dict) -> None:
|
||||
"""
|
||||
Deep merge updates into base dict, modifying base in-place.
|
||||
|
||||
For nested dicts, recursively merges. For other types, updates replace.
|
||||
"""
|
||||
for key, value in updates.items():
|
||||
if key in base and isinstance(base[key], dict) and isinstance(value, dict):
|
||||
_deep_merge(base[key], value)
|
||||
else:
|
||||
base[key] = value
|
||||
Loading…
Reference in a new issue