Key changes:
- Created scripts/lib/safe_yaml_update.py with PROTECTED_KEYS constant
- Fixed enrich_custodians_wikidata_full.py to re-read files before writing
(prevents race conditions where another script modified the file)
- Added safety check to abort if protected keys would be lost
- Protected keys include: location, original_entry, ghcid, provenance,
google_maps_enrichment, osm_enrichment, etc.
Root cause of data loss in 62fdd35321:
- Script loaded files into list, then processed them later
- If another script modified files between load and write, changes were lost
- Now files are re-read immediately before modification
Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
227 lines
7.3 KiB
Python
227 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Safe YAML update utilities for custodian files.
|
|
|
|
CRITICAL: This module implements AGENTS.md Rule 5 - NEVER Delete Enriched Data.
|
|
All enrichment scripts MUST use these functions to prevent data loss.
|
|
|
|
Usage:
|
|
from lib.safe_yaml_update import safe_update_custodian, PROTECTED_KEYS
|
|
|
|
# Update a custodian file safely
|
|
success = safe_update_custodian(
|
|
yaml_path="/path/to/custodian.yaml",
|
|
updates={"new_field": "value", "wikidata_enrichment": {...}},
|
|
script_name="my_enrichment_script.py"
|
|
)
|
|
"""
|
|
|
|
import logging
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Optional, Set
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Keys that must NEVER be deleted during enrichment
|
|
# Per AGENTS.md Rule 5: NEVER Delete Enriched Data - Additive Only
|
|
PROTECTED_KEYS: Set[str] = {
|
|
# Core structural keys
|
|
'original_entry',
|
|
'ghcid',
|
|
'custodian_name',
|
|
'identifiers',
|
|
'provenance',
|
|
|
|
# Normalized location data (expensive to regenerate)
|
|
'location',
|
|
|
|
# Annotation data
|
|
'ch_annotator',
|
|
|
|
# Enrichment sources (expensive API calls to collect)
|
|
'google_maps_enrichment',
|
|
'osm_enrichment',
|
|
'wikidata_enrichment',
|
|
'unesco_mow_enrichment',
|
|
'web_enrichment',
|
|
'youtube_enrichment',
|
|
'viaf_enrichment',
|
|
|
|
# Platform data
|
|
'digital_platforms',
|
|
}
|
|
|
|
|
|
def safe_update_custodian(
|
|
yaml_path: Path | str,
|
|
updates: Dict[str, Any],
|
|
script_name: str = "unknown_script",
|
|
add_provenance_note: bool = True
|
|
) -> bool:
|
|
"""
|
|
Safely update a custodian YAML file without losing existing data.
|
|
|
|
This function:
|
|
1. Reads the current file content FRESH (not from cache)
|
|
2. Verifies all protected keys are preserved after update
|
|
3. Adds a provenance note documenting the change
|
|
4. Writes back atomically
|
|
|
|
Args:
|
|
yaml_path: Path to the custodian YAML file
|
|
updates: Dictionary of key-value pairs to add/update
|
|
script_name: Name of the calling script (for provenance)
|
|
add_provenance_note: Whether to add a note to provenance.notes
|
|
|
|
Returns:
|
|
True if successful, False if update was blocked to prevent data loss
|
|
"""
|
|
yaml_path = Path(yaml_path)
|
|
|
|
if not yaml_path.exists():
|
|
logger.error(f"File not found: {yaml_path}")
|
|
return False
|
|
|
|
try:
|
|
# Read FRESH from disk (not from cache)
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data is None:
|
|
logger.error(f"Empty or invalid YAML: {yaml_path}")
|
|
return False
|
|
|
|
# Record protected keys BEFORE modification
|
|
keys_before = set(data.keys())
|
|
protected_before = keys_before & PROTECTED_KEYS
|
|
|
|
# Apply updates
|
|
for key, value in updates.items():
|
|
data[key] = value
|
|
|
|
# Verify protected keys AFTER modification
|
|
keys_after = set(data.keys())
|
|
protected_after = keys_after & PROTECTED_KEYS
|
|
|
|
lost_keys = protected_before - protected_after
|
|
if lost_keys:
|
|
logger.error(
|
|
f"BLOCKED: Update to {yaml_path.name} would delete protected keys: {lost_keys}"
|
|
)
|
|
return False
|
|
|
|
# Add provenance note
|
|
if add_provenance_note:
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list):
|
|
data['provenance']['notes'] = []
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
note = f"Updated by {script_name} on {timestamp}"
|
|
data['provenance']['notes'].append(note)
|
|
|
|
# Write back
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating {yaml_path}: {e}")
|
|
return False
|
|
|
|
|
|
def merge_enrichment(
|
|
yaml_path: Path | str,
|
|
enrichment_key: str,
|
|
enrichment_data: Dict[str, Any],
|
|
script_name: str = "unknown_script"
|
|
) -> bool:
|
|
"""
|
|
Merge new enrichment data with existing enrichment data.
|
|
|
|
Unlike simple updates, this MERGES nested dictionaries rather than
|
|
replacing them entirely. Useful for adding to wikidata_enrichment, etc.
|
|
|
|
Args:
|
|
yaml_path: Path to the custodian YAML file
|
|
enrichment_key: Key for the enrichment (e.g., 'wikidata_enrichment')
|
|
enrichment_data: New enrichment data to merge
|
|
script_name: Name of the calling script (for provenance)
|
|
|
|
Returns:
|
|
True if successful, False if blocked to prevent data loss
|
|
"""
|
|
yaml_path = Path(yaml_path)
|
|
|
|
if not yaml_path.exists():
|
|
logger.error(f"File not found: {yaml_path}")
|
|
return False
|
|
|
|
try:
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data is None:
|
|
data = {}
|
|
|
|
# Record protected keys BEFORE modification
|
|
keys_before = set(data.keys())
|
|
protected_before = keys_before & PROTECTED_KEYS
|
|
|
|
# Merge enrichment data
|
|
if enrichment_key not in data:
|
|
data[enrichment_key] = {}
|
|
|
|
if isinstance(data[enrichment_key], dict) and isinstance(enrichment_data, dict):
|
|
# Deep merge for dict values
|
|
_deep_merge(data[enrichment_key], enrichment_data)
|
|
else:
|
|
# Replace for non-dict values
|
|
data[enrichment_key] = enrichment_data
|
|
|
|
# Verify protected keys AFTER modification
|
|
keys_after = set(data.keys())
|
|
protected_after = keys_after & PROTECTED_KEYS
|
|
|
|
lost_keys = protected_before - protected_after
|
|
if lost_keys:
|
|
logger.error(
|
|
f"BLOCKED: Merge to {yaml_path.name} would delete protected keys: {lost_keys}"
|
|
)
|
|
return False
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance'] or not isinstance(data['provenance'].get('notes'), list):
|
|
data['provenance']['notes'] = []
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
note = f"Enrichment merged ({enrichment_key}) by {script_name} on {timestamp}"
|
|
data['provenance']['notes'].append(note)
|
|
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error merging enrichment to {yaml_path}: {e}")
|
|
return False
|
|
|
|
|
|
def _deep_merge(base: Dict, updates: Dict) -> None:
|
|
"""
|
|
Deep merge updates into base dict, modifying base in-place.
|
|
|
|
For nested dicts, recursively merges. For other types, updates replace.
|
|
"""
|
|
for key, value in updates.items():
|
|
if key in base and isinstance(base[key], dict) and isinstance(value, dict):
|
|
_deep_merge(base[key], value)
|
|
else:
|
|
base[key] = value
|