glam/scripts/fix_yaml_history.py
kempersc 90a1f20271 chore: add YAML history fix scripts and update ducklake/deploy tooling
- Add fix_yaml_history.py and fix_yaml_history_v2.py for cleaning up
  malformed ghcid_history entries with duplicate/redundant data
- Update load_custodians_to_ducklake.py for DuckDB lakehouse loading
- Update migrate_web_archives.py for web archive management
- Update deploy.sh with improvements
- Ignore entire data/ducklake/ directory (generated databases)
2025-12-07 18:45:52 +01:00

90 lines
3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix malformed ghcid_history entries in YAML files.
The issue: Some files have multiple history entries concatenated on a single line:
reason: Migrated from CH to EG namespace - Assiut - ghcid: XX-XX-XXX-L-AUL
This should be split into separate list items.
"""
import os
import re
import sys
from pathlib import Path
def fix_yaml_content(content: str) -> str:
"""Fix malformed ghcid_history entries."""
lines = content.split('\n')
fixed_lines = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this is a reason line that has an embedded " - ghcid:" or similar
# Pattern: reason: ... - ghcid: or reason: ... - valid_from:
if 'reason:' in line and ' - ghcid:' in line:
# Split the line at the embedded list marker
parts = line.split(' - ghcid:', 1)
# Add the first part (the reason line, truncated)
fixed_lines.append(parts[0].rstrip())
# Add the second part as a new list item
# Find the proper indentation (should be same level as the - that started this entry)
indent_match = re.match(r'^(\s*)', line)
base_indent = indent_match.group(1) if indent_match else ' '
# The new entry should be at the list item level
fixed_lines.append(f"{base_indent}- ghcid:{parts[1]}")
elif 'reason:' in line and ' - valid_from:' in line:
parts = line.split(' - valid_from:', 1)
fixed_lines.append(parts[0].rstrip())
indent_match = re.match(r'^(\s*)', line)
base_indent = indent_match.group(1) if indent_match else ' '
fixed_lines.append(f"{base_indent}- valid_from:{parts[1]}")
else:
fixed_lines.append(line)
i += 1
return '\n'.join(fixed_lines)
def process_file(filepath: Path, dry_run: bool = False) -> bool:
"""Process a single file. Returns True if changes were made."""
with open(filepath, 'r', encoding='utf-8') as f:
original = f.read()
fixed = fix_yaml_content(original)
if fixed != original:
if dry_run:
print(f"Would fix: {filepath.name}")
else:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(fixed)
print(f"Fixed: {filepath.name}")
return True
return False
def main():
dry_run = '--dry-run' in sys.argv
custodian_dir = Path('data/custodian')
if not custodian_dir.exists():
print("Error: data/custodian directory not found")
sys.exit(1)
fixed_count = 0
# Process files known to have issues (BE and EG prefixes)
for prefix in ['BE-', 'EG-']:
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
if process_file(yaml_file, dry_run):
fixed_count += 1
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
if __name__ == '__main__':
main()