- Add fix_yaml_history.py and fix_yaml_history_v2.py for cleaning up malformed ghcid_history entries with duplicate/redundant data - Update load_custodians_to_ducklake.py for DuckDB lakehouse loading - Update migrate_web_archives.py for web archive management - Update deploy.sh with improvements - Ignore entire data/ducklake/ directory (generated databases)
90 lines
3 KiB
Python
Executable file
90 lines
3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix malformed ghcid_history entries in YAML files.
|
|
|
|
The issue: Some files have multiple history entries concatenated on a single line:
|
|
reason: Migrated from CH to EG namespace - Assiut - ghcid: XX-XX-XXX-L-AUL
|
|
|
|
This should be split into separate list items.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
def fix_yaml_content(content: str) -> str:
|
|
"""Fix malformed ghcid_history entries."""
|
|
lines = content.split('\n')
|
|
fixed_lines = []
|
|
i = 0
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check if this is a reason line that has an embedded " - ghcid:" or similar
|
|
# Pattern: reason: ... - ghcid: or reason: ... - valid_from:
|
|
if 'reason:' in line and ' - ghcid:' in line:
|
|
# Split the line at the embedded list marker
|
|
parts = line.split(' - ghcid:', 1)
|
|
# Add the first part (the reason line, truncated)
|
|
fixed_lines.append(parts[0].rstrip())
|
|
# Add the second part as a new list item
|
|
# Find the proper indentation (should be same level as the - that started this entry)
|
|
indent_match = re.match(r'^(\s*)', line)
|
|
base_indent = indent_match.group(1) if indent_match else ' '
|
|
# The new entry should be at the list item level
|
|
fixed_lines.append(f"{base_indent}- ghcid:{parts[1]}")
|
|
elif 'reason:' in line and ' - valid_from:' in line:
|
|
parts = line.split(' - valid_from:', 1)
|
|
fixed_lines.append(parts[0].rstrip())
|
|
indent_match = re.match(r'^(\s*)', line)
|
|
base_indent = indent_match.group(1) if indent_match else ' '
|
|
fixed_lines.append(f"{base_indent}- valid_from:{parts[1]}")
|
|
else:
|
|
fixed_lines.append(line)
|
|
|
|
i += 1
|
|
|
|
return '\n'.join(fixed_lines)
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = False) -> bool:
|
|
"""Process a single file. Returns True if changes were made."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
original = f.read()
|
|
|
|
fixed = fix_yaml_content(original)
|
|
|
|
if fixed != original:
|
|
if dry_run:
|
|
print(f"Would fix: {filepath.name}")
|
|
else:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(fixed)
|
|
print(f"Fixed: {filepath.name}")
|
|
return True
|
|
return False
|
|
|
|
|
|
def main():
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
custodian_dir = Path('data/custodian')
|
|
if not custodian_dir.exists():
|
|
print("Error: data/custodian directory not found")
|
|
sys.exit(1)
|
|
|
|
fixed_count = 0
|
|
|
|
# Process files known to have issues (BE and EG prefixes)
|
|
for prefix in ['BE-', 'EG-']:
|
|
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
|
|
if process_file(yaml_file, dry_run):
|
|
fixed_count += 1
|
|
|
|
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|