- Add fix_yaml_history.py and fix_yaml_history_v2.py for cleaning up malformed ghcid_history entries with duplicate/redundant data - Update load_custodians_to_ducklake.py for DuckDB lakehouse loading - Update migrate_web_archives.py for web archive management - Update deploy.sh with improvements - Ignore entire data/ducklake/ directory (generated databases)
154 lines
5.2 KiB
Python
154 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix malformed ghcid_history entries in YAML files.
|
|
Version 2: More robust parsing and reconstruction.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def fix_ghcid_history_section(content: str) -> str:
|
|
"""Fix the ghcid_history section of a YAML file."""
|
|
|
|
# Find the ghcid_history section
|
|
history_match = re.search(r'(\s*)ghcid_history:\s*\n', content)
|
|
if not history_match:
|
|
return content # No ghcid_history section
|
|
|
|
base_indent = history_match.group(1)
|
|
list_indent = base_indent + " "
|
|
item_indent = list_indent + " "
|
|
|
|
# Find the end of ghcid_history section (next top-level key at same or less indent)
|
|
start_pos = history_match.end()
|
|
|
|
# Find where ghcid_history ends by looking for next key at same level
|
|
remaining = content[start_pos:]
|
|
|
|
# Match pattern for next section at base_indent level or less
|
|
end_pattern = re.compile(rf'^{base_indent}[a-z_]+:', re.MULTILINE)
|
|
end_match = end_pattern.search(remaining)
|
|
|
|
if end_match:
|
|
history_section = remaining[:end_match.start()]
|
|
after_section = remaining[end_match.start():]
|
|
else:
|
|
history_section = remaining
|
|
after_section = ""
|
|
|
|
# Parse all history entries from the section
|
|
# They might be concatenated on one line or split incorrectly
|
|
|
|
# Extract all ghcid entries - they have pattern: ghcid: <value>
|
|
entry_pattern = re.compile(
|
|
r'(?:^|\s*-\s*)ghcid:\s*(\S+).*?'
|
|
r'(?:valid_from:\s*[\'"]?([^\'"]+)[\'"]?)?\s*'
|
|
r'(?:ghcid_numeric:\s*(\d+))?\s*'
|
|
r'(?:reason:\s*([^\n]+))?',
|
|
re.DOTALL
|
|
)
|
|
|
|
# Simpler approach: Just extract key-value pairs
|
|
entries = []
|
|
current_entry = {}
|
|
|
|
# Split by potential entry boundaries and reconstruct
|
|
lines = history_section.split('\n')
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
|
|
# Start of new entry
|
|
if stripped.startswith('- ghcid:') or (stripped.startswith('ghcid:') and not current_entry):
|
|
if current_entry:
|
|
entries.append(current_entry)
|
|
value = stripped.replace('- ghcid:', '').replace('ghcid:', '').strip()
|
|
current_entry = {'ghcid': value}
|
|
elif stripped.startswith('- ') and ':' in stripped[2:]:
|
|
# This might be a new entry starting with a different key
|
|
if current_entry:
|
|
entries.append(current_entry)
|
|
# Parse the key-value
|
|
key_val = stripped[2:]
|
|
if ':' in key_val:
|
|
key, val = key_val.split(':', 1)
|
|
current_entry = {key.strip(): val.strip().strip("'\"")}
|
|
elif ':' in stripped:
|
|
# It's a key-value pair for current entry
|
|
key, val = stripped.split(':', 1)
|
|
key = key.strip().replace('- ', '')
|
|
val = val.strip().strip("'\"")
|
|
if key and val:
|
|
current_entry[key] = val
|
|
|
|
if current_entry:
|
|
entries.append(current_entry)
|
|
|
|
# Deduplicate entries by ghcid + valid_from
|
|
seen = set()
|
|
unique_entries = []
|
|
for entry in entries:
|
|
key = (entry.get('ghcid', ''), entry.get('valid_from', ''))
|
|
if key not in seen and entry.get('ghcid'):
|
|
seen.add(key)
|
|
unique_entries.append(entry)
|
|
|
|
# Reconstruct the section properly
|
|
new_history = f"{base_indent}ghcid_history:\n"
|
|
for entry in unique_entries:
|
|
new_history += f"{list_indent}- ghcid: {entry.get('ghcid', '')}\n"
|
|
if 'valid_from' in entry:
|
|
new_history += f"{item_indent}valid_from: '{entry['valid_from']}'\n"
|
|
if 'ghcid_numeric' in entry:
|
|
new_history += f"{item_indent}ghcid_numeric: {entry['ghcid_numeric']}\n"
|
|
if 'reason' in entry:
|
|
# Escape colons in reason text by quoting
|
|
reason = entry['reason']
|
|
if ':' in reason and not reason.startswith('"') and not reason.startswith("'"):
|
|
reason = f'"{reason}"'
|
|
new_history += f"{item_indent}reason: {reason}\n"
|
|
|
|
# Rebuild the content
|
|
before_section = content[:history_match.start()] + history_match.group(0).rstrip() + '\n'
|
|
|
|
return before_section + new_history + after_section
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = False) -> bool:
|
|
"""Process a single file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
original = f.read()
|
|
|
|
fixed = fix_ghcid_history_section(original)
|
|
|
|
if fixed != original:
|
|
if dry_run:
|
|
print(f"Would fix: {filepath.name}")
|
|
else:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(fixed)
|
|
print(f"Fixed: {filepath.name}")
|
|
return True
|
|
return False
|
|
|
|
|
|
def main():
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
custodian_dir = Path('data/custodian')
|
|
fixed_count = 0
|
|
|
|
for prefix in ['BE-', 'EG-']:
|
|
for yaml_file in sorted(custodian_dir.glob(f'{prefix}*.yaml')):
|
|
if process_file(yaml_file, dry_run):
|
|
fixed_count += 1
|
|
|
|
print(f"\n{'Would fix' if dry_run else 'Fixed'}: {fixed_count} files")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|