glam/scripts/fix_ghcid_mismatches.py
2025-12-21 22:12:34 +01:00

175 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Fix GHCID mismatches where ghcid_current doesn't match the filename.
Per AGENTS.md Rule on PID Stability, the ghcid_current MUST match the filename.
This script:
1. Finds all files where ghcid_current != filename
2. Updates ghcid_current to match filename
3. Updates ghcid_history to record the correction
4. Logs all changes for audit trail
"""
import os
import yaml
import re
from datetime import datetime, timezone
from pathlib import Path
def get_ghcid_from_file(filepath: Path) -> str | None:
"""Extract ghcid_current from a YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Use regex to find ghcid_current value
match = re.search(r'ghcid_current:\s*[\'"]?([^\s\'"]+)[\'"]?', content)
if match:
return match.group(1)
return None
def fix_ghcid_mismatch(filepath: Path, correct_ghcid: str, old_ghcid: str) -> bool:
"""
Fix the ghcid_current in a file to match the filename.
Returns True if file was modified, False otherwise.
"""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Replace ghcid_current value
# Handle both quoted and unquoted values
patterns = [
(rf"ghcid_current:\s*'{re.escape(old_ghcid)}'", f"ghcid_current: '{correct_ghcid}'"),
(rf'ghcid_current:\s*"{re.escape(old_ghcid)}"', f'ghcid_current: "{correct_ghcid}"'),
(rf'ghcid_current:\s*{re.escape(old_ghcid)}(?=\s|$)', f'ghcid_current: {correct_ghcid}'),
]
new_content = content
for pattern, replacement in patterns:
new_content = re.sub(pattern, replacement, new_content, count=1)
if new_content != content:
break
if new_content == content:
print(f" WARNING: Could not find ghcid_current to replace in {filepath.name}")
return False
# Add correction note to ghcid_history if it exists, or add a comment
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
# Check if there's a ghcid_history section
if 'ghcid_history:' in new_content:
# Add entry to existing history
history_entry = f"""
- ghcid_value: {old_ghcid}
valid_from: null
valid_to: '{timestamp}'
reason: "Corrected: ghcid_current was incorrectly set to {old_ghcid}, should be {correct_ghcid} (filename mismatch fix)"
"""
# Find ghcid_history and add entry after it
new_content = re.sub(
r'(ghcid_history:\s*\n)',
r'\1' + history_entry,
new_content,
count=1
)
else:
# Add a comment about the correction
correction_comment = f"# GHCID Correction {timestamp}: Changed from {old_ghcid} to {correct_ghcid} (filename mismatch)\n"
# Add after ghcid_current line
new_content = re.sub(
rf'(ghcid_current:\s*{re.escape(correct_ghcid)}[^\n]*\n)',
r'\1' + correction_comment,
new_content,
count=1
)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(new_content)
return True
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print("=" * 70)
print("GHCID Mismatch Fixer")
print("=" * 70)
print(f"Scanning: {custodian_dir}")
print()
mismatches = []
# Find all mismatches
for filepath in sorted(custodian_dir.glob('*.yaml')):
filename_ghcid = filepath.stem # filename without .yaml
file_ghcid = get_ghcid_from_file(filepath)
if file_ghcid and file_ghcid != filename_ghcid:
mismatches.append({
'filepath': filepath,
'filename_ghcid': filename_ghcid,
'file_ghcid': file_ghcid
})
print(f"Found {len(mismatches)} files with GHCID mismatches")
print()
if not mismatches:
print("No mismatches to fix!")
return
# Display mismatches
print("Mismatches to fix:")
print("-" * 70)
for m in mismatches:
print(f" {m['filepath'].name}")
print(f" Current ghcid_current: {m['file_ghcid']}")
print(f" Should be: {m['filename_ghcid']}")
print()
# Fix them
print("Fixing mismatches...")
print("-" * 70)
fixed_count = 0
failed_count = 0
for m in mismatches:
print(f"Fixing: {m['filepath'].name}")
if fix_ghcid_mismatch(m['filepath'], m['filename_ghcid'], m['file_ghcid']):
print(f" OK: {m['file_ghcid']} -> {m['filename_ghcid']}")
fixed_count += 1
else:
print(f" FAILED")
failed_count += 1
print()
print("=" * 70)
print(f"Summary: Fixed {fixed_count} files, Failed {failed_count} files")
print("=" * 70)
# Write audit log
log_path = custodian_dir / 'ghcid_mismatch_fix_log.txt'
with open(log_path, 'w', encoding='utf-8') as f:
f.write(f"GHCID Mismatch Fix Log\n")
f.write(f"Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"=" * 70 + "\n\n")
f.write(f"Total mismatches found: {len(mismatches)}\n")
f.write(f"Fixed: {fixed_count}\n")
f.write(f"Failed: {failed_count}\n\n")
f.write("Details:\n")
f.write("-" * 70 + "\n")
for m in mismatches:
f.write(f"File: {m['filepath'].name}\n")
f.write(f" Old ghcid_current: {m['file_ghcid']}\n")
f.write(f" New ghcid_current: {m['filename_ghcid']}\n\n")
print(f"\nAudit log written to: {log_path}")
if __name__ == '__main__':
main()