#!/usr/bin/env python3 """ Fix GHCID mismatches where ghcid_current doesn't match the filename. Per AGENTS.md Rule on PID Stability, the ghcid_current MUST match the filename. This script: 1. Finds all files where ghcid_current != filename 2. Updates ghcid_current to match filename 3. Updates ghcid_history to record the correction 4. Logs all changes for audit trail """ import os import yaml import re from datetime import datetime, timezone from pathlib import Path def get_ghcid_from_file(filepath: Path) -> str | None: """Extract ghcid_current from a YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Use regex to find ghcid_current value match = re.search(r'ghcid_current:\s*[\'"]?([^\s\'"]+)[\'"]?', content) if match: return match.group(1) return None def fix_ghcid_mismatch(filepath: Path, correct_ghcid: str, old_ghcid: str) -> bool: """ Fix the ghcid_current in a file to match the filename. Returns True if file was modified, False otherwise. """ with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Replace ghcid_current value # Handle both quoted and unquoted values patterns = [ (rf"ghcid_current:\s*'{re.escape(old_ghcid)}'", f"ghcid_current: '{correct_ghcid}'"), (rf'ghcid_current:\s*"{re.escape(old_ghcid)}"', f'ghcid_current: "{correct_ghcid}"'), (rf'ghcid_current:\s*{re.escape(old_ghcid)}(?=\s|$)', f'ghcid_current: {correct_ghcid}'), ] new_content = content for pattern, replacement in patterns: new_content = re.sub(pattern, replacement, new_content, count=1) if new_content != content: break if new_content == content: print(f" WARNING: Could not find ghcid_current to replace in {filepath.name}") return False # Add correction note to ghcid_history if it exists, or add a comment timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') # Check if there's a ghcid_history section if 'ghcid_history:' in new_content: # Add entry to existing history history_entry = f""" - ghcid_value: {old_ghcid} valid_from: null valid_to: '{timestamp}' reason: "Corrected: ghcid_current was incorrectly set to {old_ghcid}, should be {correct_ghcid} (filename mismatch fix)" """ # Find ghcid_history and add entry after it new_content = re.sub( r'(ghcid_history:\s*\n)', r'\1' + history_entry, new_content, count=1 ) else: # Add a comment about the correction correction_comment = f"# GHCID Correction {timestamp}: Changed from {old_ghcid} to {correct_ghcid} (filename mismatch)\n" # Add after ghcid_current line new_content = re.sub( rf'(ghcid_current:\s*{re.escape(correct_ghcid)}[^\n]*\n)', r'\1' + correction_comment, new_content, count=1 ) with open(filepath, 'w', encoding='utf-8') as f: f.write(new_content) return True def main(): custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') print("=" * 70) print("GHCID Mismatch Fixer") print("=" * 70) print(f"Scanning: {custodian_dir}") print() mismatches = [] # Find all mismatches for filepath in sorted(custodian_dir.glob('*.yaml')): filename_ghcid = filepath.stem # filename without .yaml file_ghcid = get_ghcid_from_file(filepath) if file_ghcid and file_ghcid != filename_ghcid: mismatches.append({ 'filepath': filepath, 'filename_ghcid': filename_ghcid, 'file_ghcid': file_ghcid }) print(f"Found {len(mismatches)} files with GHCID mismatches") print() if not mismatches: print("No mismatches to fix!") return # Display mismatches print("Mismatches to fix:") print("-" * 70) for m in mismatches: print(f" {m['filepath'].name}") print(f" Current ghcid_current: {m['file_ghcid']}") print(f" Should be: {m['filename_ghcid']}") print() # Fix them print("Fixing mismatches...") print("-" * 70) fixed_count = 0 failed_count = 0 for m in mismatches: print(f"Fixing: {m['filepath'].name}") if fix_ghcid_mismatch(m['filepath'], m['filename_ghcid'], m['file_ghcid']): print(f" OK: {m['file_ghcid']} -> {m['filename_ghcid']}") fixed_count += 1 else: print(f" FAILED") failed_count += 1 print() print("=" * 70) print(f"Summary: Fixed {fixed_count} files, Failed {failed_count} files") print("=" * 70) # Write audit log log_path = custodian_dir / 'ghcid_mismatch_fix_log.txt' with open(log_path, 'w', encoding='utf-8') as f: f.write(f"GHCID Mismatch Fix Log\n") f.write(f"Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"=" * 70 + "\n\n") f.write(f"Total mismatches found: {len(mismatches)}\n") f.write(f"Fixed: {fixed_count}\n") f.write(f"Failed: {failed_count}\n\n") f.write("Details:\n") f.write("-" * 70 + "\n") for m in mismatches: f.write(f"File: {m['filepath'].name}\n") f.write(f" Old ghcid_current: {m['file_ghcid']}\n") f.write(f" New ghcid_current: {m['filename_ghcid']}\n\n") print(f"\nAudit log written to: {log_path}") if __name__ == '__main__': main()