#!/usr/bin/env python3 """ Fix duplicate 'temporal_coverage:' keys caused by batch fix script. This script removes the duplicate prefix pattern: temporal_coverage: temporal_coverage: 1500-01-01/1899-12-31 → temporal_coverage: 1500-01-01/1899-12-31 """ import re from pathlib import Path # Files with duplicate temporal_coverage keys (from rg search) AFFECTED_FILES = [ "data/instances/algeria/algerian_institutions.yaml", "data/instances/libya/libyan_institutions.yaml", "data/instances/libya/libyan_institutions_backup_20251111.yaml", "data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml", "data/instances/georgia/georgian_institutions_enriched_batch1.yaml", "data/instances/georgia/georgian_institutions_enriched_batch2.yaml", "data/instances/georgia_glam_institutions.yaml", "data/instances/all/globalglam-20251111-batch16-pre-fix-20251111-230522.yaml", "data/instances/all/globalglam-20251111-batch16-fixed.yaml", "data/instances/brazil/brazilian_institutions_final.yaml", "data/instances/all/globalglam-20251111-batch16.yaml", "data/instances/brazil/brazilian_institutions_batch7_enriched.yaml", "data/instances/brazil/brazilian_institutions_batch6_enriched.yaml", "data/instances/all/globalglam-20251111.yaml", "data/instances/all/globalglam-20251111-pre-batch16-20251111-230249.yaml", "data/instances/all/globalglam-20251111_backup_20251111_144746.yaml", "data/instances/norway/museums_southern_norway.yaml", "data/instances/norway/museums_trondelag.yaml", "data/instances/norway/museums_eastern_norway.yaml", "data/instances/brazil/brazilian_institutions_batch8_enriched.yaml", "data/instances/norway/museums_northern_norway.yaml", "data/instances/norway/museums_oslo.yaml", "data/instances/norway/national_aggregators.yaml", "data/instances/norway/museums_western_norway.yaml", "data/instances/tunisia/tunisian_institutions.yaml", "data/instances/tunisia/tunisian_institutions_enhanced.yaml", "data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml", "data/instances/georgia_glam_institutions_enriched.yaml", "data/instances/georgia_glam_institutions_enriched.pre_enrichment_backfill_20251111_100230.yaml", "data/instances/all/globalglam-20251111_backup_20251111_144624.yaml", ] def fix_duplicate_temporal_coverage(content: str) -> tuple[str, int]: """ Remove duplicate 'temporal_coverage:' prefix. Returns: (fixed_content, num_fixes) """ # Pattern: temporal_coverage: temporal_coverage: # Replace with: temporal_coverage: pattern = r'^(\s+)temporal_coverage:\s+temporal_coverage:\s+(.+)$' lines = content.split('\n') fixed_lines = [] fix_count = 0 for line in lines: match = re.match(pattern, line) if match: indent = match.group(1) value = match.group(2) fixed_line = f"{indent}temporal_coverage: {value}" fixed_lines.append(fixed_line) fix_count += 1 else: fixed_lines.append(line) return '\n'.join(fixed_lines), fix_count def main(): """Fix duplicate temporal_coverage keys in all affected files.""" repo_root = Path(__file__).parent.parent total_fixed = 0 fixed_files = [] for file_path_str in AFFECTED_FILES: file_path = repo_root / file_path_str if not file_path.exists(): print(f"⚠️ File not found: {file_path_str}") continue # Read file content = file_path.read_text(encoding='utf-8') # Fix duplicates fixed_content, fix_count = fix_duplicate_temporal_coverage(content) if fix_count > 0: # Write back file_path.write_text(fixed_content, encoding='utf-8') print(f"✅ Fixed {fix_count} duplicate(s) in {file_path_str}") total_fixed += fix_count fixed_files.append(file_path_str) else: print(f"ℹ️ No duplicates found in {file_path_str}") print(f"\n{'='*70}") print(f"✅ Total fixes: {total_fixed} duplicates across {len(fixed_files)} files") print(f"{'='*70}") if __name__ == "__main__": main()