#!/usr/bin/env python3 """ Batch fix common validation errors across all YAML instance files. Fixes: 1. extent → item_count (Collection field alias) 2. metadata_standards → implemented_standards (DigitalPlatform field alias) 3. Temporal coverage format (ISO 8601: YYYY-MM-DD/YYYY-MM-DD) 4. Invalid platform_type enum values (CATALOG → COLLECTION_MANAGEMENT, REPOSITORY → DIGITAL_REPOSITORY) 5. subject_areas → subjects (Collection field alias) """ import sys from pathlib import Path import re def load_yaml_content(filepath): """Load YAML file as raw text (preserve formatting).""" with open(filepath, 'r', encoding='utf-8') as f: return f.read() def save_yaml_content(filepath, content): """Save YAML content to file.""" with open(filepath, 'w', encoding='utf-8') as f: f.write(content) def fix_alias_fields(content): """Fix alias field names to canonical names.""" changes = [] # Fix: extent → item_count (in Collection context) # Pattern: Look for 'extent:' within collections section pattern_extent = r'(\n\s+)(extent:)' if re.search(pattern_extent, content): content = re.sub(pattern_extent, r'\1item_count:', content) changes.append("extent → item_count") # Fix: metadata_standards → implemented_standards (in DigitalPlatform context) # Pattern: Look for 'metadata_standards:' within digital_platforms section pattern_metadata = r'(\n\s+)(metadata_standards:)' if re.search(pattern_metadata, content): content = re.sub(pattern_metadata, r'\1implemented_standards:', content) changes.append("metadata_standards → implemented_standards") # Fix: subject_areas → subjects (in Collection context) pattern_subjects = r'(\n\s+)(subject_areas:)' if re.search(pattern_subjects, content): content = re.sub(pattern_subjects, r'\1subjects:', content) changes.append("subject_areas → subjects") return content, changes def fix_temporal_coverage(content): """Fix temporal coverage format to ISO 8601 with full dates.""" changes = [] # Pattern: temporal_coverage: YYYY/YYYY or partial dates # Matches: # - 1800/1930 → 1800-01-01/1930-12-31 # - 800/1500 → 0800-01-01/1500-12-31 # - 1800/1930-12-31 → 1800-01-01/1930-12-31 (incomplete start) def replace_temporal(match): indent = match.group(1) start = match.group(2) end = match.group(3) # Fix start date if re.match(r'^\d{1,4}$', start): # Just year: pad and add month/day start_fixed = f"{start.zfill(4)}-01-01" elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', start): # Already valid start_fixed = start else: # Incomplete format, skip return match.group(0) # Fix end date if re.match(r'^\d{1,4}$', end): # Just year: pad and add month/day end_fixed = f"{end.zfill(4)}-12-31" elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', end): # Already valid end_fixed = end else: # Incomplete format, skip return match.group(0) changes.append(f"temporal_coverage: {start}/{end} → {start_fixed}/{end_fixed}") return f"{indent}temporal_coverage: {start_fixed}/{end_fixed}" # Match temporal_coverage with various formats pattern = r'(\n\s+temporal_coverage: )([^\n]+?)/([^\n]+)' content = re.sub(pattern, replace_temporal, content) return content, changes def fix_platform_types(content): """Fix invalid platform_type enum values.""" changes = [] # CATALOG → COLLECTION_MANAGEMENT if 'platform_type: CATALOG' in content or "platform_type: 'CATALOG'" in content: content = content.replace('platform_type: CATALOG', 'platform_type: COLLECTION_MANAGEMENT') content = content.replace("platform_type: 'CATALOG'", 'platform_type: COLLECTION_MANAGEMENT') changes.append("platform_type: CATALOG → COLLECTION_MANAGEMENT") # REPOSITORY → DIGITAL_REPOSITORY if 'platform_type: REPOSITORY' in content or "platform_type: 'REPOSITORY'" in content: content = content.replace('platform_type: REPOSITORY', 'platform_type: DIGITAL_REPOSITORY') content = content.replace("platform_type: 'REPOSITORY'", 'platform_type: DIGITAL_REPOSITORY') changes.append("platform_type: REPOSITORY → DIGITAL_REPOSITORY") return content, changes def process_file(filepath): """Process a single YAML file and apply all fixes.""" try: content = load_yaml_content(filepath) original_content = content all_changes = [] # Apply fixes content, changes1 = fix_alias_fields(content) all_changes.extend(changes1) content, changes2 = fix_temporal_coverage(content) all_changes.extend(changes2) content, changes3 = fix_platform_types(content) all_changes.extend(changes3) # Only save if changes were made if content != original_content: save_yaml_content(filepath, content) return True, all_changes return False, [] except Exception as e: print(f" ❌ Error processing {filepath}: {e}") return False, [] def get_active_yaml_files(): """Get all active YAML files (excluding archives, backups, tests, cache).""" instance_dir = Path.cwd() / "data" / "instances" all_files = list(instance_dir.rglob("*.yaml")) # Filter out archives, backups, test files, cache active_files = [ f for f in all_files if "archive" not in str(f) and "backups" not in str(f) and "test_outputs" not in str(f) and "cache" not in str(f) ] return sorted(active_files) def main(): print("🔧 Batch Fix Validation Errors") print("=" * 80) print() # Get all files files = get_active_yaml_files() print(f"📄 Found {len(files)} active YAML files") print() # Process each file files_modified = [] files_unchanged = [] for i, filepath in enumerate(files, 1): relative_path = filepath.relative_to(Path.cwd()) print(f"[{i}/{len(files)}] Processing {relative_path}...", end=" ") modified, changes = process_file(filepath) if modified: print(f"✅ Fixed ({len(changes)} changes)") files_modified.append((str(relative_path), changes)) else: print("⏭️ No changes needed") files_unchanged.append(str(relative_path)) print() print("=" * 80) print(f"✅ Files modified: {len(files_modified)}") print(f"⏭️ Files unchanged: {len(files_unchanged)}") print() if files_modified: print("📝 CHANGES MADE:") print("-" * 80) for filepath, changes in files_modified: print(f"\n{filepath}:") for change in changes[:10]: # Show first 10 changes print(f" - {change}") if len(changes) > 10: print(f" ... and {len(changes) - 10} more changes") print() print("✅ Batch fix complete!") return 0 if __name__ == "__main__": sys.exit(main())