#!/usr/bin/env python3 """ Fix remaining validation errors in YAML files, processing by country directory. This optimized version: - Processes files by country directory - Shows progress indicators - Skips large consolidated files and metadata files - Provides detailed reporting per country """ import yaml import re from pathlib import Path from datetime import datetime # Metadata standards mapping STANDARDS_MAPPING = { 'Dublin Core': 'DUBLIN_CORE', 'dublin core': 'DUBLIN_CORE', 'DUBLIN-CORE': 'DUBLIN_CORE', 'DSpace': None, # Platform, not a standard 'OAI-PMH': None, # Protocol, not a standard 'Z39.50': None, # Protocol, not a standard 'UNIMARC': 'MARC21', 'ISAD(G)': 'EAD', } # Files to skip SKIP_FILES = { 'DATASET_STATISTICS.yaml', 'ENRICHMENT_CANDIDATES.yaml', 'tunisian_institutions_enhanced.yaml', 'tunisian_institutions_enhanced.backup.yaml', 'globalglam-20251111.yaml', # Large consolidated file } # Directories to skip SKIP_DIRS = { 'all', # Contains large consolidated files 'archive', 'backups', 'cache', 'exports', 'test_outputs', } def fix_metadata_standards(standards_list): """Map or remove invalid metadata standards.""" if not standards_list: return standards_list fixed = [] for std in standards_list: if std in STANDARDS_MAPPING: mapped = STANDARDS_MAPPING[std] if mapped: fixed.append(mapped) else: fixed.append(std) return list(set(fixed)) if fixed else None def fix_temporal_coverage(coverage_str): """Fix temporal coverage patterns like '1983-01-01/present' or empty strings.""" if not coverage_str or coverage_str.strip() == '': return None if 'present' in coverage_str.lower(): current_year = datetime.now().year coverage_str = re.sub( r'/present', f'/{current_year}-12-31', coverage_str, flags=re.IGNORECASE ) return coverage_str def fix_enrichment_history(entry): """Add required fields to enrichment_history entries.""" if 'enrichment_type' not in entry: entry['enrichment_type'] = 'MANUAL' if 'verified' not in entry: entry['verified'] = False if 'enrichment_source' in entry and isinstance(entry['enrichment_source'], list): entry['enrichment_source'] = entry['enrichment_source'][0] if entry['enrichment_source'] else None return entry def fix_institution(inst): """Fix a single institution record.""" changes = [] # Move 'notes' from provenance to description if 'provenance' in inst and 'notes' in inst['provenance']: notes = inst['provenance'].pop('notes') if 'description' not in inst or not inst['description']: inst['description'] = notes else: inst['description'] += f"\n\n{notes}" changes.append("moved_provenance_notes") # Move enrichment_history from root to provenance if 'enrichment_history' in inst: if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['enrichment_history'] = inst.pop('enrichment_history') changes.append("moved_enrichment_history") # Fix enrichment_history entries if 'provenance' in inst and 'enrichment_history' in inst['provenance']: for entry in inst['provenance']['enrichment_history']: fix_enrichment_history(entry) changes.append("fixed_enrichment_entries") # Fix digital platforms if 'digital_platforms' in inst: for platform in inst['digital_platforms']: if 'description' in platform: platform.pop('description') changes.append("removed_platform_description") if 'notes' in platform: platform.pop('notes') changes.append("removed_platform_notes") if 'implemented_standards' in platform: fixed_standards = fix_metadata_standards(platform['implemented_standards']) if fixed_standards != platform['implemented_standards']: platform['implemented_standards'] = fixed_standards changes.append("fixed_metadata_standards") # Fix collections if 'collections' in inst: for collection in inst['collections']: if 'description' in collection: collection.pop('description') changes.append("removed_collection_description") if 'temporal_coverage' in collection: fixed = fix_temporal_coverage(collection['temporal_coverage']) if fixed != collection['temporal_coverage']: collection['temporal_coverage'] = fixed changes.append("fixed_temporal_coverage") # Fix change_history if 'change_history' in inst: for event in inst['change_history']: if 'event_date' not in event: event['event_date'] = '1900-01-01' changes.append("added_event_date") return changes def process_file(file_path): """Process a single YAML file.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0, [] all_changes = [] # Handle different YAML structures if isinstance(data, list): for inst in data: changes = fix_institution(inst) all_changes.extend(changes) elif isinstance(data, dict): if 'institutions' in data or 'metadata' in data: return 0, ['skipped_metadata_structure'] else: changes = fix_institution(data) all_changes.extend(changes) if all_changes: with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return len(all_changes), all_changes except Exception as e: return 0, [f'error: {str(e)}'] def process_country_directory(country_dir): """Process all YAML files in a country directory.""" yaml_files = list(country_dir.glob('*.yaml')) if not yaml_files: return 0, 0, [] print(f"\n{'='*70}") print(f"📁 Processing: {country_dir.name}") print(f"{'='*70}") total_changes = 0 files_modified = 0 skipped_files = [] for i, yaml_file in enumerate(yaml_files, 1): if yaml_file.name in SKIP_FILES: print(f" ⏭️ [{i}/{len(yaml_files)}] Skipping: {yaml_file.name}") skipped_files.append(yaml_file.name) continue changes_count, changes = process_file(yaml_file) if 'skipped_metadata_structure' in changes: print(f" ⏭️ [{i}/{len(yaml_files)}] Skipping metadata: {yaml_file.name}") skipped_files.append(yaml_file.name) elif any(c.startswith('error:') for c in changes): print(f" ❌ [{i}/{len(yaml_files)}] Error: {yaml_file.name}") elif changes_count > 0: print(f" ✅ [{i}/{len(yaml_files)}] Fixed {changes_count} issues: {yaml_file.name}") total_changes += changes_count files_modified += 1 else: print(f" ✓ [{i}/{len(yaml_files)}] No changes: {yaml_file.name}") return total_changes, files_modified, skipped_files def main(): """Process YAML files by country directory.""" repo_root = Path(__file__).parent.parent instances_dir = repo_root / 'data' / 'instances' # Get all country directories country_dirs = sorted([ d for d in instances_dir.iterdir() if d.is_dir() and d.name not in SKIP_DIRS ]) print(f"🎯 Processing {len(country_dirs)} country directories") print(f"⏭️ Skipping directories: {', '.join(sorted(SKIP_DIRS))}") grand_total_changes = 0 grand_total_files = 0 all_skipped = [] for country_dir in country_dirs: changes, files, skipped = process_country_directory(country_dir) grand_total_changes += changes grand_total_files += files all_skipped.extend(skipped) if changes > 0: print(f" 📊 Summary: Fixed {changes} issues in {files} files") print(f"\n{'='*70}") print(f"🎉 GRAND TOTAL") print(f"{'='*70}") print(f" ✅ Fixed {grand_total_changes} issues across {grand_total_files} files") print(f" ⏭️ Skipped {len(all_skipped)} files") print(f"{'='*70}") if __name__ == "__main__": main()