#!/usr/bin/env python3 """ Fix remaining validation errors in YAML files. Common error patterns: 1. Invalid metadata standard enum values (Dublin Core → DUBLIN_CORE, DSpace, OAI-PMH, Z39.50, UNIMARC, ISAD(G)) 2. 'notes' field in provenance (should be in HeritageCustodian.description) 3. 'description' field in collections/digital_platforms (not allowed) 4. 'enrichment_history' at root level (should be in provenance) 5. Temporal coverage with "present" or empty strings 6. Missing required fields in enrichment_history """ import yaml import re from pathlib import Path from datetime import datetime # Metadata standards mapping STANDARDS_MAPPING = { 'Dublin Core': 'DUBLIN_CORE', 'dublin core': 'DUBLIN_CORE', 'DUBLIN-CORE': 'DUBLIN_CORE', 'DSpace': None, # DSpace is a platform, not a metadata standard - remove 'OAI-PMH': None, # OAI-PMH is a protocol, not a metadata standard - remove 'Z39.50': None, # Z39.50 is a protocol, not a metadata standard - remove 'UNIMARC': 'MARC21', # Map UNIMARC to MARC21 (closest equivalent) 'ISAD(G)': 'EAD', # ISAD(G) archival standard maps to EAD } def fix_metadata_standards(standards_list): """Map or remove invalid metadata standards.""" if not standards_list: return standards_list fixed = [] for std in standards_list: if std in STANDARDS_MAPPING: mapped = STANDARDS_MAPPING[std] if mapped: # Only add if there's a valid mapping fixed.append(mapped) else: fixed.append(std) # Keep valid standards as-is return list(set(fixed)) if fixed else None # Remove duplicates def fix_temporal_coverage(coverage_str): """Fix temporal coverage patterns like '1983-01-01/present' or empty strings.""" if not coverage_str or coverage_str.strip() == '': return None # Remove empty temporal coverage # Replace "present" with current year if 'present' in coverage_str.lower(): current_year = datetime.now().year coverage_str = re.sub( r'/present', f'/{current_year}-12-31', coverage_str, flags=re.IGNORECASE ) return coverage_str def fix_enrichment_history(entry): """Add required fields to enrichment_history entries.""" if 'enrichment_type' not in entry: entry['enrichment_type'] = 'MANUAL' # Default to manual enrichment if 'verified' not in entry: entry['verified'] = False # Default to unverified # Fix enrichment_source if it's a list (should be a string) if 'enrichment_source' in entry and isinstance(entry['enrichment_source'], list): entry['enrichment_source'] = entry['enrichment_source'][0] if entry['enrichment_source'] else None return entry def fix_institution(inst): """Fix a single institution record.""" changes = [] # Move 'notes' from provenance to description if 'provenance' in inst and 'notes' in inst['provenance']: notes = inst['provenance'].pop('notes') if 'description' not in inst or not inst['description']: inst['description'] = notes else: inst['description'] += f"\n\n{notes}" changes.append("Moved provenance.notes to description") # Move enrichment_history from root to provenance if 'enrichment_history' in inst: if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['enrichment_history'] = inst.pop('enrichment_history') changes.append("Moved enrichment_history to provenance") # Fix enrichment_history entries if 'provenance' in inst and 'enrichment_history' in inst['provenance']: for entry in inst['provenance']['enrichment_history']: fix_enrichment_history(entry) changes.append("Fixed enrichment_history entries") # Fix digital platforms if 'digital_platforms' in inst: for platform in inst['digital_platforms']: # Remove 'description' field (not allowed) if 'description' in platform: platform.pop('description') changes.append("Removed description from digital_platform") # Remove 'notes' field (not allowed) if 'notes' in platform: platform.pop('notes') changes.append("Removed notes from digital_platform") # Fix metadata standards if 'implemented_standards' in platform: fixed_standards = fix_metadata_standards(platform['implemented_standards']) if fixed_standards != platform['implemented_standards']: platform['implemented_standards'] = fixed_standards changes.append("Fixed implemented_standards") # Fix collections if 'collections' in inst: for collection in inst['collections']: # Remove 'description' field (not allowed) if 'description' in collection: collection.pop('description') changes.append("Removed description from collection") # Fix temporal coverage if 'temporal_coverage' in collection: fixed = fix_temporal_coverage(collection['temporal_coverage']) if fixed != collection['temporal_coverage']: collection['temporal_coverage'] = fixed changes.append("Fixed temporal_coverage") # Fix change_history if 'change_history' in inst: for event in inst['change_history']: # Add required event_date if missing if 'event_date' not in event: # Try to infer from event_description or use a placeholder event['event_date'] = '1900-01-01' # Placeholder changes.append("Added missing event_date") return changes def process_file(file_path): """Process a single YAML file.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0 total_changes = 0 # Handle different YAML structures if isinstance(data, list): # List of institutions for inst in data: changes = fix_institution(inst) total_changes += len(changes) elif isinstance(data, dict): if 'institutions' in data: # Wrapped structure with metadata print(f"⚠️ Skipping {file_path.name} - wrapped structure not a valid HeritageCustodian") return 0 else: # Single institution changes = fix_institution(data) total_changes += len(changes) if total_changes > 0: # Write back with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"✅ Fixed {total_changes} issues in {file_path.relative_to(file_path.parent.parent.parent)}") return total_changes except Exception as e: print(f"❌ Error processing {file_path.name}: {e}") return 0 def main(): """Process all YAML files in data/instances.""" repo_root = Path(__file__).parent.parent instances_dir = repo_root / 'data' / 'instances' # Skip metadata files skip_files = { 'DATASET_STATISTICS.yaml', 'ENRICHMENT_CANDIDATES.yaml', 'tunisian_institutions_enhanced.yaml', 'tunisian_institutions_enhanced.backup.yaml', } total_fixes = 0 fixed_count = 0 # Process all YAML files recursively for yaml_file in instances_dir.rglob('*.yaml'): if yaml_file.name in skip_files: print(f"⏭️ Skipping metadata file: {yaml_file.name}") continue changes = process_file(yaml_file) if changes > 0: total_fixes += changes fixed_count += 1 print(f"\n{'='*70}") print(f"✅ Fixed {total_fixes} issues across {fixed_count} files") print(f"{'='*70}") if __name__ == "__main__": main()