#!/usr/bin/env python3 """ Fix alias field names in YAML instance files to use canonical schema names. LinkML aliases don't work in data validation - they're only for documentation. This script renames fields to use canonical names defined in the schema. Mappings: - description (in DigitalPlatform) → platform_description - description (in Collection) → collection_description - description (in Location) → location_description (remove for now) - description (in Identifier) → identifier_description (remove for now) - metadata_standards (in DigitalPlatform) → implemented_standards - notes (in Provenance) → provenance_notes - subject_areas (in Collection) → subjects """ import yaml import sys from pathlib import Path def fix_digital_platform(platform): """Fix DigitalPlatform field names.""" changed = False if 'description' in platform: platform['platform_description'] = platform.pop('description') changed = True if 'metadata_standards' in platform: platform['implemented_standards'] = platform.pop('metadata_standards') changed = True return changed def fix_collection(collection): """Fix Collection field names.""" changed = False if 'description' in collection: collection['collection_description'] = collection.pop('description') changed = True if 'subject_areas' in collection: collection['subjects'] = collection.pop('subject_areas') changed = True return changed def fix_location(location): """Fix Location field names.""" changed = False if 'description' in location: # Remove for now - not in schema yet location.pop('description') changed = True return changed def fix_identifier(identifier): """Fix Identifier field names.""" changed = False if 'description' in identifier: # Remove for now - not in schema yet identifier.pop('description') changed = True return changed def fix_provenance(provenance): """Fix Provenance field names.""" changed = False if 'notes' in provenance: provenance['provenance_notes'] = provenance.pop('notes') changed = True return changed def fix_institution(institution): """Fix field names in a single institution record.""" changed = False # Fix digital platforms if 'digital_platforms' in institution: for platform in institution['digital_platforms']: if fix_digital_platform(platform): changed = True # Fix collections if 'collections' in institution: for collection in institution['collections']: if fix_collection(collection): changed = True # Fix locations if 'locations' in institution: for location in institution['locations']: if fix_location(location): changed = True # Fix identifiers if 'identifiers' in institution: for identifier in institution['identifiers']: if fix_identifier(identifier): changed = True # Fix provenance if 'provenance' in institution: if fix_provenance(institution['provenance']): changed = True return changed def fix_yaml_file(file_path): """Fix alias field names in a YAML file.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return False, "Empty file" # Handle both single institution and list of institutions changed = False if isinstance(data, list): for inst in data: if fix_institution(inst): changed = True else: changed = fix_institution(data) if changed: # Write back to file with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True, "Fixed" else: return False, "No changes needed" except Exception as e: return False, f"Error: {e}" def main(): """Process all YAML instance files.""" # Find all YAML files in data/instances/ base_dir = Path(__file__).parent.parent instances_dir = base_dir / 'data' / 'instances' if not instances_dir.exists(): print(f"❌ Instances directory not found: {instances_dir}") sys.exit(1) yaml_files = list(instances_dir.rglob('*.yaml')) yaml_files.extend(instances_dir.rglob('*.yml')) if not yaml_files: print("⚠️ No YAML files found") sys.exit(0) print(f"🔍 Found {len(yaml_files)} YAML files\n") changed_count = 0 unchanged_count = 0 error_count = 0 for i, yaml_file in enumerate(sorted(yaml_files), 1): rel_path = yaml_file.relative_to(base_dir) changed, msg = fix_yaml_file(yaml_file) if changed: print(f"{i:3d}. ✅ {rel_path}: {msg}") changed_count += 1 elif "Error" in msg: print(f"{i:3d}. ❌ {rel_path}: {msg}") error_count += 1 else: unchanged_count += 1 print(f"\n{'='*80}") print(f"✅ Modified: {changed_count} files") print(f"⏭️ Unchanged: {unchanged_count} files") print(f"❌ Errors: {error_count} files") print(f"📊 Total: {len(yaml_files)} files") if __name__ == '__main__': main()