#!/usr/bin/env python3 """ Fix alias field names in YAML instance files to use canonical schema names. LinkML aliases don't work in data validation - they're only for documentation. This script renames fields to use canonical names defined in the schema. Mappings: - description (in DigitalPlatform) → platform_description - description (in Collection) → collection_description - description (in Location) → location_description (needs to be added to schema) - description (in Identifier) → identifier_description (needs to be added to schema) - metadata_standards (in DigitalPlatform) → implemented_standards - notes (in Provenance) → provenance_notes - subject_areas (in Collection) → subjects """ import yaml import sys from pathlib import Path from typing import Any, Dict, List def fix_digital_platform(platform: Dict[str, Any]) -> Dict[str, Any]: """Fix DigitalPlatform field names.""" if 'description' in platform: platform['platform_description'] = platform.pop('description') if 'metadata_standards' in platform: platform['implemented_standards'] = platform.pop('metadata_standards') return platform def fix_collection(collection: Dict[str, Any]) -> Dict[str, Any]: """Fix Collection field names.""" if 'description' in collection: collection['collection_description'] = collection.pop('description') if 'subject_areas' in collection: collection['subjects'] = collection.pop('subject_areas') return collection def fix_location(location: Dict[str, Any]) -> Dict[str, Any]: """Fix Location field names.""" if 'description' in location: # For now, store in a comment or remove # TODO: Add location_description to schema desc = location.pop('description') print(f" ⚠️ Removed location description (not in schema yet): {desc[:50]}...") return location def fix_identifier(identifier: Dict[str, Any]) -> Dict[str, Any]: """Fix Identifier field names.""" if 'description' in identifier: # For now, store in a comment or remove # TODO: Add identifier_description to schema desc = identifier.pop('description') print(f" ⚠️ Removed identifier description (not in schema yet): {desc[:50]}...") return identifier def fix_provenance(provenance: Dict[str, Any]) -> Dict[str, Any]: """Fix Provenance field names.""" if 'notes' in provenance: provenance['provenance_notes'] = provenance.pop('notes') return provenance def fix_institution(institution: Dict[str, Any]) -> Dict[str, Any]: """Fix field names in a single institution record.""" # Fix digital platforms if 'digital_platforms' in institution: institution['digital_platforms'] = [ fix_digital_platform(p) for p in institution['digital_platforms'] ] # Fix collections if 'collections' in institution: institution['collections'] = [ fix_collection(c) for c in institution['collections'] ] # Fix locations if 'locations' in institution: institution['locations'] = [ fix_location(loc) for loc in institution['locations'] ] # Fix identifiers if 'identifiers' in institution: institution['identifiers'] = [ fix_identifier(ident) for ident in institution['identifiers'] ] # Fix provenance if 'provenance' in institution: institution['provenance'] = fix_provenance(institution['provenance']) return institution def fix_yaml_file(file_path: Path) -> bool: """ Fix alias field names in a YAML file. Returns True if changes were made, False otherwise. """ print(f"\n📄 Processing: {file_path.relative_to(file_path.parents[2])}") try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: print(" ⚠️ Empty file, skipping") return False # Handle both single institution and list of institutions if isinstance(data, list): original_data = yaml.dump(data, default_flow_style=False, allow_unicode=True) fixed_data = [fix_institution(inst) for inst in data] else: original_data = yaml.dump(data, default_flow_style=False, allow_unicode=True) fixed_data = fix_institution(data) # Check if anything changed new_data_yaml = yaml.dump(fixed_data, default_flow_style=False, allow_unicode=True) if original_data == new_data_yaml: print(" ✅ No changes needed") return False # Write back to file with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(fixed_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(" ✅ Fixed and saved") return True except Exception as e: print(f" ❌ Error: {e}") return False def main(): """Process all YAML instance files.""" # Find all YAML files in data/instances/ base_dir = Path(__file__).parent.parent instances_dir = base_dir / 'data' / 'instances' if not instances_dir.exists(): print(f"❌ Instances directory not found: {instances_dir}") sys.exit(1) yaml_files = list(instances_dir.rglob('*.yaml')) yaml_files.extend(instances_dir.rglob('*.yml')) if not yaml_files: print("⚠️ No YAML files found") sys.exit(0) print(f"🔍 Found {len(yaml_files)} YAML files") print("=" * 80) changed_files = [] for yaml_file in sorted(yaml_files): if fix_yaml_file(yaml_file): changed_files.append(yaml_file) print("\n" + "=" * 80) print(f"✅ Processed {len(yaml_files)} files") print(f"📝 Modified {len(changed_files)} files") if changed_files: print("\nChanged files:") for f in changed_files: print(f" - {f.relative_to(base_dir)}") print("\n⚠️ NOTE: Location and Identifier descriptions were removed.") print(" These fields need to be added to the schema before they can be used.") if __name__ == '__main__': main()