#!/usr/bin/env python3 """ Normalize field names from aliases to canonical names for LinkML validation. This script converts intuitive field names (aliases) used in extracted data to the canonical field names expected by the LinkML schema. Field Mappings: - Collection.description → collection_description - Collection.extent → item_count - Collection.subject_areas → subjects - DigitalPlatform.description → platform_description - DigitalPlatform.metadata_standards → implemented_standards - Provenance.notes → provenance_notes """ import yaml import sys from pathlib import Path from typing import Any, Dict, List def normalize_collection(collection: Dict[str, Any]) -> Dict[str, Any]: """Normalize Collection field names from aliases to canonical.""" normalized = collection.copy() # description → collection_description if 'description' in normalized: normalized['collection_description'] = normalized.pop('description') # extent → item_count if 'extent' in normalized: normalized['item_count'] = normalized.pop('extent') # subject_areas → subjects if 'subject_areas' in normalized: normalized['subjects'] = normalized.pop('subject_areas') return normalized def normalize_digital_platform(platform: Dict[str, Any]) -> Dict[str, Any]: """Normalize DigitalPlatform field names from aliases to canonical.""" normalized = platform.copy() # description → platform_description if 'description' in normalized: normalized['platform_description'] = normalized.pop('description') # metadata_standards → implemented_standards if 'metadata_standards' in normalized: normalized['implemented_standards'] = normalized.pop('metadata_standards') return normalized def normalize_provenance(provenance: Dict[str, Any]) -> Dict[str, Any]: """Normalize Provenance field names from aliases to canonical.""" normalized = provenance.copy() # notes → provenance_notes if 'notes' in normalized: normalized['provenance_notes'] = normalized.pop('notes') return normalized def normalize_heritage_custodian(custodian: Dict[str, Any]) -> Dict[str, Any]: """Normalize HeritageCustodian and nested objects.""" normalized = custodian.copy() # Normalize collections if 'collections' in normalized and normalized['collections']: normalized['collections'] = [ normalize_collection(coll) for coll in normalized['collections'] ] # Normalize digital_platforms if 'digital_platforms' in normalized and normalized['digital_platforms']: normalized['digital_platforms'] = [ normalize_digital_platform(plat) for plat in normalized['digital_platforms'] ] # Normalize provenance if 'provenance' in normalized and normalized['provenance']: normalized['provenance'] = normalize_provenance(normalized['provenance']) return normalized def normalize_yaml_file(input_path: Path, output_path: Path) -> None: """ Read YAML file, normalize field names, write to output file. Args: input_path: Path to input YAML file with alias field names output_path: Path to output YAML file with canonical field names """ # Read input YAML with open(input_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Normalize each institution if isinstance(data, list): normalized_data = [normalize_heritage_custodian(inst) for inst in data] else: normalized_data = normalize_heritage_custodian(data) # Write output YAML with open(output_path, 'w', encoding='utf-8') as f: yaml.dump( normalized_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120 ) print(f"✅ Normalized {input_path} → {output_path}") def main(): """Main entry point.""" if len(sys.argv) < 2: print("Usage: python normalize_field_names.py [output.yaml]") print("\nNormalizes field names from aliases to canonical schema names.") print("If output path not specified, writes to _normalized.yaml") sys.exit(1) input_path = Path(sys.argv[1]) if len(sys.argv) >= 3: output_path = Path(sys.argv[2]) else: output_path = input_path.parent / f"{input_path.stem}_normalized.yaml" if not input_path.exists(): print(f"❌ Error: Input file not found: {input_path}") sys.exit(1) normalize_yaml_file(input_path, output_path) print(f"\n✅ Validation-ready file created: {output_path}") print(f"\nValidate with:") print(f" linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian {output_path}") if __name__ == "__main__": main()