#!/usr/bin/env python3 """ Fix validation errors in Norwegian heritage institution YAML files. Fixes: 1. Temporal coverage format: "YYYY/YYYY" or "YYYY BCE/YYYY" → "YYYY-01-01/YYYY-12-31" 2. Remove access_rights from digital_platforms (move to collections if needed) 3. Remove description field from Location objects 4. Remove description field from Identifier objects 5. Add IDENTIFIER_SERVICE to PlatformTypeEnum (schema change needed) 6. Fix metadata_standards enum format: "Dublin Core" → "DUBLIN_CORE" Usage: python3 scripts/fix_validation_errors.py """ import sys import re from pathlib import Path import ruamel.yaml def parse_temporal_coverage(coverage_str: str) -> str: """ Convert simplified temporal coverage to ISO format. Examples: "8000 BCE/2000" → "-8000-01-01/2000-12-31" "2000 BCE/1950" → "-2000-01-01/1950-12-31" "800/1950" → "0800-01-01/1950-12-31" "1850-01-01/2024-12-31" → "1850-01-01/2024-12-31" (unchanged) """ # Already in correct format if re.match(r'^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}$', coverage_str): return coverage_str # Parse start and end dates parts = coverage_str.split('/') if len(parts) != 2: print(f"Warning: Cannot parse temporal coverage '{coverage_str}'") return coverage_str start, end = parts # Parse start date if 'BCE' in start: year = start.replace('BCE', '').strip() start_iso = f"-{int(year):04d}-01-01" elif 'CE' in start: year = start.replace('CE', '').strip() start_iso = f"{int(year):04d}-01-01" else: year = start.strip() start_iso = f"{int(year):04d}-01-01" # Parse end date if 'BCE' in end: year = end.replace('BCE', '').strip() end_iso = f"-{int(year):04d}-12-31" elif 'CE' in end: year = end.replace('CE', '').strip() end_iso = f"{int(year):04d}-12-31" else: year = end.strip() end_iso = f"{int(year):04d}-12-31" return f"{start_iso}/{end_iso}" def fix_metadata_standards_enum(standards_list: list) -> list: """ Fix metadata standards enum format. "Dublin Core" → "DUBLIN_CORE" "MARC 21" → "MARC21" """ if not standards_list: return standards_list fixed = [] for standard in standards_list: if standard == "Dublin Core": fixed.append("DUBLIN_CORE") elif standard == "MARC 21": fixed.append("MARC21") else: fixed.append(standard) return fixed def fix_institution(institution: dict) -> list: """Fix validation errors in a single institution record. Returns list of changes made.""" changes_made = [] # Fix temporal coverage in collections if 'collections' in institution: for collection in institution['collections']: if 'temporal_coverage' in collection: old_coverage = collection['temporal_coverage'] new_coverage = parse_temporal_coverage(old_coverage) if old_coverage != new_coverage: collection['temporal_coverage'] = new_coverage changes_made.append(f"Fixed temporal_coverage: {old_coverage} → {new_coverage}") # Remove access_rights from digital_platforms if 'digital_platforms' in institution: for platform in institution['digital_platforms']: if 'access_rights' in platform: access_value = platform.pop('access_rights') changes_made.append(f"Removed access_rights from digital_platform: {access_value}") # Remove description from locations if 'locations' in institution: for location in institution['locations']: if 'description' in location: desc = location.pop('description') changes_made.append(f"Removed description from location: {desc}") # Remove description from identifiers if 'identifiers' in institution: for identifier in institution['identifiers']: if 'description' in identifier: desc = identifier.pop('description') changes_made.append(f"Removed description from identifier: {desc}") # Fix metadata_standards enum format if 'digital_platforms' in institution: for platform in institution['digital_platforms']: if 'metadata_standards' in platform: old_standards = platform['metadata_standards'] new_standards = fix_metadata_standards_enum(old_standards) if old_standards != new_standards: platform['metadata_standards'] = new_standards changes_made.append(f"Fixed metadata_standards enum: {old_standards} → {new_standards}") return changes_made def main(): if len(sys.argv) != 3: print(__doc__) sys.exit(1) input_path = Path(sys.argv[1]) output_path = Path(sys.argv[2]) if not input_path.exists(): print(f"Error: Input file not found: {input_path}") sys.exit(1) # Load YAML with ruamel.yaml to preserve formatting yaml = ruamel.yaml.YAML() yaml.preserve_quotes = True yaml.default_flow_style = False yaml.width = 4096 with open(input_path, 'r', encoding='utf-8') as f: data = yaml.load(f) # Process each institution total_changes = 0 institutions_modified = 0 for institution in data: institution_name = institution.get('name', 'Unknown') changes = fix_institution(institution) if changes: institutions_modified += 1 total_changes += len(changes) print(f"\n✏️ {institution_name}:") for change in changes: print(f" - {change}") # Write output with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(data, f) print(f"\n{'='*70}") print(f"✅ Fixed {total_changes} validation errors in {institutions_modified} institutions") print(f"✅ Output written to: {output_path}") print(f"\nValidate with:") print(f" python3 scripts/normalize_field_names.py {output_path} /tmp/test.yaml") print(f" linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian /tmp/test.yaml") if __name__ == '__main__': main()