#!/usr/bin/env python3 """ Fix metadata_standards field values in Norwegian institution data files. Issues to fix: 1. Technical formats (REST API, JSON, XML) → move to integration_method 2. Software names (Primus) → move to vendor or platform_name 3. Non-standard formats → convert to canonical enum values or remove 4. Add missing integration_method fields Schema-compliant values for MetadataStandardEnum: - DUBLIN_CORE, MARC21, EAD, BIBFRAME, LIDO, CIDOC_CRM, SCHEMA_ORG, RIC_O, MODS, PREMIS, SPECTRUM, DACS """ import yaml import sys from pathlib import Path from typing import Any, Dict, List, Set # Valid metadata standard enum values (from schemas/enums.yaml) VALID_STANDARDS = { 'DUBLIN_CORE', 'MARC21', 'EAD', 'BIBFRAME', 'LIDO', 'CIDOC_CRM', 'SCHEMA_ORG', 'RIC_O', 'MODS', 'PREMIS', 'SPECTRUM', 'DACS' } # Mapping from text to canonical enum values STANDARD_MAPPINGS = { 'Dublin Core': 'DUBLIN_CORE', 'DUBLIN CORE': 'DUBLIN_CORE', 'dublin core': 'DUBLIN_CORE', 'DC': 'DUBLIN_CORE', 'MARC21': 'MARC21', 'MARC 21': 'MARC21', 'marc21': 'MARC21', 'EAD': 'EAD', 'EAD (Encoded Archival Description)': 'EAD', 'Encoded Archival Description': 'EAD', 'BIBFRAME': 'BIBFRAME', 'LIDO': 'LIDO', 'CIDOC-CRM': 'CIDOC_CRM', 'CIDOC CRM': 'CIDOC_CRM', 'Schema.org': 'SCHEMA_ORG', 'schema.org': 'SCHEMA_ORG', 'RiC-O': 'RIC_O', 'RIC-O': 'RIC_O', 'Records in Contexts': 'RIC_O', 'MODS': 'MODS', 'PREMIS': 'PREMIS', 'SPECTRUM': 'SPECTRUM', 'DACS': 'DACS', } # Standards that need to be added to the schema enum # These are valid but not yet in MetadataStandardEnum PENDING_SCHEMA_ADDITIONS = { 'Europeana Data Model (EDM)': 'EDM', 'EDM': 'EDM', 'IIIF': 'IIIF', 'International Image Interoperability Framework': 'IIIF', 'Noark 5': 'NOARK5', 'NOARK 5': 'NOARK5', 'Noark-5': 'NOARK5', } # Semantic web technologies (not metadata standards per se) # Should be noted in platform description or removed SEMANTIC_WEB_TECH = { 'RDF', 'RDFS', 'OWL', 'SKOS', 'Linked Open Data', 'LOD', 'Linked Data', 'SPARQL', 'Triple Store' } # Technical formats that should go in integration_method TECHNICAL_FORMATS = { 'REST API', 'REST', 'JSON', 'XML', 'OAI-PMH', 'SPARQL', 'GraphQL', 'SOAP', 'CSV', 'API', 'HTTP' } # Software/vendor names SOFTWARE_NAMES = { 'Primus', 'CollectiveAccess', 'Adlib', 'TMS', 'Axiell', 'MAIS', 'Atlantis', 'DSpace', 'EPrints', 'Fedora', 'Islandora' } # Standards to investigate (not in enum but might be valid) UNKNOWN_STANDARDS = set() def normalize_standard(value: str) -> str | None: """ Normalize a metadata standard value to canonical enum format. Returns: Canonical enum value, or None if not a valid standard """ value_clean = value.strip() # Check if already canonical if value_clean in VALID_STANDARDS: return value_clean # Check mappings if value_clean in STANDARD_MAPPINGS: return STANDARD_MAPPINGS[value_clean] # Check pending schema additions (valid but not yet in enum) if value_clean in PENDING_SCHEMA_ADDITIONS: mapped_value = PENDING_SCHEMA_ADDITIONS[value_clean] UNKNOWN_STANDARDS.add(f"{value_clean} [→ {mapped_value}, pending schema update]") return None # Will need schema update # Check semantic web technologies (not metadata standards) if value_clean in SEMANTIC_WEB_TECH or any(tech in value_clean for tech in SEMANTIC_WEB_TECH): return None # Will be filtered out with note return None def is_technical_format(value: str) -> bool: """Check if value is a technical format (REST, JSON, etc.).""" return any(fmt in value.upper() for fmt in TECHNICAL_FORMATS) def is_software_name(value: str) -> bool: """Check if value is a software/vendor name.""" return any(sw in value for sw in SOFTWARE_NAMES) def fix_digital_platform(platform: Dict[str, Any], changes: List[str]) -> Dict[str, Any]: """ Fix metadata_standards in a digital platform. Returns: Fixed platform dict, with changes logged """ fixed = platform.copy() # Handle metadata_standards field if 'metadata_standards' in fixed: original_standards = fixed['metadata_standards'] valid_standards = [] integration_methods = [] vendors = [] for std in original_standards: if isinstance(std, str): # Check if it's a valid standard canonical = normalize_standard(std) if canonical: valid_standards.append(canonical) continue # Check if it's a technical format if is_technical_format(std): integration_methods.append(std) changes.append(f" - Moved '{std}' from metadata_standards to integration_method") continue # Check if it's software if is_software_name(std): vendors.append(std) changes.append(f" - Moved '{std}' from metadata_standards to vendor") continue # Check if it's semantic web technology (not a metadata standard) if std in SEMANTIC_WEB_TECH or any(tech in std for tech in SEMANTIC_WEB_TECH): changes.append(f" - Removed '{std}' (semantic web technology, not a metadata standard)") continue # Unknown - log it UNKNOWN_STANDARDS.add(std) changes.append(f" - ⚠️ Unknown standard '{std}' - needs manual review") # Update platform with fixed values if valid_standards: fixed['metadata_standards'] = valid_standards else: # Remove empty metadata_standards del fixed['metadata_standards'] # Add integration_method if we found technical formats if integration_methods: existing_method = fixed.get('integration_method', '') if existing_method: # Merge with existing all_methods = set(integration_methods + [existing_method]) fixed['integration_method'] = ', '.join(sorted(all_methods)) else: fixed['integration_method'] = ', '.join(integration_methods) # Add vendor if we found software names if vendors: existing_vendor = fixed.get('vendor', '') if existing_vendor: all_vendors = set(vendors + [existing_vendor]) fixed['vendor'] = ', '.join(sorted(all_vendors)) else: fixed['vendor'] = ', '.join(vendors) return fixed def fix_heritage_custodian(custodian: Dict[str, Any]) -> tuple[Dict[str, Any], List[str]]: """ Fix metadata_standards across all digital platforms in a custodian. Returns: (fixed_custodian, list_of_changes) """ fixed = custodian.copy() changes = [] if 'digital_platforms' in fixed and fixed['digital_platforms']: fixed_platforms = [] for i, platform in enumerate(fixed['digital_platforms']): platform_changes = [] fixed_platform = fix_digital_platform(platform, platform_changes) fixed_platforms.append(fixed_platform) if platform_changes: platform_name = platform.get('platform_name', f'Platform {i+1}') changes.append(f"\n Platform: {platform_name}") changes.extend(platform_changes) fixed['digital_platforms'] = fixed_platforms return fixed, changes def fix_yaml_file(file_path: Path, dry_run: bool = False) -> tuple[int, List[str]]: """ Fix metadata_standards in a YAML file. Returns: (number_of_institutions_changed, list_of_all_changes) """ # Read file with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0, [] # Process each institution all_changes = [] institutions_changed = 0 institutions = data if isinstance(data, list) else [data] for i, inst in enumerate(institutions): inst_changes = [] fixed_inst, changes = fix_heritage_custodian(inst) if changes: institutions_changed += 1 inst_name = inst.get('name', f'Institution {i+1}') all_changes.append(f"\n📍 {inst_name}:") all_changes.extend(changes) # Update in place if isinstance(data, list): data[i] = fixed_inst else: data = fixed_inst # Write back if not dry run if not dry_run and institutions_changed > 0: with open(file_path, 'w', encoding='utf-8') as f: yaml.dump( data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120 ) return institutions_changed, all_changes def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser( description='Fix metadata_standards field values in Norwegian institution YAML files' ) parser.add_argument( 'files', nargs='+', type=Path, help='YAML files to fix (or directory containing YAML files)' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be changed without modifying files' ) args = parser.parse_args() # Collect all YAML files yaml_files = [] for path in args.files: if path.is_dir(): yaml_files.extend(path.glob('*.yaml')) yaml_files.extend(path.glob('*.yml')) else: yaml_files.append(path) if not yaml_files: print("❌ No YAML files found") return 1 print(f"{'🔍 DRY RUN - ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n") total_institutions_changed = 0 total_files_changed = 0 for file_path in sorted(yaml_files): print(f"\n{'='*80}") print(f"📄 {file_path.name}") print('='*80) try: num_changed, changes = fix_yaml_file(file_path, dry_run=args.dry_run) if num_changed > 0: total_files_changed += 1 total_institutions_changed += num_changed print(f"\n✏️ {num_changed} institution(s) modified:") for change in changes: print(change) else: print("✅ No changes needed") except Exception as e: print(f"❌ Error processing {file_path}: {e}") import traceback traceback.print_exc() # Summary print(f"\n{'='*80}") print("📊 SUMMARY") print('='*80) print(f"Files processed: {len(yaml_files)}") print(f"Files changed: {total_files_changed}") print(f"Institutions modified: {total_institutions_changed}") if UNKNOWN_STANDARDS: print(f"\n⚠️ Unknown standards found (need manual review or schema update):") for std in sorted(UNKNOWN_STANDARDS): print(f" - {std}") if args.dry_run: print(f"\n🔍 This was a dry run. Run without --dry-run to apply changes.") else: print(f"\n✅ Changes applied successfully!") return 0 if __name__ == "__main__": sys.exit(main())