#!/usr/bin/env python3 """ Deduplicate class definitions in LinkML schema. When a class is defined in both: 1. A "container" file (e.g., StaffRoles.yaml, FindingAidTypes.yaml) 2. An individual file (e.g., Curator.yaml, Archivist.yaml) This script: 1. Updates imports in all files to point to the container 2. Deletes the duplicate individual files Rule: Container files are the authoritative source. """ import os import re import yaml from collections import defaultdict from datetime import datetime CLASSES_DIR = 'schemas/20251121/linkml/modules/classes' # Files that contain multiple class definitions (containers) CONTAINER_PATTERNS = [ 'Types.yaml', 'Roles.yaml', 'FindingAid.yaml', 'LinkedInProfile.yaml', 'VideoAudioAnnotation.yaml', 'DataLicensePolicy.yaml', 'EncompassingBody.yaml', 'DataServiceEndpointTypes.yaml', 'OAIPMHEndpoint.yaml', 'SearchAPI.yaml', 'OAIPMHSet.yaml', 'SearchQueryParameter.yaml' ] def is_container_file(filename): return any(p in filename for p in CONTAINER_PATTERNS) def build_class_to_container_mapping(classes_dir): """Build mapping of class names to their container files.""" class_to_container = {} for filename in sorted(os.listdir(classes_dir)): if not filename.endswith('.yaml'): continue if not is_container_file(filename): continue filepath = os.path.join(classes_dir, filename) with open(filepath) as f: try: data = yaml.safe_load(f) if data and 'classes' in data and isinstance(data['classes'], dict): for class_name in data['classes'].keys(): container_name = filename.replace('.yaml', '') class_to_container[class_name] = container_name except Exception as e: print(f"Error parsing {filename}: {e}") return class_to_container def find_duplicate_files(classes_dir, class_to_container): """Find individual files that duplicate container class definitions.""" duplicates = [] for filename in sorted(os.listdir(classes_dir)): if not filename.endswith('.yaml'): continue if is_container_file(filename): continue class_name = filename.replace('.yaml', '') if class_name in class_to_container: duplicates.append({ 'file': filename, 'class': class_name, 'container': class_to_container[class_name] }) return duplicates def update_imports_in_file(filepath, import_mapping): """Update imports in a YAML file.""" with open(filepath) as f: content = f.read() original = content updated = False for old_import, new_import in import_mapping.items(): # Match various import formats patterns = [ (f'- {old_import}\n', f'- {new_import}\n'), (f"- '{old_import}'\n", f"- '{new_import}'\n"), (f'- "{old_import}"\n', f'- "{new_import}"\n'), ] for old_pattern, new_pattern in patterns: if old_pattern in content: content = content.replace(old_pattern, new_pattern) updated = True # Remove duplicate imports if updated: lines = content.split('\n') seen_imports = set() new_lines = [] in_imports = False for line in lines: stripped = line.strip() if stripped.startswith('imports:'): in_imports = True new_lines.append(line) continue if in_imports: if stripped.startswith('- '): import_val = stripped[2:].strip().strip("'\"") if import_val in seen_imports: continue # Skip duplicate seen_imports.add(import_val) elif stripped and not stripped.startswith('#'): in_imports = False new_lines.append(line) content = '\n'.join(new_lines) if content != original: with open(filepath, 'w') as f: f.write(content) return True return False def main(): os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) print("Building class-to-container mapping...") class_to_container = build_class_to_container_mapping(CLASSES_DIR) print(f"Found {len(class_to_container)} classes in container files") print("\nFinding duplicate individual files...") duplicates = find_duplicate_files(CLASSES_DIR, class_to_container) print(f"Found {len(duplicates)} duplicate files to remove") # Build import mapping import_mapping = {} for dup in duplicates: old_import = f"./{dup['class']}" new_import = f"./{dup['container']}" import_mapping[old_import] = new_import print("\nUpdating imports in all schema files...") files_updated = 0 # Update imports in all YAML files in the schema directory schema_root = 'schemas/20251121/linkml' for root, dirs, files in os.walk(schema_root): for filename in files: if not filename.endswith('.yaml'): continue filepath = os.path.join(root, filename) # Skip files we're about to delete if os.path.basename(filepath) in [d['file'] for d in duplicates]: continue if update_imports_in_file(filepath, import_mapping): files_updated += 1 print(f" Updated: {filepath}") print(f"\nUpdated imports in {files_updated} files") print("\nDeleting duplicate files...") deleted = 0 for dup in duplicates: filepath = os.path.join(CLASSES_DIR, dup['file']) if os.path.exists(filepath): os.remove(filepath) deleted += 1 print(f" Deleted: {dup['file']} (class {dup['class']} defined in {dup['container']})") print(f"\nDeleted {deleted} duplicate files") print("\nDone!") if __name__ == '__main__': main()