glam/scripts/deduplicate_classes.py

183 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Deduplicate class definitions in LinkML schema.
When a class is defined in both:
1. A "container" file (e.g., StaffRoles.yaml, FindingAidTypes.yaml)
2. An individual file (e.g., Curator.yaml, Archivist.yaml)
This script:
1. Updates imports in all files to point to the container
2. Deletes the duplicate individual files
Rule: Container files are the authoritative source.
"""
import os
import re
import yaml
from collections import defaultdict
from datetime import datetime
CLASSES_DIR = 'schemas/20251121/linkml/modules/classes'
# Files that contain multiple class definitions (containers)
CONTAINER_PATTERNS = [
'Types.yaml', 'Roles.yaml', 'FindingAid.yaml', 'LinkedInProfile.yaml',
'VideoAudioAnnotation.yaml', 'DataLicensePolicy.yaml', 'EncompassingBody.yaml',
'DataServiceEndpointTypes.yaml', 'OAIPMHEndpoint.yaml', 'SearchAPI.yaml',
'OAIPMHSet.yaml', 'SearchQueryParameter.yaml'
]
def is_container_file(filename):
return any(p in filename for p in CONTAINER_PATTERNS)
def build_class_to_container_mapping(classes_dir):
"""Build mapping of class names to their container files."""
class_to_container = {}
for filename in sorted(os.listdir(classes_dir)):
if not filename.endswith('.yaml'):
continue
if not is_container_file(filename):
continue
filepath = os.path.join(classes_dir, filename)
with open(filepath) as f:
try:
data = yaml.safe_load(f)
if data and 'classes' in data and isinstance(data['classes'], dict):
for class_name in data['classes'].keys():
container_name = filename.replace('.yaml', '')
class_to_container[class_name] = container_name
except Exception as e:
print(f"Error parsing {filename}: {e}")
return class_to_container
def find_duplicate_files(classes_dir, class_to_container):
"""Find individual files that duplicate container class definitions."""
duplicates = []
for filename in sorted(os.listdir(classes_dir)):
if not filename.endswith('.yaml'):
continue
if is_container_file(filename):
continue
class_name = filename.replace('.yaml', '')
if class_name in class_to_container:
duplicates.append({
'file': filename,
'class': class_name,
'container': class_to_container[class_name]
})
return duplicates
def update_imports_in_file(filepath, import_mapping):
"""Update imports in a YAML file."""
with open(filepath) as f:
content = f.read()
original = content
updated = False
for old_import, new_import in import_mapping.items():
# Match various import formats
patterns = [
(f'- {old_import}\n', f'- {new_import}\n'),
(f"- '{old_import}'\n", f"- '{new_import}'\n"),
(f'- "{old_import}"\n', f'- "{new_import}"\n'),
]
for old_pattern, new_pattern in patterns:
if old_pattern in content:
content = content.replace(old_pattern, new_pattern)
updated = True
# Remove duplicate imports
if updated:
lines = content.split('\n')
seen_imports = set()
new_lines = []
in_imports = False
for line in lines:
stripped = line.strip()
if stripped.startswith('imports:'):
in_imports = True
new_lines.append(line)
continue
if in_imports:
if stripped.startswith('- '):
import_val = stripped[2:].strip().strip("'\"")
if import_val in seen_imports:
continue # Skip duplicate
seen_imports.add(import_val)
elif stripped and not stripped.startswith('#'):
in_imports = False
new_lines.append(line)
content = '\n'.join(new_lines)
if content != original:
with open(filepath, 'w') as f:
f.write(content)
return True
return False
def main():
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
print("Building class-to-container mapping...")
class_to_container = build_class_to_container_mapping(CLASSES_DIR)
print(f"Found {len(class_to_container)} classes in container files")
print("\nFinding duplicate individual files...")
duplicates = find_duplicate_files(CLASSES_DIR, class_to_container)
print(f"Found {len(duplicates)} duplicate files to remove")
# Build import mapping
import_mapping = {}
for dup in duplicates:
old_import = f"./{dup['class']}"
new_import = f"./{dup['container']}"
import_mapping[old_import] = new_import
print("\nUpdating imports in all schema files...")
files_updated = 0
# Update imports in all YAML files in the schema directory
schema_root = 'schemas/20251121/linkml'
for root, dirs, files in os.walk(schema_root):
for filename in files:
if not filename.endswith('.yaml'):
continue
filepath = os.path.join(root, filename)
# Skip files we're about to delete
if os.path.basename(filepath) in [d['file'] for d in duplicates]:
continue
if update_imports_in_file(filepath, import_mapping):
files_updated += 1
print(f" Updated: {filepath}")
print(f"\nUpdated imports in {files_updated} files")
print("\nDeleting duplicate files...")
deleted = 0
for dup in duplicates:
filepath = os.path.join(CLASSES_DIR, dup['file'])
if os.path.exists(filepath):
os.remove(filepath)
deleted += 1
print(f" Deleted: {dup['file']} (class {dup['class']} defined in {dup['container']})")
print(f"\nDeleted {deleted} duplicate files")
print("\nDone!")
if __name__ == '__main__':
main()