183 lines
6.1 KiB
Python
183 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Deduplicate class definitions in LinkML schema.
|
|
|
|
When a class is defined in both:
|
|
1. A "container" file (e.g., StaffRoles.yaml, FindingAidTypes.yaml)
|
|
2. An individual file (e.g., Curator.yaml, Archivist.yaml)
|
|
|
|
This script:
|
|
1. Updates imports in all files to point to the container
|
|
2. Deletes the duplicate individual files
|
|
|
|
Rule: Container files are the authoritative source.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
CLASSES_DIR = 'schemas/20251121/linkml/modules/classes'
|
|
|
|
# Files that contain multiple class definitions (containers)
|
|
CONTAINER_PATTERNS = [
|
|
'Types.yaml', 'Roles.yaml', 'FindingAid.yaml', 'LinkedInProfile.yaml',
|
|
'VideoAudioAnnotation.yaml', 'DataLicensePolicy.yaml', 'EncompassingBody.yaml',
|
|
'DataServiceEndpointTypes.yaml', 'OAIPMHEndpoint.yaml', 'SearchAPI.yaml',
|
|
'OAIPMHSet.yaml', 'SearchQueryParameter.yaml'
|
|
]
|
|
|
|
def is_container_file(filename):
|
|
return any(p in filename for p in CONTAINER_PATTERNS)
|
|
|
|
def build_class_to_container_mapping(classes_dir):
|
|
"""Build mapping of class names to their container files."""
|
|
class_to_container = {}
|
|
|
|
for filename in sorted(os.listdir(classes_dir)):
|
|
if not filename.endswith('.yaml'):
|
|
continue
|
|
if not is_container_file(filename):
|
|
continue
|
|
|
|
filepath = os.path.join(classes_dir, filename)
|
|
with open(filepath) as f:
|
|
try:
|
|
data = yaml.safe_load(f)
|
|
if data and 'classes' in data and isinstance(data['classes'], dict):
|
|
for class_name in data['classes'].keys():
|
|
container_name = filename.replace('.yaml', '')
|
|
class_to_container[class_name] = container_name
|
|
except Exception as e:
|
|
print(f"Error parsing {filename}: {e}")
|
|
|
|
return class_to_container
|
|
|
|
def find_duplicate_files(classes_dir, class_to_container):
|
|
"""Find individual files that duplicate container class definitions."""
|
|
duplicates = []
|
|
|
|
for filename in sorted(os.listdir(classes_dir)):
|
|
if not filename.endswith('.yaml'):
|
|
continue
|
|
if is_container_file(filename):
|
|
continue
|
|
|
|
class_name = filename.replace('.yaml', '')
|
|
if class_name in class_to_container:
|
|
duplicates.append({
|
|
'file': filename,
|
|
'class': class_name,
|
|
'container': class_to_container[class_name]
|
|
})
|
|
|
|
return duplicates
|
|
|
|
def update_imports_in_file(filepath, import_mapping):
|
|
"""Update imports in a YAML file."""
|
|
with open(filepath) as f:
|
|
content = f.read()
|
|
|
|
original = content
|
|
updated = False
|
|
|
|
for old_import, new_import in import_mapping.items():
|
|
# Match various import formats
|
|
patterns = [
|
|
(f'- {old_import}\n', f'- {new_import}\n'),
|
|
(f"- '{old_import}'\n", f"- '{new_import}'\n"),
|
|
(f'- "{old_import}"\n', f'- "{new_import}"\n'),
|
|
]
|
|
for old_pattern, new_pattern in patterns:
|
|
if old_pattern in content:
|
|
content = content.replace(old_pattern, new_pattern)
|
|
updated = True
|
|
|
|
# Remove duplicate imports
|
|
if updated:
|
|
lines = content.split('\n')
|
|
seen_imports = set()
|
|
new_lines = []
|
|
in_imports = False
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if stripped.startswith('imports:'):
|
|
in_imports = True
|
|
new_lines.append(line)
|
|
continue
|
|
|
|
if in_imports:
|
|
if stripped.startswith('- '):
|
|
import_val = stripped[2:].strip().strip("'\"")
|
|
if import_val in seen_imports:
|
|
continue # Skip duplicate
|
|
seen_imports.add(import_val)
|
|
elif stripped and not stripped.startswith('#'):
|
|
in_imports = False
|
|
|
|
new_lines.append(line)
|
|
|
|
content = '\n'.join(new_lines)
|
|
|
|
if content != original:
|
|
with open(filepath, 'w') as f:
|
|
f.write(content)
|
|
return True
|
|
return False
|
|
|
|
def main():
|
|
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
print("Building class-to-container mapping...")
|
|
class_to_container = build_class_to_container_mapping(CLASSES_DIR)
|
|
print(f"Found {len(class_to_container)} classes in container files")
|
|
|
|
print("\nFinding duplicate individual files...")
|
|
duplicates = find_duplicate_files(CLASSES_DIR, class_to_container)
|
|
print(f"Found {len(duplicates)} duplicate files to remove")
|
|
|
|
# Build import mapping
|
|
import_mapping = {}
|
|
for dup in duplicates:
|
|
old_import = f"./{dup['class']}"
|
|
new_import = f"./{dup['container']}"
|
|
import_mapping[old_import] = new_import
|
|
|
|
print("\nUpdating imports in all schema files...")
|
|
files_updated = 0
|
|
|
|
# Update imports in all YAML files in the schema directory
|
|
schema_root = 'schemas/20251121/linkml'
|
|
for root, dirs, files in os.walk(schema_root):
|
|
for filename in files:
|
|
if not filename.endswith('.yaml'):
|
|
continue
|
|
filepath = os.path.join(root, filename)
|
|
|
|
# Skip files we're about to delete
|
|
if os.path.basename(filepath) in [d['file'] for d in duplicates]:
|
|
continue
|
|
|
|
if update_imports_in_file(filepath, import_mapping):
|
|
files_updated += 1
|
|
print(f" Updated: {filepath}")
|
|
|
|
print(f"\nUpdated imports in {files_updated} files")
|
|
|
|
print("\nDeleting duplicate files...")
|
|
deleted = 0
|
|
for dup in duplicates:
|
|
filepath = os.path.join(CLASSES_DIR, dup['file'])
|
|
if os.path.exists(filepath):
|
|
os.remove(filepath)
|
|
deleted += 1
|
|
print(f" Deleted: {dup['file']} (class {dup['class']} defined in {dup['container']})")
|
|
|
|
print(f"\nDeleted {deleted} duplicate files")
|
|
print("\nDone!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|