glam/remove_duplicates.py
kempersc fc405445c6 Refactor and update schema definitions
- Removed obsolete slots: `has_or_had_custodian_observation`, `provider`, and `specificity_annotation`.
- Updated `has_or_had_score` slot to use `SpecificityScore` class and modified its description and examples.
- Added new slots: `end_seconds`, `end_time`, `has_archive_path`, `has_or_had_custodian_name`, `protocol_name`, and `protocol_version`.
- Introduced a script `check_annotation_types.py` to validate the presence and structure of `custodian_types` in YAML files.
- Added a script `update_specificity.py` to automate updates related to `SpecificityAnnotation` to `SpecificityScore`.
2026-02-01 19:55:38 +01:00

78 lines
2.5 KiB
Python

import os
def remove_duplicates_in_file(path):
with open(path, 'r') as f:
lines = f.readlines()
new_lines = []
# Track keys at each indentation level to detect duplicates
# keys_at_indent: {indent: set(keys)}
# But we need to clear deeper levels when indent decreases.
keys_at_indent = {}
prev_indent = 0
# We also need to skip lines belonging to the removed duplicate key (list items)
skip_mode = False
skip_indent = -1
keys_to_check = ["broad_mappings", "close_mappings", "related_mappings", "exact_mappings"]
for i, line in enumerate(lines):
stripped = line.strip()
# Determine indent
if not stripped:
new_lines.append(line)
continue
indent = len(line) - len(line.lstrip())
# If we are skipping a block (children of removed key)
if skip_mode:
if indent > skip_indent:
# Still inside the block of removed key
continue
else:
# Block ended
skip_mode = False
skip_indent = -1
# Update indentation tracking
if indent > prev_indent:
pass
elif indent < prev_indent:
# Clear keys for deeper levels
levels = [k for k in keys_at_indent if k > indent]
for l in levels:
del keys_at_indent[l]
if indent not in keys_at_indent:
keys_at_indent[indent] = set()
# Check if line is a key
if ':' in stripped and not stripped.startswith('-') and not stripped.startswith('#'):
key = stripped.split(':')[0].strip()
if key in keys_to_check:
if key in keys_at_indent[indent]:
print(f"Removing duplicate key '{key}' in {path} at line {i+1}")
skip_mode = True
skip_indent = indent
continue
else:
keys_at_indent[indent].add(key)
new_lines.append(line)
prev_indent = indent
with open(path, 'w') as f:
f.writelines(new_lines)
def process_directory(directory):
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".yaml"):
remove_duplicates_in_file(os.path.join(root, file))
process_directory("schemas/20251121/linkml/modules/classes")