glam/check_duplicates.py
kempersc fc405445c6 Refactor and update schema definitions
- Removed obsolete slots: `has_or_had_custodian_observation`, `provider`, and `specificity_annotation`.
- Updated `has_or_had_score` slot to use `SpecificityScore` class and modified its description and examples.
- Added new slots: `end_seconds`, `end_time`, `has_archive_path`, `has_or_had_custodian_name`, `protocol_name`, and `protocol_version`.
- Introduced a script `check_annotation_types.py` to validate the presence and structure of `custodian_types` in YAML files.
- Added a script `update_specificity.py` to automate updates related to `SpecificityAnnotation` to `SpecificityScore`.
2026-02-01 19:55:38 +01:00

67 lines
No EOL
3.3 KiB
Python

import yaml
import os
def check_dir(directory):
print(f"Checking directory: {directory}")
target_keys = ["related_mappings", "close_mappings", "exact_mappings", "broad_mappings", "narrow_mappings", "slots", "slot_usage", "attributes", "annotations", "description", "class_uri", "id", "name", "title", "imports", "prefixes", "default_prefix", "default_range", "classes", "types", "enums", "subsets"]
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".yaml"):
path = os.path.join(root, file)
with open(path, 'r') as f:
lines = f.readlines()
keys_at_indent = {} # {indent: {key: line_no}}
prev_indent = 0
for i, line in enumerate(lines):
stripped = line.strip()
if not stripped or stripped.startswith('#') or stripped.startswith('-'):
continue
indent = len(line) - len(line.lstrip())
if ':' in stripped:
key = stripped.split(':')[0].strip()
# Only check for specific structural keys to avoid noise
if key not in target_keys:
continue
# If indentation increased, we are in a new block
if indent > prev_indent:
pass
# If indentation decreased, clear deeper levels
elif indent < prev_indent:
keys_to_remove = [k for k in keys_at_indent if k > indent]
for k in keys_to_remove:
del keys_at_indent[k]
if indent not in keys_at_indent:
keys_at_indent[indent] = {}
if key in keys_at_indent[indent]:
prev_line = keys_at_indent[indent][key]
# Heuristic: if lines are in same block (no lower indent between)
# We assume it's a duplicate in the same object
# Double check if there was a lower indent line between them
parent_found = False
for j in range(prev_line + 1, i):
inner_line = lines[j]
if inner_line.strip() and not inner_line.strip().startswith('#'):
curr_indent = len(inner_line) - len(inner_line.lstrip())
if curr_indent < indent:
parent_found = True
break
if not parent_found:
print(f"DUPLICATE KEY '{key}' in {path} at line {i+1} (previous at {prev_line+1})")
keys_at_indent[indent][key] = i
prev_indent = indent
check_dir("schemas/20251121/linkml/modules/classes")
check_dir("schemas/20251121/linkml/modules/slots")