- Added `fix_dual_class_link.py` to remove dual class link references from specified YAML files. - Created `fix_specific_ghosts.py` to apply specific replacements in YAML files based on defined mappings. - Introduced `migrate_staff_count.py` to migrate staff count references to a new structure in specified YAML files. - Developed `migrate_type_slots.py` to replace type-related slots with new identifiers across YAML files. - Implemented `scan_ghost_references.py` to identify and report ghost references to archived slots and classes in YAML files. - Added `verify_ontology_terms.py` to verify the presence of ontology terms in specified ontology files against schema definitions.
121 lines
4.8 KiB
Python
121 lines
4.8 KiB
Python
import os
|
|
import re
|
|
|
|
# Paths
|
|
SCHEMA_ROOT = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules"
|
|
CLASSES_DIR = os.path.join(SCHEMA_ROOT, "classes")
|
|
SLOTS_DIR = os.path.join(SCHEMA_ROOT, "slots")
|
|
ARCHIVED_SLOTS_FILE = "archived_slots.txt"
|
|
ARCHIVED_CLASSES_FILE = "archived_classes.txt"
|
|
|
|
def parse_archived_name(filename):
|
|
# Removes _archived_YYYYMMDD.yaml suffix
|
|
# Also handles cases without date if any (though list showed dates)
|
|
# The pattern seems to be name_archived_date.yaml
|
|
match = re.match(r"(.+)_archived_\d+\.yaml", filename)
|
|
if match:
|
|
return match.group(1)
|
|
# Fallback if no date
|
|
if filename.endswith(".yaml"):
|
|
return filename[:-5]
|
|
return filename
|
|
|
|
def load_archived_entities(filepath):
|
|
entities = set()
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
for line in f:
|
|
filename = line.strip()
|
|
if filename:
|
|
entity_name = parse_archived_name(filename)
|
|
entities.add(entity_name)
|
|
except FileNotFoundError:
|
|
print(f"Warning: {filepath} not found.")
|
|
return entities
|
|
|
|
def scan_files(directory, archived_slots, archived_classes):
|
|
print(f"Scanning {directory}...")
|
|
issues = []
|
|
|
|
for root, _, files in os.walk(directory):
|
|
if "archive" in root: # Skip archive directories
|
|
continue
|
|
|
|
for file in files:
|
|
if not file.endswith(".yaml"):
|
|
continue
|
|
|
|
filepath = os.path.join(root, file)
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
content = f.read()
|
|
|
|
lines = content.split('\n')
|
|
|
|
for i, line in enumerate(lines):
|
|
# Check for slot imports
|
|
# Pattern: - ../slots/slot_name
|
|
# or - ./slot_name (if in slots dir)
|
|
|
|
# Check for slot usage in slots list
|
|
# Pattern: - slot_name (indented)
|
|
|
|
stripped = line.strip()
|
|
|
|
# Check for archived slots
|
|
for slot in archived_slots:
|
|
# Explicit import check
|
|
if f"../slots/{slot}" in stripped or f"./{slot}" in stripped:
|
|
# Be strict about word boundaries to avoid partial matches
|
|
# e.g. "slot" vs "slot_suffix"
|
|
# But yaml references usually are explicit paths or items
|
|
if stripped.endswith(slot) or f"/{slot}" in stripped:
|
|
issues.append(f"{filepath}:{i+1}: Import of archived slot '{slot}'")
|
|
|
|
# Usage check (simple heuristic)
|
|
# We look for "- slot_name" in a slots list context
|
|
if stripped == f"- {slot}":
|
|
issues.append(f"{filepath}:{i+1}: Usage of archived slot '{slot}'")
|
|
|
|
# Slot usage keys
|
|
if stripped.startswith(f"{slot}:"):
|
|
issues.append(f"{filepath}:{i+1}: Slot usage key for archived slot '{slot}'")
|
|
|
|
# Check for archived classes
|
|
for cls in archived_classes:
|
|
if cls in stripped:
|
|
# Exclude the file itself defining the class (if it were somehow here)
|
|
# Check for imports or range references
|
|
if f"/{cls}" in stripped or f"range: {cls}" in stripped:
|
|
issues.append(f"{filepath}:{i+1}: Reference to archived class '{cls}'")
|
|
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
|
|
return issues
|
|
|
|
def main():
|
|
archived_slots = load_archived_entities(ARCHIVED_SLOTS_FILE)
|
|
archived_classes = load_archived_entities(ARCHIVED_CLASSES_FILE)
|
|
|
|
# Remove common metadata fields that collide with archived slot names
|
|
archived_slots.discard("description")
|
|
archived_slots.discard("title")
|
|
archived_slots.discard("name") # "name" might be a slot too, but also metadata
|
|
|
|
print(f"Loaded {len(archived_slots)} archived slots.")
|
|
print(f"Loaded {len(archived_classes)} archived classes.")
|
|
|
|
issues = scan_files(CLASSES_DIR, archived_slots, archived_classes)
|
|
# Also scan slots dir, though less likely to have cross-slot dependencies that rot
|
|
issues.extend(scan_files(SLOTS_DIR, archived_slots, archived_classes))
|
|
|
|
if issues:
|
|
print("\nFound potential ghost references:")
|
|
for issue in issues:
|
|
print(issue)
|
|
else:
|
|
print("\nNo obvious ghost references found.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|