glam/scripts/scan_ghost_references.py
kempersc 7cf10084b4 Implement scripts for schema modifications and ontology verification
- Added `fix_dual_class_link.py` to remove dual class link references from specified YAML files.
- Created `fix_specific_ghosts.py` to apply specific replacements in YAML files based on defined mappings.
- Introduced `migrate_staff_count.py` to migrate staff count references to a new structure in specified YAML files.
- Developed `migrate_type_slots.py` to replace type-related slots with new identifiers across YAML files.
- Implemented `scan_ghost_references.py` to identify and report ghost references to archived slots and classes in YAML files.
- Added `verify_ontology_terms.py` to verify the presence of ontology terms in specified ontology files against schema definitions.
2026-01-29 17:10:25 +01:00

121 lines
4.8 KiB
Python

import os
import re
# Paths
SCHEMA_ROOT = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules"
CLASSES_DIR = os.path.join(SCHEMA_ROOT, "classes")
SLOTS_DIR = os.path.join(SCHEMA_ROOT, "slots")
ARCHIVED_SLOTS_FILE = "archived_slots.txt"
ARCHIVED_CLASSES_FILE = "archived_classes.txt"
def parse_archived_name(filename):
# Removes _archived_YYYYMMDD.yaml suffix
# Also handles cases without date if any (though list showed dates)
# The pattern seems to be name_archived_date.yaml
match = re.match(r"(.+)_archived_\d+\.yaml", filename)
if match:
return match.group(1)
# Fallback if no date
if filename.endswith(".yaml"):
return filename[:-5]
return filename
def load_archived_entities(filepath):
entities = set()
try:
with open(filepath, 'r') as f:
for line in f:
filename = line.strip()
if filename:
entity_name = parse_archived_name(filename)
entities.add(entity_name)
except FileNotFoundError:
print(f"Warning: {filepath} not found.")
return entities
def scan_files(directory, archived_slots, archived_classes):
print(f"Scanning {directory}...")
issues = []
for root, _, files in os.walk(directory):
if "archive" in root: # Skip archive directories
continue
for file in files:
if not file.endswith(".yaml"):
continue
filepath = os.path.join(root, file)
try:
with open(filepath, 'r') as f:
content = f.read()
lines = content.split('\n')
for i, line in enumerate(lines):
# Check for slot imports
# Pattern: - ../slots/slot_name
# or - ./slot_name (if in slots dir)
# Check for slot usage in slots list
# Pattern: - slot_name (indented)
stripped = line.strip()
# Check for archived slots
for slot in archived_slots:
# Explicit import check
if f"../slots/{slot}" in stripped or f"./{slot}" in stripped:
# Be strict about word boundaries to avoid partial matches
# e.g. "slot" vs "slot_suffix"
# But yaml references usually are explicit paths or items
if stripped.endswith(slot) or f"/{slot}" in stripped:
issues.append(f"{filepath}:{i+1}: Import of archived slot '{slot}'")
# Usage check (simple heuristic)
# We look for "- slot_name" in a slots list context
if stripped == f"- {slot}":
issues.append(f"{filepath}:{i+1}: Usage of archived slot '{slot}'")
# Slot usage keys
if stripped.startswith(f"{slot}:"):
issues.append(f"{filepath}:{i+1}: Slot usage key for archived slot '{slot}'")
# Check for archived classes
for cls in archived_classes:
if cls in stripped:
# Exclude the file itself defining the class (if it were somehow here)
# Check for imports or range references
if f"/{cls}" in stripped or f"range: {cls}" in stripped:
issues.append(f"{filepath}:{i+1}: Reference to archived class '{cls}'")
except Exception as e:
print(f"Error reading {filepath}: {e}")
return issues
def main():
archived_slots = load_archived_entities(ARCHIVED_SLOTS_FILE)
archived_classes = load_archived_entities(ARCHIVED_CLASSES_FILE)
# Remove common metadata fields that collide with archived slot names
archived_slots.discard("description")
archived_slots.discard("title")
archived_slots.discard("name") # "name" might be a slot too, but also metadata
print(f"Loaded {len(archived_slots)} archived slots.")
print(f"Loaded {len(archived_classes)} archived classes.")
issues = scan_files(CLASSES_DIR, archived_slots, archived_classes)
# Also scan slots dir, though less likely to have cross-slot dependencies that rot
issues.extend(scan_files(SLOTS_DIR, archived_slots, archived_classes))
if issues:
print("\nFound potential ghost references:")
for issue in issues:
print(issue)
else:
print("\nNo obvious ghost references found.")
if __name__ == "__main__":
main()