glam/scripts/trace_dead_links.py
kempersc 2f44857028 Refactor LinkML schemas and slots for consistency and clarity
- Updated imports in FindingAid.yaml to remove unnecessary entries and added new slots for arrangement level and provenance path.
- Replaced 'full_name' with 'has_or_had_label' in LegalName.yaml and ProfileData.yaml for uniformity.
- Enhanced slot definitions in various YAML files, including ceases_or_ceased_through, has_or_had_arrangement_level, has_or_had_assessment, and others, to include metadata and improve structure.
- Removed the script fix_linkml_metadata.py as it is no longer needed.
- Added new script fix_specific_dead_links.py to handle specific mapping updates for extraction metadata and full name fields across multiple YAML files.
2026-01-29 18:17:47 +01:00

93 lines
3.7 KiB
Python

import os
import re
import glob
ARCHIVE_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/slots/archive/"
CLASSES_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes/"
def get_archived_slot_names():
slots = set()
files = os.listdir(ARCHIVE_DIR)
for f in files:
if not f.endswith(".yaml"):
continue
# Filename pattern: name.yaml or name_archived_YYYYMMDD.yaml
name = f[:-5] # remove .yaml
if "_archived_" in name:
name = name.split("_archived_")[0]
slots.add(name)
return slots
def find_references(archived_slots):
references = {} # {slot_name: [file_paths]}
# Metadata keys that mimic slot names but are valid LinkML structure
# We ignore "Usage as key" for these
SAFE_METADATA_KEYS = {
"title", "description", "name", "id", "status", "notes", "comments", "examples",
"todos", "see_also", "range", "slot_usage", "required", "multivalued",
"inlined", "identifier", "value", "unit", "prefixes", "imports", "classes",
"slots", "attributes", "exact_mappings", "close_mappings", "related_mappings"
}
class_files = glob.glob(os.path.join(CLASSES_DIR, "*.yaml"))
for cls_file in class_files:
with open(cls_file, 'r') as f:
lines = f.readlines()
for i, line in enumerate(lines):
stripped = line.strip()
for slot in archived_slots:
# Import check: "- ../slots/slotname"
if f"../slots/{slot}" in stripped and not stripped.startswith("#"):
if slot not in references: references[slot] = []
references[slot].append(f"{cls_file} (line {i+1}): Import")
continue
# Usage in slots list: "- slotname"
# Must be exact match to avoid partials
if stripped == f"- {slot}":
if slot not in references: references[slot] = []
references[slot].append(f"{cls_file} (line {i+1}): Usage in slots list")
continue
# Usage as key: "slotname:"
if stripped.startswith(f"{slot}:"):
# Check if it's a safe metadata key
if slot in SAFE_METADATA_KEYS:
continue
# Also, if we are inside a slot_usage block, "slotname:" is valid ONLY IF
# we are refining that slot. But if the slot is archived, we shouldn't be refining it!
# So "Usage as key" is actually relevant for slot_usage of archived slots.
if slot not in references: references[slot] = []
references[slot].append(f"{cls_file} (line {i+1}): Usage as key")
continue
return references
def main():
print("Identifying archived slots...")
slots = get_archived_slot_names()
print(f"Found {len(slots)} archived slots.")
# Filter out common false positives if necessary, but better to check everything
# 'description' is a common metadata field, but if it's archived as a slot, we should use has_or_had_description
print("Scanning class files for references...")
refs = find_references(slots)
if refs:
print(f"Found {len(refs)} archived slots still referenced in classes:")
for slot, locations in refs.items():
print(f"\nSLOT: {slot}")
for loc in locations:
print(f" {loc}")
else:
print("No dead links found.")
if __name__ == "__main__":
main()