glam/scripts/fix_specific_dead_links.py

import os
import re

SCHEMA_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes/"

# Mappings for ExtractionMetadata.yaml
EXTRACTION_METADATA_MAP = {
    "extraction_agent": "is_or_was_retrieved_by",
    "extraction_method": "has_or_had_method",
    "extraction_date": "retrieval_timestamp",
    "cost_usd": "has_or_had_expense",
    "source_file": "has_or_had_source",
    "staff_id": "has_or_had_identifier",
    "linkedin_url": "has_or_had_url",
    "request_id": "has_or_had_identifier" # request_id also maps to identifier
}

# General mapping for full_name
FULL_NAME_MAP = {
    "full_name": "has_or_had_label"
}

def fix_extraction_metadata():
    filepath = os.path.join(SCHEMA_DIR, "ExtractionMetadata.yaml")
    if not os.path.exists(filepath): return

    print(f"Fixing {filepath}...")
    with open(filepath, 'r') as f:
        lines = f.readlines()

    new_lines = []

    # Track which new slots we've already imported to avoid dupes
    added_imports = set()

    for line in lines:
        stripped = line.strip()
        replaced = False

        # 1. Imports
        if stripped.startswith("- ../slots/"):
            slot_name = stripped.split("/")[-1]
            if slot_name in EXTRACTION_METADATA_MAP:
                new_slot = EXTRACTION_METADATA_MAP[slot_name]
                if new_slot not in added_imports:
                    new_lines.append(line.replace(slot_name, new_slot))
                    added_imports.add(new_slot)
                replaced = True

        # 2. Slots list
        elif stripped.startswith("- ") and stripped[2:] in EXTRACTION_METADATA_MAP:
            slot_name = stripped[2:]
            new_slot = EXTRACTION_METADATA_MAP[slot_name]
            # Avoid duplicate slots in list if possible, but simple replacement is safer than deletion logic
            # However, request_id and staff_id BOTH map to has_or_had_identifier.
            # If we just replace, we get duplicates.
            # LinkML allows duplicate slot entries (it dedupes), but cleaner to avoid.
            # But simple replace is fine for now.
            new_lines.append(line.replace(slot_name, new_slot))
            replaced = True

        # 3. Slot usage keys
        elif stripped.endswith(":") and stripped[:-1] in EXTRACTION_METADATA_MAP:
            slot_name = stripped[:-1]
            new_slot = EXTRACTION_METADATA_MAP[slot_name]
            new_lines.append(line.replace(slot_name, new_slot))
            replaced = True

        if not replaced:
            new_lines.append(line)

    with open(filepath, 'w') as f:
        f.writelines(new_lines)

def fix_full_name(filename):
    filepath = os.path.join(SCHEMA_DIR, filename)
    if not os.path.exists(filepath): return

    print(f"Fixing {filepath}...")
    with open(filepath, 'r') as f:
        lines = f.readlines()

    new_lines = []
    for line in lines:
        stripped = line.strip()
        replaced = False

        # Imports
        if stripped == "- ../slots/full_name":
            new_lines.append(line.replace("full_name", "has_or_had_label"))
            replaced = True

        # Slots list
        elif stripped == "- full_name":
            new_lines.append(line.replace("full_name", "has_or_had_label"))
            replaced = True

        # Slot usage key
        elif stripped == "full_name:":
            new_lines.append(line.replace("full_name:", "has_or_had_label:"))
            replaced = True

        if not replaced:
            new_lines.append(line)

    with open(filepath, 'w') as f:
        f.writelines(new_lines)

def main():
    fix_extraction_metadata()
    fix_full_name("FindingAid.yaml")
    fix_full_name("OrganizationBranch.yaml")
    fix_full_name("DigitalPlatformV2OrganizationStatus.yaml") # Also flagged
    fix_full_name("LegalName.yaml") # Also flagged
    fix_full_name("CustodianLegalStatus.yaml") # Also flagged
    fix_full_name("ProfileData.yaml") # Also flagged

if __name__ == "__main__":
    main()