import os import re SCHEMA_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes/" # Mappings for ExtractionMetadata.yaml EXTRACTION_METADATA_MAP = { "extraction_agent": "is_or_was_retrieved_by", "extraction_method": "has_or_had_method", "extraction_date": "retrieval_timestamp", "cost_usd": "has_or_had_expense", "source_file": "has_or_had_source", "staff_id": "has_or_had_identifier", "linkedin_url": "has_or_had_url", "request_id": "has_or_had_identifier" # request_id also maps to identifier } # General mapping for full_name FULL_NAME_MAP = { "full_name": "has_or_had_label" } def fix_extraction_metadata(): filepath = os.path.join(SCHEMA_DIR, "ExtractionMetadata.yaml") if not os.path.exists(filepath): return print(f"Fixing {filepath}...") with open(filepath, 'r') as f: lines = f.readlines() new_lines = [] # Track which new slots we've already imported to avoid dupes added_imports = set() for line in lines: stripped = line.strip() replaced = False # 1. Imports if stripped.startswith("- ../slots/"): slot_name = stripped.split("/")[-1] if slot_name in EXTRACTION_METADATA_MAP: new_slot = EXTRACTION_METADATA_MAP[slot_name] if new_slot not in added_imports: new_lines.append(line.replace(slot_name, new_slot)) added_imports.add(new_slot) replaced = True # 2. Slots list elif stripped.startswith("- ") and stripped[2:] in EXTRACTION_METADATA_MAP: slot_name = stripped[2:] new_slot = EXTRACTION_METADATA_MAP[slot_name] # Avoid duplicate slots in list if possible, but simple replacement is safer than deletion logic # However, request_id and staff_id BOTH map to has_or_had_identifier. # If we just replace, we get duplicates. # LinkML allows duplicate slot entries (it dedupes), but cleaner to avoid. # But simple replace is fine for now. new_lines.append(line.replace(slot_name, new_slot)) replaced = True # 3. Slot usage keys elif stripped.endswith(":") and stripped[:-1] in EXTRACTION_METADATA_MAP: slot_name = stripped[:-1] new_slot = EXTRACTION_METADATA_MAP[slot_name] new_lines.append(line.replace(slot_name, new_slot)) replaced = True if not replaced: new_lines.append(line) with open(filepath, 'w') as f: f.writelines(new_lines) def fix_full_name(filename): filepath = os.path.join(SCHEMA_DIR, filename) if not os.path.exists(filepath): return print(f"Fixing {filepath}...") with open(filepath, 'r') as f: lines = f.readlines() new_lines = [] for line in lines: stripped = line.strip() replaced = False # Imports if stripped == "- ../slots/full_name": new_lines.append(line.replace("full_name", "has_or_had_label")) replaced = True # Slots list elif stripped == "- full_name": new_lines.append(line.replace("full_name", "has_or_had_label")) replaced = True # Slot usage key elif stripped == "full_name:": new_lines.append(line.replace("full_name:", "has_or_had_label:")) replaced = True if not replaced: new_lines.append(line) with open(filepath, 'w') as f: f.writelines(new_lines) def main(): fix_extraction_metadata() fix_full_name("FindingAid.yaml") fix_full_name("OrganizationBranch.yaml") fix_full_name("DigitalPlatformV2OrganizationStatus.yaml") # Also flagged fix_full_name("LegalName.yaml") # Also flagged fix_full_name("CustodianLegalStatus.yaml") # Also flagged fix_full_name("ProfileData.yaml") # Also flagged if __name__ == "__main__": main()