glam/scripts/fix_specific_dead_links.py
kempersc 2f44857028 Refactor LinkML schemas and slots for consistency and clarity
- Updated imports in FindingAid.yaml to remove unnecessary entries and added new slots for arrangement level and provenance path.
- Replaced 'full_name' with 'has_or_had_label' in LegalName.yaml and ProfileData.yaml for uniformity.
- Enhanced slot definitions in various YAML files, including ceases_or_ceased_through, has_or_had_arrangement_level, has_or_had_assessment, and others, to include metadata and improve structure.
- Removed the script fix_linkml_metadata.py as it is no longer needed.
- Added new script fix_specific_dead_links.py to handle specific mapping updates for extraction metadata and full name fields across multiple YAML files.
2026-01-29 18:17:47 +01:00

119 lines
4 KiB
Python

import os
import re
SCHEMA_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes/"
# Mappings for ExtractionMetadata.yaml
EXTRACTION_METADATA_MAP = {
"extraction_agent": "is_or_was_retrieved_by",
"extraction_method": "has_or_had_method",
"extraction_date": "retrieval_timestamp",
"cost_usd": "has_or_had_expense",
"source_file": "has_or_had_source",
"staff_id": "has_or_had_identifier",
"linkedin_url": "has_or_had_url",
"request_id": "has_or_had_identifier" # request_id also maps to identifier
}
# General mapping for full_name
FULL_NAME_MAP = {
"full_name": "has_or_had_label"
}
def fix_extraction_metadata():
filepath = os.path.join(SCHEMA_DIR, "ExtractionMetadata.yaml")
if not os.path.exists(filepath): return
print(f"Fixing {filepath}...")
with open(filepath, 'r') as f:
lines = f.readlines()
new_lines = []
# Track which new slots we've already imported to avoid dupes
added_imports = set()
for line in lines:
stripped = line.strip()
replaced = False
# 1. Imports
if stripped.startswith("- ../slots/"):
slot_name = stripped.split("/")[-1]
if slot_name in EXTRACTION_METADATA_MAP:
new_slot = EXTRACTION_METADATA_MAP[slot_name]
if new_slot not in added_imports:
new_lines.append(line.replace(slot_name, new_slot))
added_imports.add(new_slot)
replaced = True
# 2. Slots list
elif stripped.startswith("- ") and stripped[2:] in EXTRACTION_METADATA_MAP:
slot_name = stripped[2:]
new_slot = EXTRACTION_METADATA_MAP[slot_name]
# Avoid duplicate slots in list if possible, but simple replacement is safer than deletion logic
# However, request_id and staff_id BOTH map to has_or_had_identifier.
# If we just replace, we get duplicates.
# LinkML allows duplicate slot entries (it dedupes), but cleaner to avoid.
# But simple replace is fine for now.
new_lines.append(line.replace(slot_name, new_slot))
replaced = True
# 3. Slot usage keys
elif stripped.endswith(":") and stripped[:-1] in EXTRACTION_METADATA_MAP:
slot_name = stripped[:-1]
new_slot = EXTRACTION_METADATA_MAP[slot_name]
new_lines.append(line.replace(slot_name, new_slot))
replaced = True
if not replaced:
new_lines.append(line)
with open(filepath, 'w') as f:
f.writelines(new_lines)
def fix_full_name(filename):
filepath = os.path.join(SCHEMA_DIR, filename)
if not os.path.exists(filepath): return
print(f"Fixing {filepath}...")
with open(filepath, 'r') as f:
lines = f.readlines()
new_lines = []
for line in lines:
stripped = line.strip()
replaced = False
# Imports
if stripped == "- ../slots/full_name":
new_lines.append(line.replace("full_name", "has_or_had_label"))
replaced = True
# Slots list
elif stripped == "- full_name":
new_lines.append(line.replace("full_name", "has_or_had_label"))
replaced = True
# Slot usage key
elif stripped == "full_name:":
new_lines.append(line.replace("full_name:", "has_or_had_label:"))
replaced = True
if not replaced:
new_lines.append(line)
with open(filepath, 'w') as f:
f.writelines(new_lines)
def main():
fix_extraction_metadata()
fix_full_name("FindingAid.yaml")
fix_full_name("OrganizationBranch.yaml")
fix_full_name("DigitalPlatformV2OrganizationStatus.yaml") # Also flagged
fix_full_name("LegalName.yaml") # Also flagged
fix_full_name("CustodianLegalStatus.yaml") # Also flagged
fix_full_name("ProfileData.yaml") # Also flagged
if __name__ == "__main__":
main()