- Updated imports in FindingAid.yaml to remove unnecessary entries and added new slots for arrangement level and provenance path. - Replaced 'full_name' with 'has_or_had_label' in LegalName.yaml and ProfileData.yaml for uniformity. - Enhanced slot definitions in various YAML files, including ceases_or_ceased_through, has_or_had_arrangement_level, has_or_had_assessment, and others, to include metadata and improve structure. - Removed the script fix_linkml_metadata.py as it is no longer needed. - Added new script fix_specific_dead_links.py to handle specific mapping updates for extraction metadata and full name fields across multiple YAML files.
119 lines
4 KiB
Python
119 lines
4 KiB
Python
import os
|
|
import re
|
|
|
|
SCHEMA_DIR = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes/"
|
|
|
|
# Mappings for ExtractionMetadata.yaml
|
|
EXTRACTION_METADATA_MAP = {
|
|
"extraction_agent": "is_or_was_retrieved_by",
|
|
"extraction_method": "has_or_had_method",
|
|
"extraction_date": "retrieval_timestamp",
|
|
"cost_usd": "has_or_had_expense",
|
|
"source_file": "has_or_had_source",
|
|
"staff_id": "has_or_had_identifier",
|
|
"linkedin_url": "has_or_had_url",
|
|
"request_id": "has_or_had_identifier" # request_id also maps to identifier
|
|
}
|
|
|
|
# General mapping for full_name
|
|
FULL_NAME_MAP = {
|
|
"full_name": "has_or_had_label"
|
|
}
|
|
|
|
def fix_extraction_metadata():
|
|
filepath = os.path.join(SCHEMA_DIR, "ExtractionMetadata.yaml")
|
|
if not os.path.exists(filepath): return
|
|
|
|
print(f"Fixing {filepath}...")
|
|
with open(filepath, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
new_lines = []
|
|
|
|
# Track which new slots we've already imported to avoid dupes
|
|
added_imports = set()
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
replaced = False
|
|
|
|
# 1. Imports
|
|
if stripped.startswith("- ../slots/"):
|
|
slot_name = stripped.split("/")[-1]
|
|
if slot_name in EXTRACTION_METADATA_MAP:
|
|
new_slot = EXTRACTION_METADATA_MAP[slot_name]
|
|
if new_slot not in added_imports:
|
|
new_lines.append(line.replace(slot_name, new_slot))
|
|
added_imports.add(new_slot)
|
|
replaced = True
|
|
|
|
# 2. Slots list
|
|
elif stripped.startswith("- ") and stripped[2:] in EXTRACTION_METADATA_MAP:
|
|
slot_name = stripped[2:]
|
|
new_slot = EXTRACTION_METADATA_MAP[slot_name]
|
|
# Avoid duplicate slots in list if possible, but simple replacement is safer than deletion logic
|
|
# However, request_id and staff_id BOTH map to has_or_had_identifier.
|
|
# If we just replace, we get duplicates.
|
|
# LinkML allows duplicate slot entries (it dedupes), but cleaner to avoid.
|
|
# But simple replace is fine for now.
|
|
new_lines.append(line.replace(slot_name, new_slot))
|
|
replaced = True
|
|
|
|
# 3. Slot usage keys
|
|
elif stripped.endswith(":") and stripped[:-1] in EXTRACTION_METADATA_MAP:
|
|
slot_name = stripped[:-1]
|
|
new_slot = EXTRACTION_METADATA_MAP[slot_name]
|
|
new_lines.append(line.replace(slot_name, new_slot))
|
|
replaced = True
|
|
|
|
if not replaced:
|
|
new_lines.append(line)
|
|
|
|
with open(filepath, 'w') as f:
|
|
f.writelines(new_lines)
|
|
|
|
def fix_full_name(filename):
|
|
filepath = os.path.join(SCHEMA_DIR, filename)
|
|
if not os.path.exists(filepath): return
|
|
|
|
print(f"Fixing {filepath}...")
|
|
with open(filepath, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
new_lines = []
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
replaced = False
|
|
|
|
# Imports
|
|
if stripped == "- ../slots/full_name":
|
|
new_lines.append(line.replace("full_name", "has_or_had_label"))
|
|
replaced = True
|
|
|
|
# Slots list
|
|
elif stripped == "- full_name":
|
|
new_lines.append(line.replace("full_name", "has_or_had_label"))
|
|
replaced = True
|
|
|
|
# Slot usage key
|
|
elif stripped == "full_name:":
|
|
new_lines.append(line.replace("full_name:", "has_or_had_label:"))
|
|
replaced = True
|
|
|
|
if not replaced:
|
|
new_lines.append(line)
|
|
|
|
with open(filepath, 'w') as f:
|
|
f.writelines(new_lines)
|
|
|
|
def main():
|
|
fix_extraction_metadata()
|
|
fix_full_name("FindingAid.yaml")
|
|
fix_full_name("OrganizationBranch.yaml")
|
|
fix_full_name("DigitalPlatformV2OrganizationStatus.yaml") # Also flagged
|
|
fix_full_name("LegalName.yaml") # Also flagged
|
|
fix_full_name("CustodianLegalStatus.yaml") # Also flagged
|
|
fix_full_name("ProfileData.yaml") # Also flagged
|
|
|
|
if __name__ == "__main__":
|
|
main()
|