glam/scripts/update_class_slot_references.py
kempersc 0845d9f30e feat(scripts): add person enrichment and slot mapping utilities
Person Enrichment Scripts:
- enrich_person_comprehensive.py: Full-featured web search enrichment via Linkup
  with Rule 6/21/26/34/35 compliance (dual timestamps, no fabrication)
- enrich_ppids_linkup.py: Batch PPID enrichment pipeline
- extract_persons_with_provenance.py: Extract person data from LinkedIn HTML
  with XPath provenance tracking

LinkML Slot Management:
- update_slot_mappings.py: Update slots for RiC-O naming (Rule 39) and
  semantic URI requirements (Rule 38)
- update_class_slot_references.py: Update class files referencing renamed slots
- validate_slot_mappings.py: Validate slot definitions against ontology rules

All scripts follow established project conventions for provenance and
ontology alignment.
2026-01-10 13:32:32 +01:00

315 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Update LinkML class files to reference renamed slots.
This script updates class files to use the new RiC-O style slot names.
Usage:
python scripts/update_class_slot_references.py --dry-run # Preview changes
python scripts/update_class_slot_references.py # Apply changes
"""
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple
# Mapping from old slot names to new slot names
SLOT_RENAMES: Dict[str, str] = {
"abbreviation": "has_or_had_abbreviation",
"about_digital_presence": "is_or_was_about_digital_presence",
"about_text": "has_or_had_about_text",
"academic_affiliation": "has_or_had_academic_affiliation",
"academic_programs": "has_or_had_academic_program",
"accepts_external_work": "accepts_or_accepted_external_work",
"accepts_payment_methods": "accepts_or_accepted_payment_method",
"accepts_visiting_scholars": "accepts_or_accepted_visiting_scholar",
"access": "has_or_had_access_condition",
"access_application_url": "has_access_application_url",
"access_control": "has_or_had_access_control",
"access_description": "has_or_had_access_description",
"access_frequency": "has_or_had_access_frequency",
"access_interface_url": "has_access_interface_url",
"access_level": "has_or_had_access_level",
"access_management": "has_or_had_access_management",
"access_policy": "has_or_had_access_policy",
"access_policy_ref": "has_access_policy_reference",
"access_restricted": "is_or_was_access_restricted",
"access_restriction": "has_or_had_access_restriction",
"access_restrictions": "has_or_had_access_restriction",
"access_rights": "has_or_had_access_right",
"access_trigger_events": "has_or_had_access_trigger_event",
"accessibility_features": "has_or_had_accessibility_feature",
"accession_date": "has_accession_date",
"accession_number": "has_accession_number",
"account_id": "has_account_identifier",
"account_name": "has_or_had_account_name",
"account_status": "has_or_had_account_status",
"accreditation": "has_or_had_accreditation",
"accreditation_body": "has_or_had_accreditation_body",
"accumulation_date_end": "has_accumulation_end_date",
"accumulation_date_start": "has_accumulation_start_date",
"accuracy_meters": "has_accuracy_in_meters",
"acquisition_budget": "has_or_had_acquisition_budget",
"acquisition_date": "has_acquisition_date",
"acquisition_history": "has_acquisition_history",
"acquisition_method": "has_acquisition_method",
"acquisition_source": "has_acquisition_source",
"active_since": "has_active_since_date",
"activities_societies": "has_or_had_activity_or_society_membership",
"activity_description": "has_activity_description",
"activity_id": "has_activity_identifier",
"activity_name": "has_activity_name",
"activity_timespan": "has_activity_timespan",
"activity_type": "has_activity_type",
"actual_end": "has_actual_end_date",
"actual_return_date": "has_actual_return_date",
"actual_start": "has_actual_start_date",
"admin_office_description": "has_admin_office_description",
"admin_office_id": "has_admin_office_identifier",
"admin_office_name": "has_admin_office_name",
"admin_staff_count": "has_or_had_admin_staff_count",
"administration_description": "has_administration_description",
"administration_name": "has_administration_name",
"administrative_expenses": "has_or_had_administrative_expense",
"administrative_functions": "has_or_had_administrative_function",
"administrative_level": "has_administrative_level",
"admission_fee": "has_or_had_admission_fee",
"adoption_context": "has_adoption_context",
"affected_by_event": "is_or_was_affected_by_event",
"affected_territory": "has_or_had_affected_territory",
"affected_units": "has_or_had_affected_unit",
"affects_organization": "affects_or_affected_organization",
"affiliated_universities": "has_or_had_affiliated_university",
"affiliation": "has_or_had_affiliation",
"age": "has_age",
"agenda_description": "has_agenda_description",
"agenda_document_url": "has_agenda_document_url",
"agenda_id": "has_agenda_identifier",
"agenda_short_name": "has_agenda_short_name",
"agenda_title": "has_agenda_title",
"agenda_url": "has_agenda_url",
"agent_name": "has_agent_name",
"agent_type": "has_agent_type",
"aggregated_by": "is_or_was_aggregated_by",
"aggregates_from": "aggregates_or_aggregated_from",
"agreement_signed_date": "has_agreement_signed_date",
"air_changes_per_hour": "has_air_changes_per_hour",
"all_data_real": "has_all_data_real_flag",
"all_links": "has_link",
"allocated_by": "is_or_was_allocated_by",
"allocates": "allocates_or_allocated",
"allocation_date": "has_allocation_date",
"allows_laptops": "allows_or_allowed_laptop",
"allows_photography": "allows_or_allowed_photography",
"alpha_2": "has_alpha_2_code",
"alpha_3": "has_alpha_3_code",
"also_allocation_agency": "is_or_was_also_allocation_agency",
"also_identifies_name": "also_identifies_name",
"alternative_names": "has_or_had_alternative_name",
"alternative_observed_names": "has_or_had_alternative_observed_name",
"altitude": "has_altitude",
"amendment_history": "has_amendment_history",
"animal_species_count": "has_or_had_animal_species_count",
"annex_description": "has_annex_description",
"annex_id": "has_annex_identifier",
"annex_name": "has_annex_name",
"annex_reason": "has_annex_reason",
"annotation_motivation": "has_annotation_motivation",
"annotation_segments": "has_annotation_segment",
"annotation_type": "has_annotation_type",
"annotations_by": "has_annotation_by",
"annual_participants": "has_or_had_annual_participant_count",
"annual_revenue": "has_or_had_annual_revenue",
"api_available": "has_api_available_flag",
"api_documentation": "has_api_documentation_url",
"api_endpoint": "has_api_endpoint",
"api_version": "has_api_version",
"appellation_language": "has_appellation_language",
"appellation_type": "has_appellation_type",
"appellation_value": "has_appellation_value",
"appellations": "has_or_had_appellation",
"applicable_countries": "has_applicable_country",
"application_deadline": "has_application_deadline",
"application_opening_date": "has_application_opening_date",
"applies_to_call": "applies_to_call",
"appointment_required": "has_appointment_required_flag",
"appraisal_notes": "has_appraisal_note",
"appraisal_policy": "has_or_had_appraisal_policy",
"approval_date": "has_approval_date",
"approved_by": "was_approved_by",
"approximate": "is_approximate",
"archdiocese_name": "has_archdiocese_name",
"architect": "has_or_had_architect",
"architectural_style": "has_architectural_style",
"archival_reference": "has_archival_reference",
"archival_status": "has_or_had_archival_status",
"archive_branches": "has_or_had_archive_branch",
"archive_department_of": "is_or_was_archive_department_of",
"archive_description": "has_archive_description",
"archive_memento_uri": "has_archive_memento_uri",
"archive_name": "has_archive_name",
"archive_path": "has_archive_path",
"archive_scope": "has_or_had_archive_scope",
"archive_search_score": "has_archive_search_score",
"archive_series": "is_or_was_part_of_archive_series",
"archive_subtype": "has_archive_subtype",
"archived_at": "was_archived_at",
"archived_in": "is_or_was_archived_in",
"area_hectares": "has_area_in_hectares",
"area_served": "has_or_had_area_served",
"arrangement": "has_arrangement",
"arrangement_level": "has_arrangement_level",
"arrangement_notes": "has_arrangement_note",
"arrangement_system": "has_or_had_arrangement_system",
"articles_archival_stage": "has_articles_archival_stage",
"articles_document_format": "has_articles_document_format",
"articles_document_url": "has_articles_document_url",
"artist_representation": "has_or_had_artist_representation",
"artwork_count": "has_or_had_artwork_count",
"aspect_ratio": "has_aspect_ratio",
"asserted_by": "was_asserted_by",
"assertion_date": "has_assertion_date",
"assertion_id": "has_assertion_identifier",
"assertion_rationale": "has_assertion_rationale",
"assertion_value": "has_assertion_value",
"assessment_category": "has_assessment_category",
"assessment_date": "has_assessment_date",
"assigned_processor": "has_or_had_assigned_processor",
"associated_auxiliary_platform": "has_or_had_associated_auxiliary_platform",
"associated_custodian": "has_or_had_associated_custodian",
"associated_digital_platform": "has_or_had_associated_digital_platform",
"associated_encompassing_bodies": "has_or_had_associated_encompassing_body",
"associated_taxa": "has_associated_taxon",
"auction_house": "has_auction_house",
"auction_sale_name": "has_auction_sale_name",
"audience_size": "has_or_had_audience_size",
"audience_type": "has_audience_type",
"audio_event_segments": "has_audio_event_segment",
"audio_quality_score": "has_audio_quality_score",
"audit_date": "has_audit_date",
"audit_opinion": "has_audit_opinion",
"audit_status": "has_or_had_audit_status",
"auditor_name": "has_auditor_name",
"authentication_required": "has_authentication_required_flag",
"authority_file_abbreviation": "has_authority_file_abbreviation",
"authority_file_name": "has_authority_file_name",
"authority_file_url": "has_authority_file_url",
"authors": "has_author",
"auto_generated": "is_auto_generated",
"auxiliary_place_id": "has_auxiliary_place_identifier",
"auxiliary_place_type": "has_auxiliary_place_type",
"auxiliary_places": "has_auxiliary_place",
"auxiliary_platform_id": "has_auxiliary_platform_identifier",
"auxiliary_platform_type": "has_auxiliary_platform_type",
"auxiliary_platforms": "has_auxiliary_platform",
"availability_timespan": "has_availability_timespan",
"available_caption_languages": "has_available_caption_language",
"average_entry_duration_seconds": "has_average_entry_duration_seconds",
"average_scene_duration_seconds": "has_average_scene_duration_seconds",
}
def find_class_files(classes_dir: Path) -> List[Path]:
"""Find all YAML class files."""
return list(classes_dir.glob("**/*.yaml"))
def update_file_content(content: str, renames: Dict[str, str]) -> Tuple[str, List[str]]:
"""Update slot references in file content."""
changes = []
updated_content = content
for old_name, new_name in renames.items():
# Match slot references in attributes section
# Pattern: " old_name:" at start of line (with proper indentation)
pattern = rf'^(\s+){old_name}:(\s*)$'
if re.search(pattern, updated_content, re.MULTILINE):
updated_content = re.sub(
pattern,
rf'\1{new_name}:\2',
updated_content,
flags=re.MULTILINE
)
changes.append(f"{old_name} -> {new_name}")
# Also match in slot_usage and other contexts
pattern2 = rf'^(\s+){old_name}:(\s*\n)'
if re.search(pattern2, updated_content, re.MULTILINE):
updated_content = re.sub(
pattern2,
rf'\1{new_name}:\2',
updated_content,
flags=re.MULTILINE
)
if f"{old_name} -> {new_name}" not in changes:
changes.append(f"{old_name} -> {new_name}")
return updated_content, changes
def process_file(file_path: Path, renames: Dict[str, str], dry_run: bool = False) -> Tuple[bool, List[str]]:
"""Process a single class file."""
try:
content = file_path.read_text()
except Exception as e:
return False, [f"Error reading {file_path}: {e}"]
updated_content, changes = update_file_content(content, renames)
if not changes:
return True, []
if not dry_run:
try:
file_path.write_text(updated_content)
except Exception as e:
return False, [f"Error writing {file_path}: {e}"]
return True, changes
def main():
import argparse
parser = argparse.ArgumentParser(description="Update class files with new slot names")
parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing files")
parser.add_argument("--classes-dir", default="schemas/20251121/linkml/modules/classes",
help="Path to classes directory")
args = parser.parse_args()
classes_dir = Path(args.classes_dir)
if not classes_dir.exists():
print(f"Classes directory not found: {classes_dir}")
return 1
class_files = find_class_files(classes_dir)
print(f"Found {len(class_files)} class files")
print(f"Checking for {len(SLOT_RENAMES)} slot renames")
print(f"Dry run: {args.dry_run}")
print()
files_updated = 0
total_changes = 0
for file_path in sorted(class_files):
success, changes = process_file(file_path, SLOT_RENAMES, args.dry_run)
if changes:
files_updated += 1
total_changes += len(changes)
rel_path = file_path.relative_to(classes_dir)
action = "Would update" if args.dry_run else "Updated"
print(f"{action} {rel_path}:")
for change in changes:
print(f" {change}")
print()
print(f"Files updated: {files_updated}")
print(f"Total slot renames: {total_changes}")
return 0
if __name__ == "__main__":
exit(main())