Person Enrichment Scripts: - enrich_person_comprehensive.py: Full-featured web search enrichment via Linkup with Rule 6/21/26/34/35 compliance (dual timestamps, no fabrication) - enrich_ppids_linkup.py: Batch PPID enrichment pipeline - extract_persons_with_provenance.py: Extract person data from LinkedIn HTML with XPath provenance tracking LinkML Slot Management: - update_slot_mappings.py: Update slots for RiC-O naming (Rule 39) and semantic URI requirements (Rule 38) - update_class_slot_references.py: Update class files referencing renamed slots - validate_slot_mappings.py: Validate slot definitions against ontology rules All scripts follow established project conventions for provenance and ontology alignment.
315 lines
14 KiB
Python
315 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Update LinkML class files to reference renamed slots.
|
|
|
|
This script updates class files to use the new RiC-O style slot names.
|
|
|
|
Usage:
|
|
python scripts/update_class_slot_references.py --dry-run # Preview changes
|
|
python scripts/update_class_slot_references.py # Apply changes
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
|
|
# Mapping from old slot names to new slot names
|
|
SLOT_RENAMES: Dict[str, str] = {
|
|
"abbreviation": "has_or_had_abbreviation",
|
|
"about_digital_presence": "is_or_was_about_digital_presence",
|
|
"about_text": "has_or_had_about_text",
|
|
"academic_affiliation": "has_or_had_academic_affiliation",
|
|
"academic_programs": "has_or_had_academic_program",
|
|
"accepts_external_work": "accepts_or_accepted_external_work",
|
|
"accepts_payment_methods": "accepts_or_accepted_payment_method",
|
|
"accepts_visiting_scholars": "accepts_or_accepted_visiting_scholar",
|
|
"access": "has_or_had_access_condition",
|
|
"access_application_url": "has_access_application_url",
|
|
"access_control": "has_or_had_access_control",
|
|
"access_description": "has_or_had_access_description",
|
|
"access_frequency": "has_or_had_access_frequency",
|
|
"access_interface_url": "has_access_interface_url",
|
|
"access_level": "has_or_had_access_level",
|
|
"access_management": "has_or_had_access_management",
|
|
"access_policy": "has_or_had_access_policy",
|
|
"access_policy_ref": "has_access_policy_reference",
|
|
"access_restricted": "is_or_was_access_restricted",
|
|
"access_restriction": "has_or_had_access_restriction",
|
|
"access_restrictions": "has_or_had_access_restriction",
|
|
"access_rights": "has_or_had_access_right",
|
|
"access_trigger_events": "has_or_had_access_trigger_event",
|
|
"accessibility_features": "has_or_had_accessibility_feature",
|
|
"accession_date": "has_accession_date",
|
|
"accession_number": "has_accession_number",
|
|
"account_id": "has_account_identifier",
|
|
"account_name": "has_or_had_account_name",
|
|
"account_status": "has_or_had_account_status",
|
|
"accreditation": "has_or_had_accreditation",
|
|
"accreditation_body": "has_or_had_accreditation_body",
|
|
"accumulation_date_end": "has_accumulation_end_date",
|
|
"accumulation_date_start": "has_accumulation_start_date",
|
|
"accuracy_meters": "has_accuracy_in_meters",
|
|
"acquisition_budget": "has_or_had_acquisition_budget",
|
|
"acquisition_date": "has_acquisition_date",
|
|
"acquisition_history": "has_acquisition_history",
|
|
"acquisition_method": "has_acquisition_method",
|
|
"acquisition_source": "has_acquisition_source",
|
|
"active_since": "has_active_since_date",
|
|
"activities_societies": "has_or_had_activity_or_society_membership",
|
|
"activity_description": "has_activity_description",
|
|
"activity_id": "has_activity_identifier",
|
|
"activity_name": "has_activity_name",
|
|
"activity_timespan": "has_activity_timespan",
|
|
"activity_type": "has_activity_type",
|
|
"actual_end": "has_actual_end_date",
|
|
"actual_return_date": "has_actual_return_date",
|
|
"actual_start": "has_actual_start_date",
|
|
"admin_office_description": "has_admin_office_description",
|
|
"admin_office_id": "has_admin_office_identifier",
|
|
"admin_office_name": "has_admin_office_name",
|
|
"admin_staff_count": "has_or_had_admin_staff_count",
|
|
"administration_description": "has_administration_description",
|
|
"administration_name": "has_administration_name",
|
|
"administrative_expenses": "has_or_had_administrative_expense",
|
|
"administrative_functions": "has_or_had_administrative_function",
|
|
"administrative_level": "has_administrative_level",
|
|
"admission_fee": "has_or_had_admission_fee",
|
|
"adoption_context": "has_adoption_context",
|
|
"affected_by_event": "is_or_was_affected_by_event",
|
|
"affected_territory": "has_or_had_affected_territory",
|
|
"affected_units": "has_or_had_affected_unit",
|
|
"affects_organization": "affects_or_affected_organization",
|
|
"affiliated_universities": "has_or_had_affiliated_university",
|
|
"affiliation": "has_or_had_affiliation",
|
|
"age": "has_age",
|
|
"agenda_description": "has_agenda_description",
|
|
"agenda_document_url": "has_agenda_document_url",
|
|
"agenda_id": "has_agenda_identifier",
|
|
"agenda_short_name": "has_agenda_short_name",
|
|
"agenda_title": "has_agenda_title",
|
|
"agenda_url": "has_agenda_url",
|
|
"agent_name": "has_agent_name",
|
|
"agent_type": "has_agent_type",
|
|
"aggregated_by": "is_or_was_aggregated_by",
|
|
"aggregates_from": "aggregates_or_aggregated_from",
|
|
"agreement_signed_date": "has_agreement_signed_date",
|
|
"air_changes_per_hour": "has_air_changes_per_hour",
|
|
"all_data_real": "has_all_data_real_flag",
|
|
"all_links": "has_link",
|
|
"allocated_by": "is_or_was_allocated_by",
|
|
"allocates": "allocates_or_allocated",
|
|
"allocation_date": "has_allocation_date",
|
|
"allows_laptops": "allows_or_allowed_laptop",
|
|
"allows_photography": "allows_or_allowed_photography",
|
|
"alpha_2": "has_alpha_2_code",
|
|
"alpha_3": "has_alpha_3_code",
|
|
"also_allocation_agency": "is_or_was_also_allocation_agency",
|
|
"also_identifies_name": "also_identifies_name",
|
|
"alternative_names": "has_or_had_alternative_name",
|
|
"alternative_observed_names": "has_or_had_alternative_observed_name",
|
|
"altitude": "has_altitude",
|
|
"amendment_history": "has_amendment_history",
|
|
"animal_species_count": "has_or_had_animal_species_count",
|
|
"annex_description": "has_annex_description",
|
|
"annex_id": "has_annex_identifier",
|
|
"annex_name": "has_annex_name",
|
|
"annex_reason": "has_annex_reason",
|
|
"annotation_motivation": "has_annotation_motivation",
|
|
"annotation_segments": "has_annotation_segment",
|
|
"annotation_type": "has_annotation_type",
|
|
"annotations_by": "has_annotation_by",
|
|
"annual_participants": "has_or_had_annual_participant_count",
|
|
"annual_revenue": "has_or_had_annual_revenue",
|
|
"api_available": "has_api_available_flag",
|
|
"api_documentation": "has_api_documentation_url",
|
|
"api_endpoint": "has_api_endpoint",
|
|
"api_version": "has_api_version",
|
|
"appellation_language": "has_appellation_language",
|
|
"appellation_type": "has_appellation_type",
|
|
"appellation_value": "has_appellation_value",
|
|
"appellations": "has_or_had_appellation",
|
|
"applicable_countries": "has_applicable_country",
|
|
"application_deadline": "has_application_deadline",
|
|
"application_opening_date": "has_application_opening_date",
|
|
"applies_to_call": "applies_to_call",
|
|
"appointment_required": "has_appointment_required_flag",
|
|
"appraisal_notes": "has_appraisal_note",
|
|
"appraisal_policy": "has_or_had_appraisal_policy",
|
|
"approval_date": "has_approval_date",
|
|
"approved_by": "was_approved_by",
|
|
"approximate": "is_approximate",
|
|
"archdiocese_name": "has_archdiocese_name",
|
|
"architect": "has_or_had_architect",
|
|
"architectural_style": "has_architectural_style",
|
|
"archival_reference": "has_archival_reference",
|
|
"archival_status": "has_or_had_archival_status",
|
|
"archive_branches": "has_or_had_archive_branch",
|
|
"archive_department_of": "is_or_was_archive_department_of",
|
|
"archive_description": "has_archive_description",
|
|
"archive_memento_uri": "has_archive_memento_uri",
|
|
"archive_name": "has_archive_name",
|
|
"archive_path": "has_archive_path",
|
|
"archive_scope": "has_or_had_archive_scope",
|
|
"archive_search_score": "has_archive_search_score",
|
|
"archive_series": "is_or_was_part_of_archive_series",
|
|
"archive_subtype": "has_archive_subtype",
|
|
"archived_at": "was_archived_at",
|
|
"archived_in": "is_or_was_archived_in",
|
|
"area_hectares": "has_area_in_hectares",
|
|
"area_served": "has_or_had_area_served",
|
|
"arrangement": "has_arrangement",
|
|
"arrangement_level": "has_arrangement_level",
|
|
"arrangement_notes": "has_arrangement_note",
|
|
"arrangement_system": "has_or_had_arrangement_system",
|
|
"articles_archival_stage": "has_articles_archival_stage",
|
|
"articles_document_format": "has_articles_document_format",
|
|
"articles_document_url": "has_articles_document_url",
|
|
"artist_representation": "has_or_had_artist_representation",
|
|
"artwork_count": "has_or_had_artwork_count",
|
|
"aspect_ratio": "has_aspect_ratio",
|
|
"asserted_by": "was_asserted_by",
|
|
"assertion_date": "has_assertion_date",
|
|
"assertion_id": "has_assertion_identifier",
|
|
"assertion_rationale": "has_assertion_rationale",
|
|
"assertion_value": "has_assertion_value",
|
|
"assessment_category": "has_assessment_category",
|
|
"assessment_date": "has_assessment_date",
|
|
"assigned_processor": "has_or_had_assigned_processor",
|
|
"associated_auxiliary_platform": "has_or_had_associated_auxiliary_platform",
|
|
"associated_custodian": "has_or_had_associated_custodian",
|
|
"associated_digital_platform": "has_or_had_associated_digital_platform",
|
|
"associated_encompassing_bodies": "has_or_had_associated_encompassing_body",
|
|
"associated_taxa": "has_associated_taxon",
|
|
"auction_house": "has_auction_house",
|
|
"auction_sale_name": "has_auction_sale_name",
|
|
"audience_size": "has_or_had_audience_size",
|
|
"audience_type": "has_audience_type",
|
|
"audio_event_segments": "has_audio_event_segment",
|
|
"audio_quality_score": "has_audio_quality_score",
|
|
"audit_date": "has_audit_date",
|
|
"audit_opinion": "has_audit_opinion",
|
|
"audit_status": "has_or_had_audit_status",
|
|
"auditor_name": "has_auditor_name",
|
|
"authentication_required": "has_authentication_required_flag",
|
|
"authority_file_abbreviation": "has_authority_file_abbreviation",
|
|
"authority_file_name": "has_authority_file_name",
|
|
"authority_file_url": "has_authority_file_url",
|
|
"authors": "has_author",
|
|
"auto_generated": "is_auto_generated",
|
|
"auxiliary_place_id": "has_auxiliary_place_identifier",
|
|
"auxiliary_place_type": "has_auxiliary_place_type",
|
|
"auxiliary_places": "has_auxiliary_place",
|
|
"auxiliary_platform_id": "has_auxiliary_platform_identifier",
|
|
"auxiliary_platform_type": "has_auxiliary_platform_type",
|
|
"auxiliary_platforms": "has_auxiliary_platform",
|
|
"availability_timespan": "has_availability_timespan",
|
|
"available_caption_languages": "has_available_caption_language",
|
|
"average_entry_duration_seconds": "has_average_entry_duration_seconds",
|
|
"average_scene_duration_seconds": "has_average_scene_duration_seconds",
|
|
}
|
|
|
|
|
|
def find_class_files(classes_dir: Path) -> List[Path]:
|
|
"""Find all YAML class files."""
|
|
return list(classes_dir.glob("**/*.yaml"))
|
|
|
|
|
|
def update_file_content(content: str, renames: Dict[str, str]) -> Tuple[str, List[str]]:
|
|
"""Update slot references in file content."""
|
|
changes = []
|
|
updated_content = content
|
|
|
|
for old_name, new_name in renames.items():
|
|
# Match slot references in attributes section
|
|
# Pattern: " old_name:" at start of line (with proper indentation)
|
|
pattern = rf'^(\s+){old_name}:(\s*)$'
|
|
if re.search(pattern, updated_content, re.MULTILINE):
|
|
updated_content = re.sub(
|
|
pattern,
|
|
rf'\1{new_name}:\2',
|
|
updated_content,
|
|
flags=re.MULTILINE
|
|
)
|
|
changes.append(f"{old_name} -> {new_name}")
|
|
|
|
# Also match in slot_usage and other contexts
|
|
pattern2 = rf'^(\s+){old_name}:(\s*\n)'
|
|
if re.search(pattern2, updated_content, re.MULTILINE):
|
|
updated_content = re.sub(
|
|
pattern2,
|
|
rf'\1{new_name}:\2',
|
|
updated_content,
|
|
flags=re.MULTILINE
|
|
)
|
|
if f"{old_name} -> {new_name}" not in changes:
|
|
changes.append(f"{old_name} -> {new_name}")
|
|
|
|
return updated_content, changes
|
|
|
|
|
|
def process_file(file_path: Path, renames: Dict[str, str], dry_run: bool = False) -> Tuple[bool, List[str]]:
|
|
"""Process a single class file."""
|
|
try:
|
|
content = file_path.read_text()
|
|
except Exception as e:
|
|
return False, [f"Error reading {file_path}: {e}"]
|
|
|
|
updated_content, changes = update_file_content(content, renames)
|
|
|
|
if not changes:
|
|
return True, []
|
|
|
|
if not dry_run:
|
|
try:
|
|
file_path.write_text(updated_content)
|
|
except Exception as e:
|
|
return False, [f"Error writing {file_path}: {e}"]
|
|
|
|
return True, changes
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Update class files with new slot names")
|
|
parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing files")
|
|
parser.add_argument("--classes-dir", default="schemas/20251121/linkml/modules/classes",
|
|
help="Path to classes directory")
|
|
args = parser.parse_args()
|
|
|
|
classes_dir = Path(args.classes_dir)
|
|
if not classes_dir.exists():
|
|
print(f"Classes directory not found: {classes_dir}")
|
|
return 1
|
|
|
|
class_files = find_class_files(classes_dir)
|
|
print(f"Found {len(class_files)} class files")
|
|
print(f"Checking for {len(SLOT_RENAMES)} slot renames")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
files_updated = 0
|
|
total_changes = 0
|
|
|
|
for file_path in sorted(class_files):
|
|
success, changes = process_file(file_path, SLOT_RENAMES, args.dry_run)
|
|
|
|
if changes:
|
|
files_updated += 1
|
|
total_changes += len(changes)
|
|
rel_path = file_path.relative_to(classes_dir)
|
|
action = "Would update" if args.dry_run else "Updated"
|
|
print(f"✓ {action} {rel_path}:")
|
|
for change in changes:
|
|
print(f" {change}")
|
|
|
|
print()
|
|
print(f"Files updated: {files_updated}")
|
|
print(f"Total slot renames: {total_changes}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|