glam/scripts/fix_archive_class_files.py
kempersc 98c42bf272 Fix LinkML URI conflicts and generate RDF outputs
- Fix scope_note → finding_aid_scope_note in FindingAid.yaml
- Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead)
- Remove duplicate rico_record_set_type from class_metadata_slots.yaml
- Fix range types for equals_string compatibility (uriorcurie → string)
- Move class names from close_mappings to see_also in 10 RecordSetTypes files
- Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context
- Sync schemas to frontend/public/schemas/

Files: 1,151 changed (includes prior CustodianType migration)
2026-01-07 12:32:59 +01:00

192 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
Fix archive class files to properly integrate with RecordSetTypes.
This script:
1. Removes duplicate 'slots:' sections
2. Adds the RecordSetType base class if missing
3. Fixes imports
"""
import os
import re
from pathlib import Path
from datetime import datetime
SCHEMA_BASE = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml")
CLASSES_DIR = SCHEMA_BASE / "modules" / "classes"
# Archive classes that already have proper RecordSetType base class
ALREADY_PROPER = {"AcademicArchive"}
# Skip these - not custodian classes
SKIP_PATTERNS = ["RecordSetTypes", "Association", "Network", "OrganizationType"]
def get_archive_classes() -> list:
"""Get list of archive custodian class files."""
archives = []
for f in CLASSES_DIR.glob("*Archive*.yaml"):
name = f.stem
if any(p in name for p in SKIP_PATTERNS):
continue
archives.append(name)
return sorted(archives)
def fix_archive_file(archive_name: str):
"""Fix an archive class file."""
file_path = CLASSES_DIR / f"{archive_name}.yaml"
rst_file = CLASSES_DIR / f"{archive_name}RecordSetTypes.yaml"
if not file_path.exists():
print(f" Skipping {archive_name} - file not found")
return
if not rst_file.exists():
print(f" Skipping {archive_name} - no RecordSetTypes file")
return
with open(file_path, 'r') as f:
content = f.read()
original_content = content
modified = False
# Check if already has RecordSetType base class
rst_type_name = f"{archive_name}RecordSetType"
has_rst_type = rst_type_name + ":" in content
# 1. Fix imports - add missing imports
imports_needed = []
if f"./{archive_name}RecordSetTypes" not in content:
imports_needed.append(f" - ./{archive_name}RecordSetTypes # Imports concrete subclasses")
if "../slots/holds_record_set_types" not in content:
imports_needed.append(" - ../slots/holds_record_set_types # Links custodian to record set types")
if not has_rst_type and "./CollectionType" not in content:
imports_needed.append(" - ./CollectionType")
if not has_rst_type and "../slots/type_scope" not in content:
imports_needed.append(" - ../slots/type_scope")
if imports_needed:
# Find the imports section and add to it
import_match = re.search(r'(imports:\n(?: - [^\n]+\n)+)', content)
if import_match:
old_imports = import_match.group(1)
new_imports = old_imports.rstrip('\n') + '\n' + '\n'.join(imports_needed) + '\n'
content = content.replace(old_imports, new_imports)
modified = True
# 2. Fix duplicate slots sections
# Count slots sections
slots_matches = list(re.finditer(r'^ slots:\n((?: - [^\n]+\n)+)', content, re.MULTILINE))
if len(slots_matches) > 1:
# Collect all slot items
all_slots = set()
for match in slots_matches:
slot_items = re.findall(r' - ([^\n]+)', match.group(0))
all_slots.update(slot_items)
# Add holds_record_set_types if not present
all_slots.add('holds_record_set_types')
# Remove all slots sections
for match in reversed(slots_matches):
content = content[:match.start()] + content[match.end():]
# Add consolidated slots section after class_uri
class_uri_match = re.search(rf'( {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL)
if class_uri_match:
insert_point = class_uri_match.end()
slots_text = "\n # Slots linking custodian to record set types\n slots:\n"
for slot in sorted(all_slots):
slots_text += f" - {slot}\n"
content = content[:insert_point] + slots_text + content[insert_point:]
modified = True
elif len(slots_matches) == 1:
# Check if holds_record_set_types is in slots
if "holds_record_set_types" not in slots_matches[0].group(0):
# Add it
old_slots = slots_matches[0].group(0)
new_slots = old_slots.rstrip('\n') + '\n - holds_record_set_types\n'
content = content.replace(old_slots, new_slots)
modified = True
else:
# No slots section - need to add one
class_uri_match = re.search(rf'( {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL)
if class_uri_match:
insert_point = class_uri_match.end()
slots_text = "\n # Slots linking custodian to record set types\n slots:\n - holds_record_set_types\n"
content = content[:insert_point] + slots_text + content[insert_point:]
modified = True
# 3. Add RecordSetType base class if not present
if not has_rst_type:
# Add the RecordSetType class at the end
rst_type_class = f'''
# rico:RecordSetType base class for collection classification
{rst_type_name}:
description: |
A rico:RecordSetType for classifying collections held by {archive_name} custodians.
**Dual-Class Pattern**:
This class represents the COLLECTION type (rico:RecordSetType).
For the custodian organization type, see `{archive_name}`.
is_a: CollectionType
class_uri: rico:RecordSetType
slots:
- type_scope
see_also:
- {archive_name}
- rico:RecordSetType
annotations:
custodian_types: '["A"]'
custodian_types_rationale: "{rst_type_name} classifies collections held by ARCHIVE (A) type custodians"
linked_custodian_type: {archive_name}
dual_class_pattern: collection_type
specificity_score: 0.7
specificity_rationale: Type taxonomy class.
specificity_annotation_timestamp: '{datetime.now().isoformat()}Z'
specificity_annotation_agent: opencode-claude-sonnet-4
template_specificity:
archive_search: 0.2
museum_search: 0.75
library_search: 0.75
collection_discovery: 0.75
person_research: 0.75
location_browse: 0.75
identifier_lookup: 0.75
organizational_change: 0.75
digital_platform: 0.75
general_heritage: 0.75
'''
content = content.rstrip() + '\n' + rst_type_class
modified = True
if modified:
with open(file_path, 'w') as f:
f.write(content)
print(f" Fixed {archive_name}.yaml")
else:
print(f" {archive_name}.yaml - no changes needed")
def main():
print("=" * 70)
print("Archive Class File Fixer")
print("=" * 70)
archives = get_archive_classes()
print(f"\nProcessing {len(archives)} archive class files\n")
for archive in archives:
if archive in ALREADY_PROPER:
print(f" Skipping {archive} - already properly configured")
continue
fix_archive_file(archive)
print("\nDone!")
if __name__ == "__main__":
main()