- Fix scope_note → finding_aid_scope_note in FindingAid.yaml - Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead) - Remove duplicate rico_record_set_type from class_metadata_slots.yaml - Fix range types for equals_string compatibility (uriorcurie → string) - Move class names from close_mappings to see_also in 10 RecordSetTypes files - Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context - Sync schemas to frontend/public/schemas/ Files: 1,151 changed (includes prior CustodianType migration)
192 lines
6.9 KiB
Python
192 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix archive class files to properly integrate with RecordSetTypes.
|
|
|
|
This script:
|
|
1. Removes duplicate 'slots:' sections
|
|
2. Adds the RecordSetType base class if missing
|
|
3. Fixes imports
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
SCHEMA_BASE = Path("/Users/kempersc/apps/glam/schemas/20251121/linkml")
|
|
CLASSES_DIR = SCHEMA_BASE / "modules" / "classes"
|
|
|
|
# Archive classes that already have proper RecordSetType base class
|
|
ALREADY_PROPER = {"AcademicArchive"}
|
|
|
|
# Skip these - not custodian classes
|
|
SKIP_PATTERNS = ["RecordSetTypes", "Association", "Network", "OrganizationType"]
|
|
|
|
|
|
def get_archive_classes() -> list:
|
|
"""Get list of archive custodian class files."""
|
|
archives = []
|
|
for f in CLASSES_DIR.glob("*Archive*.yaml"):
|
|
name = f.stem
|
|
if any(p in name for p in SKIP_PATTERNS):
|
|
continue
|
|
archives.append(name)
|
|
return sorted(archives)
|
|
|
|
|
|
def fix_archive_file(archive_name: str):
|
|
"""Fix an archive class file."""
|
|
file_path = CLASSES_DIR / f"{archive_name}.yaml"
|
|
rst_file = CLASSES_DIR / f"{archive_name}RecordSetTypes.yaml"
|
|
|
|
if not file_path.exists():
|
|
print(f" Skipping {archive_name} - file not found")
|
|
return
|
|
|
|
if not rst_file.exists():
|
|
print(f" Skipping {archive_name} - no RecordSetTypes file")
|
|
return
|
|
|
|
with open(file_path, 'r') as f:
|
|
content = f.read()
|
|
|
|
original_content = content
|
|
modified = False
|
|
|
|
# Check if already has RecordSetType base class
|
|
rst_type_name = f"{archive_name}RecordSetType"
|
|
has_rst_type = rst_type_name + ":" in content
|
|
|
|
# 1. Fix imports - add missing imports
|
|
imports_needed = []
|
|
if f"./{archive_name}RecordSetTypes" not in content:
|
|
imports_needed.append(f" - ./{archive_name}RecordSetTypes # Imports concrete subclasses")
|
|
if "../slots/holds_record_set_types" not in content:
|
|
imports_needed.append(" - ../slots/holds_record_set_types # Links custodian to record set types")
|
|
if not has_rst_type and "./CollectionType" not in content:
|
|
imports_needed.append(" - ./CollectionType")
|
|
if not has_rst_type and "../slots/type_scope" not in content:
|
|
imports_needed.append(" - ../slots/type_scope")
|
|
|
|
if imports_needed:
|
|
# Find the imports section and add to it
|
|
import_match = re.search(r'(imports:\n(?: - [^\n]+\n)+)', content)
|
|
if import_match:
|
|
old_imports = import_match.group(1)
|
|
new_imports = old_imports.rstrip('\n') + '\n' + '\n'.join(imports_needed) + '\n'
|
|
content = content.replace(old_imports, new_imports)
|
|
modified = True
|
|
|
|
# 2. Fix duplicate slots sections
|
|
# Count slots sections
|
|
slots_matches = list(re.finditer(r'^ slots:\n((?: - [^\n]+\n)+)', content, re.MULTILINE))
|
|
if len(slots_matches) > 1:
|
|
# Collect all slot items
|
|
all_slots = set()
|
|
for match in slots_matches:
|
|
slot_items = re.findall(r' - ([^\n]+)', match.group(0))
|
|
all_slots.update(slot_items)
|
|
|
|
# Add holds_record_set_types if not present
|
|
all_slots.add('holds_record_set_types')
|
|
|
|
# Remove all slots sections
|
|
for match in reversed(slots_matches):
|
|
content = content[:match.start()] + content[match.end():]
|
|
|
|
# Add consolidated slots section after class_uri
|
|
class_uri_match = re.search(rf'( {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL)
|
|
if class_uri_match:
|
|
insert_point = class_uri_match.end()
|
|
slots_text = "\n # Slots linking custodian to record set types\n slots:\n"
|
|
for slot in sorted(all_slots):
|
|
slots_text += f" - {slot}\n"
|
|
content = content[:insert_point] + slots_text + content[insert_point:]
|
|
modified = True
|
|
elif len(slots_matches) == 1:
|
|
# Check if holds_record_set_types is in slots
|
|
if "holds_record_set_types" not in slots_matches[0].group(0):
|
|
# Add it
|
|
old_slots = slots_matches[0].group(0)
|
|
new_slots = old_slots.rstrip('\n') + '\n - holds_record_set_types\n'
|
|
content = content.replace(old_slots, new_slots)
|
|
modified = True
|
|
else:
|
|
# No slots section - need to add one
|
|
class_uri_match = re.search(rf'( {archive_name}:.*?class_uri: [^\n]+\n)', content, re.DOTALL)
|
|
if class_uri_match:
|
|
insert_point = class_uri_match.end()
|
|
slots_text = "\n # Slots linking custodian to record set types\n slots:\n - holds_record_set_types\n"
|
|
content = content[:insert_point] + slots_text + content[insert_point:]
|
|
modified = True
|
|
|
|
# 3. Add RecordSetType base class if not present
|
|
if not has_rst_type:
|
|
# Add the RecordSetType class at the end
|
|
rst_type_class = f'''
|
|
# rico:RecordSetType base class for collection classification
|
|
{rst_type_name}:
|
|
description: |
|
|
A rico:RecordSetType for classifying collections held by {archive_name} custodians.
|
|
|
|
**Dual-Class Pattern**:
|
|
This class represents the COLLECTION type (rico:RecordSetType).
|
|
For the custodian organization type, see `{archive_name}`.
|
|
is_a: CollectionType
|
|
class_uri: rico:RecordSetType
|
|
slots:
|
|
- type_scope
|
|
see_also:
|
|
- {archive_name}
|
|
- rico:RecordSetType
|
|
annotations:
|
|
custodian_types: '["A"]'
|
|
custodian_types_rationale: "{rst_type_name} classifies collections held by ARCHIVE (A) type custodians"
|
|
linked_custodian_type: {archive_name}
|
|
dual_class_pattern: collection_type
|
|
specificity_score: 0.7
|
|
specificity_rationale: Type taxonomy class.
|
|
specificity_annotation_timestamp: '{datetime.now().isoformat()}Z'
|
|
specificity_annotation_agent: opencode-claude-sonnet-4
|
|
template_specificity:
|
|
archive_search: 0.2
|
|
museum_search: 0.75
|
|
library_search: 0.75
|
|
collection_discovery: 0.75
|
|
person_research: 0.75
|
|
location_browse: 0.75
|
|
identifier_lookup: 0.75
|
|
organizational_change: 0.75
|
|
digital_platform: 0.75
|
|
general_heritage: 0.75
|
|
'''
|
|
content = content.rstrip() + '\n' + rst_type_class
|
|
modified = True
|
|
|
|
if modified:
|
|
with open(file_path, 'w') as f:
|
|
f.write(content)
|
|
print(f" Fixed {archive_name}.yaml")
|
|
else:
|
|
print(f" {archive_name}.yaml - no changes needed")
|
|
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Archive Class File Fixer")
|
|
print("=" * 70)
|
|
|
|
archives = get_archive_classes()
|
|
print(f"\nProcessing {len(archives)} archive class files\n")
|
|
|
|
for archive in archives:
|
|
if archive in ALREADY_PROPER:
|
|
print(f" Skipping {archive} - already properly configured")
|
|
continue
|
|
fix_archive_file(archive)
|
|
|
|
print("\nDone!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|