- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework. - Added detailed mapping of SPARQL templates to context templates for improved specificity filtering. - Implemented wrapper patterns around existing classifiers to extend functionality without duplication. - Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality. - Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
755 lines
26 KiB
Python
755 lines
26 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add specificity score annotations to all LinkML class files.
|
|
|
|
This script adds Rule 37-compliant specificity_score annotations with proper
|
|
provenance to enable intelligent RAG retrieval filtering.
|
|
|
|
Provenance Statement:
|
|
- statement_created_at: When this annotation was added
|
|
- source_archived_at: N/A (schema files, not web content)
|
|
- annotation_agent: opencode-claude-sonnet-4
|
|
- annotation_rationale: Rule 37 compliance for RAG filtering
|
|
|
|
Usage:
|
|
python scripts/add_specificity_annotations.py [--dry-run]
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import yaml
|
|
|
|
# Provenance metadata
|
|
ANNOTATION_TIMESTAMP = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
ANNOTATION_AGENT = "opencode-claude-sonnet-4"
|
|
|
|
# Schema directory
|
|
SCHEMA_DIR = Path("schemas/20251121/linkml/modules/classes")
|
|
|
|
# Specificity score mappings based on class semantics
|
|
# Lower scores = more broadly relevant = more likely included in RAG context
|
|
|
|
# Default scores by custodian_types pattern
|
|
CUSTODIAN_TYPE_SCORES = {
|
|
# Universal classes (apply to all types)
|
|
'["*"]': {
|
|
"specificity_score": 0.2,
|
|
"rationale": "Core class applicable to all custodian types.",
|
|
"template_specificity": {
|
|
"archive_search": 0.2,
|
|
"museum_search": 0.2,
|
|
"library_search": 0.2,
|
|
"collection_discovery": 0.3,
|
|
"person_research": 0.4,
|
|
"location_browse": 0.3,
|
|
"identifier_lookup": 0.2,
|
|
"organizational_change": 0.3,
|
|
"digital_platform": 0.4,
|
|
"general_heritage": 0.1,
|
|
}
|
|
},
|
|
# Archive-specific classes
|
|
'["A"]': {
|
|
"specificity_score": 0.7,
|
|
"rationale": "Archive-specific class - highly relevant to archival contexts.",
|
|
"template_specificity": {
|
|
"archive_search": 0.2,
|
|
"museum_search": 0.9,
|
|
"library_search": 0.8,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.7,
|
|
"location_browse": 0.8,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.8,
|
|
"general_heritage": 0.6,
|
|
}
|
|
},
|
|
# Museum-specific classes
|
|
'["M"]': {
|
|
"specificity_score": 0.7,
|
|
"rationale": "Museum-specific class - highly relevant to museum contexts.",
|
|
"template_specificity": {
|
|
"archive_search": 0.9,
|
|
"museum_search": 0.2,
|
|
"library_search": 0.9,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.7,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.6,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
# Library-specific classes
|
|
'["L"]': {
|
|
"specificity_score": 0.7,
|
|
"rationale": "Library-specific class - highly relevant to library contexts.",
|
|
"template_specificity": {
|
|
"archive_search": 0.8,
|
|
"museum_search": 0.9,
|
|
"library_search": 0.2,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.7,
|
|
"location_browse": 0.6,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.6,
|
|
"digital_platform": 0.6,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
# Research-specific classes
|
|
'["R"]': {
|
|
"specificity_score": 0.7,
|
|
"rationale": "Research organization-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.7,
|
|
"museum_search": 0.8,
|
|
"library_search": 0.4,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.4,
|
|
"location_browse": 0.7,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.6,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
# Digital platform classes
|
|
'["D"]': {
|
|
"specificity_score": 0.75,
|
|
"rationale": "Digital platform-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.8,
|
|
"museum_search": 0.8,
|
|
"library_search": 0.7,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.8,
|
|
"location_browse": 0.9,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.7,
|
|
"digital_platform": 0.2,
|
|
"general_heritage": 0.6,
|
|
}
|
|
},
|
|
# Society/association classes
|
|
'["S"]': {
|
|
"specificity_score": 0.75,
|
|
"rationale": "Heritage society/association-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.6,
|
|
"museum_search": 0.7,
|
|
"library_search": 0.7,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.6,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
# Intangible heritage classes
|
|
'["I"]': {
|
|
"specificity_score": 0.8,
|
|
"rationale": "Intangible heritage-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.7,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.7,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.6,
|
|
"identifier_lookup": 0.7,
|
|
"organizational_change": 0.6,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
# Holy sites classes
|
|
'["H"]': {
|
|
"specificity_score": 0.8,
|
|
"rationale": "Religious heritage/holy sites-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.6,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.6,
|
|
"location_browse": 0.4,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.6,
|
|
"digital_platform": 0.8,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
# Feature/monument classes
|
|
'["F"]': {
|
|
"specificity_score": 0.8,
|
|
"rationale": "Physical feature/monument-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.8,
|
|
"museum_search": 0.7,
|
|
"library_search": 0.9,
|
|
"collection_discovery": 0.6,
|
|
"person_research": 0.8,
|
|
"location_browse": 0.2,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.7,
|
|
"digital_platform": 0.8,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
# Corporation/commercial classes
|
|
'["C"]': {
|
|
"specificity_score": 0.8,
|
|
"rationale": "Corporate heritage-specific class.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.7,
|
|
"library_search": 0.7,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.6,
|
|
"location_browse": 0.7,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
}
|
|
|
|
# Special class-specific overrides (by class name pattern)
|
|
CLASS_SPECIFIC_SCORES = {
|
|
# Core hub classes - very broadly relevant
|
|
"Custodian": {
|
|
"specificity_score": 0.15,
|
|
"rationale": "Central hub class - relevant to virtually all queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.15,
|
|
"museum_search": 0.15,
|
|
"library_search": 0.15,
|
|
"collection_discovery": 0.2,
|
|
"person_research": 0.3,
|
|
"location_browse": 0.2,
|
|
"identifier_lookup": 0.15,
|
|
"organizational_change": 0.2,
|
|
"digital_platform": 0.3,
|
|
"general_heritage": 0.1,
|
|
}
|
|
},
|
|
# Identifier classes - very important for lookups
|
|
"Identifier": {
|
|
"specificity_score": 0.25,
|
|
"rationale": "Core identifier class - essential for lookup queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.4,
|
|
"museum_search": 0.4,
|
|
"library_search": 0.4,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.1,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.4,
|
|
"general_heritage": 0.3,
|
|
}
|
|
},
|
|
"CustodianIdentifier": {
|
|
"specificity_score": 0.25,
|
|
"rationale": "Core identifier class - essential for lookup queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.4,
|
|
"museum_search": 0.4,
|
|
"library_search": 0.4,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.1,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.4,
|
|
"general_heritage": 0.3,
|
|
}
|
|
},
|
|
# Location/Place classes
|
|
"CustodianPlace": {
|
|
"specificity_score": 0.3,
|
|
"rationale": "Place aspect class - important for location queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.6,
|
|
"location_browse": 0.1,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.6,
|
|
"general_heritage": 0.3,
|
|
}
|
|
},
|
|
"GeoSpatialPlace": {
|
|
"specificity_score": 0.35,
|
|
"rationale": "Geographic coordinates class - essential for location queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.6,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.6,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.7,
|
|
"location_browse": 0.1,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
"Settlement": {
|
|
"specificity_score": 0.35,
|
|
"rationale": "Settlement class - important for location context.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.6,
|
|
"location_browse": 0.15,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.6,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
"Country": {
|
|
"specificity_score": 0.3,
|
|
"rationale": "Country class - fundamental geographic context.",
|
|
"template_specificity": {
|
|
"archive_search": 0.4,
|
|
"museum_search": 0.4,
|
|
"library_search": 0.4,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.2,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.3,
|
|
}
|
|
},
|
|
# Collection classes
|
|
"Collection": {
|
|
"specificity_score": 0.3,
|
|
"rationale": "Collection class - broadly relevant for collection queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.3,
|
|
"museum_search": 0.3,
|
|
"library_search": 0.3,
|
|
"collection_discovery": 0.1,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.4,
|
|
"general_heritage": 0.3,
|
|
}
|
|
},
|
|
"CustodianCollection": {
|
|
"specificity_score": 0.3,
|
|
"rationale": "Custodian collection aspect - broadly relevant.",
|
|
"template_specificity": {
|
|
"archive_search": 0.3,
|
|
"museum_search": 0.3,
|
|
"library_search": 0.3,
|
|
"collection_discovery": 0.1,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.4,
|
|
"general_heritage": 0.3,
|
|
}
|
|
},
|
|
# Person/Staff classes
|
|
"PersonObservation": {
|
|
"specificity_score": 0.5,
|
|
"rationale": "Person observation class - important for person research.",
|
|
"template_specificity": {
|
|
"archive_search": 0.6,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.6,
|
|
"collection_discovery": 0.6,
|
|
"person_research": 0.1,
|
|
"location_browse": 0.7,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
"PersonName": {
|
|
"specificity_score": 0.5,
|
|
"rationale": "Person name class - important for person research.",
|
|
"template_specificity": {
|
|
"archive_search": 0.6,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.6,
|
|
"collection_discovery": 0.7,
|
|
"person_research": 0.15,
|
|
"location_browse": 0.8,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
"StaffRole": {
|
|
"specificity_score": 0.55,
|
|
"rationale": "Staff role class - relevant for person/organizational queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.6,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.6,
|
|
"collection_discovery": 0.7,
|
|
"person_research": 0.2,
|
|
"location_browse": 0.8,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.3,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
"WorkExperience": {
|
|
"specificity_score": 0.6,
|
|
"rationale": "Work experience class - specific to person research.",
|
|
"template_specificity": {
|
|
"archive_search": 0.7,
|
|
"museum_search": 0.7,
|
|
"library_search": 0.7,
|
|
"collection_discovery": 0.8,
|
|
"person_research": 0.2,
|
|
"location_browse": 0.8,
|
|
"identifier_lookup": 0.7,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.8,
|
|
"general_heritage": 0.6,
|
|
}
|
|
},
|
|
# Observation/Provenance classes
|
|
"CustodianObservation": {
|
|
"specificity_score": 0.4,
|
|
"rationale": "Observation class - important for provenance tracking.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.3,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
"ReconstructedEntity": {
|
|
"specificity_score": 0.45,
|
|
"rationale": "Base reconstructed entity class - foundational for provenance.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
# Digital platform classes
|
|
"DigitalPlatformType": {
|
|
"specificity_score": 0.6,
|
|
"rationale": "Digital platform type taxonomy - specific to digital contexts.",
|
|
"template_specificity": {
|
|
"archive_search": 0.7,
|
|
"museum_search": 0.7,
|
|
"library_search": 0.6,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.8,
|
|
"location_browse": 0.9,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.7,
|
|
"digital_platform": 0.15,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
"SocialMediaProfile": {
|
|
"specificity_score": 0.65,
|
|
"rationale": "Social media profile class - specific to digital presence.",
|
|
"template_specificity": {
|
|
"archive_search": 0.8,
|
|
"museum_search": 0.8,
|
|
"library_search": 0.8,
|
|
"collection_discovery": 0.7,
|
|
"person_research": 0.4,
|
|
"location_browse": 0.9,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.7,
|
|
"digital_platform": 0.2,
|
|
"general_heritage": 0.6,
|
|
}
|
|
},
|
|
# Organizational change classes
|
|
"CustodianLegalStatus": {
|
|
"specificity_score": 0.5,
|
|
"rationale": "Legal status class - important for organizational queries.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.6,
|
|
"person_research": 0.6,
|
|
"location_browse": 0.6,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.15,
|
|
"digital_platform": 0.7,
|
|
"general_heritage": 0.4,
|
|
}
|
|
},
|
|
# Access/Policy classes
|
|
"AccessPolicy": {
|
|
"specificity_score": 0.55,
|
|
"rationale": "Access policy class - moderately specific.",
|
|
"template_specificity": {
|
|
"archive_search": 0.4,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.4,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.7,
|
|
"location_browse": 0.7,
|
|
"identifier_lookup": 0.6,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
# Standard/metadata classes
|
|
"Standard": {
|
|
"specificity_score": 0.6,
|
|
"rationale": "Standard class - specific to technical/metadata contexts.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.6,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.8,
|
|
"location_browse": 0.8,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.7,
|
|
"digital_platform": 0.4,
|
|
"general_heritage": 0.5,
|
|
}
|
|
},
|
|
}
|
|
|
|
# Default fallback scores for classes without specific mappings
|
|
DEFAULT_SCORES = {
|
|
"specificity_score": 0.5,
|
|
"rationale": "General heritage class with moderate specificity.",
|
|
"template_specificity": {
|
|
"archive_search": 0.5,
|
|
"museum_search": 0.5,
|
|
"library_search": 0.5,
|
|
"collection_discovery": 0.5,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.5,
|
|
"organizational_change": 0.5,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.5,
|
|
}
|
|
}
|
|
|
|
|
|
def get_scores_for_class(class_name: str, custodian_types: Optional[str]) -> dict:
|
|
"""Determine appropriate specificity scores for a class."""
|
|
|
|
# Check for class-specific override first
|
|
if class_name in CLASS_SPECIFIC_SCORES:
|
|
return CLASS_SPECIFIC_SCORES[class_name]
|
|
|
|
# Check for custodian_types-based scores
|
|
if custodian_types and custodian_types in CUSTODIAN_TYPE_SCORES:
|
|
return CUSTODIAN_TYPE_SCORES[custodian_types]
|
|
|
|
# Check if custodian_types contains multiple types - use higher specificity
|
|
if custodian_types and custodian_types.startswith('["') and ',' in custodian_types:
|
|
# Multiple custodian types - moderately broad relevance
|
|
return {
|
|
"specificity_score": 0.4,
|
|
"rationale": "Class applies to multiple custodian types.",
|
|
"template_specificity": {
|
|
"archive_search": 0.4,
|
|
"museum_search": 0.4,
|
|
"library_search": 0.4,
|
|
"collection_discovery": 0.4,
|
|
"person_research": 0.5,
|
|
"location_browse": 0.5,
|
|
"identifier_lookup": 0.4,
|
|
"organizational_change": 0.4,
|
|
"digital_platform": 0.5,
|
|
"general_heritage": 0.4,
|
|
}
|
|
}
|
|
|
|
# Default fallback
|
|
return DEFAULT_SCORES
|
|
|
|
|
|
def format_template_specificity(template_scores: dict) -> str:
|
|
"""Format template_specificity as YAML-compatible multiline string."""
|
|
lines = []
|
|
for template, score in template_scores.items():
|
|
lines.append(f" {template}: {score}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def add_specificity_annotations(file_path: Path, dry_run: bool = False) -> bool:
|
|
"""Add specificity annotations to a LinkML class file.
|
|
|
|
Returns True if file was modified, False otherwise.
|
|
"""
|
|
content = file_path.read_text()
|
|
|
|
# Skip if already has specificity_score
|
|
if "specificity_score:" in content:
|
|
return False
|
|
|
|
# Find the class definition and its annotations section
|
|
# Pattern: annotations:\n custodian_types: ...
|
|
|
|
# First, try to find existing annotations section
|
|
annotations_pattern = r'( annotations:\n)((?: [^\n]+\n)*)'
|
|
match = re.search(annotations_pattern, content)
|
|
|
|
if match:
|
|
# Extract existing custodian_types if present
|
|
existing_annotations = match.group(2)
|
|
custodian_types_match = re.search(r"custodian_types: '(\[.*?\])'", existing_annotations)
|
|
custodian_types = custodian_types_match.group(1) if custodian_types_match else None
|
|
|
|
# Extract class name from file
|
|
class_name_match = re.search(r'^ (\w+):\s*$', content, re.MULTILINE)
|
|
class_name = class_name_match.group(1) if class_name_match else file_path.stem
|
|
|
|
# Get appropriate scores
|
|
scores = get_scores_for_class(class_name, f"'{custodian_types}'" if custodian_types else None)
|
|
|
|
# Build new annotations to add
|
|
new_annotations = f""" specificity_score: {scores['specificity_score']}
|
|
specificity_rationale: "{scores['rationale']}"
|
|
specificity_annotation_timestamp: "{ANNOTATION_TIMESTAMP}"
|
|
specificity_annotation_agent: "{ANNOTATION_AGENT}"
|
|
template_specificity:
|
|
{format_template_specificity(scores['template_specificity'])}
|
|
"""
|
|
|
|
# Insert after existing annotations
|
|
new_content = content[:match.end()] + new_annotations + content[match.end():]
|
|
|
|
else:
|
|
# No annotations section - need to add one
|
|
# Find where to insert (after class_uri, exact_mappings, close_mappings, etc.)
|
|
|
|
# Extract class name
|
|
class_name_match = re.search(r'^ (\w+):\s*$', content, re.MULTILINE)
|
|
class_name = class_name_match.group(1) if class_name_match else file_path.stem
|
|
|
|
# Get scores with no custodian_types
|
|
scores = get_scores_for_class(class_name, None)
|
|
|
|
# Look for insertion point - after class_uri or is_a line
|
|
insertion_patterns = [
|
|
(r'( class_uri: [^\n]+\n)', 'after_class_uri'),
|
|
(r'( is_a: [^\n]+\n)', 'after_is_a'),
|
|
(r'(^ \w+:\s*\n)', 'after_class_name'),
|
|
]
|
|
|
|
inserted = False
|
|
for pattern, location in insertion_patterns:
|
|
match = re.search(pattern, content, re.MULTILINE)
|
|
if match:
|
|
# Build full annotations block
|
|
annotations_block = f""" annotations:
|
|
specificity_score: {scores['specificity_score']}
|
|
specificity_rationale: "{scores['rationale']}"
|
|
specificity_annotation_timestamp: "{ANNOTATION_TIMESTAMP}"
|
|
specificity_annotation_agent: "{ANNOTATION_AGENT}"
|
|
template_specificity:
|
|
{format_template_specificity(scores['template_specificity'])}
|
|
"""
|
|
new_content = content[:match.end()] + annotations_block + content[match.end():]
|
|
inserted = True
|
|
break
|
|
|
|
if not inserted:
|
|
print(f" WARNING: Could not find insertion point in {file_path}")
|
|
return False
|
|
|
|
if dry_run:
|
|
print(f" Would update: {file_path}")
|
|
return True
|
|
|
|
# Write updated content
|
|
file_path.write_text(new_content)
|
|
print(f" Updated: {file_path}")
|
|
return True
|
|
|
|
|
|
def main():
|
|
dry_run = "--dry-run" in sys.argv
|
|
|
|
if dry_run:
|
|
print("DRY RUN MODE - no files will be modified\n")
|
|
|
|
print(f"Scanning {SCHEMA_DIR} for LinkML class files...\n")
|
|
|
|
if not SCHEMA_DIR.exists():
|
|
print(f"ERROR: Schema directory not found: {SCHEMA_DIR}")
|
|
sys.exit(1)
|
|
|
|
yaml_files = list(SCHEMA_DIR.glob("*.yaml"))
|
|
print(f"Found {len(yaml_files)} YAML files\n")
|
|
|
|
modified_count = 0
|
|
skipped_count = 0
|
|
error_count = 0
|
|
|
|
for yaml_file in sorted(yaml_files):
|
|
try:
|
|
if add_specificity_annotations(yaml_file, dry_run):
|
|
modified_count += 1
|
|
else:
|
|
skipped_count += 1
|
|
except Exception as e:
|
|
print(f" ERROR processing {yaml_file}: {e}")
|
|
error_count += 1
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Summary:")
|
|
print(f" Modified: {modified_count}")
|
|
print(f" Skipped (already annotated): {skipped_count}")
|
|
print(f" Errors: {error_count}")
|
|
|
|
if dry_run:
|
|
print(f"\nRun without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|