glam/scripts/add_specificity_annotations.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

755 lines
26 KiB
Python

#!/usr/bin/env python3
"""
Add specificity score annotations to all LinkML class files.
This script adds Rule 37-compliant specificity_score annotations with proper
provenance to enable intelligent RAG retrieval filtering.
Provenance Statement:
- statement_created_at: When this annotation was added
- source_archived_at: N/A (schema files, not web content)
- annotation_agent: opencode-claude-sonnet-4
- annotation_rationale: Rule 37 compliance for RAG filtering
Usage:
python scripts/add_specificity_annotations.py [--dry-run]
"""
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
# Provenance metadata
ANNOTATION_TIMESTAMP = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
ANNOTATION_AGENT = "opencode-claude-sonnet-4"
# Schema directory
SCHEMA_DIR = Path("schemas/20251121/linkml/modules/classes")
# Specificity score mappings based on class semantics
# Lower scores = more broadly relevant = more likely included in RAG context
# Default scores by custodian_types pattern
CUSTODIAN_TYPE_SCORES = {
# Universal classes (apply to all types)
'["*"]': {
"specificity_score": 0.2,
"rationale": "Core class applicable to all custodian types.",
"template_specificity": {
"archive_search": 0.2,
"museum_search": 0.2,
"library_search": 0.2,
"collection_discovery": 0.3,
"person_research": 0.4,
"location_browse": 0.3,
"identifier_lookup": 0.2,
"organizational_change": 0.3,
"digital_platform": 0.4,
"general_heritage": 0.1,
}
},
# Archive-specific classes
'["A"]': {
"specificity_score": 0.7,
"rationale": "Archive-specific class - highly relevant to archival contexts.",
"template_specificity": {
"archive_search": 0.2,
"museum_search": 0.9,
"library_search": 0.8,
"collection_discovery": 0.5,
"person_research": 0.7,
"location_browse": 0.8,
"identifier_lookup": 0.5,
"organizational_change": 0.5,
"digital_platform": 0.8,
"general_heritage": 0.6,
}
},
# Museum-specific classes
'["M"]': {
"specificity_score": 0.7,
"rationale": "Museum-specific class - highly relevant to museum contexts.",
"template_specificity": {
"archive_search": 0.9,
"museum_search": 0.2,
"library_search": 0.9,
"collection_discovery": 0.4,
"person_research": 0.7,
"location_browse": 0.5,
"identifier_lookup": 0.5,
"organizational_change": 0.6,
"digital_platform": 0.7,
"general_heritage": 0.5,
}
},
# Library-specific classes
'["L"]': {
"specificity_score": 0.7,
"rationale": "Library-specific class - highly relevant to library contexts.",
"template_specificity": {
"archive_search": 0.8,
"museum_search": 0.9,
"library_search": 0.2,
"collection_discovery": 0.4,
"person_research": 0.7,
"location_browse": 0.6,
"identifier_lookup": 0.4,
"organizational_change": 0.6,
"digital_platform": 0.6,
"general_heritage": 0.5,
}
},
# Research-specific classes
'["R"]': {
"specificity_score": 0.7,
"rationale": "Research organization-specific class.",
"template_specificity": {
"archive_search": 0.7,
"museum_search": 0.8,
"library_search": 0.4,
"collection_discovery": 0.5,
"person_research": 0.4,
"location_browse": 0.7,
"identifier_lookup": 0.5,
"organizational_change": 0.6,
"digital_platform": 0.5,
"general_heritage": 0.5,
}
},
# Digital platform classes
'["D"]': {
"specificity_score": 0.75,
"rationale": "Digital platform-specific class.",
"template_specificity": {
"archive_search": 0.8,
"museum_search": 0.8,
"library_search": 0.7,
"collection_discovery": 0.5,
"person_research": 0.8,
"location_browse": 0.9,
"identifier_lookup": 0.6,
"organizational_change": 0.7,
"digital_platform": 0.2,
"general_heritage": 0.6,
}
},
# Society/association classes
'["S"]': {
"specificity_score": 0.75,
"rationale": "Heritage society/association-specific class.",
"template_specificity": {
"archive_search": 0.6,
"museum_search": 0.7,
"library_search": 0.7,
"collection_discovery": 0.5,
"person_research": 0.5,
"location_browse": 0.6,
"identifier_lookup": 0.6,
"organizational_change": 0.4,
"digital_platform": 0.7,
"general_heritage": 0.4,
}
},
# Intangible heritage classes
'["I"]': {
"specificity_score": 0.8,
"rationale": "Intangible heritage-specific class.",
"template_specificity": {
"archive_search": 0.7,
"museum_search": 0.6,
"library_search": 0.7,
"collection_discovery": 0.4,
"person_research": 0.5,
"location_browse": 0.6,
"identifier_lookup": 0.7,
"organizational_change": 0.6,
"digital_platform": 0.7,
"general_heritage": 0.4,
}
},
# Holy sites classes
'["H"]': {
"specificity_score": 0.8,
"rationale": "Religious heritage/holy sites-specific class.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.6,
"library_search": 0.6,
"collection_discovery": 0.5,
"person_research": 0.6,
"location_browse": 0.4,
"identifier_lookup": 0.6,
"organizational_change": 0.6,
"digital_platform": 0.8,
"general_heritage": 0.4,
}
},
# Feature/monument classes
'["F"]': {
"specificity_score": 0.8,
"rationale": "Physical feature/monument-specific class.",
"template_specificity": {
"archive_search": 0.8,
"museum_search": 0.7,
"library_search": 0.9,
"collection_discovery": 0.6,
"person_research": 0.8,
"location_browse": 0.2,
"identifier_lookup": 0.6,
"organizational_change": 0.7,
"digital_platform": 0.8,
"general_heritage": 0.5,
}
},
# Corporation/commercial classes
'["C"]': {
"specificity_score": 0.8,
"rationale": "Corporate heritage-specific class.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.7,
"library_search": 0.7,
"collection_discovery": 0.5,
"person_research": 0.6,
"location_browse": 0.7,
"identifier_lookup": 0.5,
"organizational_change": 0.4,
"digital_platform": 0.7,
"general_heritage": 0.5,
}
},
}
# Special class-specific overrides (by class name pattern)
CLASS_SPECIFIC_SCORES = {
# Core hub classes - very broadly relevant
"Custodian": {
"specificity_score": 0.15,
"rationale": "Central hub class - relevant to virtually all queries.",
"template_specificity": {
"archive_search": 0.15,
"museum_search": 0.15,
"library_search": 0.15,
"collection_discovery": 0.2,
"person_research": 0.3,
"location_browse": 0.2,
"identifier_lookup": 0.15,
"organizational_change": 0.2,
"digital_platform": 0.3,
"general_heritage": 0.1,
}
},
# Identifier classes - very important for lookups
"Identifier": {
"specificity_score": 0.25,
"rationale": "Core identifier class - essential for lookup queries.",
"template_specificity": {
"archive_search": 0.4,
"museum_search": 0.4,
"library_search": 0.4,
"collection_discovery": 0.4,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.1,
"organizational_change": 0.4,
"digital_platform": 0.4,
"general_heritage": 0.3,
}
},
"CustodianIdentifier": {
"specificity_score": 0.25,
"rationale": "Core identifier class - essential for lookup queries.",
"template_specificity": {
"archive_search": 0.4,
"museum_search": 0.4,
"library_search": 0.4,
"collection_discovery": 0.4,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.1,
"organizational_change": 0.4,
"digital_platform": 0.4,
"general_heritage": 0.3,
}
},
# Location/Place classes
"CustodianPlace": {
"specificity_score": 0.3,
"rationale": "Place aspect class - important for location queries.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.5,
"library_search": 0.5,
"collection_discovery": 0.5,
"person_research": 0.6,
"location_browse": 0.1,
"identifier_lookup": 0.5,
"organizational_change": 0.4,
"digital_platform": 0.6,
"general_heritage": 0.3,
}
},
"GeoSpatialPlace": {
"specificity_score": 0.35,
"rationale": "Geographic coordinates class - essential for location queries.",
"template_specificity": {
"archive_search": 0.6,
"museum_search": 0.6,
"library_search": 0.6,
"collection_discovery": 0.5,
"person_research": 0.7,
"location_browse": 0.1,
"identifier_lookup": 0.6,
"organizational_change": 0.5,
"digital_platform": 0.7,
"general_heritage": 0.4,
}
},
"Settlement": {
"specificity_score": 0.35,
"rationale": "Settlement class - important for location context.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.5,
"library_search": 0.5,
"collection_discovery": 0.5,
"person_research": 0.6,
"location_browse": 0.15,
"identifier_lookup": 0.5,
"organizational_change": 0.5,
"digital_platform": 0.6,
"general_heritage": 0.4,
}
},
"Country": {
"specificity_score": 0.3,
"rationale": "Country class - fundamental geographic context.",
"template_specificity": {
"archive_search": 0.4,
"museum_search": 0.4,
"library_search": 0.4,
"collection_discovery": 0.4,
"person_research": 0.5,
"location_browse": 0.2,
"identifier_lookup": 0.4,
"organizational_change": 0.4,
"digital_platform": 0.5,
"general_heritage": 0.3,
}
},
# Collection classes
"Collection": {
"specificity_score": 0.3,
"rationale": "Collection class - broadly relevant for collection queries.",
"template_specificity": {
"archive_search": 0.3,
"museum_search": 0.3,
"library_search": 0.3,
"collection_discovery": 0.1,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.4,
"organizational_change": 0.5,
"digital_platform": 0.4,
"general_heritage": 0.3,
}
},
"CustodianCollection": {
"specificity_score": 0.3,
"rationale": "Custodian collection aspect - broadly relevant.",
"template_specificity": {
"archive_search": 0.3,
"museum_search": 0.3,
"library_search": 0.3,
"collection_discovery": 0.1,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.4,
"organizational_change": 0.5,
"digital_platform": 0.4,
"general_heritage": 0.3,
}
},
# Person/Staff classes
"PersonObservation": {
"specificity_score": 0.5,
"rationale": "Person observation class - important for person research.",
"template_specificity": {
"archive_search": 0.6,
"museum_search": 0.6,
"library_search": 0.6,
"collection_discovery": 0.6,
"person_research": 0.1,
"location_browse": 0.7,
"identifier_lookup": 0.5,
"organizational_change": 0.4,
"digital_platform": 0.7,
"general_heritage": 0.5,
}
},
"PersonName": {
"specificity_score": 0.5,
"rationale": "Person name class - important for person research.",
"template_specificity": {
"archive_search": 0.6,
"museum_search": 0.6,
"library_search": 0.6,
"collection_discovery": 0.7,
"person_research": 0.15,
"location_browse": 0.8,
"identifier_lookup": 0.4,
"organizational_change": 0.5,
"digital_platform": 0.7,
"general_heritage": 0.5,
}
},
"StaffRole": {
"specificity_score": 0.55,
"rationale": "Staff role class - relevant for person/organizational queries.",
"template_specificity": {
"archive_search": 0.6,
"museum_search": 0.6,
"library_search": 0.6,
"collection_discovery": 0.7,
"person_research": 0.2,
"location_browse": 0.8,
"identifier_lookup": 0.6,
"organizational_change": 0.3,
"digital_platform": 0.7,
"general_heritage": 0.5,
}
},
"WorkExperience": {
"specificity_score": 0.6,
"rationale": "Work experience class - specific to person research.",
"template_specificity": {
"archive_search": 0.7,
"museum_search": 0.7,
"library_search": 0.7,
"collection_discovery": 0.8,
"person_research": 0.2,
"location_browse": 0.8,
"identifier_lookup": 0.7,
"organizational_change": 0.4,
"digital_platform": 0.8,
"general_heritage": 0.6,
}
},
# Observation/Provenance classes
"CustodianObservation": {
"specificity_score": 0.4,
"rationale": "Observation class - important for provenance tracking.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.5,
"library_search": 0.5,
"collection_discovery": 0.5,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.4,
"organizational_change": 0.3,
"digital_platform": 0.5,
"general_heritage": 0.4,
}
},
"ReconstructedEntity": {
"specificity_score": 0.45,
"rationale": "Base reconstructed entity class - foundational for provenance.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.5,
"library_search": 0.5,
"collection_discovery": 0.5,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.4,
"organizational_change": 0.4,
"digital_platform": 0.5,
"general_heritage": 0.4,
}
},
# Digital platform classes
"DigitalPlatformType": {
"specificity_score": 0.6,
"rationale": "Digital platform type taxonomy - specific to digital contexts.",
"template_specificity": {
"archive_search": 0.7,
"museum_search": 0.7,
"library_search": 0.6,
"collection_discovery": 0.5,
"person_research": 0.8,
"location_browse": 0.9,
"identifier_lookup": 0.6,
"organizational_change": 0.7,
"digital_platform": 0.15,
"general_heritage": 0.5,
}
},
"SocialMediaProfile": {
"specificity_score": 0.65,
"rationale": "Social media profile class - specific to digital presence.",
"template_specificity": {
"archive_search": 0.8,
"museum_search": 0.8,
"library_search": 0.8,
"collection_discovery": 0.7,
"person_research": 0.4,
"location_browse": 0.9,
"identifier_lookup": 0.5,
"organizational_change": 0.7,
"digital_platform": 0.2,
"general_heritage": 0.6,
}
},
# Organizational change classes
"CustodianLegalStatus": {
"specificity_score": 0.5,
"rationale": "Legal status class - important for organizational queries.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.5,
"library_search": 0.5,
"collection_discovery": 0.6,
"person_research": 0.6,
"location_browse": 0.6,
"identifier_lookup": 0.4,
"organizational_change": 0.15,
"digital_platform": 0.7,
"general_heritage": 0.4,
}
},
# Access/Policy classes
"AccessPolicy": {
"specificity_score": 0.55,
"rationale": "Access policy class - moderately specific.",
"template_specificity": {
"archive_search": 0.4,
"museum_search": 0.5,
"library_search": 0.4,
"collection_discovery": 0.4,
"person_research": 0.7,
"location_browse": 0.7,
"identifier_lookup": 0.6,
"organizational_change": 0.5,
"digital_platform": 0.5,
"general_heritage": 0.5,
}
},
# Standard/metadata classes
"Standard": {
"specificity_score": 0.6,
"rationale": "Standard class - specific to technical/metadata contexts.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.6,
"library_search": 0.5,
"collection_discovery": 0.5,
"person_research": 0.8,
"location_browse": 0.8,
"identifier_lookup": 0.4,
"organizational_change": 0.7,
"digital_platform": 0.4,
"general_heritage": 0.5,
}
},
}
# Default fallback scores for classes without specific mappings
DEFAULT_SCORES = {
"specificity_score": 0.5,
"rationale": "General heritage class with moderate specificity.",
"template_specificity": {
"archive_search": 0.5,
"museum_search": 0.5,
"library_search": 0.5,
"collection_discovery": 0.5,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.5,
"organizational_change": 0.5,
"digital_platform": 0.5,
"general_heritage": 0.5,
}
}
def get_scores_for_class(class_name: str, custodian_types: Optional[str]) -> dict:
"""Determine appropriate specificity scores for a class."""
# Check for class-specific override first
if class_name in CLASS_SPECIFIC_SCORES:
return CLASS_SPECIFIC_SCORES[class_name]
# Check for custodian_types-based scores
if custodian_types and custodian_types in CUSTODIAN_TYPE_SCORES:
return CUSTODIAN_TYPE_SCORES[custodian_types]
# Check if custodian_types contains multiple types - use higher specificity
if custodian_types and custodian_types.startswith('["') and ',' in custodian_types:
# Multiple custodian types - moderately broad relevance
return {
"specificity_score": 0.4,
"rationale": "Class applies to multiple custodian types.",
"template_specificity": {
"archive_search": 0.4,
"museum_search": 0.4,
"library_search": 0.4,
"collection_discovery": 0.4,
"person_research": 0.5,
"location_browse": 0.5,
"identifier_lookup": 0.4,
"organizational_change": 0.4,
"digital_platform": 0.5,
"general_heritage": 0.4,
}
}
# Default fallback
return DEFAULT_SCORES
def format_template_specificity(template_scores: dict) -> str:
"""Format template_specificity as YAML-compatible multiline string."""
lines = []
for template, score in template_scores.items():
lines.append(f" {template}: {score}")
return "\n".join(lines)
def add_specificity_annotations(file_path: Path, dry_run: bool = False) -> bool:
"""Add specificity annotations to a LinkML class file.
Returns True if file was modified, False otherwise.
"""
content = file_path.read_text()
# Skip if already has specificity_score
if "specificity_score:" in content:
return False
# Find the class definition and its annotations section
# Pattern: annotations:\n custodian_types: ...
# First, try to find existing annotations section
annotations_pattern = r'( annotations:\n)((?: [^\n]+\n)*)'
match = re.search(annotations_pattern, content)
if match:
# Extract existing custodian_types if present
existing_annotations = match.group(2)
custodian_types_match = re.search(r"custodian_types: '(\[.*?\])'", existing_annotations)
custodian_types = custodian_types_match.group(1) if custodian_types_match else None
# Extract class name from file
class_name_match = re.search(r'^ (\w+):\s*$', content, re.MULTILINE)
class_name = class_name_match.group(1) if class_name_match else file_path.stem
# Get appropriate scores
scores = get_scores_for_class(class_name, f"'{custodian_types}'" if custodian_types else None)
# Build new annotations to add
new_annotations = f""" specificity_score: {scores['specificity_score']}
specificity_rationale: "{scores['rationale']}"
specificity_annotation_timestamp: "{ANNOTATION_TIMESTAMP}"
specificity_annotation_agent: "{ANNOTATION_AGENT}"
template_specificity:
{format_template_specificity(scores['template_specificity'])}
"""
# Insert after existing annotations
new_content = content[:match.end()] + new_annotations + content[match.end():]
else:
# No annotations section - need to add one
# Find where to insert (after class_uri, exact_mappings, close_mappings, etc.)
# Extract class name
class_name_match = re.search(r'^ (\w+):\s*$', content, re.MULTILINE)
class_name = class_name_match.group(1) if class_name_match else file_path.stem
# Get scores with no custodian_types
scores = get_scores_for_class(class_name, None)
# Look for insertion point - after class_uri or is_a line
insertion_patterns = [
(r'( class_uri: [^\n]+\n)', 'after_class_uri'),
(r'( is_a: [^\n]+\n)', 'after_is_a'),
(r'(^ \w+:\s*\n)', 'after_class_name'),
]
inserted = False
for pattern, location in insertion_patterns:
match = re.search(pattern, content, re.MULTILINE)
if match:
# Build full annotations block
annotations_block = f""" annotations:
specificity_score: {scores['specificity_score']}
specificity_rationale: "{scores['rationale']}"
specificity_annotation_timestamp: "{ANNOTATION_TIMESTAMP}"
specificity_annotation_agent: "{ANNOTATION_AGENT}"
template_specificity:
{format_template_specificity(scores['template_specificity'])}
"""
new_content = content[:match.end()] + annotations_block + content[match.end():]
inserted = True
break
if not inserted:
print(f" WARNING: Could not find insertion point in {file_path}")
return False
if dry_run:
print(f" Would update: {file_path}")
return True
# Write updated content
file_path.write_text(new_content)
print(f" Updated: {file_path}")
return True
def main():
dry_run = "--dry-run" in sys.argv
if dry_run:
print("DRY RUN MODE - no files will be modified\n")
print(f"Scanning {SCHEMA_DIR} for LinkML class files...\n")
if not SCHEMA_DIR.exists():
print(f"ERROR: Schema directory not found: {SCHEMA_DIR}")
sys.exit(1)
yaml_files = list(SCHEMA_DIR.glob("*.yaml"))
print(f"Found {len(yaml_files)} YAML files\n")
modified_count = 0
skipped_count = 0
error_count = 0
for yaml_file in sorted(yaml_files):
try:
if add_specificity_annotations(yaml_file, dry_run):
modified_count += 1
else:
skipped_count += 1
except Exception as e:
print(f" ERROR processing {yaml_file}: {e}")
error_count += 1
print(f"\n{'=' * 60}")
print(f"Summary:")
print(f" Modified: {modified_count}")
print(f" Skipped (already annotated): {skipped_count}")
print(f" Errors: {error_count}")
if dry_run:
print(f"\nRun without --dry-run to apply changes.")
if __name__ == "__main__":
main()