#!/usr/bin/env python3 """ Add specificity score annotations to all LinkML class files. This script adds Rule 37-compliant specificity_score annotations with proper provenance to enable intelligent RAG retrieval filtering. Provenance Statement: - statement_created_at: When this annotation was added - source_archived_at: N/A (schema files, not web content) - annotation_agent: opencode-claude-sonnet-4 - annotation_rationale: Rule 37 compliance for RAG filtering Usage: python scripts/add_specificity_annotations.py [--dry-run] """ import os import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional import yaml # Provenance metadata ANNOTATION_TIMESTAMP = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") ANNOTATION_AGENT = "opencode-claude-sonnet-4" # Schema directory SCHEMA_DIR = Path("schemas/20251121/linkml/modules/classes") # Specificity score mappings based on class semantics # Lower scores = more broadly relevant = more likely included in RAG context # Default scores by custodian_types pattern CUSTODIAN_TYPE_SCORES = { # Universal classes (apply to all types) '["*"]': { "specificity_score": 0.2, "rationale": "Core class applicable to all custodian types.", "template_specificity": { "archive_search": 0.2, "museum_search": 0.2, "library_search": 0.2, "collection_discovery": 0.3, "person_research": 0.4, "location_browse": 0.3, "identifier_lookup": 0.2, "organizational_change": 0.3, "digital_platform": 0.4, "general_heritage": 0.1, } }, # Archive-specific classes '["A"]': { "specificity_score": 0.7, "rationale": "Archive-specific class - highly relevant to archival contexts.", "template_specificity": { "archive_search": 0.2, "museum_search": 0.9, "library_search": 0.8, "collection_discovery": 0.5, "person_research": 0.7, "location_browse": 0.8, "identifier_lookup": 0.5, "organizational_change": 0.5, "digital_platform": 0.8, "general_heritage": 0.6, } }, # Museum-specific classes '["M"]': { "specificity_score": 0.7, "rationale": "Museum-specific class - highly relevant to museum contexts.", "template_specificity": { "archive_search": 0.9, "museum_search": 0.2, "library_search": 0.9, "collection_discovery": 0.4, "person_research": 0.7, "location_browse": 0.5, "identifier_lookup": 0.5, "organizational_change": 0.6, "digital_platform": 0.7, "general_heritage": 0.5, } }, # Library-specific classes '["L"]': { "specificity_score": 0.7, "rationale": "Library-specific class - highly relevant to library contexts.", "template_specificity": { "archive_search": 0.8, "museum_search": 0.9, "library_search": 0.2, "collection_discovery": 0.4, "person_research": 0.7, "location_browse": 0.6, "identifier_lookup": 0.4, "organizational_change": 0.6, "digital_platform": 0.6, "general_heritage": 0.5, } }, # Research-specific classes '["R"]': { "specificity_score": 0.7, "rationale": "Research organization-specific class.", "template_specificity": { "archive_search": 0.7, "museum_search": 0.8, "library_search": 0.4, "collection_discovery": 0.5, "person_research": 0.4, "location_browse": 0.7, "identifier_lookup": 0.5, "organizational_change": 0.6, "digital_platform": 0.5, "general_heritage": 0.5, } }, # Digital platform classes '["D"]': { "specificity_score": 0.75, "rationale": "Digital platform-specific class.", "template_specificity": { "archive_search": 0.8, "museum_search": 0.8, "library_search": 0.7, "collection_discovery": 0.5, "person_research": 0.8, "location_browse": 0.9, "identifier_lookup": 0.6, "organizational_change": 0.7, "digital_platform": 0.2, "general_heritage": 0.6, } }, # Society/association classes '["S"]': { "specificity_score": 0.75, "rationale": "Heritage society/association-specific class.", "template_specificity": { "archive_search": 0.6, "museum_search": 0.7, "library_search": 0.7, "collection_discovery": 0.5, "person_research": 0.5, "location_browse": 0.6, "identifier_lookup": 0.6, "organizational_change": 0.4, "digital_platform": 0.7, "general_heritage": 0.4, } }, # Intangible heritage classes '["I"]': { "specificity_score": 0.8, "rationale": "Intangible heritage-specific class.", "template_specificity": { "archive_search": 0.7, "museum_search": 0.6, "library_search": 0.7, "collection_discovery": 0.4, "person_research": 0.5, "location_browse": 0.6, "identifier_lookup": 0.7, "organizational_change": 0.6, "digital_platform": 0.7, "general_heritage": 0.4, } }, # Holy sites classes '["H"]': { "specificity_score": 0.8, "rationale": "Religious heritage/holy sites-specific class.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.6, "library_search": 0.6, "collection_discovery": 0.5, "person_research": 0.6, "location_browse": 0.4, "identifier_lookup": 0.6, "organizational_change": 0.6, "digital_platform": 0.8, "general_heritage": 0.4, } }, # Feature/monument classes '["F"]': { "specificity_score": 0.8, "rationale": "Physical feature/monument-specific class.", "template_specificity": { "archive_search": 0.8, "museum_search": 0.7, "library_search": 0.9, "collection_discovery": 0.6, "person_research": 0.8, "location_browse": 0.2, "identifier_lookup": 0.6, "organizational_change": 0.7, "digital_platform": 0.8, "general_heritage": 0.5, } }, # Corporation/commercial classes '["C"]': { "specificity_score": 0.8, "rationale": "Corporate heritage-specific class.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.7, "library_search": 0.7, "collection_discovery": 0.5, "person_research": 0.6, "location_browse": 0.7, "identifier_lookup": 0.5, "organizational_change": 0.4, "digital_platform": 0.7, "general_heritage": 0.5, } }, } # Special class-specific overrides (by class name pattern) CLASS_SPECIFIC_SCORES = { # Core hub classes - very broadly relevant "Custodian": { "specificity_score": 0.15, "rationale": "Central hub class - relevant to virtually all queries.", "template_specificity": { "archive_search": 0.15, "museum_search": 0.15, "library_search": 0.15, "collection_discovery": 0.2, "person_research": 0.3, "location_browse": 0.2, "identifier_lookup": 0.15, "organizational_change": 0.2, "digital_platform": 0.3, "general_heritage": 0.1, } }, # Identifier classes - very important for lookups "Identifier": { "specificity_score": 0.25, "rationale": "Core identifier class - essential for lookup queries.", "template_specificity": { "archive_search": 0.4, "museum_search": 0.4, "library_search": 0.4, "collection_discovery": 0.4, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.1, "organizational_change": 0.4, "digital_platform": 0.4, "general_heritage": 0.3, } }, "CustodianIdentifier": { "specificity_score": 0.25, "rationale": "Core identifier class - essential for lookup queries.", "template_specificity": { "archive_search": 0.4, "museum_search": 0.4, "library_search": 0.4, "collection_discovery": 0.4, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.1, "organizational_change": 0.4, "digital_platform": 0.4, "general_heritage": 0.3, } }, # Location/Place classes "CustodianPlace": { "specificity_score": 0.3, "rationale": "Place aspect class - important for location queries.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.5, "library_search": 0.5, "collection_discovery": 0.5, "person_research": 0.6, "location_browse": 0.1, "identifier_lookup": 0.5, "organizational_change": 0.4, "digital_platform": 0.6, "general_heritage": 0.3, } }, "GeoSpatialPlace": { "specificity_score": 0.35, "rationale": "Geographic coordinates class - essential for location queries.", "template_specificity": { "archive_search": 0.6, "museum_search": 0.6, "library_search": 0.6, "collection_discovery": 0.5, "person_research": 0.7, "location_browse": 0.1, "identifier_lookup": 0.6, "organizational_change": 0.5, "digital_platform": 0.7, "general_heritage": 0.4, } }, "Settlement": { "specificity_score": 0.35, "rationale": "Settlement class - important for location context.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.5, "library_search": 0.5, "collection_discovery": 0.5, "person_research": 0.6, "location_browse": 0.15, "identifier_lookup": 0.5, "organizational_change": 0.5, "digital_platform": 0.6, "general_heritage": 0.4, } }, "Country": { "specificity_score": 0.3, "rationale": "Country class - fundamental geographic context.", "template_specificity": { "archive_search": 0.4, "museum_search": 0.4, "library_search": 0.4, "collection_discovery": 0.4, "person_research": 0.5, "location_browse": 0.2, "identifier_lookup": 0.4, "organizational_change": 0.4, "digital_platform": 0.5, "general_heritage": 0.3, } }, # Collection classes "Collection": { "specificity_score": 0.3, "rationale": "Collection class - broadly relevant for collection queries.", "template_specificity": { "archive_search": 0.3, "museum_search": 0.3, "library_search": 0.3, "collection_discovery": 0.1, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.4, "organizational_change": 0.5, "digital_platform": 0.4, "general_heritage": 0.3, } }, "CustodianCollection": { "specificity_score": 0.3, "rationale": "Custodian collection aspect - broadly relevant.", "template_specificity": { "archive_search": 0.3, "museum_search": 0.3, "library_search": 0.3, "collection_discovery": 0.1, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.4, "organizational_change": 0.5, "digital_platform": 0.4, "general_heritage": 0.3, } }, # Person/Staff classes "PersonObservation": { "specificity_score": 0.5, "rationale": "Person observation class - important for person research.", "template_specificity": { "archive_search": 0.6, "museum_search": 0.6, "library_search": 0.6, "collection_discovery": 0.6, "person_research": 0.1, "location_browse": 0.7, "identifier_lookup": 0.5, "organizational_change": 0.4, "digital_platform": 0.7, "general_heritage": 0.5, } }, "PersonName": { "specificity_score": 0.5, "rationale": "Person name class - important for person research.", "template_specificity": { "archive_search": 0.6, "museum_search": 0.6, "library_search": 0.6, "collection_discovery": 0.7, "person_research": 0.15, "location_browse": 0.8, "identifier_lookup": 0.4, "organizational_change": 0.5, "digital_platform": 0.7, "general_heritage": 0.5, } }, "StaffRole": { "specificity_score": 0.55, "rationale": "Staff role class - relevant for person/organizational queries.", "template_specificity": { "archive_search": 0.6, "museum_search": 0.6, "library_search": 0.6, "collection_discovery": 0.7, "person_research": 0.2, "location_browse": 0.8, "identifier_lookup": 0.6, "organizational_change": 0.3, "digital_platform": 0.7, "general_heritage": 0.5, } }, "WorkExperience": { "specificity_score": 0.6, "rationale": "Work experience class - specific to person research.", "template_specificity": { "archive_search": 0.7, "museum_search": 0.7, "library_search": 0.7, "collection_discovery": 0.8, "person_research": 0.2, "location_browse": 0.8, "identifier_lookup": 0.7, "organizational_change": 0.4, "digital_platform": 0.8, "general_heritage": 0.6, } }, # Observation/Provenance classes "CustodianObservation": { "specificity_score": 0.4, "rationale": "Observation class - important for provenance tracking.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.5, "library_search": 0.5, "collection_discovery": 0.5, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.4, "organizational_change": 0.3, "digital_platform": 0.5, "general_heritage": 0.4, } }, "ReconstructedEntity": { "specificity_score": 0.45, "rationale": "Base reconstructed entity class - foundational for provenance.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.5, "library_search": 0.5, "collection_discovery": 0.5, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.4, "organizational_change": 0.4, "digital_platform": 0.5, "general_heritage": 0.4, } }, # Digital platform classes "DigitalPlatformType": { "specificity_score": 0.6, "rationale": "Digital platform type taxonomy - specific to digital contexts.", "template_specificity": { "archive_search": 0.7, "museum_search": 0.7, "library_search": 0.6, "collection_discovery": 0.5, "person_research": 0.8, "location_browse": 0.9, "identifier_lookup": 0.6, "organizational_change": 0.7, "digital_platform": 0.15, "general_heritage": 0.5, } }, "SocialMediaProfile": { "specificity_score": 0.65, "rationale": "Social media profile class - specific to digital presence.", "template_specificity": { "archive_search": 0.8, "museum_search": 0.8, "library_search": 0.8, "collection_discovery": 0.7, "person_research": 0.4, "location_browse": 0.9, "identifier_lookup": 0.5, "organizational_change": 0.7, "digital_platform": 0.2, "general_heritage": 0.6, } }, # Organizational change classes "CustodianLegalStatus": { "specificity_score": 0.5, "rationale": "Legal status class - important for organizational queries.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.5, "library_search": 0.5, "collection_discovery": 0.6, "person_research": 0.6, "location_browse": 0.6, "identifier_lookup": 0.4, "organizational_change": 0.15, "digital_platform": 0.7, "general_heritage": 0.4, } }, # Access/Policy classes "AccessPolicy": { "specificity_score": 0.55, "rationale": "Access policy class - moderately specific.", "template_specificity": { "archive_search": 0.4, "museum_search": 0.5, "library_search": 0.4, "collection_discovery": 0.4, "person_research": 0.7, "location_browse": 0.7, "identifier_lookup": 0.6, "organizational_change": 0.5, "digital_platform": 0.5, "general_heritage": 0.5, } }, # Standard/metadata classes "Standard": { "specificity_score": 0.6, "rationale": "Standard class - specific to technical/metadata contexts.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.6, "library_search": 0.5, "collection_discovery": 0.5, "person_research": 0.8, "location_browse": 0.8, "identifier_lookup": 0.4, "organizational_change": 0.7, "digital_platform": 0.4, "general_heritage": 0.5, } }, } # Default fallback scores for classes without specific mappings DEFAULT_SCORES = { "specificity_score": 0.5, "rationale": "General heritage class with moderate specificity.", "template_specificity": { "archive_search": 0.5, "museum_search": 0.5, "library_search": 0.5, "collection_discovery": 0.5, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.5, "organizational_change": 0.5, "digital_platform": 0.5, "general_heritage": 0.5, } } def get_scores_for_class(class_name: str, custodian_types: Optional[str]) -> dict: """Determine appropriate specificity scores for a class.""" # Check for class-specific override first if class_name in CLASS_SPECIFIC_SCORES: return CLASS_SPECIFIC_SCORES[class_name] # Check for custodian_types-based scores if custodian_types and custodian_types in CUSTODIAN_TYPE_SCORES: return CUSTODIAN_TYPE_SCORES[custodian_types] # Check if custodian_types contains multiple types - use higher specificity if custodian_types and custodian_types.startswith('["') and ',' in custodian_types: # Multiple custodian types - moderately broad relevance return { "specificity_score": 0.4, "rationale": "Class applies to multiple custodian types.", "template_specificity": { "archive_search": 0.4, "museum_search": 0.4, "library_search": 0.4, "collection_discovery": 0.4, "person_research": 0.5, "location_browse": 0.5, "identifier_lookup": 0.4, "organizational_change": 0.4, "digital_platform": 0.5, "general_heritage": 0.4, } } # Default fallback return DEFAULT_SCORES def format_template_specificity(template_scores: dict) -> str: """Format template_specificity as YAML-compatible multiline string.""" lines = [] for template, score in template_scores.items(): lines.append(f" {template}: {score}") return "\n".join(lines) def add_specificity_annotations(file_path: Path, dry_run: bool = False) -> bool: """Add specificity annotations to a LinkML class file. Returns True if file was modified, False otherwise. """ content = file_path.read_text() # Skip if already has specificity_score if "specificity_score:" in content: return False # Find the class definition and its annotations section # Pattern: annotations:\n custodian_types: ... # First, try to find existing annotations section annotations_pattern = r'( annotations:\n)((?: [^\n]+\n)*)' match = re.search(annotations_pattern, content) if match: # Extract existing custodian_types if present existing_annotations = match.group(2) custodian_types_match = re.search(r"custodian_types: '(\[.*?\])'", existing_annotations) custodian_types = custodian_types_match.group(1) if custodian_types_match else None # Extract class name from file class_name_match = re.search(r'^ (\w+):\s*$', content, re.MULTILINE) class_name = class_name_match.group(1) if class_name_match else file_path.stem # Get appropriate scores scores = get_scores_for_class(class_name, f"'{custodian_types}'" if custodian_types else None) # Build new annotations to add new_annotations = f""" specificity_score: {scores['specificity_score']} specificity_rationale: "{scores['rationale']}" specificity_annotation_timestamp: "{ANNOTATION_TIMESTAMP}" specificity_annotation_agent: "{ANNOTATION_AGENT}" template_specificity: {format_template_specificity(scores['template_specificity'])} """ # Insert after existing annotations new_content = content[:match.end()] + new_annotations + content[match.end():] else: # No annotations section - need to add one # Find where to insert (after class_uri, exact_mappings, close_mappings, etc.) # Extract class name class_name_match = re.search(r'^ (\w+):\s*$', content, re.MULTILINE) class_name = class_name_match.group(1) if class_name_match else file_path.stem # Get scores with no custodian_types scores = get_scores_for_class(class_name, None) # Look for insertion point - after class_uri or is_a line insertion_patterns = [ (r'( class_uri: [^\n]+\n)', 'after_class_uri'), (r'( is_a: [^\n]+\n)', 'after_is_a'), (r'(^ \w+:\s*\n)', 'after_class_name'), ] inserted = False for pattern, location in insertion_patterns: match = re.search(pattern, content, re.MULTILINE) if match: # Build full annotations block annotations_block = f""" annotations: specificity_score: {scores['specificity_score']} specificity_rationale: "{scores['rationale']}" specificity_annotation_timestamp: "{ANNOTATION_TIMESTAMP}" specificity_annotation_agent: "{ANNOTATION_AGENT}" template_specificity: {format_template_specificity(scores['template_specificity'])} """ new_content = content[:match.end()] + annotations_block + content[match.end():] inserted = True break if not inserted: print(f" WARNING: Could not find insertion point in {file_path}") return False if dry_run: print(f" Would update: {file_path}") return True # Write updated content file_path.write_text(new_content) print(f" Updated: {file_path}") return True def main(): dry_run = "--dry-run" in sys.argv if dry_run: print("DRY RUN MODE - no files will be modified\n") print(f"Scanning {SCHEMA_DIR} for LinkML class files...\n") if not SCHEMA_DIR.exists(): print(f"ERROR: Schema directory not found: {SCHEMA_DIR}") sys.exit(1) yaml_files = list(SCHEMA_DIR.glob("*.yaml")) print(f"Found {len(yaml_files)} YAML files\n") modified_count = 0 skipped_count = 0 error_count = 0 for yaml_file in sorted(yaml_files): try: if add_specificity_annotations(yaml_file, dry_run): modified_count += 1 else: skipped_count += 1 except Exception as e: print(f" ERROR processing {yaml_file}: {e}") error_count += 1 print(f"\n{'=' * 60}") print(f"Summary:") print(f" Modified: {modified_count}") print(f" Skipped (already annotated): {skipped_count}") print(f" Errors: {error_count}") if dry_run: print(f"\nRun without --dry-run to apply changes.") if __name__ == "__main__": main()