glam/scripts/apply_ch_annotator_algeria.py

#!/usr/bin/env python3
"""
Apply CH-Annotator Convention to Algeria GLAM Dataset

This script retroactively adds formal CH-Annotator (ch_annotator-v1_7_0)
provenance metadata to the Algerian heritage institutions dataset.

CH-Annotator v1.7.0 requirements:
1. Entity hypernym codes (GRP.HER for heritage institutions)
2. 5-component claim provenance model
3. Convention version reference
4. Ontology class mappings

Usage:
    python scripts/apply_ch_annotator_algeria.py
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


# =============================================================================
# CH-ANNOTATOR MAPPINGS
# =============================================================================

# Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes
INSTITUTION_TYPE_TO_HYPERNYM = {
    "GALLERY": "GRP.HER.GAL",
    "LIBRARY": "GRP.HER.LIB",
    "ARCHIVE": "GRP.HER.ARC",
    "MUSEUM": "GRP.HER.MUS",
    "OFFICIAL_INSTITUTION": "GRP.HER.OFF",
    "RESEARCH_CENTER": "GRP.HER.RES",
    "CORPORATION": "GRP.COR",
    "UNKNOWN": "GRP.HER",
    "BOTANICAL_ZOO": "GRP.HER.BOT",
    "EDUCATION_PROVIDER": "GRP.EDU",
    "COLLECTING_SOCIETY": "GRP.HER.SOC",
    "FEATURES": "TOP.MON",  # Monuments are toponym-based
    "INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT",
    "MIXED": "GRP.HER.MIX",
    "PERSONAL_COLLECTION": "GRP.HER.PER",
    "HOLY_SITES": "GRP.HER.HOL",
    "DIGITAL_PLATFORM": "GRP.HER.DIG",
    "NGO": "GRP.ASS",
    "TASTE_SMELL": "GRP.HER.TAS",
}

# Ontology class mappings for each hypernym
HYPERNYM_ONTOLOGY_CLASS = {
    "GRP.HER.GAL": "schema:Museum",
    "GRP.HER.LIB": "schema:Library",
    "GRP.HER.ARC": "schema:ArchiveOrganization",
    "GRP.HER.MUS": "schema:Museum",
    "GRP.HER.OFF": "schema:GovernmentOrganization",
    "GRP.HER.RES": "schema:ResearchOrganization",
    "GRP.COR": "schema:Corporation",
    "GRP.HER": "glam:HeritageCustodian",
    "GRP.HER.BOT": "schema:Zoo",
    "GRP.EDU": "schema:EducationalOrganization",
    "GRP.HER.SOC": "org:FormalOrganization",
    "TOP.MON": "crm:E22_Human-Made_Object",
    "GRP.HER.INT": "crm:E74_Group",
    "GRP.HER.MIX": "glam:HeritageCustodian",
    "GRP.HER.PER": "glam:PersonalCollection",
    "GRP.HER.HOL": "schema:PlaceOfWorship",
    "GRP.HER.DIG": "schema:WebSite",
    "GRP.ASS": "org:FormalOrganization",
    "GRP.HER.TAS": "glam:HeritageCustodian",
}


def get_hypernym_code(institution_type: str) -> str:
    """Get CH-Annotator hypernym code from institution type."""
    return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER")


def get_ontology_class(hypernym_code: str) -> str:
    """Get ontology class URI from hypernym code."""
    return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian")


def create_ch_annotator_block(institution: dict, annotation_date: str) -> dict:
    """Create CH-Annotator entity annotation block for an institution."""

    institution_type = institution.get("institution_type", "UNKNOWN")
    hypernym_code = get_hypernym_code(institution_type)
    ontology_class = get_ontology_class(hypernym_code)

    # Extract conversation_id from existing provenance
    provenance = institution.get("provenance", {})
    conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d")
    extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z")
    confidence_score = provenance.get("confidence_score", 0.85)

    # Build entity annotation block
    entity_annotation = {
        "convention_id": "ch_annotator-v1_7_0",
        "convention_version": "1.7.0",
        "annotation_date": annotation_date,

        # Entity classification
        "entity_classification": {
            "hypernym": "GRP",
            "hypernym_label": "GROUP",
            "subtype": hypernym_code,
            "subtype_label": institution_type,
            "ontology_class": ontology_class,
            "alternative_classes": [
                "org:FormalOrganization",
                "rov:RegisteredOrganization" if institution.get("identifiers") else None,
                "glam:HeritageCustodian",
            ],
        },

        # Extraction provenance (5-component model)
        # Note: agent reflects the ORIGINAL extraction model from conversation
        # For Claude conversation exports, the specific model version is often unknown
        "extraction_provenance": {
            "namespace": "glam",
            "path": f"/conversations/{conversation_id}",
            "timestamp": extraction_date,
            "agent": provenance.get("extraction_agent", "claude-conversation"),  # Original extraction agent
            "context_convention": "ch_annotator-v1_7_0",
        },

        # CH-Annotator application provenance (separate from original extraction)
        "annotation_provenance": {
            "annotation_agent": "opencode-claude-sonnet-4",  # Model applying CH-Annotator NOW
            "annotation_date": annotation_date,
            "annotation_method": "retroactive CH-Annotator application via batch script",
        },

        # Confidence and verification
        "annotation_metadata": {
            "confidence_score": confidence_score,
            "verified": False,
            "verification_date": None,
            "verified_by": None,
            "annotation_notes": f"Retroactive CH-Annotator annotation applied {annotation_date}. "
                              f"Original extraction from Algerian GLAM conversation.",
        },
    }

    # Clean up None values in alternative_classes
    entity_annotation["entity_classification"]["alternative_classes"] = [
        c for c in entity_annotation["entity_classification"]["alternative_classes"] if c
    ]

    return entity_annotation


def create_entity_claims(institution: dict) -> list:
    """Create CH-Annotator claims for key entity attributes."""

    claims = []
    provenance = institution.get("provenance", {})
    conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d")
    extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z")

    # Base provenance for all claims
    # Note: agent reflects original extraction, not CH-Annotator application
    base_provenance = {
        "namespace": "glam",
        "path": f"/conversations/{conversation_id}",
        "timestamp": extraction_date,
        "agent": provenance.get("extraction_agent", "claude-conversation"),
        "context_convention": "ch_annotator-v1_7_0",
    }

    # Claim 1: Institution name
    if institution.get("name"):
        claims.append({
            "claim_type": "full_name",
            "claim_value": institution["name"],
            "property_uri": "skos:prefLabel",
            "provenance": base_provenance.copy(),
            "confidence": provenance.get("confidence_score", 0.9),
        })

    # Claim 2: Institution type
    if institution.get("institution_type"):
        claims.append({
            "claim_type": "institution_type",
            "claim_value": institution["institution_type"],
            "property_uri": "rdf:type",
            "provenance": base_provenance.copy(),
            "confidence": 0.95,  # Type classification is usually reliable
        })

    # Claim 3: Location (city)
    locations = institution.get("locations", [])
    if locations and locations[0].get("city"):
        claims.append({
            "claim_type": "located_in_city",
            "claim_value": locations[0]["city"],
            "property_uri": "schema:addressLocality",
            "provenance": base_provenance.copy(),
            "confidence": 0.9,
        })

    # Claim 4: Wikidata ID (if present)
    identifiers = institution.get("identifiers", [])
    for ident in identifiers:
        if ident.get("identifier_scheme") == "Wikidata":
            claims.append({
                "claim_type": "wikidata_id",
                "claim_value": ident["identifier_value"],
                "property_uri": "owl:sameAs",
                "provenance": {
                    **base_provenance,
                    "namespace": "wikidata",
                    "path": f"/entity/{ident['identifier_value']}",
                },
                "confidence": 0.98,  # Wikidata IDs verified via SPARQL
            })
            break

    return claims


def apply_ch_annotator(input_path: Path, output_path: Path) -> dict:
    """Apply CH-Annotator convention to Algeria dataset."""

    # Load existing data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    annotation_date = datetime.now(timezone.utc).isoformat()

    stats = {
        "total_institutions": 0,
        "annotations_added": 0,
        "claims_created": 0,
        "by_hypernym": {},
    }

    # Process each institution
    for institution in data:
        stats["total_institutions"] += 1

        # Add CH-Annotator entity annotation block
        ch_annotation = create_ch_annotator_block(institution, annotation_date)
        institution["ch_annotator"] = ch_annotation
        stats["annotations_added"] += 1

        # Track hypernym distribution
        hypernym = ch_annotation["entity_classification"]["subtype"]
        stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1

        # Add entity claims
        claims = create_entity_claims(institution)
        institution["ch_annotator"]["entity_claims"] = claims
        stats["claims_created"] += len(claims)

    # Write updated data
    with open(output_path, 'w', encoding='utf-8') as f:
        # Add header comment
        header = """# Algerian GLAM Institutions - CH-Annotator Enhanced
# Last updated: {date}
# CH-Annotator Convention: ch_annotator-v1_7_0
#
# This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator)
# provenance metadata following the ch_annotator-v1_7_0 convention.
#
# CH-Annotator Features Applied:
# - Entity hypernym codes (GRP.HER.* for heritage institutions)
# - 5-component claim provenance model (namespace, path, timestamp, agent, convention)
# - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org)
# - Entity claims for key attributes (name, type, location, identifiers)
#
# Statistics:
# - Total institutions: {total}
# - Annotations added: {annotations}
# - Entity claims created: {claims}
# - Hypernym distribution: {hypernyms}
#
# Original Data Source: Algerian GLAM conversation (039a271a-f8e3-4bf3-9e89-b289ec80701d)
# Original Extraction Date: 2025-11-09
# Convention Applied: {date}

""".format(
            date=annotation_date,
            total=stats["total_institutions"],
            annotations=stats["annotations_added"],
            claims=stats["claims_created"],
            hypernyms=", ".join(f"{k}: {v}" for k, v in sorted(stats["by_hypernym"].items())),
        )

        f.write(header)
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return stats


def main():
    """Main entry point."""

    input_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ghcid.yaml")
    output_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ch_annotator.yaml")

    print("=" * 70)
    print("CH-Annotator Convention Application")
    print("=" * 70)
    print(f"Input:  {input_path}")
    print(f"Output: {output_path}")
    print()

    stats = apply_ch_annotator(input_path, output_path)

    print("Results:")
    print(f"  Total institutions:    {stats['total_institutions']}")
    print(f"  Annotations added:     {stats['annotations_added']}")
    print(f"  Entity claims created: {stats['claims_created']}")
    print()
    print("Hypernym Distribution:")
    for hypernym, count in sorted(stats["by_hypernym"].items()):
        print(f"  {hypernym}: {count}")
    print()
    print(f"Output written to: {output_path}")
    print("=" * 70)


if __name__ == "__main__":
    main()