glam/scripts/apply_ch_annotator_batch.py

#!/usr/bin/env python3
"""
Batch Apply CH-Annotator Convention to GLAM Datasets

This script applies formal CH-Annotator (ch_annotator-v1_7_0) provenance metadata
to heritage institution datasets across multiple countries.

CH-Annotator v1.7.0 requirements:
1. Entity hypernym codes (GRP.HER for heritage institutions)
2. 5-component claim provenance model
3. Convention version reference
4. Ontology class mappings

Usage:
    # Process all discovered datasets
    python scripts/apply_ch_annotator_batch.py

    # Process specific files
    python scripts/apply_ch_annotator_batch.py --files data/instances/egypt_institutions_ghcid.yaml

    # Dry run (show what would be processed)
    python scripts/apply_ch_annotator_batch.py --dry-run

Author: OpenCode (Claude Sonnet 4)
Date: 2025-12-06
Convention: ch_annotator-v1_7_0
"""

import argparse
import yaml
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import sys


# =============================================================================
# CH-ANNOTATOR MAPPINGS
# =============================================================================

# Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes
INSTITUTION_TYPE_TO_HYPERNYM = {
    "GALLERY": "GRP.HER.GAL",
    "LIBRARY": "GRP.HER.LIB",
    "ARCHIVE": "GRP.HER.ARC",
    "MUSEUM": "GRP.HER.MUS",
    "OFFICIAL_INSTITUTION": "GRP.HER.OFF",
    "RESEARCH_CENTER": "GRP.HER.RES",
    "CORPORATION": "GRP.COR",
    "UNKNOWN": "GRP.HER",
    "BOTANICAL_ZOO": "GRP.HER.BOT",
    "EDUCATION_PROVIDER": "GRP.EDU",
    "COLLECTING_SOCIETY": "GRP.HER.SOC",
    "FEATURES": "TOP.MON",  # Monuments are toponym-based
    "INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT",
    "MIXED": "GRP.HER.MIX",
    "PERSONAL_COLLECTION": "GRP.HER.PER",
    "HOLY_SITES": "GRP.HER.HOL",
    "DIGITAL_PLATFORM": "GRP.HER.DIG",
    "NGO": "GRP.ASS",
    "TASTE_SMELL": "GRP.HER.TAS",
}

# Ontology class mappings for each hypernym
HYPERNYM_ONTOLOGY_CLASS = {
    "GRP.HER.GAL": "schema:Museum",
    "GRP.HER.LIB": "schema:Library",
    "GRP.HER.ARC": "schema:ArchiveOrganization",
    "GRP.HER.MUS": "schema:Museum",
    "GRP.HER.OFF": "schema:GovernmentOrganization",
    "GRP.HER.RES": "schema:ResearchOrganization",
    "GRP.COR": "schema:Corporation",
    "GRP.HER": "glam:HeritageCustodian",
    "GRP.HER.BOT": "schema:Zoo",
    "GRP.EDU": "schema:EducationalOrganization",
    "GRP.HER.SOC": "org:FormalOrganization",
    "TOP.MON": "crm:E22_Human-Made_Object",
    "GRP.HER.INT": "crm:E74_Group",
    "GRP.HER.MIX": "glam:HeritageCustodian",
    "GRP.HER.PER": "glam:PersonalCollection",
    "GRP.HER.HOL": "schema:PlaceOfWorship",
    "GRP.HER.DIG": "schema:WebSite",
    "GRP.ASS": "org:FormalOrganization",
    "GRP.HER.TAS": "glam:HeritageCustodian",
}

# Hypernym labels
HYPERNYM_LABELS = {
    "GRP": "GROUP",
    "TOP": "TOPONYM",
    "AGT": "AGENT",
}


# =============================================================================
# DATASET DISCOVERY
# =============================================================================

# Primary datasets to process (main YAML files with GHCID-enriched data)
# Excludes: backups, archives, cache files, test files
PRIMARY_DATASETS = [
    # North Africa
    "data/instances/algeria/algerian_institutions_ghcid.yaml",
    "data/instances/egypt_institutions_ghcid.yaml",
    "data/instances/libya/libyan_institutions.yaml",
    "data/instances/morocco/moroccan_institutions.yaml",
    "data/instances/tunisia/tunisian_institutions_enhanced.yaml",

    # Europe
    "data/instances/austria_complete.yaml",
    "data/instances/belarus_complete.yaml",
    "data/instances/belgium_complete.yaml",
    "data/instances/bulgaria_complete.yaml",
    "data/instances/czech_unified.yaml",
    "data/instances/denmark_libraries.yaml",
    "data/instances/netherlands_complete.yaml",
    "data/instances/norway/city_archives.yaml",
    "data/instances/norway/county_archives.yaml",
    "data/instances/norway/museums_oslo.yaml",
    "data/instances/switzerland_isil.yaml",
    "data/instances/georgia_glam_institutions_enriched.yaml",
    "data/instances/great_britain/gb_institutions_enriched_manual.yaml",
    "data/instances/italy/it_institutions_enriched_manual.yaml",

    # Asia
    "data/instances/japan_complete.yaml",
    "data/instances/vietnamese_glam_institutions.yaml",
    "data/instances/palestinian_heritage_custodians.yaml",

    # Americas
    "data/instances/argentina_complete.yaml",
    "data/instances/latin_american_institutions_AUTHORITATIVE.yaml",
    "data/instances/mexico/mexican_institutions_curated.yaml",
    "data/instances/united_states/us_institutions_enriched_manual.yaml",
]


def discover_datasets(base_path: Path) -> list[Path]:
    """Discover all primary datasets that need CH-Annotator application."""
    datasets = []

    for rel_path in PRIMARY_DATASETS:
        full_path = base_path / rel_path
        if full_path.exists():
            datasets.append(full_path)
        else:
            print(f"  [SKIP] Not found: {rel_path}")

    return datasets


def already_has_ch_annotator(data: list | dict) -> bool:
    """Check if dataset already has CH-Annotator annotations."""
    if isinstance(data, list) and len(data) > 0:
        first_item = data[0]
        return isinstance(first_item, dict) and "ch_annotator" in first_item
    return False


# =============================================================================
# CH-ANNOTATOR APPLICATION
# =============================================================================

def get_hypernym_code(institution_type: str) -> str:
    """Get CH-Annotator hypernym code from institution type."""
    return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER")


def get_ontology_class(hypernym_code: str) -> str:
    """Get ontology class URI from hypernym code."""
    return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian")


def get_hypernym_label(hypernym_code: str) -> str:
    """Get hypernym label from code (first 3 letters)."""
    prefix = hypernym_code.split(".")[0] if "." in hypernym_code else hypernym_code
    return HYPERNYM_LABELS.get(prefix, "GROUP")


def extract_conversation_id(institution: dict) -> str:
    """Extract conversation ID from provenance if available."""
    provenance = institution.get("provenance", {})
    return provenance.get("conversation_id", "unknown")


def create_ch_annotator_block(institution: dict, annotation_date: str, source_file: str) -> dict:
    """Create CH-Annotator entity annotation block for an institution."""

    # Handle alternative field names for institution type
    institution_type = (
        institution.get("institution_type") or
        institution.get("custodian_type") or
        "UNKNOWN"
    )
    hypernym_code = get_hypernym_code(institution_type)
    ontology_class = get_ontology_class(hypernym_code)
    hypernym_label = get_hypernym_label(hypernym_code)

    # Extract provenance info
    provenance = institution.get("provenance", {})
    conversation_id = provenance.get("conversation_id", "unknown")
    extraction_date = provenance.get("extraction_date", "unknown")
    confidence_score = provenance.get("confidence_score", 0.85)

    # Determine original extraction agent
    # For conversation-based extractions, we use a generic identifier
    # since Claude conversation exports don't specify model version
    extraction_agent = provenance.get("extraction_agent", "claude-conversation")

    # Build entity annotation block
    entity_annotation = {
        "convention_id": "ch_annotator-v1_7_0",
        "convention_version": "1.7.0",

        # Entity classification
        "entity_classification": {
            "hypernym": hypernym_code.split(".")[0] if "." in hypernym_code else hypernym_code,
            "hypernym_label": hypernym_label,
            "subtype": hypernym_code,
            "subtype_label": institution_type,
            "ontology_class": ontology_class,
            "alternative_classes": [
                c for c in [
                    "org:FormalOrganization",
                    "rov:RegisteredOrganization" if institution.get("identifiers") else None,
                    "glam:HeritageCustodian",
                ] if c
            ],
        },

        # Extraction provenance (5-component model) - reflects ORIGINAL extraction
        "extraction_provenance": {
            "namespace": "glam",
            "path": f"/conversations/{conversation_id}" if conversation_id != "unknown" else f"/files/{source_file}",
            "timestamp": extraction_date,
            "agent": extraction_agent,
            "context_convention": "ch_annotator-v1_7_0",
        },

        # CH-Annotator application provenance (separate from original extraction)
        "annotation_provenance": {
            "annotation_agent": "opencode-claude-sonnet-4",
            "annotation_date": annotation_date,
            "annotation_method": "retroactive CH-Annotator application via batch script",
            "source_file": source_file,
        },

        # Confidence and verification
        "annotation_metadata": {
            "confidence_score": confidence_score,
            "verified": False,
            "verification_date": None,
            "verified_by": None,
        },
    }

    return entity_annotation


def create_entity_claims(institution: dict, source_file: str) -> list:
    """Create CH-Annotator claims for key entity attributes."""

    claims = []
    provenance = institution.get("provenance", {})
    conversation_id = provenance.get("conversation_id", "unknown")
    extraction_date = provenance.get("extraction_date", institution.get("created", "unknown"))
    extraction_agent = provenance.get("extraction_agent", "claude-conversation")

    # Base provenance for all claims
    base_provenance = {
        "namespace": "glam",
        "path": f"/conversations/{conversation_id}" if conversation_id != "unknown" else f"/files/{source_file}",
        "timestamp": extraction_date,
        "agent": extraction_agent,
        "context_convention": "ch_annotator-v1_7_0",
    }

    # Claim 1: Institution name (handle alternative field names)
    name = institution.get("name") or institution.get("preferred_label")
    if name:
        claims.append({
            "claim_type": "full_name",
            "claim_value": name,
            "property_uri": "skos:prefLabel",
            "provenance": base_provenance.copy(),
            "confidence": provenance.get("confidence_score", 0.9),
        })

    # Claim 2: Institution type (handle alternative field names)
    inst_type = institution.get("institution_type") or institution.get("custodian_type")
    if inst_type:
        claims.append({
            "claim_type": "institution_type",
            "claim_value": inst_type,
            "property_uri": "rdf:type",
            "provenance": base_provenance.copy(),
            "confidence": 0.95,
        })

    # Claim 3: Location (city) - handle different location structures
    locations = institution.get("locations", [])
    place_designation = institution.get("place_designation", {})

    city = None
    if locations and isinstance(locations, list) and len(locations) > 0:
        first_loc = locations[0]
        if isinstance(first_loc, dict):
            city = first_loc.get("city")
    elif place_designation:
        # Handle Palestinian schema structure
        city = place_designation.get("place_name")
        if not city:
            settlement = place_designation.get("settlement", {})
            city = settlement.get("settlement_name")

    if city:
        claims.append({
            "claim_type": "located_in_city",
            "claim_value": city,
            "property_uri": "schema:addressLocality",
            "provenance": base_provenance.copy(),
            "confidence": 0.9,
        })

    # Claim 4: Wikidata ID (if present)
    identifiers = institution.get("identifiers", [])
    if identifiers and isinstance(identifiers, list):
        for ident in identifiers:
            if isinstance(ident, dict) and ident.get("identifier_scheme") == "Wikidata":
                claims.append({
                    "claim_type": "wikidata_id",
                    "claim_value": ident["identifier_value"],
                    "property_uri": "owl:sameAs",
                    "provenance": {
                        **base_provenance,
                        "namespace": "wikidata",
                        "path": f"/entity/{ident['identifier_value']}",
                    },
                    "confidence": 0.98,
                })
                break

    # Claim 5: GHCID (if present)
    ghcid = institution.get("ghcid")
    if ghcid:
        claims.append({
            "claim_type": "ghcid",
            "claim_value": ghcid,
            "property_uri": "glam:ghcid",
            "provenance": base_provenance.copy(),
            "confidence": 1.0,  # GHCID is deterministically generated
        })

    return claims


def apply_ch_annotator(input_path: Path, output_path: Path) -> dict:
    """Apply CH-Annotator convention to a dataset."""

    # Load existing data
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Handle YAML files that may have header comments
    data = yaml.safe_load(content)

    if data is None:
        return {"error": "Empty or invalid YAML file", "total_institutions": 0}

    # Handle different data structures
    if isinstance(data, dict):
        # Some files have a root key containing the list
        # Try to find the list of institutions
        for key in ["institutions", "entries", "records", "data", "custodians"]:
            if key in data and isinstance(data[key], list):
                institutions = data[key]
                root_key = key  # Remember the root key for writing back
                break
        else:
            # Single institution or unknown structure
            return {"error": "Unknown data structure (dict without institution list)", "total_institutions": 0}
    elif isinstance(data, list):
        institutions = data
        root_key = None
    else:
        return {"error": f"Unexpected data type: {type(data)}", "total_institutions": 0}

    if not institutions:
        return {"error": "No institutions found", "total_institutions": 0}

    # Check if already annotated
    if already_has_ch_annotator(institutions):
        return {"skipped": True, "reason": "Already has CH-Annotator annotations", "total_institutions": len(institutions)}

    annotation_date = datetime.now(timezone.utc).isoformat()
    source_file = input_path.name

    stats = {
        "total_institutions": 0,
        "annotations_added": 0,
        "claims_created": 0,
        "by_hypernym": {},
        "skipped": False,
    }

    # Process each institution
    for institution in institutions:
        if not isinstance(institution, dict):
            continue

        stats["total_institutions"] += 1

        # Add CH-Annotator entity annotation block
        ch_annotation = create_ch_annotator_block(institution, annotation_date, source_file)
        institution["ch_annotator"] = ch_annotation
        stats["annotations_added"] += 1

        # Track hypernym distribution
        hypernym = ch_annotation["entity_classification"]["subtype"]
        stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1

        # Add entity claims
        claims = create_entity_claims(institution, source_file)
        institution["ch_annotator"]["entity_claims"] = claims
        stats["claims_created"] += len(claims)

    # Write updated data
    with open(output_path, 'w', encoding='utf-8') as f:
        # Add header comment
        header = f"""# Heritage Institutions - CH-Annotator Enhanced
# Source file: {input_path.name}
# Last updated: {annotation_date}
# CH-Annotator Convention: ch_annotator-v1_7_0
#
# This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator)
# provenance metadata following the ch_annotator-v1_7_0 convention.
#
# CH-Annotator Features Applied:
# - Entity hypernym codes (GRP.HER.* for heritage institutions)
# - 5-component claim provenance model (namespace, path, timestamp, agent, convention)
# - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org)
# - Entity claims for key attributes (name, type, location, identifiers, GHCID)
#
# Statistics:
# - Total institutions: {stats['total_institutions']}
# - Annotations added: {stats['annotations_added']}
# - Entity claims created: {stats['claims_created']}
# - Hypernym distribution: {', '.join(f'{k}: {v}' for k, v in sorted(stats['by_hypernym'].items()))}
#
# Annotation Agent: opencode-claude-sonnet-4
# Annotation Date: {annotation_date}

"""
        f.write(header)
        yaml.dump(institutions if isinstance(data, list) else data, f,
                  default_flow_style=False, allow_unicode=True, sort_keys=False)

    return stats


# =============================================================================
# MAIN ENTRY POINT
# =============================================================================

def main():
    parser = argparse.ArgumentParser(
        description="Batch apply CH-Annotator convention to GLAM datasets"
    )
    parser.add_argument(
        "--files",
        nargs="+",
        help="Specific files to process (default: discover all primary datasets)"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be processed without making changes"
    )
    parser.add_argument(
        "--output-suffix",
        default="_ch_annotator",
        help="Suffix for output files (default: _ch_annotator)"
    )
    parser.add_argument(
        "--in-place",
        action="store_true",
        help="Modify files in place (creates backup with .pre_ch_annotator suffix)"
    )

    args = parser.parse_args()

    base_path = Path("/Users/kempersc/apps/glam")

    print("=" * 70)
    print("CH-Annotator Batch Application")
    print("=" * 70)
    print(f"Convention: ch_annotator-v1_7_0")
    print(f"Annotation Agent: opencode-claude-sonnet-4")
    print()

    # Discover or use specified files
    if args.files:
        datasets = []
        for f in args.files:
            p = Path(f)
            if not p.is_absolute():
                p = base_path / p
            if p.exists():
                datasets.append(p)
            else:
                print(f"  [SKIP] File not found: {f}")
    else:
        print("Discovering datasets...")
        datasets = discover_datasets(base_path)

    print(f"Found {len(datasets)} datasets to process")
    print()

    if args.dry_run:
        print("[DRY RUN] Would process:")
        for ds in datasets:
            print(f"  - {ds.relative_to(base_path)}")
        return

    # Process each dataset
    total_stats = {
        "files_processed": 0,
        "files_skipped": 0,
        "files_failed": 0,
        "total_institutions": 0,
        "total_claims": 0,
        "by_hypernym": {},
    }

    for dataset in datasets:
        rel_path = dataset.relative_to(base_path)
        print(f"Processing: {rel_path}")

        # Determine output path
        if args.in_place:
            output_path = dataset
            # Create backup
            backup_path = dataset.with_suffix(f".pre_ch_annotator{dataset.suffix}")
            import shutil
            shutil.copy2(dataset, backup_path)
        else:
            stem = dataset.stem
            if stem.endswith("_ghcid"):
                stem = stem.replace("_ghcid", "")
            output_path = dataset.parent / f"{stem}{args.output_suffix}.yaml"

        try:
            stats = apply_ch_annotator(dataset, output_path)

            if stats.get("skipped"):
                print(f"  [SKIP] {stats.get('reason', 'Unknown reason')}")
                total_stats["files_skipped"] += 1
            elif stats.get("error"):
                print(f"  [ERROR] {stats['error']}")
                total_stats["files_failed"] += 1
            else:
                print(f"  [OK] {stats['total_institutions']} institutions, {stats['claims_created']} claims")
                print(f"       Output: {output_path.relative_to(base_path)}")
                total_stats["files_processed"] += 1
                total_stats["total_institutions"] += stats["total_institutions"]
                total_stats["total_claims"] += stats["claims_created"]

                # Merge hypernym stats
                for h, c in stats.get("by_hypernym", {}).items():
                    total_stats["by_hypernym"][h] = total_stats["by_hypernym"].get(h, 0) + c

        except Exception as e:
            print(f"  [ERROR] {e}")
            total_stats["files_failed"] += 1

    # Print summary
    print()
    print("=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"Files processed: {total_stats['files_processed']}")
    print(f"Files skipped:   {total_stats['files_skipped']}")
    print(f"Files failed:    {total_stats['files_failed']}")
    print(f"Total institutions: {total_stats['total_institutions']}")
    print(f"Total claims: {total_stats['total_claims']}")
    print()
    print("Hypernym Distribution:")
    for hypernym, count in sorted(total_stats["by_hypernym"].items()):
        print(f"  {hypernym}: {count}")
    print("=" * 70)


if __name__ == "__main__":
    main()