glam/scripts/extract_with_agents.py

#!/usr/bin/env python3
"""
Orchestration script for NLP extraction using OpenCode subagents.

This script coordinates multiple specialized OpenCode agents to extract
structured heritage institution data from conversation JSON files.

Usage:
    python scripts/extract_with_agents.py <conversation_json_path>

Example:
    python scripts/extract_with_agents.py conversations/Brazilian_GLAM_collection_inventories.json

The script will:
1. Parse the conversation JSON using ConversationParser
2. Extract text content from messages
3. Invoke specialized OpenCode agents:
   - @institution-extractor: Extract institution names and types
   - @location-extractor: Extract geographic information
   - @identifier-extractor: Extract ISIL, Wikidata, VIAF, etc.
   - @event-extractor: Extract organizational change events
4. Combine results into HeritageCustodian records
5. Validate with LinkML schema
6. Export to JSON-LD

Note: This script is designed to work WITH OpenCode agents, not independently.
Run this script from within an OpenCode session where you can invoke agents.
"""

import sys
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.parsers.conversation import ConversationParser
from glam_extractor.models import (
    HeritageCustodian,
    Provenance,
    DataSource,
    DataTier,
    InstitutionType,
    OrganizationStatus,
    Location,
    Identifier,
    ChangeEvent,
    ChangeType,
)


class AgentOrchestrator:
    """
    Orchestrates multiple OpenCode agents to extract heritage institution data.

    This class does NOT directly invoke agents (that's done by OpenCode).
    Instead, it:
    - Prepares input text for agents
    - Provides helper methods for processing agent responses
    - Combines results into HeritageCustodian records
    """

    def __init__(self, conversation_path: Path):
        self.conversation_path = conversation_path
        self.parser = ConversationParser()
        self.conversation = self.parser.parse_file(conversation_path)

    def extract_conversation_text(
        self,
        sender: Optional[str] = None,
        max_length: int = 50000
    ) -> str:
        """
        Extract text content from conversation for agent processing.

        Args:
            sender: Filter by sender ('human' or 'assistant'). None for all.
            max_length: Maximum characters to extract (agent context limits)

        Returns:
            Combined text from conversation messages
        """
        # Use Conversation's extract_all_text method
        combined = self.conversation.extract_all_text(sender=sender)

        # Truncate if too long (respect agent context limits)
        if len(combined) > max_length:
            combined = combined[:max_length] + "\n\n[... text truncated ...]"

        return combined

    def prepare_institution_extraction_prompt(self) -> str:
        """
        Prepare prompt for @institution-extractor agent.

        Returns:
            Formatted prompt with conversation text
        """
        text = self.extract_conversation_text(max_length=40000)

        conversation_name = self.conversation.name
        conversation_id = self.conversation.uuid

        prompt = f"""
# Institution Extraction Task

**Conversation**: {conversation_name}
**Conversation ID**: {conversation_id}
**File**: {self.conversation_path.name}

## Instructions

Please extract all heritage institutions (museums, libraries, archives, galleries, etc.)
mentioned in the following conversation text.

Return your results as a JSON array of institution objects following the format
specified in your agent configuration.

## Conversation Text

{text}

## Output Format

Return ONLY the JSON array, no additional commentary:

```json
{{
  "institutions": [...]
}}
```
"""
        return prompt.strip()

    def prepare_location_extraction_prompt(self, institution_context: str = "") -> str:
        """
        Prepare prompt for @location-extractor agent.

        Args:
            institution_context: Optional context about specific institution

        Returns:
            Formatted prompt with conversation text
        """
        text = self.extract_conversation_text(max_length=40000)

        prompt = f"""
# Location Extraction Task

**File**: {self.conversation_path.name}

## Instructions

Please extract all geographic locations (cities, addresses, regions, countries)
mentioned for heritage institutions in the following conversation text.

{institution_context}

Return your results as a JSON array following the format specified in your agent configuration.

## Conversation Text

{text}

## Output Format

Return ONLY the JSON array, no additional commentary:

```json
{{
  "locations": [...]
}}
```
"""
        return prompt.strip()

    def prepare_identifier_extraction_prompt(self) -> str:
        """
        Prepare prompt for @identifier-extractor agent.

        Returns:
            Formatted prompt with conversation text
        """
        text = self.extract_conversation_text(max_length=40000)

        prompt = f"""
# Identifier Extraction Task

**File**: {self.conversation_path.name}

## Instructions

Please extract all external identifiers (ISIL codes, Wikidata IDs, VIAF, KvK numbers, URLs, etc.)
mentioned for heritage institutions in the following conversation text.

Look for patterns like:
- ISIL: NL-AsdAM, US-DLC, etc.
- Wikidata: Q123456
- VIAF: 147143282
- KvK: 41231987
- URLs: https://www.example.org

Return your results as a JSON array following the format specified in your agent configuration.

## Conversation Text

{text}

## Output Format

Return ONLY the JSON array, no additional commentary:

```json
{{
  "identifiers": [...]
}}
```
"""
        return prompt.strip()

    def prepare_event_extraction_prompt(self) -> str:
        """
        Prepare prompt for @event-extractor agent.

        Returns:
            Formatted prompt with conversation text
        """
        text = self.extract_conversation_text(max_length=40000)

        prompt = f"""
# Change Event Extraction Task

**File**: {self.conversation_path.name}

## Instructions

Please extract all organizational change events (founding, mergers, relocations, name changes, etc.)
mentioned for heritage institutions in the following conversation text.

Look for temporal patterns like:
- "Founded in 1985"
- "Merged with X in 2001"
- "Relocated to Y in 2010"
- "Renamed from A to B"

Return your results as a JSON array following the format specified in your agent configuration.

## Conversation Text

{text}

## Output Format

Return ONLY the JSON array, no additional commentary:

```json
{{
  "change_events": [...]
}}
```
"""
        return prompt.strip()

    def create_heritage_custodian_record(
        self,
        institution_data: Dict[str, Any],
        locations_data: List[Dict[str, Any]],
        identifiers_data: List[Dict[str, Any]],
        events_data: List[Dict[str, Any]]
    ) -> HeritageCustodian:
        """
        Combine agent extraction results into a HeritageCustodian record.

        Args:
            institution_data: From @institution-extractor
            locations_data: From @location-extractor
            identifiers_data: From @identifier-extractor
            events_data: From @event-extractor

        Returns:
            Validated HeritageCustodian record
        """
        # Create provenance metadata
        provenance = Provenance(
            data_source=DataSource.CONVERSATION_NLP,
            data_tier=DataTier.TIER_4_INFERRED,
            extraction_date=datetime.now(timezone.utc),
            extraction_method="OpenCode multi-agent NLP extraction",
            confidence_score=institution_data.get("confidence_score", 0.8),
            conversation_id=self.conversation.uuid,
            source_url=None,
            verified_date=None,
            verified_by=None
        )

        # Parse institution type
        institution_type = InstitutionType.UNKNOWN
        type_str = institution_data.get("institution_type", "").upper()
        if type_str in InstitutionType.__members__:
            institution_type = InstitutionType[type_str]

        # Create Location objects
        locations = []
        for loc_data in locations_data:
            location = Location(
                location_type=loc_data.get("location_type"),
                city=loc_data.get("city"),
                street_address=loc_data.get("street_address"),
                postal_code=loc_data.get("postal_code"),
                region=loc_data.get("region"),
                country=loc_data.get("country"),
                geonames_id=loc_data.get("geonames_id"),
                latitude=loc_data.get("latitude"),
                longitude=loc_data.get("longitude"),
                is_primary=loc_data.get("is_primary", False)
            )
            locations.append(location)

        # Create Identifier objects
        identifiers = []
        for id_data in identifiers_data:
            scheme = id_data.get("identifier_scheme")
            value = id_data.get("identifier_value")
            if scheme and value:  # Skip if missing required fields
                identifier = Identifier(
                    identifier_scheme=scheme,
                    identifier_value=value,
                    identifier_url=id_data.get("identifier_uri"),  # Agent uses identifier_uri
                    assigned_date=None  # Not extracted by agents
                )
                identifiers.append(identifier)

        # Create ChangeEvent objects
        change_history = []
        for event_data in events_data:
            change_type_str = event_data.get("change_type", "").upper()
            if change_type_str in ChangeType.__members__:
                change_type = ChangeType[change_type_str]
            else:
                change_type = ChangeType.OTHER

            # Parse event_date (can be string or date object from agent)
            event_date = event_data.get("event_date")
            if isinstance(event_date, str):
                # Try to parse ISO date string
                from datetime import date
                try:
                    event_date = date.fromisoformat(event_date)
                except (ValueError, TypeError):
                    continue  # Skip if date parsing fails

            # Skip if no valid date
            if event_date is None:
                continue

            event = ChangeEvent(
                event_id=event_data.get("event_id", f"event-{len(change_history)}"),
                change_type=change_type,
                event_date=event_date,
                event_description=event_data.get("event_description"),
                affected_organization=event_data.get("affected_organization"),
                resulting_organization=event_data.get("resulting_organization"),
                related_organizations=event_data.get("related_organizations", []),
                source_documentation=event_data.get("source_documentation")
            )
            change_history.append(event)

        # Create HeritageCustodian record
        name = institution_data.get("name", "Unknown Institution")
        custodian = HeritageCustodian(
            id=f"heritage-custodian-{institution_data.get('institution_id', 'unknown')}",
            name=name,
            alternative_names=institution_data.get("alternative_names", []),
            institution_type=institution_type,
            organization_status=OrganizationStatus.UNKNOWN,  # Not extracted by agents
            description=institution_data.get("description"),
            parent_organization=None,  # Not extracted by agents
            founded_date=None,  # Extracted separately via events
            closed_date=None,  # Extracted separately via events
            homepage=None,  # Could be in identifiers as WEBSITE
            ghcid_numeric=None,  # Generated later
            ghcid_current=None,  # Generated later
            ghcid_original=None,  # Generated later
            ghcid_history=None,  # Generated later
            contact_info=None,  # Not extracted by agents
            locations=locations if locations else [],
            identifiers=identifiers if identifiers else [],
            change_history=change_history if change_history else [],
            provenance=provenance
        )

        return custodian

    def export_to_jsonld(
        self,
        custodians: List[HeritageCustodian],
        output_path: Path
    ):
        """
        Export HeritageCustodian records to JSON-LD.

        Args:
            custodians: List of validated records
            output_path: Where to save JSON-LD file
        """
        jsonld_data = {
            "@context": "https://w3id.org/heritage/custodian/context.jsonld",
            "@graph": [custodian.dict(exclude_none=True) for custodian in custodians]
        }

        with output_path.open('w', encoding='utf-8') as f:
            json.dump(jsonld_data, f, indent=2, ensure_ascii=False, default=str)

        print(f"✅ Exported {len(custodians)} records to {output_path}")


def main():
    """
    Main entry point for the orchestration script.

    NOTE: This script prepares prompts and data structures, but does NOT
    directly invoke agents. Agent invocation happens through OpenCode.

    To use this script:
    1. Run it to see the prompts for each agent
    2. Copy/paste prompts to invoke agents via @mention
    3. Collect agent responses
    4. Use helper methods to process results
    """
    if len(sys.argv) < 2:
        print("Usage: python scripts/extract_with_agents.py <conversation_json_path>")
        print()
        print("Example:")
        print("  python scripts/extract_with_agents.py conversations/Brazilian_GLAM_collection_inventories.json")
        sys.exit(1)

    conversation_path = Path(sys.argv[1])

    if not conversation_path.exists():
        print(f"❌ Error: File not found: {conversation_path}")
        sys.exit(1)

    print(f"🔄 Processing conversation: {conversation_path.name}")
    print()

    # Initialize orchestrator
    orchestrator = AgentOrchestrator(conversation_path)

    print("=" * 80)
    print("AGENT INVOCATION PROMPTS")
    print("=" * 80)
    print()

    # Generate prompts for each agent
    print("📋 STEP 1: Invoke @institution-extractor")
    print("-" * 80)
    institution_prompt = orchestrator.prepare_institution_extraction_prompt()
    print(institution_prompt)
    print()
    print("⏸  Copy the above prompt and send to @institution-extractor")
    print()

    print("=" * 80)
    print("📋 STEP 2: Invoke @location-extractor")
    print("-" * 80)
    location_prompt = orchestrator.prepare_location_extraction_prompt()
    print(location_prompt)
    print()
    print("⏸  Copy the above prompt and send to @location-extractor")
    print()

    print("=" * 80)
    print("📋 STEP 3: Invoke @identifier-extractor")
    print("-" * 80)
    identifier_prompt = orchestrator.prepare_identifier_extraction_prompt()
    print(identifier_prompt)
    print()
    print("⏸  Copy the above prompt and send to @identifier-extractor")
    print()

    print("=" * 80)
    print("📋 STEP 4: Invoke @event-extractor")
    print("-" * 80)
    event_prompt = orchestrator.prepare_event_extraction_prompt()
    print(event_prompt)
    print()
    print("⏸  Copy the above prompt and send to @event-extractor")
    print()

    print("=" * 80)
    print("📊 NEXT STEPS")
    print("=" * 80)
    print()
    print("After receiving responses from all agents:")
    print("1. Collect JSON responses from each agent")
    print("2. Use orchestrator.create_heritage_custodian_record() to combine results")
    print("3. Validate records with LinkML schema")
    print("4. Export to JSON-LD using orchestrator.export_to_jsonld()")
    print()
    print("Or continue working with the orchestrator object in your OpenCode session.")
    print()


if __name__ == "__main__":
    main()