glam/scripts/llm_extract_archiveslab.py

#!/usr/bin/env python3
"""
LLM-based extraction for Archives Lab content.

Uses LLMAnnotator with GLAM-NER v1.7.0 for comprehensive entity and relationship extraction.
This replaces the regex-based extraction with generative LLM inference.

Usage:
    cd /Users/kempersc/apps/glam
    PYTHONPATH=src python scripts/llm_extract_archiveslab.py
"""

import asyncio
import json
import sys
from datetime import datetime, timezone
from pathlib import Path

# Add src to path for imports
src_path = Path(__file__).parent.parent / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Now import from glam_extractor
try:
    from glam_extractor.annotators.llm_annotator import (
        LLMAnnotator,
        LLMAnnotatorConfig,
        LLMProvider,
        create_llm_annotator,
    )
    from glam_extractor.annotators.base import AnnotationSession
except ImportError as e:
    print(f"Import error: {e}")
    print("Make sure you're running from the glam project root with PYTHONPATH=src")
    sys.exit(1)


def load_markdown_content(md_path: Path) -> str:
    """Load markdown content from file."""
    with open(md_path, "r", encoding="utf-8") as f:
        return f.read()


def session_to_dict(session: AnnotationSession) -> dict:
    """Convert AnnotationSession to serializable dict."""
    return {
        "session_id": session.session_id,
        "source_url": session.source_url,
        "source_file": session.source_file,
        "started_at": session.started_at,
        "completed_at": session.completed_at,
        "entity_claims": [
            {
                "claim_id": c.claim_id,
                "hypernym": c.hypernym.value if c.hypernym else None,
                "hyponym": c.hyponym,
                "text_content": c.text_content,
                "class_uri": c.class_uri,
                "recognition_confidence": c.recognition_confidence,
                "provenance": {
                    "namespace": c.provenance.namespace,
                    "path": c.provenance.path,
                    "timestamp": c.provenance.timestamp,
                    "agent": c.provenance.agent,
                    "context_convention": c.provenance.context_convention,
                    "confidence": c.provenance.confidence,
                } if c.provenance else None,
            }
            for c in session.entity_claims
        ],
        "relationship_claims": [
            {
                "claim_id": c.claim_id,
                "relationship_hypernym": c.relationship_hypernym.value if c.relationship_hypernym else None,
                "relationship_hyponym": c.relationship_hyponym,
                "subject": {
                    "entity_id": c.subject.entity_id,
                    "entity_type": c.subject.entity_type,
                    "span_text": c.subject.span_text,
                    "uri": c.subject.uri,
                } if c.subject else None,
                "predicate": {
                    "uri": c.predicate.uri,
                    "label": c.predicate.label,
                    "direction": c.predicate.direction,
                } if c.predicate else None,
                "object": {
                    "entity_id": c.object.entity_id,
                    "entity_type": c.object.entity_type,
                    "span_text": c.object.span_text,
                    "uri": c.object.uri,
                } if c.object else None,
                "extraction_confidence": c.extraction_confidence,
                "text_content": c.text_content,
            }
            for c in session.relationship_claims
        ],
        "aggregate_claims": [
            {
                "claim_id": c.claim_id,
                "claim_type": c.claim_type,
                "claim_value": c.claim_value,
                "text_content": c.text_content,
                "provenance": {
                    "path": c.provenance.path,
                    "confidence": c.provenance.confidence,
                } if c.provenance else None,
            }
            for c in session.aggregate_claims
        ],
        "layout_claims": [
            {
                "claim_id": c.claim_id,
                "region": c.region.value if c.region else None,
                "semantic_role": c.semantic_role.value if c.semantic_role else None,
                "xpath": c.xpath,
                "text_content": c.text_content[:100] if c.text_content else None,
            }
            for c in session.layout_claims
        ],
        "image_claims": [
            {
                "image_url": c.image_url,
                "description": c.description,
                "detected_entities": c.detected_entities,
                "image_type": c.image_type,
                "heritage_relevance": c.heritage_relevance,
            }
            for c in session.image_claims
        ],
        "errors": session.errors,
        "config": session.config,
    }


def generate_statistics(session: AnnotationSession) -> dict:
    """Generate extraction statistics."""
    # Count entity types
    entity_type_counts = {}
    for claim in session.entity_claims:
        hyponym = claim.hyponym or "unknown"
        entity_type_counts[hyponym] = entity_type_counts.get(hyponym, 0) + 1

    # Count relationship types
    rel_type_counts = {}
    for claim in session.relationship_claims:
        rel_type = claim.relationship_hyponym or "unknown"
        rel_type_counts[rel_type] = rel_type_counts.get(rel_type, 0) + 1

    # Identify heritage institutions
    heritage_institutions = [
        c for c in session.entity_claims
        if c.hyponym and c.hyponym.startswith("GRP.HER")
    ]

    # Identify persons (speakers, panelists)
    persons = [
        c for c in session.entity_claims
        if c.hyponym and c.hyponym.startswith("AGT.PER")
    ]

    # Identify locations
    locations = [
        c for c in session.entity_claims
        if c.hyponym and c.hyponym.startswith("TOP")
    ]

    return {
        "total_entities": len(session.entity_claims),
        "total_relationships": len(session.relationship_claims),
        "total_aggregate_claims": len(session.aggregate_claims),
        "total_layout_claims": len(session.layout_claims),
        "total_image_claims": len(session.image_claims),
        "entity_type_counts": entity_type_counts,
        "relationship_type_counts": rel_type_counts,
        "heritage_institutions_count": len(heritage_institutions),
        "persons_count": len(persons),
        "locations_count": len(locations),
        "heritage_institutions": [
            {"text": c.text_content, "confidence": c.recognition_confidence}
            for c in heritage_institutions
        ],
        "persons": [
            {"text": c.text_content, "confidence": c.recognition_confidence}
            for c in persons[:30]  # Limit to first 30
        ],
        "locations": [
            {"text": c.text_content, "confidence": c.recognition_confidence}
            for c in locations
        ],
        "errors": session.errors,
    }


async def main():
    """Run LLM extraction on archiveslab content."""
    # Paths
    base_dir = Path(__file__).parent.parent
    content_path = base_dir / "data/extracted/archiveslab/archiveslab.org/content.md"
    html_path = base_dir / "data/extracted/archiveslab/archiveslab.org/rendered.html"
    output_dir = base_dir / "data/extracted/archiveslab"

    # Load content
    print(f"Loading content from {content_path}")

    # Use HTML if available (has XPath), otherwise markdown
    if html_path.exists():
        print(f"Using HTML file for XPath provenance: {html_path}")
        source_file = html_path
        source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives"
    else:
        source_file = content_path
        source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives"

    # Create annotator
    print("Creating LLM annotator (Z.AI provider)...")
    try:
        annotator = create_llm_annotator(
            provider="zai",
            model="glm-4.6",
            enable_fallback=True,
            max_retries=3,
        )
    except ValueError as e:
        print(f"Error creating annotator: {e}")
        print("Make sure ZAI_API_TOKEN environment variable is set")
        return 1

    # Run annotation
    print("Running LLM annotation (this may take a minute)...")
    print("  - Extracting entities (GRP.HER, AGT.PER, TOP.*, etc.)")
    print("  - Extracting relationships (REL.SPA.LOC, REL.SOC.*, REL.EVT.*, etc.)")
    print("  - Extracting aggregate claims (full_name, email, website, etc.)")

    try:
        session = await annotator.annotate(
            document=source_file,
            source_url=source_url,
        )
    except Exception as e:
        print(f"Annotation failed: {e}")
        import traceback
        traceback.print_exc()
        return 1

    # Generate statistics
    print("\n" + "=" * 60)
    print("EXTRACTION RESULTS")
    print("=" * 60)

    stats = generate_statistics(session)

    print(f"\nTotal entities: {stats['total_entities']}")
    print(f"Total relationships: {stats['total_relationships']}")
    print(f"Total aggregate claims: {stats['total_aggregate_claims']}")
    print(f"Total layout claims: {stats['total_layout_claims']}")
    print(f"Total image claims: {stats['total_image_claims']}")

    print(f"\nHeritage institutions found: {stats['heritage_institutions_count']}")
    for inst in stats['heritage_institutions'][:10]:
        print(f"  - {inst['text']} (conf: {inst['confidence']:.2f})")

    print(f"\nPersons (speakers/panelists) found: {stats['persons_count']}")
    for person in stats['persons'][:10]:
        print(f"  - {person['text']} (conf: {person['confidence']:.2f})")

    print(f"\nLocations found: {stats['locations_count']}")
    for loc in stats['locations'][:10]:
        print(f"  - {loc['text']} (conf: {loc['confidence']:.2f})")

    print("\nEntity type distribution:")
    for etype, count in sorted(stats['entity_type_counts'].items(), key=lambda x: -x[1])[:15]:
        print(f"  {etype}: {count}")

    print("\nRelationship type distribution:")
    for rtype, count in sorted(stats['relationship_type_counts'].items(), key=lambda x: -x[1])[:10]:
        print(f"  {rtype}: {count}")

    if stats['errors']:
        print(f"\nErrors ({len(stats['errors'])}):")
        for err in stats['errors'][:5]:
            print(f"  - {err[:100]}...")

    # Save results
    output_file = output_dir / "archiveslab_llm_extraction.json"
    print(f"\nSaving results to {output_file}")

    result = {
        "extraction_method": "LLM (GLAM-NER v1.7.0)",
        "extraction_date": datetime.now(timezone.utc).isoformat(),
        "source_url": source_url,
        "provider": "zai",
        "model": "glm-4.6",
        "statistics": stats,
        "session": session_to_dict(session),
    }

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print(f"Extraction complete! Results saved to {output_file}")

    # Also save a summary statistics file
    stats_file = output_dir / "archiveslab_llm_stats.json"
    with open(stats_file, "w", encoding="utf-8") as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)
    print(f"Statistics saved to {stats_file}")

    return 0


if __name__ == "__main__":
    exit_code = asyncio.run(main())
    sys.exit(exit_code)