glam/scripts/llm_extract_archiveslab.py
kempersc 55e2cd2340 feat: implement LLM-based extraction for Archives Lab content
- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0.
- Replaced regex-based extraction with generative LLM inference.
- Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics.
- Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons.
- Results and statistics are saved in JSON format for further analysis.
2025-12-05 23:16:21 +01:00

315 lines
11 KiB
Python

#!/usr/bin/env python3
"""
LLM-based extraction for Archives Lab content.
Uses LLMAnnotator with GLAM-NER v1.7.0 for comprehensive entity and relationship extraction.
This replaces the regex-based extraction with generative LLM inference.
Usage:
cd /Users/kempersc/apps/glam
PYTHONPATH=src python scripts/llm_extract_archiveslab.py
"""
import asyncio
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
# Add src to path for imports
src_path = Path(__file__).parent.parent / "src"
if str(src_path) not in sys.path:
sys.path.insert(0, str(src_path))
# Now import from glam_extractor
try:
from glam_extractor.annotators.llm_annotator import (
LLMAnnotator,
LLMAnnotatorConfig,
LLMProvider,
create_llm_annotator,
)
from glam_extractor.annotators.base import AnnotationSession
except ImportError as e:
print(f"Import error: {e}")
print("Make sure you're running from the glam project root with PYTHONPATH=src")
sys.exit(1)
def load_markdown_content(md_path: Path) -> str:
"""Load markdown content from file."""
with open(md_path, "r", encoding="utf-8") as f:
return f.read()
def session_to_dict(session: AnnotationSession) -> dict:
"""Convert AnnotationSession to serializable dict."""
return {
"session_id": session.session_id,
"source_url": session.source_url,
"source_file": session.source_file,
"started_at": session.started_at,
"completed_at": session.completed_at,
"entity_claims": [
{
"claim_id": c.claim_id,
"hypernym": c.hypernym.value if c.hypernym else None,
"hyponym": c.hyponym,
"text_content": c.text_content,
"class_uri": c.class_uri,
"recognition_confidence": c.recognition_confidence,
"provenance": {
"namespace": c.provenance.namespace,
"path": c.provenance.path,
"timestamp": c.provenance.timestamp,
"agent": c.provenance.agent,
"context_convention": c.provenance.context_convention,
"confidence": c.provenance.confidence,
} if c.provenance else None,
}
for c in session.entity_claims
],
"relationship_claims": [
{
"claim_id": c.claim_id,
"relationship_hypernym": c.relationship_hypernym.value if c.relationship_hypernym else None,
"relationship_hyponym": c.relationship_hyponym,
"subject": {
"entity_id": c.subject.entity_id,
"entity_type": c.subject.entity_type,
"span_text": c.subject.span_text,
"uri": c.subject.uri,
} if c.subject else None,
"predicate": {
"uri": c.predicate.uri,
"label": c.predicate.label,
"direction": c.predicate.direction,
} if c.predicate else None,
"object": {
"entity_id": c.object.entity_id,
"entity_type": c.object.entity_type,
"span_text": c.object.span_text,
"uri": c.object.uri,
} if c.object else None,
"extraction_confidence": c.extraction_confidence,
"text_content": c.text_content,
}
for c in session.relationship_claims
],
"aggregate_claims": [
{
"claim_id": c.claim_id,
"claim_type": c.claim_type,
"claim_value": c.claim_value,
"text_content": c.text_content,
"provenance": {
"path": c.provenance.path,
"confidence": c.provenance.confidence,
} if c.provenance else None,
}
for c in session.aggregate_claims
],
"layout_claims": [
{
"claim_id": c.claim_id,
"region": c.region.value if c.region else None,
"semantic_role": c.semantic_role.value if c.semantic_role else None,
"xpath": c.xpath,
"text_content": c.text_content[:100] if c.text_content else None,
}
for c in session.layout_claims
],
"image_claims": [
{
"image_url": c.image_url,
"description": c.description,
"detected_entities": c.detected_entities,
"image_type": c.image_type,
"heritage_relevance": c.heritage_relevance,
}
for c in session.image_claims
],
"errors": session.errors,
"config": session.config,
}
def generate_statistics(session: AnnotationSession) -> dict:
"""Generate extraction statistics."""
# Count entity types
entity_type_counts = {}
for claim in session.entity_claims:
hyponym = claim.hyponym or "unknown"
entity_type_counts[hyponym] = entity_type_counts.get(hyponym, 0) + 1
# Count relationship types
rel_type_counts = {}
for claim in session.relationship_claims:
rel_type = claim.relationship_hyponym or "unknown"
rel_type_counts[rel_type] = rel_type_counts.get(rel_type, 0) + 1
# Identify heritage institutions
heritage_institutions = [
c for c in session.entity_claims
if c.hyponym and c.hyponym.startswith("GRP.HER")
]
# Identify persons (speakers, panelists)
persons = [
c for c in session.entity_claims
if c.hyponym and c.hyponym.startswith("AGT.PER")
]
# Identify locations
locations = [
c for c in session.entity_claims
if c.hyponym and c.hyponym.startswith("TOP")
]
return {
"total_entities": len(session.entity_claims),
"total_relationships": len(session.relationship_claims),
"total_aggregate_claims": len(session.aggregate_claims),
"total_layout_claims": len(session.layout_claims),
"total_image_claims": len(session.image_claims),
"entity_type_counts": entity_type_counts,
"relationship_type_counts": rel_type_counts,
"heritage_institutions_count": len(heritage_institutions),
"persons_count": len(persons),
"locations_count": len(locations),
"heritage_institutions": [
{"text": c.text_content, "confidence": c.recognition_confidence}
for c in heritage_institutions
],
"persons": [
{"text": c.text_content, "confidence": c.recognition_confidence}
for c in persons[:30] # Limit to first 30
],
"locations": [
{"text": c.text_content, "confidence": c.recognition_confidence}
for c in locations
],
"errors": session.errors,
}
async def main():
"""Run LLM extraction on archiveslab content."""
# Paths
base_dir = Path(__file__).parent.parent
content_path = base_dir / "data/extracted/archiveslab/archiveslab.org/content.md"
html_path = base_dir / "data/extracted/archiveslab/archiveslab.org/rendered.html"
output_dir = base_dir / "data/extracted/archiveslab"
# Load content
print(f"Loading content from {content_path}")
# Use HTML if available (has XPath), otherwise markdown
if html_path.exists():
print(f"Using HTML file for XPath provenance: {html_path}")
source_file = html_path
source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives"
else:
source_file = content_path
source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives"
# Create annotator
print("Creating LLM annotator (Z.AI provider)...")
try:
annotator = create_llm_annotator(
provider="zai",
model="glm-4.6",
enable_fallback=True,
max_retries=3,
)
except ValueError as e:
print(f"Error creating annotator: {e}")
print("Make sure ZAI_API_TOKEN environment variable is set")
return 1
# Run annotation
print("Running LLM annotation (this may take a minute)...")
print(" - Extracting entities (GRP.HER, AGT.PER, TOP.*, etc.)")
print(" - Extracting relationships (REL.SPA.LOC, REL.SOC.*, REL.EVT.*, etc.)")
print(" - Extracting aggregate claims (full_name, email, website, etc.)")
try:
session = await annotator.annotate(
document=source_file,
source_url=source_url,
)
except Exception as e:
print(f"Annotation failed: {e}")
import traceback
traceback.print_exc()
return 1
# Generate statistics
print("\n" + "=" * 60)
print("EXTRACTION RESULTS")
print("=" * 60)
stats = generate_statistics(session)
print(f"\nTotal entities: {stats['total_entities']}")
print(f"Total relationships: {stats['total_relationships']}")
print(f"Total aggregate claims: {stats['total_aggregate_claims']}")
print(f"Total layout claims: {stats['total_layout_claims']}")
print(f"Total image claims: {stats['total_image_claims']}")
print(f"\nHeritage institutions found: {stats['heritage_institutions_count']}")
for inst in stats['heritage_institutions'][:10]:
print(f" - {inst['text']} (conf: {inst['confidence']:.2f})")
print(f"\nPersons (speakers/panelists) found: {stats['persons_count']}")
for person in stats['persons'][:10]:
print(f" - {person['text']} (conf: {person['confidence']:.2f})")
print(f"\nLocations found: {stats['locations_count']}")
for loc in stats['locations'][:10]:
print(f" - {loc['text']} (conf: {loc['confidence']:.2f})")
print("\nEntity type distribution:")
for etype, count in sorted(stats['entity_type_counts'].items(), key=lambda x: -x[1])[:15]:
print(f" {etype}: {count}")
print("\nRelationship type distribution:")
for rtype, count in sorted(stats['relationship_type_counts'].items(), key=lambda x: -x[1])[:10]:
print(f" {rtype}: {count}")
if stats['errors']:
print(f"\nErrors ({len(stats['errors'])}):")
for err in stats['errors'][:5]:
print(f" - {err[:100]}...")
# Save results
output_file = output_dir / "archiveslab_llm_extraction.json"
print(f"\nSaving results to {output_file}")
result = {
"extraction_method": "LLM (GLAM-NER v1.7.0)",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"source_url": source_url,
"provider": "zai",
"model": "glm-4.6",
"statistics": stats,
"session": session_to_dict(session),
}
with open(output_file, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"Extraction complete! Results saved to {output_file}")
# Also save a summary statistics file
stats_file = output_dir / "archiveslab_llm_stats.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"Statistics saved to {stats_file}")
return 0
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)