- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0. - Replaced regex-based extraction with generative LLM inference. - Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics. - Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons. - Results and statistics are saved in JSON format for further analysis.
315 lines
11 KiB
Python
315 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LLM-based extraction for Archives Lab content.
|
|
|
|
Uses LLMAnnotator with GLAM-NER v1.7.0 for comprehensive entity and relationship extraction.
|
|
This replaces the regex-based extraction with generative LLM inference.
|
|
|
|
Usage:
|
|
cd /Users/kempersc/apps/glam
|
|
PYTHONPATH=src python scripts/llm_extract_archiveslab.py
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Add src to path for imports
|
|
src_path = Path(__file__).parent.parent / "src"
|
|
if str(src_path) not in sys.path:
|
|
sys.path.insert(0, str(src_path))
|
|
|
|
# Now import from glam_extractor
|
|
try:
|
|
from glam_extractor.annotators.llm_annotator import (
|
|
LLMAnnotator,
|
|
LLMAnnotatorConfig,
|
|
LLMProvider,
|
|
create_llm_annotator,
|
|
)
|
|
from glam_extractor.annotators.base import AnnotationSession
|
|
except ImportError as e:
|
|
print(f"Import error: {e}")
|
|
print("Make sure you're running from the glam project root with PYTHONPATH=src")
|
|
sys.exit(1)
|
|
|
|
|
|
def load_markdown_content(md_path: Path) -> str:
|
|
"""Load markdown content from file."""
|
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
|
|
def session_to_dict(session: AnnotationSession) -> dict:
|
|
"""Convert AnnotationSession to serializable dict."""
|
|
return {
|
|
"session_id": session.session_id,
|
|
"source_url": session.source_url,
|
|
"source_file": session.source_file,
|
|
"started_at": session.started_at,
|
|
"completed_at": session.completed_at,
|
|
"entity_claims": [
|
|
{
|
|
"claim_id": c.claim_id,
|
|
"hypernym": c.hypernym.value if c.hypernym else None,
|
|
"hyponym": c.hyponym,
|
|
"text_content": c.text_content,
|
|
"class_uri": c.class_uri,
|
|
"recognition_confidence": c.recognition_confidence,
|
|
"provenance": {
|
|
"namespace": c.provenance.namespace,
|
|
"path": c.provenance.path,
|
|
"timestamp": c.provenance.timestamp,
|
|
"agent": c.provenance.agent,
|
|
"context_convention": c.provenance.context_convention,
|
|
"confidence": c.provenance.confidence,
|
|
} if c.provenance else None,
|
|
}
|
|
for c in session.entity_claims
|
|
],
|
|
"relationship_claims": [
|
|
{
|
|
"claim_id": c.claim_id,
|
|
"relationship_hypernym": c.relationship_hypernym.value if c.relationship_hypernym else None,
|
|
"relationship_hyponym": c.relationship_hyponym,
|
|
"subject": {
|
|
"entity_id": c.subject.entity_id,
|
|
"entity_type": c.subject.entity_type,
|
|
"span_text": c.subject.span_text,
|
|
"uri": c.subject.uri,
|
|
} if c.subject else None,
|
|
"predicate": {
|
|
"uri": c.predicate.uri,
|
|
"label": c.predicate.label,
|
|
"direction": c.predicate.direction,
|
|
} if c.predicate else None,
|
|
"object": {
|
|
"entity_id": c.object.entity_id,
|
|
"entity_type": c.object.entity_type,
|
|
"span_text": c.object.span_text,
|
|
"uri": c.object.uri,
|
|
} if c.object else None,
|
|
"extraction_confidence": c.extraction_confidence,
|
|
"text_content": c.text_content,
|
|
}
|
|
for c in session.relationship_claims
|
|
],
|
|
"aggregate_claims": [
|
|
{
|
|
"claim_id": c.claim_id,
|
|
"claim_type": c.claim_type,
|
|
"claim_value": c.claim_value,
|
|
"text_content": c.text_content,
|
|
"provenance": {
|
|
"path": c.provenance.path,
|
|
"confidence": c.provenance.confidence,
|
|
} if c.provenance else None,
|
|
}
|
|
for c in session.aggregate_claims
|
|
],
|
|
"layout_claims": [
|
|
{
|
|
"claim_id": c.claim_id,
|
|
"region": c.region.value if c.region else None,
|
|
"semantic_role": c.semantic_role.value if c.semantic_role else None,
|
|
"xpath": c.xpath,
|
|
"text_content": c.text_content[:100] if c.text_content else None,
|
|
}
|
|
for c in session.layout_claims
|
|
],
|
|
"image_claims": [
|
|
{
|
|
"image_url": c.image_url,
|
|
"description": c.description,
|
|
"detected_entities": c.detected_entities,
|
|
"image_type": c.image_type,
|
|
"heritage_relevance": c.heritage_relevance,
|
|
}
|
|
for c in session.image_claims
|
|
],
|
|
"errors": session.errors,
|
|
"config": session.config,
|
|
}
|
|
|
|
|
|
def generate_statistics(session: AnnotationSession) -> dict:
|
|
"""Generate extraction statistics."""
|
|
# Count entity types
|
|
entity_type_counts = {}
|
|
for claim in session.entity_claims:
|
|
hyponym = claim.hyponym or "unknown"
|
|
entity_type_counts[hyponym] = entity_type_counts.get(hyponym, 0) + 1
|
|
|
|
# Count relationship types
|
|
rel_type_counts = {}
|
|
for claim in session.relationship_claims:
|
|
rel_type = claim.relationship_hyponym or "unknown"
|
|
rel_type_counts[rel_type] = rel_type_counts.get(rel_type, 0) + 1
|
|
|
|
# Identify heritage institutions
|
|
heritage_institutions = [
|
|
c for c in session.entity_claims
|
|
if c.hyponym and c.hyponym.startswith("GRP.HER")
|
|
]
|
|
|
|
# Identify persons (speakers, panelists)
|
|
persons = [
|
|
c for c in session.entity_claims
|
|
if c.hyponym and c.hyponym.startswith("AGT.PER")
|
|
]
|
|
|
|
# Identify locations
|
|
locations = [
|
|
c for c in session.entity_claims
|
|
if c.hyponym and c.hyponym.startswith("TOP")
|
|
]
|
|
|
|
return {
|
|
"total_entities": len(session.entity_claims),
|
|
"total_relationships": len(session.relationship_claims),
|
|
"total_aggregate_claims": len(session.aggregate_claims),
|
|
"total_layout_claims": len(session.layout_claims),
|
|
"total_image_claims": len(session.image_claims),
|
|
"entity_type_counts": entity_type_counts,
|
|
"relationship_type_counts": rel_type_counts,
|
|
"heritage_institutions_count": len(heritage_institutions),
|
|
"persons_count": len(persons),
|
|
"locations_count": len(locations),
|
|
"heritage_institutions": [
|
|
{"text": c.text_content, "confidence": c.recognition_confidence}
|
|
for c in heritage_institutions
|
|
],
|
|
"persons": [
|
|
{"text": c.text_content, "confidence": c.recognition_confidence}
|
|
for c in persons[:30] # Limit to first 30
|
|
],
|
|
"locations": [
|
|
{"text": c.text_content, "confidence": c.recognition_confidence}
|
|
for c in locations
|
|
],
|
|
"errors": session.errors,
|
|
}
|
|
|
|
|
|
async def main():
|
|
"""Run LLM extraction on archiveslab content."""
|
|
# Paths
|
|
base_dir = Path(__file__).parent.parent
|
|
content_path = base_dir / "data/extracted/archiveslab/archiveslab.org/content.md"
|
|
html_path = base_dir / "data/extracted/archiveslab/archiveslab.org/rendered.html"
|
|
output_dir = base_dir / "data/extracted/archiveslab"
|
|
|
|
# Load content
|
|
print(f"Loading content from {content_path}")
|
|
|
|
# Use HTML if available (has XPath), otherwise markdown
|
|
if html_path.exists():
|
|
print(f"Using HTML file for XPath provenance: {html_path}")
|
|
source_file = html_path
|
|
source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives"
|
|
else:
|
|
source_file = content_path
|
|
source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives"
|
|
|
|
# Create annotator
|
|
print("Creating LLM annotator (Z.AI provider)...")
|
|
try:
|
|
annotator = create_llm_annotator(
|
|
provider="zai",
|
|
model="glm-4.6",
|
|
enable_fallback=True,
|
|
max_retries=3,
|
|
)
|
|
except ValueError as e:
|
|
print(f"Error creating annotator: {e}")
|
|
print("Make sure ZAI_API_TOKEN environment variable is set")
|
|
return 1
|
|
|
|
# Run annotation
|
|
print("Running LLM annotation (this may take a minute)...")
|
|
print(" - Extracting entities (GRP.HER, AGT.PER, TOP.*, etc.)")
|
|
print(" - Extracting relationships (REL.SPA.LOC, REL.SOC.*, REL.EVT.*, etc.)")
|
|
print(" - Extracting aggregate claims (full_name, email, website, etc.)")
|
|
|
|
try:
|
|
session = await annotator.annotate(
|
|
document=source_file,
|
|
source_url=source_url,
|
|
)
|
|
except Exception as e:
|
|
print(f"Annotation failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
# Generate statistics
|
|
print("\n" + "=" * 60)
|
|
print("EXTRACTION RESULTS")
|
|
print("=" * 60)
|
|
|
|
stats = generate_statistics(session)
|
|
|
|
print(f"\nTotal entities: {stats['total_entities']}")
|
|
print(f"Total relationships: {stats['total_relationships']}")
|
|
print(f"Total aggregate claims: {stats['total_aggregate_claims']}")
|
|
print(f"Total layout claims: {stats['total_layout_claims']}")
|
|
print(f"Total image claims: {stats['total_image_claims']}")
|
|
|
|
print(f"\nHeritage institutions found: {stats['heritage_institutions_count']}")
|
|
for inst in stats['heritage_institutions'][:10]:
|
|
print(f" - {inst['text']} (conf: {inst['confidence']:.2f})")
|
|
|
|
print(f"\nPersons (speakers/panelists) found: {stats['persons_count']}")
|
|
for person in stats['persons'][:10]:
|
|
print(f" - {person['text']} (conf: {person['confidence']:.2f})")
|
|
|
|
print(f"\nLocations found: {stats['locations_count']}")
|
|
for loc in stats['locations'][:10]:
|
|
print(f" - {loc['text']} (conf: {loc['confidence']:.2f})")
|
|
|
|
print("\nEntity type distribution:")
|
|
for etype, count in sorted(stats['entity_type_counts'].items(), key=lambda x: -x[1])[:15]:
|
|
print(f" {etype}: {count}")
|
|
|
|
print("\nRelationship type distribution:")
|
|
for rtype, count in sorted(stats['relationship_type_counts'].items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {rtype}: {count}")
|
|
|
|
if stats['errors']:
|
|
print(f"\nErrors ({len(stats['errors'])}):")
|
|
for err in stats['errors'][:5]:
|
|
print(f" - {err[:100]}...")
|
|
|
|
# Save results
|
|
output_file = output_dir / "archiveslab_llm_extraction.json"
|
|
print(f"\nSaving results to {output_file}")
|
|
|
|
result = {
|
|
"extraction_method": "LLM (GLAM-NER v1.7.0)",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"source_url": source_url,
|
|
"provider": "zai",
|
|
"model": "glm-4.6",
|
|
"statistics": stats,
|
|
"session": session_to_dict(session),
|
|
}
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Extraction complete! Results saved to {output_file}")
|
|
|
|
# Also save a summary statistics file
|
|
stats_file = output_dir / "archiveslab_llm_stats.json"
|
|
with open(stats_file, "w", encoding="utf-8") as f:
|
|
json.dump(stats, f, indent=2, ensure_ascii=False)
|
|
print(f"Statistics saved to {stats_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit_code = asyncio.run(main())
|
|
sys.exit(exit_code)
|