#!/usr/bin/env python3 """ LLM-based extraction for Archives Lab content. Uses LLMAnnotator with GLAM-NER v1.7.0 for comprehensive entity and relationship extraction. This replaces the regex-based extraction with generative LLM inference. Usage: cd /Users/kempersc/apps/glam PYTHONPATH=src python scripts/llm_extract_archiveslab.py """ import asyncio import json import sys from datetime import datetime, timezone from pathlib import Path # Add src to path for imports src_path = Path(__file__).parent.parent / "src" if str(src_path) not in sys.path: sys.path.insert(0, str(src_path)) # Now import from glam_extractor try: from glam_extractor.annotators.llm_annotator import ( LLMAnnotator, LLMAnnotatorConfig, LLMProvider, create_llm_annotator, ) from glam_extractor.annotators.base import AnnotationSession except ImportError as e: print(f"Import error: {e}") print("Make sure you're running from the glam project root with PYTHONPATH=src") sys.exit(1) def load_markdown_content(md_path: Path) -> str: """Load markdown content from file.""" with open(md_path, "r", encoding="utf-8") as f: return f.read() def session_to_dict(session: AnnotationSession) -> dict: """Convert AnnotationSession to serializable dict.""" return { "session_id": session.session_id, "source_url": session.source_url, "source_file": session.source_file, "started_at": session.started_at, "completed_at": session.completed_at, "entity_claims": [ { "claim_id": c.claim_id, "hypernym": c.hypernym.value if c.hypernym else None, "hyponym": c.hyponym, "text_content": c.text_content, "class_uri": c.class_uri, "recognition_confidence": c.recognition_confidence, "provenance": { "namespace": c.provenance.namespace, "path": c.provenance.path, "timestamp": c.provenance.timestamp, "agent": c.provenance.agent, "context_convention": c.provenance.context_convention, "confidence": c.provenance.confidence, } if c.provenance else None, } for c in session.entity_claims ], "relationship_claims": [ { "claim_id": c.claim_id, "relationship_hypernym": c.relationship_hypernym.value if c.relationship_hypernym else None, "relationship_hyponym": c.relationship_hyponym, "subject": { "entity_id": c.subject.entity_id, "entity_type": c.subject.entity_type, "span_text": c.subject.span_text, "uri": c.subject.uri, } if c.subject else None, "predicate": { "uri": c.predicate.uri, "label": c.predicate.label, "direction": c.predicate.direction, } if c.predicate else None, "object": { "entity_id": c.object.entity_id, "entity_type": c.object.entity_type, "span_text": c.object.span_text, "uri": c.object.uri, } if c.object else None, "extraction_confidence": c.extraction_confidence, "text_content": c.text_content, } for c in session.relationship_claims ], "aggregate_claims": [ { "claim_id": c.claim_id, "claim_type": c.claim_type, "claim_value": c.claim_value, "text_content": c.text_content, "provenance": { "path": c.provenance.path, "confidence": c.provenance.confidence, } if c.provenance else None, } for c in session.aggregate_claims ], "layout_claims": [ { "claim_id": c.claim_id, "region": c.region.value if c.region else None, "semantic_role": c.semantic_role.value if c.semantic_role else None, "xpath": c.xpath, "text_content": c.text_content[:100] if c.text_content else None, } for c in session.layout_claims ], "image_claims": [ { "image_url": c.image_url, "description": c.description, "detected_entities": c.detected_entities, "image_type": c.image_type, "heritage_relevance": c.heritage_relevance, } for c in session.image_claims ], "errors": session.errors, "config": session.config, } def generate_statistics(session: AnnotationSession) -> dict: """Generate extraction statistics.""" # Count entity types entity_type_counts = {} for claim in session.entity_claims: hyponym = claim.hyponym or "unknown" entity_type_counts[hyponym] = entity_type_counts.get(hyponym, 0) + 1 # Count relationship types rel_type_counts = {} for claim in session.relationship_claims: rel_type = claim.relationship_hyponym or "unknown" rel_type_counts[rel_type] = rel_type_counts.get(rel_type, 0) + 1 # Identify heritage institutions heritage_institutions = [ c for c in session.entity_claims if c.hyponym and c.hyponym.startswith("GRP.HER") ] # Identify persons (speakers, panelists) persons = [ c for c in session.entity_claims if c.hyponym and c.hyponym.startswith("AGT.PER") ] # Identify locations locations = [ c for c in session.entity_claims if c.hyponym and c.hyponym.startswith("TOP") ] return { "total_entities": len(session.entity_claims), "total_relationships": len(session.relationship_claims), "total_aggregate_claims": len(session.aggregate_claims), "total_layout_claims": len(session.layout_claims), "total_image_claims": len(session.image_claims), "entity_type_counts": entity_type_counts, "relationship_type_counts": rel_type_counts, "heritage_institutions_count": len(heritage_institutions), "persons_count": len(persons), "locations_count": len(locations), "heritage_institutions": [ {"text": c.text_content, "confidence": c.recognition_confidence} for c in heritage_institutions ], "persons": [ {"text": c.text_content, "confidence": c.recognition_confidence} for c in persons[:30] # Limit to first 30 ], "locations": [ {"text": c.text_content, "confidence": c.recognition_confidence} for c in locations ], "errors": session.errors, } async def main(): """Run LLM extraction on archiveslab content.""" # Paths base_dir = Path(__file__).parent.parent content_path = base_dir / "data/extracted/archiveslab/archiveslab.org/content.md" html_path = base_dir / "data/extracted/archiveslab/archiveslab.org/rendered.html" output_dir = base_dir / "data/extracted/archiveslab" # Load content print(f"Loading content from {content_path}") # Use HTML if available (has XPath), otherwise markdown if html_path.exists(): print(f"Using HTML file for XPath provenance: {html_path}") source_file = html_path source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives" else: source_file = content_path source_url = "https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives" # Create annotator print("Creating LLM annotator (Z.AI provider)...") try: annotator = create_llm_annotator( provider="zai", model="glm-4.6", enable_fallback=True, max_retries=3, ) except ValueError as e: print(f"Error creating annotator: {e}") print("Make sure ZAI_API_TOKEN environment variable is set") return 1 # Run annotation print("Running LLM annotation (this may take a minute)...") print(" - Extracting entities (GRP.HER, AGT.PER, TOP.*, etc.)") print(" - Extracting relationships (REL.SPA.LOC, REL.SOC.*, REL.EVT.*, etc.)") print(" - Extracting aggregate claims (full_name, email, website, etc.)") try: session = await annotator.annotate( document=source_file, source_url=source_url, ) except Exception as e: print(f"Annotation failed: {e}") import traceback traceback.print_exc() return 1 # Generate statistics print("\n" + "=" * 60) print("EXTRACTION RESULTS") print("=" * 60) stats = generate_statistics(session) print(f"\nTotal entities: {stats['total_entities']}") print(f"Total relationships: {stats['total_relationships']}") print(f"Total aggregate claims: {stats['total_aggregate_claims']}") print(f"Total layout claims: {stats['total_layout_claims']}") print(f"Total image claims: {stats['total_image_claims']}") print(f"\nHeritage institutions found: {stats['heritage_institutions_count']}") for inst in stats['heritage_institutions'][:10]: print(f" - {inst['text']} (conf: {inst['confidence']:.2f})") print(f"\nPersons (speakers/panelists) found: {stats['persons_count']}") for person in stats['persons'][:10]: print(f" - {person['text']} (conf: {person['confidence']:.2f})") print(f"\nLocations found: {stats['locations_count']}") for loc in stats['locations'][:10]: print(f" - {loc['text']} (conf: {loc['confidence']:.2f})") print("\nEntity type distribution:") for etype, count in sorted(stats['entity_type_counts'].items(), key=lambda x: -x[1])[:15]: print(f" {etype}: {count}") print("\nRelationship type distribution:") for rtype, count in sorted(stats['relationship_type_counts'].items(), key=lambda x: -x[1])[:10]: print(f" {rtype}: {count}") if stats['errors']: print(f"\nErrors ({len(stats['errors'])}):") for err in stats['errors'][:5]: print(f" - {err[:100]}...") # Save results output_file = output_dir / "archiveslab_llm_extraction.json" print(f"\nSaving results to {output_file}") result = { "extraction_method": "LLM (GLAM-NER v1.7.0)", "extraction_date": datetime.now(timezone.utc).isoformat(), "source_url": source_url, "provider": "zai", "model": "glm-4.6", "statistics": stats, "session": session_to_dict(session), } with open(output_file, "w", encoding="utf-8") as f: json.dump(result, f, indent=2, ensure_ascii=False) print(f"Extraction complete! Results saved to {output_file}") # Also save a summary statistics file stats_file = output_dir / "archiveslab_llm_stats.json" with open(stats_file, "w", encoding="utf-8") as f: json.dump(stats, f, indent=2, ensure_ascii=False) print(f"Statistics saved to {stats_file}") return 0 if __name__ == "__main__": exit_code = asyncio.run(main()) sys.exit(exit_code)