#!/usr/bin/env python3 """ Test Schema-Driven LLM Annotation. Tests the GLiNER2-style schema builder and LLM annotator integration. """ import asyncio import sys from pathlib import Path import pytest # noqa: E402 # Add src directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from dotenv import load_dotenv # noqa: E402 load_dotenv() from glam_extractor.annotators.llm_annotator import ( LLMAnnotator, LLMAnnotatorConfig, LLMProvider, ) from glam_extractor.annotators.schema_builder import ( GLAMSchema, FieldSpec, heritage_custodian_schema, ) # Sample HTML for testing SAMPLE_HTML = """ Historische Kring Elden - Home

Historische Kring Elden

Welkom bij de Historische Kring Elden

De Historische Kring Elden is opgericht in 1985 en houdt zich bezig met het verzamelen en bewaren van historisch materiaal over Elden en omgeving.

Wij hebben een collectie van meer dan 5.000 foto's, documenten en objecten.

Contact

Email: info@historischekringelden.nl

Adres: Dorpsstraat 12, 6832 AA Elden

Telefoon: 026-1234567

Volg ons

""" def test_gliner2_field_syntax(): """Test GLiNER2-style field specification parsing.""" print("\n" + "="*60) print("TEST 1: GLiNER2-style Field Syntax Parsing") print("="*60 + "\n") test_cases = [ ("name::str::Institution name", {"name": "name", "dtype": "str", "description": "Institution name"}), ("type::[MUSEUM|ARCHIVE|LIBRARY]::str::Type", {"name": "type", "dtype": "str", "choices": ["MUSEUM", "ARCHIVE", "LIBRARY"]}), ("features::[indoor|outdoor]::list::Available features", {"name": "features", "dtype": "list", "choices": ["indoor", "outdoor"]}), ("price::str::Monthly cost", {"name": "price", "dtype": "str", "description": "Monthly cost"}), ("simple_field", {"name": "simple_field", "dtype": "str"}), ] for spec_string, expected in test_cases: print(f" Input: '{spec_string}'") parsed = FieldSpec.from_gliner2_syntax(spec_string) print(f" → name: {parsed.name}") print(f" → dtype: {parsed.dtype}") print(f" → choices: {parsed.choices}") print(f" → description: {parsed.description}") # Verify assert parsed.name == expected["name"], f"Name mismatch: {parsed.name} != {expected['name']}" assert parsed.dtype == expected["dtype"], f"Dtype mismatch: {parsed.dtype} != {expected['dtype']}" if "choices" in expected: assert parsed.choices == expected["choices"], f"Choices mismatch" print(" ✓ PASSED\n") print("All field syntax tests passed!\n") def test_schema_builder(): """Test schema builder with fluent API.""" print("\n" + "="*60) print("TEST 2: Schema Builder Fluent API") print("="*60 + "\n") # Build schema using GLiNER2-style syntax schema = ( GLAMSchema("test_institution") .entities("GRP", "TOP", "AGT") .classification("type", choices=["MUSEUM", "ARCHIVE", "COLLECTING_SOCIETY"]) .structure() .field("name", dtype="str", required=True, description="Institution name") .field("email", dtype="str", pattern=r"^[^@]+@[^@]+\.[^@]+$") .field("city", dtype="str") .build() ) print(f" Schema name: {schema.name}") print(f" Entity types: {schema.entity_types}") print(f" Classifications: {list(schema.classifications.keys())}") print(f" Fields: {[f.name for f in schema.fields]}") # Generate LLM prompt prompt = schema.to_llm_prompt() print(f"\n Prompt length: {len(prompt)} chars") print(f" Prompt preview:\n{prompt[:500]}...\n") # Generate JSON Schema json_schema = schema.to_json_schema() print(f" JSON Schema properties: {list(json_schema['properties'].keys())}") print(f" Required fields: {json_schema['required']}") print("\n ✓ Schema builder test passed!\n") def test_json_schema_generation(): """Test JSON Schema generation from FieldSpec.""" print("\n" + "="*60) print("TEST 3: JSON Schema Generation from FieldSpec") print("="*60 + "\n") # Test various field types test_fields = [ FieldSpec.from_gliner2_syntax("name::str::Institution name"), FieldSpec.from_gliner2_syntax("type::[MUSEUM|ARCHIVE]::str::Type"), FieldSpec.from_gliner2_syntax("tags::[indoor|outdoor|accessible]::list::Features"), FieldSpec(name="count", dtype="int", description="Item count"), FieldSpec(name="rating", dtype="float", description="Average rating"), FieldSpec(name="is_open", dtype="bool", description="Currently open"), FieldSpec(name="founded", dtype="date", description="Founding date"), ] for field in test_fields: prop = field.to_json_schema_property() print(f" {field.name} ({field.dtype}):") print(f" JSON Schema: {prop}") print("\n ✓ JSON Schema generation test passed!\n") @pytest.mark.asyncio async def test_schema_driven_annotation(): """Test schema-driven LLM annotation.""" print("\n" + "="*60) print("TEST 4: Schema-Driven LLM Annotation") print("="*60 + "\n") # Create custom schema schema = ( GLAMSchema("historical_society") .entities("GRP", "TOP", "TMP", "APP", "QTY") .classification("institution_type", choices=["MUSEUM", "ARCHIVE", "COLLECTING_SOCIETY", "HISTORICAL_SOCIETY"]) .structure() .field("full_name", dtype="str", required=True, description="Official institution name") .field("description", dtype="str", description="Brief description") .field("email", dtype="str", description="Contact email") .field("phone", dtype="str", description="Contact phone") .field("address", dtype="str", description="Physical address") .field("city", dtype="str", description="City") .field("founding_year", dtype="str", description="Year founded") .field("collection_size", dtype="str", description="Size of collection") .field("kvk_number", dtype="str", description="Chamber of Commerce number") .field("social_facebook", dtype="str", description="Facebook URL") .field("social_instagram", dtype="str", description="Instagram URL") .build() ) print(f" Schema: {schema.name}") print(f" Fields: {[f.name for f in schema.fields]}") # Create annotator try: annotator = LLMAnnotator(LLMAnnotatorConfig( provider=LLMProvider.ZAI, model="glm-4.6", )) except ValueError as e: print(f"\n ⚠️ Skipping LLM test: {e}") print(" Set ZAI_API_TOKEN in .env to run this test\n") return print("\n Calling LLM with schema-driven prompt...") # Run annotation session, structured_data = await annotator.annotate_with_schema( document=SAMPLE_HTML, schema=schema, source_url="https://historischekringelden.nl", ) print(f"\n Session ID: {session.session_id}") print(f" Entities: {len(session.entity_claims)}") print(f" Layout regions: {len(session.layout_claims)}") print(f" Claims: {len(session.aggregate_claims)}") print(f" Errors: {session.errors}") print(f"\n Structured Data:") for key, value in structured_data.items(): if not key.startswith('_'): print(f" {key}: {value}") # Show some entities if session.entity_claims: print(f"\n Sample Entities:") for claim in session.entity_claims[:5]: hypernym = claim.hypernym.value if hasattr(claim.hypernym, 'value') else str(claim.hypernym) print(f" [{hypernym}] {claim.text_content} (conf: {claim.recognition_confidence:.2f})") print("\n ✓ Schema-driven annotation test passed!\n") @pytest.mark.asyncio async def test_quick_extraction(): """Test the quick extraction method with GLiNER2 syntax.""" print("\n" + "="*60) print("TEST 5: Quick Structured Extraction") print("="*60 + "\n") # Create annotator try: annotator = LLMAnnotator(LLMAnnotatorConfig( provider=LLMProvider.ZAI, model="glm-4.6", )) except ValueError as e: print(f"\n ⚠️ Skipping LLM test: {e}") print(" Set ZAI_API_TOKEN in .env to run this test\n") return # Define fields using GLiNER2 syntax fields = [ "name::str::Institution name", "email::str::Contact email", "phone::str::Phone number", "city::str::City", "type::[MUSEUM|ARCHIVE|HISTORICAL_SOCIETY]::str::Institution type", ] print(f" Fields to extract:") for f in fields: print(f" • {f}") print("\n Running quick extraction...") result = await annotator.extract_structured( document=SAMPLE_HTML, fields=fields, source_url="https://historischekringelden.nl", ) print(f"\n Results:") for key, value in result.items(): if not key.startswith('_'): print(f" {key}: {value}") print("\n ✓ Quick extraction test passed!\n") async def main(): """Run all tests.""" print("\n" + "="*60) print("GLAM Schema-Driven Annotation Tests") print("="*60) # Run sync tests test_gliner2_field_syntax() test_schema_builder() test_json_schema_generation() # Run async tests (require API key) await test_schema_driven_annotation() await test_quick_extraction() print("\n" + "="*60) print("ALL TESTS COMPLETED") print("="*60 + "\n") if __name__ == "__main__": asyncio.run(main())