#!/usr/bin/env python3
"""
Test script for WebClaim Validator.

Tests the validator against real NDE entry files with web_claims.
"""

import sys
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / "src"))

import yaml
from glam_extractor.annotators.webclaim_validator import (
    WebClaimValidator,
    WebClaim,
    ClaimType,
    ValidationStatus,
    ValidationResult,
    batch_validate_entry_files,
)


def test_single_claim_validation():
    """Test validating a single claim from an entry file."""
    print("=" * 60)
    print("TEST: Single Claim Validation")
    print("=" * 60)
    
    # Entry file path
    entry_path = project_root / "data/nde/enriched/entries/1667_historische_kring_wierden.yaml"
    web_base = project_root / "data/nde/enriched/entries"
    
    # Load the entry
    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)
    
    # Get the first web claim
    claims_data = entry.get('web_claims', {}).get('claims', [])
    if not claims_data:
        print("No web_claims found in entry!")
        return False
    
    # Test first claim (org_name with XPath)
    claim_data = claims_data[0]
    print(f"\nTesting claim:")
    print(f"  Type: {claim_data.get('claim_type')}")
    print(f"  Value: {claim_data.get('claim_value')}")
    print(f"  XPath: {claim_data.get('xpath')}")
    print(f"  HTML File: {claim_data.get('html_file')}")
    
    # Create WebClaim object (use 'full_name' as fallback for custom claim types)
    claim_type_str = claim_data.get('claim_type', 'full_name')
    try:
        claim_type = ClaimType(claim_type_str)
    except ValueError:
        # Custom claim type - use FULL_NAME as fallback for validation
        print(f"  (Custom claim type '{claim_type_str}', using FULL_NAME for validation)")
        claim_type = ClaimType.FULL_NAME
    
    claim = WebClaim(
        claim_type=claim_type,
        claim_value=claim_data.get('claim_value', ''),
        source_url=claim_data.get('source_url', ''),
        retrieved_on=claim_data.get('retrieved_on', ''),
        xpath=claim_data.get('xpath', ''),
        html_file=claim_data.get('html_file', ''),
        xpath_match_score=claim_data.get('xpath_match_score', 0.0),
    )
    
    # Create validator with proper base path
    validator = WebClaimValidator(html_base_path=web_base)
    
    # Validate
    result = validator.validate_claim(claim)
    
    print(f"\nValidation Result:")
    print(f"  Status: {result.status.value}")
    print(f"  Computed Match Score: {result.computed_match_score:.2f}")
    if result.matched_text:
        print(f"  Matched Text: {result.matched_text[:100]}...")
    if result.error_message:
        print(f"  Error: {result.error_message}")
    
    return result.status == ValidationStatus.VALID


def test_all_claims_in_entry():
    """Test validating all claims in an entry file."""
    print("\n" + "=" * 60)
    print("TEST: All Claims in Entry File")
    print("=" * 60)
    
    entry_path = project_root / "data/nde/enriched/entries/1667_historische_kring_wierden.yaml"
    web_base = project_root / "data/nde/enriched/entries"
    
    # Load the entry
    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)
    
    claims_data = entry.get('web_claims', {}).get('claims', [])
    print(f"\nTotal claims in entry: {len(claims_data)}")
    
    # Convert to WebClaim objects
    claims = []
    for cd in claims_data:
        claim_type_str = cd.get('claim_type', 'full_name')
        try:
            claim_type = ClaimType(claim_type_str)
        except ValueError:
            claim_type = ClaimType.FULL_NAME  # Fallback for custom types
        
        claims.append(WebClaim(
            claim_type=claim_type,
            claim_value=cd.get('claim_value', ''),
            source_url=cd.get('source_url', ''),
            retrieved_on=cd.get('retrieved_on', ''),
            xpath=cd.get('xpath', ''),
            html_file=cd.get('html_file', ''),
            xpath_match_score=cd.get('xpath_match_score', 0.0),
        ))
    
    # Validate all
    validator = WebClaimValidator(html_base_path=web_base)
    results = validator.validate_claims(claims)
    
    # Summarize results
    summary = {
        ValidationStatus.VALID: 0,
        ValidationStatus.INVALID: 0,
        ValidationStatus.FABRICATED: 0,
        ValidationStatus.NEEDS_REVIEW: 0,
    }
    
    for result in results:
        summary[result.status] += 1
    
    print("\nValidation Summary:")
    for status, count in summary.items():
        print(f"  {status.value}: {count}")
    
    # Show details for non-valid claims
    non_valid = [r for r in results if r.status != ValidationStatus.VALID]
    if non_valid:
        print(f"\nNon-valid claims ({len(non_valid)}):")
        for r in non_valid[:5]:  # Show first 5
            print(f"  - {r.claim.claim_type.value}: '{r.claim.claim_value[:30]}...'")
            print(f"    Status: {r.status.value}, Score: {r.computed_match_score:.2f}")
            if r.error_message:
                print(f"    Error: {r.error_message}")
    
    return summary[ValidationStatus.VALID] > 0


def test_fabricated_claim_detection():
    """Test detection of claims without XPath provenance."""
    print("\n" + "=" * 60)
    print("TEST: Fabricated Claim Detection")
    print("=" * 60)
    
    # Create a claim with no XPath (fabricated)
    fabricated_claim = WebClaim(
        claim_type=ClaimType.FULL_NAME,
        claim_value="Test Institution",
        source_url="https://example.com",
        retrieved_on="2025-12-01T00:00:00Z",
        xpath="",  # No XPath = fabricated
        html_file="test.html",
        xpath_match_score=0.95,  # LLM might claim high confidence...
    )
    
    validator = WebClaimValidator()
    result = validator.validate_claim(fabricated_claim)
    
    print(f"\nClaim with empty XPath:")
    print(f"  Status: {result.status.value}")
    print(f"  Error: {result.error_message}")
    
    # Should be detected as fabricated
    assert result.status == ValidationStatus.FABRICATED
    print("  ✓ Correctly identified as FABRICATED")
    
    return True


def test_xpath_not_found():
    """Test handling of invalid XPath."""
    print("\n" + "=" * 60)
    print("TEST: XPath Not Found in HTML")
    print("=" * 60)
    
    entry_path = project_root / "data/nde/enriched/entries/1667_historische_kring_wierden.yaml"
    web_base = project_root / "data/nde/enriched/entries"
    
    # Create a claim with non-existent XPath
    bad_xpath_claim = WebClaim(
        claim_type=ClaimType.FULL_NAME,
        claim_value="Test Value",
        source_url="https://historischekringwierden.nl/",
        retrieved_on="2025-12-01T00:00:00Z",
        xpath="/html/body/div[999]/nonexistent",  # Bad XPath
        html_file="web/1667/historischekringwierden.nl/rendered.html",
        xpath_match_score=1.0,
    )
    
    validator = WebClaimValidator(html_base_path=web_base)
    result = validator.validate_claim(bad_xpath_claim)
    
    print(f"\nClaim with invalid XPath:")
    print(f"  XPath: {bad_xpath_claim.xpath}")
    print(f"  Status: {result.status.value}")
    print(f"  Error: {result.error_message}")
    
    assert result.status == ValidationStatus.INVALID
    print("  ✓ Correctly identified as INVALID")
    
    return True


def main():
    """Run all tests."""
    print("\n" + "=" * 60)
    print("WebClaim Validator Test Suite")
    print("=" * 60)
    
    tests = [
        ("Single Claim Validation", test_single_claim_validation),
        ("All Claims in Entry", test_all_claims_in_entry),
        ("Fabricated Claim Detection", test_fabricated_claim_detection),
        ("XPath Not Found", test_xpath_not_found),
    ]
    
    results = []
    for name, test_func in tests:
        try:
            passed = test_func()
            results.append((name, passed))
        except Exception as e:
            print(f"\n❌ Test '{name}' FAILED with exception: {e}")
            import traceback
            traceback.print_exc()
            results.append((name, False))
    
    # Summary
    print("\n" + "=" * 60)
    print("TEST SUMMARY")
    print("=" * 60)
    
    passed = sum(1 for _, p in results if p)
    total = len(results)
    
    for name, p in results:
        status = "✓ PASSED" if p else "❌ FAILED"
        print(f"  {status}: {name}")
    
    print(f"\nTotal: {passed}/{total} tests passed")
    
    return passed == total


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)