"""
Layout Annotator for GLAM documents.

Analyzes document structure and creates layout region claims
following the DOC hypernym from GLAM-NER v1.7.0 Section 15.

Regions include:
- Primary: HDR, PAR, SEN, LST, TBL
- Media: GAL, MAP, AUD, VID, EMB
- Navigation: NAV, TOC, IDX
- Front/Back matter: TTP, DED, COL, BIB, APP, GLO
- Commercial: ADV, LOG

NO COMPUTER VISION - uses text/structure patterns only.
"""

import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Set, Tuple

from .base import (
    BaseAnnotator,
    AnnotationSession,
    LayoutClaim,
    LayoutRegion,
    SemanticRole,
    Provenance,
)
from .html_parser import HTMLDocument, HTMLElement


# =============================================================================
# LAYOUT DETECTION PATTERNS
# =============================================================================

@dataclass  
class LayoutPattern:
    """Pattern for layout region detection."""
    region: LayoutRegion
    semantic_role: SemanticRole
    
    # Detection criteria (any match triggers)
    html_tags: Optional[Set[str]] = None
    css_classes: Optional[Set[str]] = None
    css_ids: Optional[Set[str]] = None
    text_patterns: Optional[List[str]] = None
    
    # Additional constraints
    min_text_length: int = 0
    max_text_length: int = 100000
    
    priority: int = 5
    
    def __post_init__(self):
        if self.html_tags is None:
            self.html_tags = set()
        if self.css_classes is None:
            self.css_classes = set()
        if self.css_ids is None:
            self.css_ids = set()
        if self.text_patterns is None:
            self.text_patterns = []


# Layout patterns for heritage documents
LAYOUT_PATTERNS: List[LayoutPattern] = [
    # Headings (H1-H6)
    LayoutPattern(
        region=LayoutRegion.HDR,
        semantic_role=SemanticRole.STRC,
        html_tags={'h1', 'h2', 'h3', 'h4', 'h5', 'h6'},
        priority=10,
    ),
    
    # Paragraphs
    LayoutPattern(
        region=LayoutRegion.PAR,
        semantic_role=SemanticRole.PRIM,
        html_tags={'p'},
        min_text_length=10,
        priority=5,
    ),
    
    # Lists
    LayoutPattern(
        region=LayoutRegion.LST,
        semantic_role=SemanticRole.PRIM,
        html_tags={'ul', 'ol', 'dl'},
        priority=7,
    ),
    LayoutPattern(
        region=LayoutRegion.LIT,
        semantic_role=SemanticRole.PRIM,
        html_tags={'li', 'dt', 'dd'},
        priority=6,
    ),
    
    # Tables
    LayoutPattern(
        region=LayoutRegion.TBL,
        semantic_role=SemanticRole.PRIM,
        html_tags={'table'},
        priority=8,
    ),
    
    # Navigation
    LayoutPattern(
        region=LayoutRegion.NAV,
        semantic_role=SemanticRole.NAV,
        html_tags={'nav'},
        css_classes={'navigation', 'nav', 'menu', 'navbar', 'sidebar-nav'},
        priority=9,
    ),
    
    # Figures and captions
    LayoutPattern(
        region=LayoutRegion.FIG,
        semantic_role=SemanticRole.VIS,
        html_tags={'figure', 'img'},
        priority=7,
    ),
    LayoutPattern(
        region=LayoutRegion.CAP,
        semantic_role=SemanticRole.SUPP,
        html_tags={'figcaption', 'caption'},
        priority=8,
    ),
    
    # Block quotes
    LayoutPattern(
        region=LayoutRegion.BLK,
        semantic_role=SemanticRole.PRIM,
        html_tags={'blockquote', 'q'},
        priority=7,
    ),
    
    # Sidebar content
    LayoutPattern(
        region=LayoutRegion.SDB,
        semantic_role=SemanticRole.SUPP,
        html_tags={'aside'},
        css_classes={'sidebar', 'aside', 'infobox', 'callout'},
        priority=8,
    ),
    
    # Footnotes
    LayoutPattern(
        region=LayoutRegion.FTN,
        semantic_role=SemanticRole.REF,
        css_classes={'footnote', 'footnotes', 'endnote', 'fn'},
        css_ids={'footnotes', 'endnotes'},
        priority=8,
    ),
    
    # Bibliography
    LayoutPattern(
        region=LayoutRegion.BIB,
        semantic_role=SemanticRole.BACK,
        css_classes={'bibliography', 'references', 'literature', 'literatuur', 'bronnen'},
        text_patterns=[r'(?i)bibliografie|bibliography|references|literatuur|bronnen'],
        priority=9,
    ),
    
    # Table of contents
    LayoutPattern(
        region=LayoutRegion.TOC,
        semantic_role=SemanticRole.NAV,
        css_classes={'toc', 'table-of-contents', 'inhoud', 'inhoudsopgave'},
        css_ids={'toc', 'table-of-contents'},
        text_patterns=[r'(?i)inhoudsopgave|table of contents|contents'],
        priority=9,
    ),
    
    # Embedded content
    LayoutPattern(
        region=LayoutRegion.EMB,
        semantic_role=SemanticRole.INT,
        html_tags={'iframe', 'embed', 'object'},
        priority=8,
    ),
    
    # Gallery/image collections
    LayoutPattern(
        region=LayoutRegion.GAL,
        semantic_role=SemanticRole.VIS,
        css_classes={'gallery', 'image-gallery', 'photo-gallery', 'carousel', 'slider'},
        priority=8,
    ),
    
    # Maps
    LayoutPattern(
        region=LayoutRegion.MAP,
        semantic_role=SemanticRole.SPAT,
        css_classes={'map', 'leaflet', 'mapbox', 'google-map', 'kaart'},
        priority=8,
    ),
    
    # Footer (often contains colophon)
    LayoutPattern(
        region=LayoutRegion.COL,
        semantic_role=SemanticRole.BACK,
        html_tags={'footer'},
        css_classes={'footer', 'colophon', 'site-info'},
        priority=6,
    ),
    
    # Header (site header, not headings)
    LayoutPattern(
        region=LayoutRegion.TTP,
        semantic_role=SemanticRole.FRNT,
        html_tags={'header'},
        css_classes={'site-header', 'page-header', 'masthead'},
        priority=6,
    ),
    
    # Advertisements
    LayoutPattern(
        region=LayoutRegion.ADV,
        semantic_role=SemanticRole.COMM,
        css_classes={'ad', 'advertisement', 'advertentie', 'sponsor', 'banner-ad'},
        priority=7,
    ),
    
    # Logos
    LayoutPattern(
        region=LayoutRegion.LOG,
        semantic_role=SemanticRole.COMM,
        css_classes={'logo', 'site-logo', 'brand'},
        priority=7,
    ),
    
    # Metadata blocks
    LayoutPattern(
        region=LayoutRegion.MTD,
        semantic_role=SemanticRole.META,
        css_classes={'metadata', 'meta-info', 'post-meta', 'entry-meta'},
        priority=7,
    ),
]


class LayoutAnnotator(BaseAnnotator):
    """
    Layout annotator for heritage documents.
    
    Analyzes document structure using HTML semantics and CSS patterns
    (NO computer vision). Creates layout region claims following
    GLAM-NER v1.7.0 DOC hypernym taxonomy.
    """
    
    def __init__(
        self,
        patterns: Optional[List[LayoutPattern]] = None,
        include_all_elements: bool = False,
    ):
        """
        Initialize layout annotator.
        
        Args:
            patterns: Custom layout patterns (defaults to LAYOUT_PATTERNS)
            include_all_elements: Annotate all elements (not just matched ones)
        """
        super().__init__(
            agent_name="LayoutAnnotator",
            agent_version="1.0.0",
        )
        self.patterns = patterns or LAYOUT_PATTERNS
        self.include_all_elements = include_all_elements
        
        # Pre-compile text patterns (use id as key since dataclass not hashable)
        self._compiled_text_patterns: Dict[int, List[re.Pattern]] = {}
        for pattern in self.patterns:
            if pattern.text_patterns:
                self._compiled_text_patterns[id(pattern)] = [
                    re.compile(p) for p in pattern.text_patterns
                ]
    
    def annotate(
        self,
        document: Any,
        session: Optional[AnnotationSession] = None,
    ) -> AnnotationSession:
        """
        Annotate layout regions in a document.
        
        Args:
            document: HTMLDocument to annotate
            session: Existing session to add claims to
            
        Returns:
            AnnotationSession with layout claims
        """
        if not isinstance(document, HTMLDocument):
            raise ValueError("LayoutAnnotator requires HTMLDocument")
        
        if session is None:
            session = self.create_session(
                source_url=document.source_url,
                source_file=document.source_file,
            )
        
        # Track parent-child relationships
        claim_map: Dict[str, LayoutClaim] = {}
        
        for element in document.elements:
            claim = self._classify_element(element, document)
            
            if claim:
                # Link to parent
                if element.parent_xpath and element.parent_xpath in claim_map:
                    parent_claim = claim_map[element.parent_xpath]
                    claim.parent_claim_id = parent_claim.claim_id
                    parent_claim.child_claim_ids.append(claim.claim_id)
                
                claim_map[element.xpath] = claim
                session.add_layout_claim(claim)
        
        return session
    
    def _classify_element(
        self,
        element: HTMLElement,
        document: HTMLDocument,
    ) -> Optional[LayoutClaim]:
        """
        Classify an HTML element as a layout region.
        
        Args:
            element: HTML element to classify
            document: Parent document
            
        Returns:
            LayoutClaim if classified, None otherwise
        """
        best_match: Optional[Tuple[LayoutPattern, float]] = None
        
        for pattern in self.patterns:
            score = self._match_pattern(element, pattern)
            if score > 0:
                if best_match is None or score > best_match[1]:
                    best_match = (pattern, score)
        
        if best_match is None:
            if self.include_all_elements:
                # Default to paragraph for unmatched block elements
                return self._create_default_claim(element, document)
            return None
        
        pattern, confidence = best_match
        
        # Create provenance
        provenance = self.create_provenance(
            namespace="glam-doc",
            path=element.xpath,
            confidence=min(confidence / 10, 1.0),  # Normalize to 0-1
            source_url=document.source_url,
            source_file=document.source_file,
        )
        
        # Create claim
        claim = LayoutClaim(
            region=pattern.region,
            semantic_role=pattern.semantic_role,
            xpath=element.xpath,
            text_content=element.text_content[:500] if element.text_content else None,
            start_offset=element.start_offset,
            end_offset=element.end_offset,
            provenance=provenance,
        )
        
        # Add heading level if applicable
        if pattern.region == LayoutRegion.HDR and element.heading_level:
            claim.heading_level = element.heading_level
        
        return claim
    
    def _match_pattern(
        self,
        element: HTMLElement,
        pattern: LayoutPattern,
    ) -> float:
        """
        Calculate match score between element and pattern.
        
        Returns score >= 0 (0 = no match, higher = better match)
        """
        score = 0.0
        
        # Check HTML tag
        if pattern.html_tags and element.tag.lower() in pattern.html_tags:
            score += 5.0
        
        # Check CSS classes
        elem_classes = set(element.attributes.get('class', '').lower().split())
        if pattern.css_classes and elem_classes & pattern.css_classes:
            score += 3.0
        
        # Check CSS IDs
        elem_id = element.attributes.get('id', '').lower()
        if pattern.css_ids and elem_id in pattern.css_ids:
            score += 3.0
        
        # Check text patterns
        if id(pattern) in self._compiled_text_patterns and element.text_content:
            for regex in self._compiled_text_patterns[id(pattern)]:
                if regex.search(element.text_content):
                    score += 2.0
                    break
        
        # Check text length constraints
        text_len = len(element.text_content) if element.text_content else 0
        if text_len < pattern.min_text_length or text_len > pattern.max_text_length:
            return 0.0  # Fails constraint
        
        # Apply priority multiplier
        if score > 0:
            score += pattern.priority * 0.1
        
        return score
    
    def _create_default_claim(
        self,
        element: HTMLElement,
        document: HTMLDocument,
    ) -> Optional[LayoutClaim]:
        """Create default claim for unmatched elements."""
        if not element.is_block:
            return None
        
        provenance = self.create_provenance(
            namespace="glam-doc",
            path=element.xpath,
            confidence=0.3,  # Low confidence for defaults
            source_url=document.source_url,
            source_file=document.source_file,
        )
        
        # Determine default region
        if element.is_heading:
            region = LayoutRegion.HDR
            semantic_role = SemanticRole.STRC
        else:
            region = LayoutRegion.PAR
            semantic_role = SemanticRole.PRIM
        
        return LayoutClaim(
            region=region,
            semantic_role=semantic_role,
            xpath=element.xpath,
            text_content=element.text_content[:500] if element.text_content else None,
            start_offset=element.start_offset,
            end_offset=element.end_offset,
            heading_level=element.heading_level,
            provenance=provenance,
        )
    
    def add_pattern(self, pattern: LayoutPattern):
        """Add a new layout pattern."""
        self.patterns.append(pattern)
        if pattern.text_patterns:
            self._compiled_text_patterns[id(pattern)] = [
                re.compile(p) for p in pattern.text_patterns
            ]


def create_heritage_layout_annotator() -> LayoutAnnotator:
    """
    Create a layout annotator optimized for heritage documents.
    
    Includes patterns for:
    - Standard document structure (headings, paragraphs, lists, tables)
    - Heritage-specific regions (galleries, maps, bibliographies)
    - Dutch language patterns (inhoudsopgave, literatuur, etc.)
    """
    return LayoutAnnotator(
        patterns=LAYOUT_PATTERNS,
        include_all_elements=False,
    )