glam/src/glam_extractor/extractors/nlp_extractor.py

"""
NLP-based extraction of heritage institutions from conversation text.

This module provides the InstitutionExtractor class that uses pattern matching,
NER, and heuristics to extract structured institution data from unstructured
conversation text. Extracted records include confidence scores and provenance metadata.
"""

import re
import uuid
from datetime import datetime, timezone
from typing import List, Optional, Tuple, Set, Dict, Any
from dataclasses import dataclass

try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except ImportError:
    RAPIDFUZZ_AVAILABLE = False

from glam_extractor.models import (
    HeritageCustodian,
    InstitutionType,
    OrganizationStatus,
    DataSource,
    DataTier,
    Provenance,
    Location,
    Identifier,
)
from glam_extractor.parsers.conversation import Conversation


# =============================================================================
# RESULT PATTERN FOR ERROR HANDLING
# =============================================================================

@dataclass
class Result:
    """Result type for error handling following Result pattern"""
    success: bool
    value: Any = None
    error: Optional[str] = None

    @classmethod
    def ok(cls, value: Any) -> "Result":
        """Create a successful result"""
        return cls(success=True, value=value, error=None)

    @classmethod
    def err(cls, error: str) -> "Result":
        """Create an error result"""
        return cls(success=False, value=None, error=error)


# =============================================================================
# EXTRACTION DATA STRUCTURES
# =============================================================================

@dataclass
class ExtractedEntity:
    """
    An entity extracted from text with metadata.

    Used as an intermediate representation before converting to HeritageCustodian.
    Note: institution_type is a string (e.g., 'MUSEUM', 'LIBRARY') since LinkML
    PermissibleValue enum objects are not hashable and can't be used as dict keys.
    """
    name: str
    institution_type: Optional[str] = None  # String key e.g. 'MUSEUM', 'LIBRARY'
    city: Optional[str] = None
    country: Optional[str] = None
    identifiers: Optional[List[Identifier]] = None
    confidence_score: float = 0.5
    text_snippet: Optional[str] = None  # The text where this was found

    def __post_init__(self):
        if self.identifiers is None:
            self.identifiers = []


# =============================================================================
# PATTERN DEFINITIONS
# =============================================================================

class ExtractionPatterns:
    """Regular expression patterns for extracting identifiers and keywords"""

    # Identifier patterns
    # Enhanced ISIL pattern (Nov 2025) - supports multiple context formats:
    # 1. Standard: "ISIL: NL-AsdAM" or "ISIL code: NL-AsdAM"
    # 2. Standalone: "NL-AsdAM" (basic word boundary)
    # 3. In context: "code NL-AsdAM" or "code: NL-AsdAM"
    # 4. Parenthetical: "(NL-AsdAM)"
    ISIL_PATTERN = re.compile(r'\b([A-Z]{2}-[A-Za-z0-9]+)\b')

    # Additional context-aware ISIL patterns (Nov 2025)
    # These catch ISIL codes that appear with common prefixes
    ISIL_CONTEXT_PATTERNS = [
        re.compile(r'\bISIL[:\s]+([A-Z]{2}-[A-Za-z0-9]+)\b', re.IGNORECASE),  # "ISIL: NL-..."
        re.compile(r'\bcode[:\s]+([A-Z]{2}-[A-Za-z0-9]+)\b', re.IGNORECASE),  # "code: NL-..."
        re.compile(r'\(([A-Z]{2}-[A-Za-z0-9]+)\)'),  # "(NL-AsdAM)"
    ]

    WIKIDATA_PATTERN = re.compile(r'\b(Q\d+)\b')
    VIAF_URL_PATTERN = re.compile(r'viaf\.org/viaf/(\d+)')
    KVK_PATTERN = re.compile(r'\b(\d{8})\b')

    # ISIL code blacklist (common false positives)
    # NOTE: Matching is case-insensitive, but stored in original case for reference
    ISIL_BLACKLIST = {
        'CD-ROM',      # CD-ROM is not Congo ISIL code
        'US-ASCII',
        'UTF-8',
        'ISO-8859',
        'AI-Powered',  # AI tool descriptions (English)
        'AI-processed',
        'AI-driven',
        'AI-based',
        'AI-enhanced',
        'AI-assisted',
        'AI-generated',
        'AI-gedreven',  # Dutch: AI-driven
        'AI-aangedreven',  # Dutch: AI-powered
        'AI-ondersteund',  # Dutch: AI-supported
        'MS-DOS',      # Operating systems
        'US-WEST',     # AWS regions
        'EU-GDPR',     # Legal frameworks
        'US-EN',       # Language codes
    }

    # Institution type keywords (multilingual)
    # Map keyword variants to canonical English form for normalization
    # Note: Use string keys (not PermissibleValue) because LinkML enum values are unhashable
    INSTITUTION_KEYWORDS = {
        'MUSEUM': [
            'museum', 'museo', 'museu', 'musée', 'muzeum', 'muzeul',
            'kunstmuseum', 'kunsthalle', 'muzej', 'μουσείο'
        ],
        'LIBRARY': [
            'library', 'biblioteca', 'bibliothek', 'bibliotheek', 'bibliothèque',
            'biblioteka', 'knihovna', 'βιβλιοθήκη', 'national library'
        ],
        'ARCHIVE': [
            'archive', 'archivo', 'archiv', 'archief', 'archives',
            'arkiv', 'arkivet', 'αρχείο', 'arquivos', 'national archive'
        ],
        'GALLERY': [
            'gallery', 'galerie', 'galerija', 'γκαλερί', 'art gallery',
            'kunstgalerie', 'galería'
        ],
        'RESEARCH_CENTER': [
            'research center', 'research centre', 'research institute',
            'onderzoekscentrum', 'forschungszentrum', 'centre de recherche'
        ],
        'BOTANICAL_ZOO': [
            'botanical garden', 'botanic garden', 'zoo', 'aquarium',
            'botanische tuin', 'jardin botanique', 'dierentuin'
        ],
        'EDUCATION_PROVIDER': [
            'university', 'universiteit', 'universidad', 'université',
            'college', 'school', 'educational institution'
        ],
    }

    # Keyword normalization map: variant → canonical English form
    KEYWORD_NORMALIZATION = {
        'museo': 'Museum',
        'museu': 'Museum',
        'musée': 'Museum',
        'muzeum': 'Museum',
        'muzeul': 'Museum',
        'muzej': 'Museum',
        'μουσείο': 'Museum',
        'museum': 'Museum',
        'biblioteca': 'Library',
        'bibliothek': 'Library',
        'bibliotheek': 'Library',
        'bibliothèque': 'Library',
        'biblioteka': 'Library',
        'knihovna': 'Library',
        'βιβλιοθήκη': 'Library',
        'library': 'Library',
        'archivo': 'Archive',
        'archiv': 'Archive',
        'archief': 'Archive',
        'archives': 'Archives',
        'arkiv': 'Archive',
        'arkivet': 'Archive',
        'αρχείο': 'Archive',
        'arquivos': 'Archives',
        'archive': 'Archive',
        'galerie': 'Gallery',
        'galerija': 'Gallery',
        'γκαλερί': 'Gallery',
        'galería': 'Gallery',
        'gallery': 'Gallery',
    }

    # Location extraction patterns
    CITY_PATTERN = re.compile(
        r'\bin\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)'  # "in Amsterdam", "in New York"
    )

    # Country name to ISO 3166-1 alpha-2 code mapping
    # Used to infer country from conversation filename
    COUNTRY_NAME_TO_CODE = {
        'brazilian': 'BR', 'brazil': 'BR',
        'vietnamese': 'VN', 'vietnam': 'VN',
        'chilean': 'CL', 'chile': 'CL',
        'japanese': 'JP', 'japan': 'JP',
        'mexican': 'MX', 'mexico': 'MX',
        'canadian': 'CA', 'canada': 'CA',
        'polish': 'PL', 'poland': 'PL',
        'hungarian': 'HU', 'hungary': 'HU',
        'norwegian': 'NO', 'norway': 'NO',
        'portuguese': 'PT', 'portugal': 'PT',
        'thai': 'TH', 'thailand': 'TH',
        'taiwan': 'TW',
        'turkish': 'TR', 'turkey': 'TR',
        'belgian': 'BE', 'belgium': 'BE',
        'swedish': 'SE', 'sweden': 'SE',
        'azerbaijan': 'AZ',
        'estonian': 'EE', 'estonia': 'EE',
        'south african': 'ZA', 'south africa': 'ZA',
        'namibian': 'NA', 'namibia': 'NA',
        'iraqi': 'IQ', 'iraq': 'IQ',
        'algeria': 'DZ', 'algerian': 'DZ',
        'argentine': 'AR', 'argentina': 'AR',
        'moroccan': 'MA', 'morocco': 'MA',
        'tunisian': 'TN', 'tunisia': 'TN',
        'libyan': 'LY', 'libya': 'LY',
        'mali': 'ML',
        'senegal': 'SN',
        'mauritania': 'MR',
        'egyptian': 'EG', 'egypt': 'EG',
        'ghana': 'GH',
        'jordanian': 'JO', 'jordan': 'JO',
        'iranian': 'IR', 'iran': 'IR',
        'russian': 'RU', 'russia': 'RU',
        'uzbekistan': 'UZ',
        'armenian': 'AM', 'armenia': 'AM',
        'georgia': 'GE', 'georgian': 'GE',
        'croatian': 'HR', 'croatia': 'HR',
        'greece': 'GR', 'greek': 'GR',
        'nigerian': 'NG', 'nigeria': 'NG',
        'somali': 'SO', 'somalia': 'SO',
        'yemen': 'YE',
        'oman': 'OM',
        'korean': 'KR', 'korea': 'KR', 'south korea': 'KR',
        'north korean': 'KP', 'north korea': 'KP',
        'malaysian': 'MY', 'malaysia': 'MY',
        'colombian': 'CO', 'colombia': 'CO',
        'swiss': 'CH', 'switzerland': 'CH',
        'nepal': 'NP',
        'united states': 'US', 'american': 'US',
        'serbian': 'RS', 'serbia': 'RS',
        'moldavian': 'MD', 'moldova': 'MD',
        'bulgarian': 'BG', 'bulgaria': 'BG',
        'romanian': 'RO', 'romania': 'RO',
        'albanian': 'AL', 'albania': 'AL',
        'bosnian': 'BA', 'bosnia': 'BA',
        'india': 'IN', 'indian': 'IN',
        'bhutan': 'BT',
        'pakistan': 'PK',
        'suriname': 'SR',
        'nicaragua': 'NI',
        'congo': 'CG',
        'danish': 'DK', 'denmark': 'DK',
        'austrian': 'AT', 'austria': 'AT',
        'australian': 'AU', 'australia': 'AU',
        'burma': 'MM', 'myanmar': 'MM',
        'cambodian': 'KH', 'cambodia': 'KH',
        'afghan': 'AF', 'afghanistan': 'AF',
        'sri lankan': 'LK', 'sri lanka': 'LK',
        'laos': 'LA',
        'tajikistan': 'TJ',
        'turkmenistan': 'TM',
        'uruguay': 'UY',
        'philippine': 'PH', 'philippines': 'PH',
        'finnish': 'FI', 'finland': 'FI',
        'latvian': 'LV', 'latvia': 'LV',
        'israeli': 'IL', 'israel': 'IL',
        'palestinian': 'PS', 'palestine': 'PS',
        'cyprus': 'CY',
        'overijssel': 'NL', 'limburg': 'NL', 'north brabant': 'NL',
        'zeeland': 'NL', 'zuid holland': 'NL', 'noord holland': 'NL',
        'gelderland': 'NL', 'drenthe': 'NL', 'groningen': 'NL',
        'friesland': 'NL', 'flevoland': 'NL',
        'dutch': 'NL', 'netherlands': 'NL',
        'slovak': 'SK', 'slovakia': 'SK',
        'slovenian': 'SI', 'slovenia': 'SI',
        'north macedonia': 'MK', 'macedonia': 'MK',
        'peruvian': 'PE', 'peru': 'PE',
        'ethiopian': 'ET', 'ethiopia': 'ET',
        'kenyan': 'KE', 'kenya': 'KE',
        'paraguay': 'PY',
        'honduran': 'HN', 'honduras': 'HN',
        'panamanian': 'PA', 'panama': 'PA',
        'madagascar': 'MG',
        'mozambique': 'MZ',
        'eritrean': 'ER', 'eritrea': 'ER',
        'sudan': 'SD',
        'rwandan': 'RW', 'rwanda': 'RW',
        'kiribati': 'KI',
        'new zealand': 'NZ',
        'haiti': 'HT',
        'jamaican': 'JM', 'jamaica': 'JM',
        'cuban': 'CU', 'cuba': 'CU',
        'indonesian': 'ID', 'indonesia': 'ID',
        'vatican': 'VA',
        'italian': 'IT', 'italy': 'IT',
        'zimbabwe': 'ZW',
        'east timor': 'TL',
        'qatar': 'QA',
        'arabic emirates': 'AE', 'uae': 'AE',
        'kuwait': 'KW',
        'lebanese': 'LB', 'lebanon': 'LB',
        'syrian': 'SY', 'syria': 'SY',
        'saudi arabian': 'SA', 'saudi arabia': 'SA',
        'maldives': 'MV',
        'burkina faso': 'BF',
        'togo': 'TG',
        'benin': 'BJ',
        'liberian': 'LR', 'liberia': 'LR',
    }

    # Common patterns for institution names
    # Match capitalized sequences that could be institution names
    INSTITUTION_NAME_PATTERN = re.compile(
        r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5})\b'
    )


# =============================================================================
# INSTITUTION EXTRACTOR
# =============================================================================

class InstitutionExtractor:
    """
    Extract heritage institution data from conversation text using NLP techniques.

    This extractor uses pattern matching and heuristics to identify institutions,
    classify their types, extract locations, and find associated identifiers.
    Each extraction includes a confidence score for data quality tracking.

    Usage:
        extractor = InstitutionExtractor()
        result = extractor.extract_from_conversation(conversation)
        if result.success:
            for institution in result.value:
                print(f"{institution.name} - {institution.institution_type}")
    """

    def __init__(self):
        """Initialize the extractor"""
        self.patterns = ExtractionPatterns()

    def extract_from_conversation(
        self,
        conversation: Conversation
    ) -> Result:
        """
        Extract heritage institutions from a conversation.

        Infers country from conversation name when possible.

        Args:
            conversation: Parsed Conversation object

        Returns:
            Result containing List[HeritageCustodian] on success, error message on failure
        """
        try:
            # Extract text from assistant messages (most likely to contain institution data)
            text = conversation.extract_all_text(sender="assistant")

            if not text.strip():
                return Result.ok([])  # No text to process, but not an error

            # Infer country from conversation name
            inferred_country = self._infer_country_from_name(conversation.name)

            # Extract entities from text
            entities = self._extract_entities(text, inferred_country=inferred_country)

            # Convert entities to HeritageCustodian records
            custodians = []
            for entity in entities:
                custodian = self._entity_to_custodian(
                    entity,
                    conversation_id=conversation.uuid,
                    conversation_name=conversation.name
                )
                custodians.append(custodian)

            return Result.ok(custodians)

        except Exception as e:
            return Result.err(f"Extraction failed: {str(e)}")

    def extract_from_text(
        self,
        text: str,
        conversation_id: Optional[str] = None,
        conversation_name: Optional[str] = None
    ) -> Result:
        """
        Extract heritage institutions from raw text.

        Args:
            text: Text to extract from
            conversation_id: Optional conversation UUID for provenance
            conversation_name: Optional conversation name for context

        Returns:
            Result containing List[HeritageCustodian] on success, error message on failure
        """
        try:
            if not text.strip():
                return Result.ok([])

            entities = self._extract_entities(text)

            custodians = []
            for entity in entities:
                custodian = self._entity_to_custodian(
                    entity,
                    conversation_id=conversation_id,
                    conversation_name=conversation_name
                )
                custodians.append(custodian)

            return Result.ok(custodians)

        except Exception as e:
            return Result.err(f"Extraction failed: {str(e)}")

    # =========================================================================
    # HELPER METHODS
    # =========================================================================

    def _infer_country_from_name(self, conversation_name: Optional[str]) -> Optional[str]:
        """
        Infer country code from conversation name.

        Example: "Brazilian_GLAM_collection_inventories" → "BR"

        Args:
            conversation_name: Conversation name/title

        Returns:
            ISO 3166-1 alpha-2 country code if found, otherwise None
        """
        if not conversation_name:
            return None

        name_lower = conversation_name.lower()

        # Check against country name mapping
        for country_name, country_code in self.patterns.COUNTRY_NAME_TO_CODE.items():
            if country_name in name_lower:
                return country_code

        return None

    # =========================================================================
    # ENTITY EXTRACTION
    # =========================================================================

    def _extract_entities(
        self,
        text: str,
        inferred_country: Optional[str] = None
    ) -> List[ExtractedEntity]:
        """
        Extract institution entities from text.

        This method:
        1. Splits text into sentences
        2. Identifies sentences mentioning institutions
        3. Extracts institution names, types, locations, and identifiers
        4. Assigns confidence scores
        5. Uses inferred_country as fallback when location extraction fails

        Args:
            text: Text to extract from
            inferred_country: ISO country code inferred from conversation name (optional)

        Returns:
            List of extracted entities
        """
        entities = []

        # Split into sentences for context-aware extraction
        sentences = self._split_sentences(text)

        for sentence in sentences:
            # Check if sentence contains institution-related keywords
            if not self._is_institution_sentence(sentence):
                continue

            # Extract institution type from keywords
            institution_type = self._classify_institution_type(sentence)

            # Extract potential institution names
            names = self._extract_institution_names(sentence)

            # Extract location information
            city, country = self._extract_location(sentence)

            # V5: Validate country context (explicit vs inferred)
            validated_country, country_source = self._validate_country_context(
                sentence, names[0] if names else "", inferred_country
            )
            if validated_country:
                country = validated_country

            # Extract identifiers
            identifiers = self._extract_identifiers(sentence)

            # Create entities for each name found
            for name in names:
                # V5: Skip organizations/networks
                if self._is_organization_or_network(name, sentence):
                    continue

                # V5: Skip improper institutional names
                if not self._is_proper_institutional_name(name, sentence):
                    continue

                # V5: Calculate confidence score with enhanced algorithm
                confidence = self._calculate_confidence_v5(
                    name=name,
                    institution_type=institution_type,
                    city=city,
                    country=country,
                    identifiers=identifiers,
                    sentence=sentence,
                    country_source=country_source
                )

                entity = ExtractedEntity(
                    name=name,
                    institution_type=institution_type,
                    city=city,
                    country=country,
                    identifiers=identifiers,
                    confidence_score=confidence,
                    text_snippet=sentence[:200]  # First 200 chars for reference
                )

                entities.append(entity)

        # Deduplicate entities by name
        entities = self._deduplicate_entities(entities)

        return entities

    def _split_sentences(self, text: str) -> List[str]:
        """
        Split text into sentences.

        Simple sentence splitting on common delimiters.

        Args:
            text: Text to split

        Returns:
            List of sentences
        """
        # Split on period, exclamation, question mark followed by space and capital
        sentences = re.split(r'[.!?]\s+(?=[A-Z])', text)
        return [s.strip() for s in sentences if s.strip()]

    def _is_institution_sentence(self, sentence: str) -> bool:
        """
        Check if sentence likely mentions a heritage institution.

        Args:
            sentence: Sentence to check

        Returns:
            True if sentence contains institution-related keywords
        """
        sentence_lower = sentence.lower()

        # Check for any institution type keyword
        for keywords in self.patterns.INSTITUTION_KEYWORDS.values():
            for keyword in keywords:
                if keyword in sentence_lower:
                    return True

        # Check for identifier patterns
        if (self.patterns.ISIL_PATTERN.search(sentence) or
            self.patterns.WIKIDATA_PATTERN.search(sentence) or
            self.patterns.VIAF_URL_PATTERN.search(sentence)):
            return True

        return False

    def _classify_institution_type(self, sentence: str) -> Optional[str]:
        """
        Classify institution type based on keywords in sentence.

        Args:
            sentence: Sentence to classify

        Returns:
            Institution type string (e.g., 'MUSEUM', 'LIBRARY') if detected, otherwise None
        """
        sentence_lower = sentence.lower()

        # Check for type keywords, prioritize more specific types
        for inst_type, keywords in self.patterns.INSTITUTION_KEYWORDS.items():
            for keyword in keywords:
                if keyword in sentence_lower:
                    return inst_type

        return None

    def _extract_institution_names(self, sentence: str) -> List[str]:
        """
        Extract potential institution names from sentence.

        Handles multiple patterns:
        1. "[Type] of [Name]": "Museum of Modern Art"
        2. "[Name] + [Type]": "British Museum", "National Library"
        3. Compound words: "Rijksmuseum" (contains "museum")
        4. ISIL-based: "NL-AsdAM for Amsterdam Museum"

        Normalizes multilingual keyword variants to canonical English forms.

        Args:
            sentence: Sentence to extract from

        Returns:
            List of potential institution names (normalized)
        """
        names = []
        sentence_lower = sentence.lower()

        # Pattern 1: Keyword appears AT THE START of institution name
        # "Museum of Modern Art", "Library of Congress"
        for inst_type, keywords in self.patterns.INSTITUTION_KEYWORDS.items():
            for keyword in keywords:
                if keyword not in sentence_lower:
                    continue

                # Find all occurrences
                idx = 0
                while True:
                    idx = sentence_lower.find(keyword, idx)
                    if idx == -1:
                        break

                    # Check if keyword is at word boundary (start of institution name)
                    if idx > 0 and sentence_lower[idx-1].isalnum():
                        # Pattern 3: Compound word (e.g., "Rijksmuseum")
                        # Extract backward to find start of the capitalized compound word
                        start = idx
                        while start > 0 and sentence[start-1].isalpha():
                            start -= 1

                        # Extract forward to find end of compound word
                        end = idx + len(keyword)
                        while end < len(sentence) and sentence[end].isalpha():
                            end += 1

                        # Check if we found a capitalized word
                        if start < idx and sentence[start].isupper():
                            compound_word = sentence[start:end]
                            # Verify it's a proper noun (capitalized)
                            if compound_word[0].isupper():
                                # Check if there are capitalized words BEFORE this compound word
                                # (indicates multi-word name like "Van Abbemuseum")
                                prefix = sentence[max(0, start-50):start].strip()
                                if prefix:
                                    words_before = prefix.split()
                                    # Check last few words before compound
                                    has_prefix_name = False
                                    for word in words_before[-3:]:  # Check last 3 words
                                        if word and word[0].isupper() and word.lower() not in ['the', 'a', 'an', 'in', 'at']:
                                            has_prefix_name = True
                                            break

                                    if has_prefix_name:
                                        # This is a multi-word name, use Pattern 2 logic below
                                        # Don't append compound_word, let Pattern 2 handle it
                                        idx += 1
                                        # Fall through to Pattern 2 (don't continue)
                                    else:
                                        # True compound word (single word like "Rijksmuseum")
                                        names.append(compound_word)
                                        idx += 1
                                        continue
                                else:
                                    # No prefix, true compound word
                                    names.append(compound_word)
                                    idx += 1
                                    continue

                        idx += 1
                        # Fall through to Pattern 2

                    # Extract text AFTER keyword (for "Museum of Modern Art" and "Museu Nacional" patterns)
                    text_after = sentence[idx:idx+100]

                    # Pattern 1a: keyword + "of/for/and" + capitalized words (English)
                    # "Museum of Modern Art", "Library of Congress"
                    match = re.match(
                        r'(' + re.escape(keyword) + r'(?:\s+(?:of|for|and)\s+[A-Z][a-zA-Z]+)+)',
                        text_after,
                        re.IGNORECASE
                    )

                    if match:
                        full_name = match.group(1).strip()
                        # Normalize keyword to canonical English form
                        normalized_name = full_name
                        for variant, canonical in self.patterns.KEYWORD_NORMALIZATION.items():
                            if full_name.lower().startswith(variant):
                                normalized_name = canonical + full_name[len(variant):]
                                break
                        names.append(normalized_name)
                    else:
                        # Pattern 1b: keyword + capitalized words (stop at location markers)
                        # "Museu Nacional", "Biblioteca Nacional"
                        # Match until we hit location markers (in, at, from) or lowercase word
                        parts = [keyword.title() if keyword.lower() in self.patterns.KEYWORD_NORMALIZATION else keyword]
                        words_after = text_after[len(keyword):].strip().split()

                        for word in words_after:
                            # Stop at location indicators
                            if word.lower() in ['in', 'at', 'from', 'located', 'on', 'near']:
                                break
                            # Stop at lowercase words (articles, prepositions)
                            if word.lower() in ['the', 'a', 'an', 'de', 'do', 'da', 'del', 'de la', 'di']:
                                break
                            # Include capitalized words
                            if word[0].isupper():
                                parts.append(word)
                            else:
                                break

                        if len(parts) > 1:  # At least keyword + one name part
                            # Normalize keyword to canonical English form
                            normalized_keyword = self.patterns.KEYWORD_NORMALIZATION.get(
                                keyword.lower(),
                                keyword.title()
                            )
                            full_name = ' '.join([normalized_keyword] + parts[1:])
                            names.append(full_name)

                    # Pattern 2: Keyword appears AFTER the name
                    # "British Museum", "National Library"
                    # Extract text BEFORE keyword (up to 50 chars)
                    prefix = sentence[max(0, idx-50):idx].strip()

                    if prefix:
                        words = prefix.split()
                        name_parts = []
                        for i, word in enumerate(reversed(words)):
                            # Stop at leading articles only
                            if word.lower() in ['the', 'a', 'an'] and len(name_parts) == 0:
                                continue  # Skip leading article
                            # Stop at location indicators
                            if word.lower() in ['in', 'at', 'from', 'located']:
                                break
                            # Include capitalized words or mid-name prepositions
                            if word[0].isupper() or word.lower() in ['of', 'for', 'and']:
                                name_parts.insert(0, word)
                            else:
                                break

                        if name_parts:
                            # Normalize the keyword to canonical English form
                            normalized_keyword = self.patterns.KEYWORD_NORMALIZATION.get(
                                keyword.lower(),
                                keyword.title()
                            )
                            full_name = ' '.join(name_parts + [normalized_keyword])
                            names.append(full_name)

                    idx += 1

        # Pattern 4: Match ISIL pattern followed by institution name
        # Example: "NL-AsdAM for Amsterdam Museum"
        isil_matches = self.patterns.ISIL_PATTERN.finditer(sentence)
        for match in isil_matches:
            # Look for "for" or "identifies" after ISIL code
            text_after = sentence[match.end():match.end()+50]
            for_match = re.search(r'\s+(?:for|identifies)\s+([A-Z][a-zA-Z\s]+)', text_after)
            if for_match:
                name = for_match.group(1).strip()
                # Clean up name (stop at punctuation)
                name = re.split(r'[,.\(\)]', name)[0].strip()
                if name:
                    names.append(name)

        return list(set(names))  # Remove duplicates

    def _extract_location(self, sentence: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Extract location information from sentence.

        Args:
            sentence: Sentence to extract from

        Returns:
            Tuple of (city, country) or (None, None)
        """
        city = None
        country = None

        # Extract city using "in [City]" pattern
        city_match = self.patterns.CITY_PATTERN.search(sentence)
        if city_match:
            city = city_match.group(1).strip()

        # Extract country code if present (e.g., "NL-", "US-")
        isil_match = self.patterns.ISIL_PATTERN.search(sentence)
        if isil_match:
            isil_code = isil_match.group(1)
            country = isil_code.split('-')[0]  # First part is country code

        # If no country from ISIL, scan sentence for country names
        if not country:
            sentence_lower = sentence.lower()
            # Sort by length (longest first) to match "United States" before "States"
            sorted_countries = sorted(
                self.patterns.COUNTRY_NAME_TO_CODE.items(),
                key=lambda x: len(x[0]),
                reverse=True
            )
            for country_name, country_code in sorted_countries:
                # Use word boundary matching to avoid false positives
                # e.g., "Austria" shouldn't match "Australian"
                if f' {country_name} ' in f' {sentence_lower} ' or \
                   sentence_lower.startswith(f'{country_name} ') or \
                   sentence_lower.endswith(f' {country_name}'):
                    country = country_code
                    break

        return city, country

    # =========================================================================
    # V5 VALIDATION METHODS
    # =========================================================================

    def _validate_country_context(
        self,
        sentence: str,
        name: str,
        inferred_country: Optional[str]
    ) -> Tuple[Optional[str], str]:
        """
        Validate country assignment by checking for explicit mentions in context.

        V5 enhancement to prevent geographic errors like assigning Malaysian
        institutions to Netherlands based solely on conversation filename.

        Args:
            sentence: The sentence containing the institution mention
            name: Institution name being validated
            inferred_country: Country code inferred from conversation filename

        Returns:
            Tuple of (validated_country, country_source) where:
            - validated_country: ISO 3166-1 alpha-2 country code or None
            - country_source: 'explicit' | 'inferred' | 'none'

        Examples:
            >>> # Explicit country mention overrides inferred
            >>> validate("University Malaysia in Kuala Lumpur", "University Malaysia", "NL")
            ('MY', 'explicit')

            >>> # No explicit country, use inferred
            >>> validate("Amsterdam Museum holds...", "Amsterdam Museum", "NL")
            ('NL', 'inferred')

            >>> # Contradiction: reject extraction
            >>> validate("Islamic University Malaysia", "Islamic University", "NL")
            (None, 'none')
        """
        # First, check if sentence explicitly mentions ANY country
        sentence_lower = sentence.lower()
        name_lower = name.lower()

        # Look for explicit country mentions near the institution name
        # Pattern 1: "[Name] in [Country]"
        # Pattern 2: "[Name], [City], [Country]"
        # Pattern 3: "[Country]'s [Name]" or "[Country] [Name]"

        explicit_country = None

        # Sort countries by length (longest first) to match multi-word names
        sorted_countries = sorted(
            self.patterns.COUNTRY_NAME_TO_CODE.items(),
            key=lambda x: len(x[0]),
            reverse=True
        )

        for country_name, country_code in sorted_countries:
            # Check if country name appears in sentence
            if f' {country_name} ' in f' {sentence_lower} ' or \
               sentence_lower.startswith(f'{country_name} ') or \
               sentence_lower.endswith(f' {country_name}'):

                # Check if country mention is near institution name (within 50 chars)
                name_pos = sentence_lower.find(name_lower)
                country_pos = sentence_lower.find(country_name)

                if name_pos != -1 and country_pos != -1:
                    distance = abs(name_pos - country_pos)
                    if distance < 50:  # Within 50 characters = likely related
                        explicit_country = country_code
                        break

        # Check for ISIL codes (strongest signal for country)
        isil_match = self.patterns.ISIL_PATTERN.search(sentence)
        if isil_match:
            isil_code = isil_match.group(1)
            explicit_country = isil_code.split('-')[0]

        # Decision logic
        if explicit_country:
            # Found explicit country mention in text
            if inferred_country and explicit_country != inferred_country:
                # Contradiction: explicit country contradicts inferred country
                # Example: sentence mentions "Malaysia" but inferred is "NL"
                # REJECT: This institution doesn't belong to inferred country
                return None, 'none'
            else:
                # Explicit country matches inferred (or no inferred country)
                return explicit_country, 'explicit'

        # No explicit country found
        if inferred_country:
            # Use inferred country, but mark as less confident
            return inferred_country, 'inferred'

        # No country information available
        return None, 'none'

    def _is_organization_or_network(self, name: str, sentence: str) -> bool:
        """
        Check if extracted name is an organization/network rather than an institution.

        V5 enhancement to filter out:
        - International organizations (IFLA, UNESCO, ICOM)
        - Networks and platforms (Archive Net, Museum Association)
        - Federations and consortia

        Args:
            name: Institution name to validate
            sentence: Context sentence

        Returns:
            True if name should be filtered (is an organization/network)

        Examples:
            >>> _is_organization_or_network("IFLA", "IFLA sets library standards")
            True

            >>> _is_organization_or_network("National Library", "The National Library...")
            False
        """
        name_lower = name.lower()
        sentence_lower = sentence.lower()

        # Organization blacklist (case-insensitive)
        ORGANIZATION_BLACKLIST = {
            # International organizations
            'ifla', 'unesco', 'icom', 'icomos', 'ica',
            'international federation of library associations',
            'international council of museums',
            'international council on archives',

            # Networks and platforms
            'archive net', 'netwerk oorlogsbronnen',
            'museum association', 'archives association',
            'library consortium', 'museum network',

            # Generic organizational terms (when standalone)
            'federation', 'consortium', 'network', 'association',
            'platform', 'union', 'alliance',
        }

        # Check exact match against blacklist
        if name_lower in ORGANIZATION_BLACKLIST:
            return True

        # Check if any blacklist term is contained in name
        for org_term in ORGANIZATION_BLACKLIST:
            if org_term in name_lower and len(name_lower) - len(org_term) < 5:
                # Name is mostly just the blacklisted term
                return True

        # Pattern detection: "X is a network of Y"
        NETWORK_PATTERNS = [
            r'\b' + re.escape(name_lower) + r'\s+is\s+a\s+network',
            r'\b' + re.escape(name_lower) + r'\s+is\s+an?\s+organization',
            r'\b' + re.escape(name_lower) + r'\s+is\s+an?\s+association',
            r'\b' + re.escape(name_lower) + r'\s+is\s+an?\s+federation',
            r'\bnetwork\s+of\s+\d+\s+\w+\s+including\s+' + re.escape(name_lower),
        ]

        for pattern in NETWORK_PATTERNS:
            if re.search(pattern, sentence_lower):
                return True

        return False

    def _is_proper_institutional_name(self, name: str, sentence: str) -> bool:
        """
        Validate that extracted name is a proper institution name, not a generic term.

        V5 enhancement to filter out:
        - Generic descriptors (Library FabLab, Museum Café)
        - Concepts/services rather than named institutions
        - Academic departments without proper institutional context

        Args:
            name: Institution name to validate
            sentence: Context sentence

        Returns:
            True if name is valid, False if should be filtered

        Examples:
            >>> _is_proper_institutional_name("Library FabLab", "The Library FabLab...")
            False  # Generic service descriptor

            >>> _is_proper_institutional_name("Rijksmuseum", "Rijksmuseum in Amsterdam")
            True  # Valid compound name

            >>> _is_proper_institutional_name("Southeast Asian Studies", "...")
            False  # Academic department without institution
        """
        name_lower = name.lower()

        # Generic descriptor blacklist
        GENERIC_DESCRIPTORS = {
            'library fablab', 'library makerspace', 'library café', 'library cafe',
            'museum café', 'museum cafe', 'museum shop', 'museum store',
            'archive reading room', 'archive portal',
            'library services', 'museum services', 'archive services',

            # Too generic (without specific qualifier)
            'dutch museum', 'dutch library', 'dutch archive',
            'local archive', 'local museum', 'local library',
            'university library', 'university archive', 'university museum',
            'city archive', 'city museum', 'city library',
        }

        # Check against generic descriptor blacklist
        if name_lower in GENERIC_DESCRIPTORS:
            return False

        # Check for generic patterns
        GENERIC_PATTERNS = [
            r'^(library|museum|archive)\s+(fablab|makerspace|café|cafe|shop)$',
            r'^(university|city|local|regional)\s+(library|museum|archive)$',
        ]

        for pattern in GENERIC_PATTERNS:
            if re.match(pattern, name_lower):
                return False

        # Check for academic department patterns (without institutional context)
        # "Southeast Asian Studies" is too vague
        # "Southeast Asian Studies Library, Leiden University" would be OK
        if name_lower.endswith(' studies') and not any(
            keyword in name_lower for keyword in ['library', 'museum', 'archive', 'center', 'centre']
        ):
            return False

        # Minimum name requirements
        words = name.split()

        # Check for compound words (Rijksmuseum, Tropenmuseum)
        # These are allowed as single-word names
        institution_keywords = [
            'museum', 'museu', 'museo', 'musée', 'muzeum',
            'library', 'biblioteca', 'bibliothek', 'bibliotheek',
            'archive', 'archivo', 'archiv', 'archief',
            'gallery', 'galerie',
        ]

        is_compound_word = False
        if len(words) == 1:
            # Check if single word contains institution keyword as suffix
            for keyword in institution_keywords:
                if name_lower.endswith(keyword) and len(name) > len(keyword):
                    # Has prefix before keyword (Rijks-museum)
                    is_compound_word = True
                    break

        # Single-word names only allowed if compound
        if len(words) == 1 and not is_compound_word:
            return False

        # Multi-word names must have at least one word that's NOT just the keyword
        if len(words) >= 2:
            # At least one word should be a proper name (not just "National Museum")
            has_specific_name = False
            for word in words:
                word_lower = word.lower()
                # Skip articles, prepositions, and generic adjectives
                if word_lower in ['the', 'of', 'for', 'and', 'national', 'state',
                                  'public', 'royal', 'central', 'general']:
                    continue
                # Skip institution keywords
                if word_lower in institution_keywords:
                    continue
                # Found a specific name component
                has_specific_name = True
                break

            if not has_specific_name:
                # Name is too generic (e.g., "National Museum" without specific qualifier)
                return False

        return True

    def _calculate_confidence_v5(
        self,
        name: str,
        institution_type: Optional[str],
        city: Optional[str],
        country: Optional[str],
        identifiers: List[Identifier],
        sentence: str,
        country_source: str
    ) -> float:
        """
        Calculate confidence score for extracted institution (V5 algorithm).

        V5 changes from V4:
        - Lower base score (0.2 vs 0.3) - more conservative
        - Penalties for weak signals (single-word names, inferred country)
        - Bonuses for strong signals (explicit location mentions, identifiers)
        - Higher threshold (0.6 vs 0.5) for final filtering

        Scoring algorithm:
        Base: 0.2 (lower than v4's 0.3)

        Positive signals:
        - +0.3 Has institution type
        - +0.2 Has explicit location (city OR country)
        - +0.4 Has identifier (ISIL/Wikidata/VIAF)
        - +0.2 Name is 2-6 words (optimal length)
        - +0.2 Explicit "is a" pattern
        - +0.1 Country from explicit mention (not inferred)

        Negative signals:
        - -0.2 Single-word name without compound validation
        - -0.3 Generic descriptor pattern
        - -0.2 Country only inferred
        - -0.5 Organization/network blacklist match

        Args:
            name: Institution name
            institution_type: Detected type (MUSEUM, LIBRARY, etc.)
            city: Detected city
            country: Validated country code
            identifiers: List of extracted identifiers
            sentence: Source sentence for context analysis
            country_source: 'explicit', 'inferred', or 'none'

        Returns:
            Confidence score (0.0-1.0)
        """
        score = 0.2  # Lower base than v4 (was 0.3)

        # Positive signals

        # +0.3 Has institution type
        if institution_type:
            score += 0.3

        # +0.2 Has explicit location (city OR country with explicit source)
        has_explicit_location = bool(city) or (country and country_source == 'explicit')
        if has_explicit_location:
            score += 0.2

        # +0.4 Has identifier (strong signal of real institution)
        if identifiers:
            score += 0.4

        # +0.2 Name is 2-6 words (optimal length, not too short/long)
        word_count = len(name.split())
        if 2 <= word_count <= 6:
            score += 0.2

        # +0.2 Explicit "is a" pattern in sentence
        is_a_patterns = [
            rf'\b{re.escape(name)}\b\s+is\s+a\s+(museum|library|archive|gallery)',
            rf'\b{re.escape(name)}\b,\s+a\s+(museum|library|archive|gallery)',
        ]
        if any(re.search(pattern, sentence, re.IGNORECASE) for pattern in is_a_patterns):
            score += 0.2

        # +0.1 Country from explicit mention (bonus beyond location bonus)
        if country_source == 'explicit':
            score += 0.1

        # Negative signals

        # -0.2 Single-word name (risky unless compound word)
        if word_count == 1:
            # Check if it's a compound word (e.g., Rijksmuseum)
            is_compound = len(name) > 12 or any(
                substring in name.lower()
                for substring in ['museum', 'archief', 'bibliotheek', 'gallery']
            )
            if not is_compound:
                score -= 0.2

        # -0.3 Generic descriptor pattern
        generic_patterns = [
            r'\b(national|state|provincial|regional|local)\s+(museum|library|archive)\b',
            r'\b(university|college|school)\s+(library|archive)\b',
            r'\b(public|city|town)\s+library\b',
        ]
        if any(re.search(pattern, name, re.IGNORECASE) for pattern in generic_patterns):
            # But allow if it has a specific qualifier (e.g., "National Museum of Brazil")
            has_specific = re.search(r'\b(of|in|for|at)\s+\w+', name, re.IGNORECASE)
            if not has_specific:
                score -= 0.3

        # -0.2 Country only inferred (weak geographic signal)
        if country_source == 'inferred':
            score -= 0.2

        # -0.5 Organization/network (should be filtered, but penalty if missed)
        # Note: This should be caught by _is_organization_or_network() first
        org_keywords = ['IFLA', 'UNESCO', 'ICOM', 'Network', 'Association', 'Foundation']
        if any(keyword in name for keyword in org_keywords):
            score -= 0.5

        # Clamp to [0.0, 1.0]
        return max(0.0, min(1.0, score))

    def _extract_identifiers(self, sentence: str) -> List[Identifier]:
        """
        Extract external identifiers from sentence.

        Filters out common false positives (CD-ROM, UTF-8, etc.)

        Enhanced (Nov 2025): Uses context-aware patterns to catch more ISIL codes

        Args:
            sentence: Sentence to extract from

        Returns:
            List of Identifier objects
        """
        identifiers = []
        isil_codes_found = set()  # Track to avoid duplicates

        # Extract ISIL codes using base pattern
        for match in self.patterns.ISIL_PATTERN.finditer(sentence):
            isil_code = match.group(1)

            # Filter out blacklisted false positives (case-insensitive check)
            if any(isil_code.lower() == blacklisted.lower()
                   for blacklisted in self.patterns.ISIL_BLACKLIST):
                continue

            # Validate ISIL format (2-letter country code followed by hyphen)
            if len(isil_code.split('-')[0]) == 2:
                isil_codes_found.add(isil_code)

        # Extract ISIL codes using context-aware patterns (Nov 2025)
        for context_pattern in self.patterns.ISIL_CONTEXT_PATTERNS:
            for match in context_pattern.finditer(sentence):
                isil_code = match.group(1)

                # Apply same filters
                if any(isil_code.lower() == blacklisted.lower()
                       for blacklisted in self.patterns.ISIL_BLACKLIST):
                    continue

                if len(isil_code.split('-')[0]) == 2:
                    isil_codes_found.add(isil_code)

        # Create Identifier objects for all unique ISIL codes
        for isil_code in isil_codes_found:
            identifiers.append(Identifier(
                identifier_scheme="ISIL",
                identifier_value=isil_code,
                identifier_url=None,
                assigned_date=None
            ))

        # Extract Wikidata IDs
        for match in self.patterns.WIKIDATA_PATTERN.finditer(sentence):
            wikidata_id = match.group(1)
            identifiers.append(Identifier(
                identifier_scheme="Wikidata",
                identifier_value=wikidata_id,
                identifier_url=f"https://www.wikidata.org/entity/{wikidata_id}",  # type: ignore[arg-type]
                assigned_date=None
            ))

        # Extract VIAF IDs
        for match in self.patterns.VIAF_URL_PATTERN.finditer(sentence):
            viaf_id = match.group(1)
            identifiers.append(Identifier(
                identifier_scheme="VIAF",
                identifier_value=viaf_id,
                identifier_url=f"https://viaf.org/viaf/{viaf_id}",  # type: ignore[arg-type]
                assigned_date=None
            ))

        return identifiers

    def _calculate_confidence(
        self,
        name: str,
        institution_type: Optional[InstitutionType],
        city: Optional[str],
        identifiers: List[Identifier],
        sentence: str
    ) -> float:
        """
        Calculate confidence score for extracted entity.

        Scoring criteria:
        - Has institution type: +0.2
        - Has location: +0.1
        - Has identifier: +0.3
        - Name length appropriate (2-6 words): +0.2
        - Explicit context ("The X is a museum"): +0.2

        Base score: 0.3

        Args:
            name: Institution name
            institution_type: Classified type
            city: Extracted city
            identifiers: Extracted identifiers
            sentence: Source sentence

        Returns:
            Confidence score between 0.0 and 1.0
        """
        score = 0.3  # Base score

        # Has institution type
        if institution_type:
            score += 0.2

        # Has location
        if city:
            score += 0.1

        # Has identifiers (strong signal)
        if identifiers:
            score += 0.3

        # Name length (2-6 words is typical)
        word_count = len(name.split())
        if 2 <= word_count <= 6:
            score += 0.2

        # Explicit "is a" pattern (strong signal)
        sentence_lower = sentence.lower()
        if f"{name.lower()} is a" in sentence_lower:
            score += 0.2

        # Cap at 1.0
        return min(1.0, score)

    def _deduplicate_entities(self, entities: List[ExtractedEntity]) -> List[ExtractedEntity]:
        """
        Remove duplicate entities, keeping highest confidence version.

        Uses fuzzy matching to detect near-duplicates like:
        - "National Museum" vs "National Museu" (truncation)
        - "Archive" vs "Archives" (pluralization)
        - "Vietnam Museum" vs "Vietnamese Museum" (minor variations)

        Args:
            entities: List of extracted entities

        Returns:
            Deduplicated list
        """
        if not entities:
            return []

        # Group by normalized name (case-insensitive + fuzzy matching)
        deduplicated: List[ExtractedEntity] = []

        for entity in entities:
            # Check if this entity is similar to any already deduplicated
            is_duplicate = False

            for i, existing in enumerate(deduplicated):
                # Use fuzzy matching if available, otherwise exact case-insensitive match
                if RAPIDFUZZ_AVAILABLE:
                    similarity = fuzz.ratio(
                        entity.name.lower(),
                        existing.name.lower()
                    )
                    # 85% similarity threshold catches most duplicates
                    # "Vietnam Museum" vs "Vietnamese Museum" = 88.9%
                    # "Archive" vs "Archives" = 93.3%
                    # "National Museum" vs "National Museu" = 96.0%
                    if similarity >= 85:
                        is_duplicate = True
                        # Keep entity with higher confidence
                        if entity.confidence_score > existing.confidence_score:
                            deduplicated[i] = entity
                        break
                else:
                    # Fallback: exact case-insensitive match
                    if entity.name.lower() == existing.name.lower():
                        is_duplicate = True
                        if entity.confidence_score > existing.confidence_score:
                            deduplicated[i] = entity
                        break

            if not is_duplicate:
                deduplicated.append(entity)

        return deduplicated

    # =========================================================================
    # ENTITY TO HERITAGE CUSTODIAN CONVERSION
    # =========================================================================

    def _entity_to_custodian(
        self,
        entity: ExtractedEntity,
        conversation_id: Optional[str] = None,
        conversation_name: Optional[str] = None
    ) -> HeritageCustodian:
        """
        Convert extracted entity to HeritageCustodian record.

        Args:
            entity: Extracted entity
            conversation_id: UUID of source conversation
            conversation_name: Name of source conversation

        Returns:
            HeritageCustodian record with provenance metadata
        """
        # Generate unique ID for this record
        record_id = f"https://w3id.org/heritage/custodian/{uuid.uuid4()}"

        # Build location if city/country available
        locations = []
        if entity.city or entity.country:
            location = Location(
                location_type=None,
                street_address=None,
                city=entity.city,
                postal_code=None,
                region=None,
                country=entity.country,
                latitude=None,
                longitude=None,
                geonames_id=None,
                is_primary=True
            )
            locations.append(location)

        # Create provenance metadata
        provenance = Provenance(
            data_source=DataSource.CONVERSATION_NLP,
            data_tier=DataTier.TIER_4_INFERRED,
            extraction_date=datetime.now(timezone.utc),
            extraction_method="Pattern matching + heuristic NER",
            confidence_score=entity.confidence_score,
            conversation_id=conversation_id,
            source_url=None,
            verified_date=None,
            verified_by=None
        )

        # Build description from context
        description = None
        if conversation_name:
            description = f"Extracted from conversation: {conversation_name}"
            if entity.text_snippet:
                description += f"\n\nContext: {entity.text_snippet}"

        # Create HeritageCustodian record
        custodian = HeritageCustodian(
            id=record_id,
            name=entity.name,
            institution_type=entity.institution_type or InstitutionType.UNKNOWN,
            organization_status=OrganizationStatus.UNKNOWN,
            description=description,
            parent_organization=None,
            founded_date=None,
            closed_date=None,
            homepage=None,
            ghcid_numeric=None,
            ghcid_current=None,
            ghcid_original=None,
            ghcid_history=None,
            contact_info=None,
            locations=locations if locations else [],
            identifiers=entity.identifiers if entity.identifiers else [],
            provenance=provenance
        )

        return custodian