glam/scripts/extract_with_patterns.py

#!/usr/bin/env python3
"""
Extract typed entities from web archives using annotated patterns.

Loads patterns from dutch_web_patterns.yaml and processes web archive HTML
to extract entities with CH-Annotator types and relationship predicates.

This script:
1. Loads entity and discard patterns from dutch_web_patterns.yaml
2. Finds custodian files with web_enrichment.web_archives references
3. For each custodian, processes HTML from web archive mirror directories
4. Extracts text content from HTML
5. Matches against discard patterns first (filter out navigation, UI, etc.)
6. Matches against entity patterns (extract with types and relationships)
7. Applies capture groups to extract sub-entities
8. Generates relationship triples
9. Adds pattern_entity_claims section to custodian YAML files

Usage:
    python scripts/extract_with_patterns.py [--dry-run] [--limit N] [--custodian GHCID]
    python scripts/extract_with_patterns.py --verbose --limit 3
"""

import argparse
import glob
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
from html.parser import HTMLParser
from io import StringIO

import yaml


# ============================================================================
# DUTCH STOPWORD FILTER
# ============================================================================

# Common Dutch words that should NOT be extracted as place names, organization names, etc.
# These cause false positives when patterns like "gemeente (\w+)" match "gemeente op de straat"
DUTCH_STOPWORDS = {
    # Articles
    'de', 'het', 'een', 'der', 'des', 'den',
    # Prepositions
    'op', 'in', 'van', 'aan', 'te', 'tot', 'bij', 'met', 'voor', 'na', 'naar',
    'om', 'uit', 'over', 'onder', 'door', 'tegen', 'tussen', 'zonder', 'binnen',
    'buiten', 'langs', 'sinds', 'tijdens', 'vanaf', 'volgens', 'wegens',
    # Pronouns
    'ik', 'je', 'jij', 'u', 'hij', 'zij', 'ze', 'wij', 'we', 'jullie', 'hen', 'hun',
    'mij', 'jou', 'hem', 'haar', 'ons', 'die', 'dat', 'dit', 'deze', 'wat', 'wie',
    'welke', 'welk', 'waar', 'wanneer', 'waarom', 'hoe', 'er', 'hier', 'daar',
    # Common verbs (conjugated forms that might appear after "gemeente", etc.)
    'is', 'zijn', 'was', 'waren', 'ben', 'bent', 'geweest', 'wordt', 'worden',
    'werd', 'werden', 'heeft', 'hebben', 'had', 'hadden', 'gehad', 'kan', 'kunnen',
    'kon', 'konden', 'gekund', 'mag', 'mogen', 'mocht', 'mochten', 'moet', 'moeten',
    'moest', 'moesten', 'zal', 'zullen', 'zou', 'zouden', 'wil', 'willen', 'wilde',
    'wilden', 'gewild', 'zien', 'ziet', 'zag', 'zagen', 'gezien', 'gaan', 'gaat',
    'ging', 'gingen', 'gegaan', 'komen', 'komt', 'kwam', 'kwamen', 'gekomen',
    'doen', 'doet', 'deed', 'deden', 'gedaan', 'maken', 'maakt', 'maakte', 'maakten',
    'gemaakt', 'zeggen', 'zegt', 'zei', 'zeiden', 'gezegd', 'staan', 'staat', 'stond',
    'stonden', 'gestaan', 'liggen', 'ligt', 'lag', 'lagen', 'gelegen', 'woonde', 'woont',
    # Common adjectives/adverbs
    'ook', 'nog', 'al', 'wel', 'niet', 'geen', 'meer', 'veel', 'weinig', 'erg',
    'heel', 'zeer', 'zo', 'nu', 'dan', 'toen', 'weer', 'vaak', 'altijd', 'nooit',
    'soms', 'reeds', 'steeds', 'pas', 'net', 'juist', 'precies', 'ongeveer',
    # Conjunctions
    'en', 'of', 'maar', 'want', 'dus', 'omdat', 'als', 'indien', 'hoewel', 'tenzij',
    'totdat', 'terwijl', 'voordat', 'nadat', 'zodat', 'opdat', 'mits', 'ofschoon',
    # Common nouns that aren't places
    'eigendom', 'bezit', 'gebied', 'plaats', 'deel', 'kant', 'zijde', 'wijze',
    'manier', 'vorm', 'soort', 'type', 'naam', 'titel', 'datum', 'tijd', 'jaar',
    'dag', 'week', 'maand', 'uur', 'minuut', 'eeuw', 'periode', 'men', 'iemand',
    # Short words that are likely false positives
    'aa', 'ab', 'ad', 'af', 'ag', 'ah', 'al', 'am', 'as', 'at', 'au', 'be', 'bi',
    'bo', 'bu', 'ca', 'co', 'da', 'do', 'du', 'ed', 'ee', 'eg', 'ei', 'el', 'em',
    'ex', 'fa', 'fe', 'fi', 'fo', 'fu', 'ga', 'ge', 'go', 'gu', 'ha', 'he', 'hi',
    'ho', 'hu', 'id', 'ie', 'ig', 'ij', 'il', 'im', 'io', 'ir', 'ja', 'je', 'jo',
    'ju', 'ka', 'ke', 'ki', 'ko', 'ku', 'la', 'le', 'li', 'lo', 'lu', 'ma', 'me',
    'mi', 'mo', 'mu', 'na', 'ne', 'ni', 'no', 'nu', 'ob', 'od', 'oe', 'og', 'oh',
    'ok', 'ol', 'om', 'on', 'oo', 'op', 'or', 'os', 'ot', 'ou', 'ow', 'oz', 'pa',
    'pe', 'pi', 'po', 'pu', 'ra', 're', 'ri', 'ro', 'ru', 'sa', 'se', 'si', 'so',
    'su', 'ta', 'te', 'ti', 'to', 'tu', 'ub', 'ue', 'ug', 'ui', 'uk', 'ul', 'um',
    'un', 'up', 'ur', 'us', 'ut', 'uu', 'va', 've', 'vi', 'vo', 'vu', 'wa', 'we',
    'wi', 'wo', 'wu', 'za', 'ze', 'zi', 'zo', 'zu',
}

# Generic organization words that by themselves don't make a valid entity
# e.g., "de Stichting" without a name is too generic
GENERIC_ORG_WORDS = {
    'stichting', 'vereniging', 'genootschap', 'organisatie', 'instelling',
    'instituut', 'centrum', 'bureau', 'dienst', 'raad', 'commissie',
    'archief', 'museum', 'bibliotheek', 'collectie', 'fonds',
}

# Entity types whose capture groups should be validated against stopwords
# These are patterns where captured groups are expected to be proper nouns (places, names)
STOPWORD_FILTERED_ENTITY_TYPES = {
    'GRP.GOV',      # Government - municipality names should be proper nouns
    'GRP.GOV.MUN',  # Municipality
    'GRP.GOV.PRO',  # Province
    'GRP.HER',      # Heritage institutions - name parts should be proper nouns
    'GRP.HER.MUS',  # Museum
    'GRP.HER.ARC',  # Archive
    'GRP.HER.LIB',  # Library
    'GRP.ORG',      # Organizations - name parts should be proper nouns
    'TOP.SET',      # Settlement names
    'TOP.BLD',      # Building names
    'AGT.PER',      # Person names
}


def is_stopword_match(entity_result: dict) -> bool:
    """
    Check if an entity match is actually a false positive due to stopwords.

    Returns True if the match should be REJECTED (is a false positive).
    """
    entity_type = entity_result.get('entity_type') or ''
    entity_subtype = entity_result.get('entity_subtype') or ''

    # Check if this entity type should be filtered
    should_filter = (
        entity_type in STOPWORD_FILTERED_ENTITY_TYPES or
        entity_subtype in STOPWORD_FILTERED_ENTITY_TYPES
    )

    if not should_filter:
        return False

    # Check capture groups for stopwords
    captures = entity_result.get('captures', {})
    for idx, cap in captures.items():
        value = cap.get('value', '').lower().strip()
        cap_type = cap.get('type', '')

        # Check if this capture group type should be validated
        if cap_type in STOPWORD_FILTERED_ENTITY_TYPES or entity_type in STOPWORD_FILTERED_ENTITY_TYPES:
            if value in DUTCH_STOPWORDS:
                return True  # Reject this match

            # Also reject if captured value is too short (less than 3 chars)
            # unless it's a known Dutch place abbreviation
            if len(value) < 3:
                return True

    # Check the matched text itself if no captures
    if not captures:
        # Extract the last word (often the "name" part) from matched text
        matched = entity_result.get('matched_text', '')
        words = matched.lower().split()
        if words:
            last_word = words[-1]
            if last_word in DUTCH_STOPWORDS:
                return True

    # Check for generic organization matches like "de Stichting" (without a real name)
    matched_text = entity_result.get('matched_text', '').lower().strip()
    words = matched_text.split()

    # Pattern: article + generic org word (e.g., "de stichting", "het archief")
    if len(words) == 2:
        if words[0] in {'de', 'het', 'een'} and words[1] in GENERIC_ORG_WORDS:
            return True  # Too generic, reject

    return False


# ============================================================================
# YAML HANDLING
# ============================================================================

class CustomDumper(yaml.SafeDumper):
    """Custom YAML dumper to preserve formatting."""
    pass


def str_representer(dumper, data):
    """Represent strings with proper multiline handling."""
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)


CustomDumper.add_representer(str, str_representer)


def load_yaml(filepath: Path) -> dict:
    """Load a YAML file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f) or {}


def save_yaml(filepath: Path, data: dict) -> None:
    """Save data to a YAML file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, Dumper=CustomDumper, allow_unicode=True,
                  default_flow_style=False, sort_keys=False, width=120)


# ============================================================================
# HTML TEXT EXTRACTION
# ============================================================================

class MLStripper(HTMLParser):
    """Simple HTML stripper to extract text content."""

    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()
        self.in_script_or_style = False

    def handle_starttag(self, tag, attrs):
        if tag in ('script', 'style', 'noscript'):
            self.in_script_or_style = True

    def handle_endtag(self, tag):
        if tag in ('script', 'style', 'noscript'):
            self.in_script_or_style = False

    def handle_data(self, data):
        if not self.in_script_or_style:
            self.text.write(data)

    def get_data(self):
        return self.text.getvalue()


def strip_tags(html: str) -> str:
    """Remove HTML tags and return text content."""
    s = MLStripper()
    try:
        s.feed(html)
        return s.get_data()
    except Exception:
        # Fallback: simple regex-based stripping
        return re.sub(r'<[^>]+>', ' ', html)


def extract_text_from_html(html_content: str) -> list[str]:
    """
    Extract meaningful text segments from HTML content.

    Returns a list of text strings that could be entity mentions.
    Filters out very short strings and common non-entity content.
    """
    text = strip_tags(html_content)

    # Split into lines and clean
    lines = []
    for line in text.split('\n'):
        # Clean whitespace
        line = ' '.join(line.split())

        # Skip very short lines
        if len(line) < 3:
            continue

        # Skip lines that are just whitespace/punctuation
        if re.match(r'^[\s\W]+$', line):
            continue

        lines.append(line)

    return lines


# ============================================================================
# PATTERN LOADING AND COMPILATION
# ============================================================================

class PatternMatcher:
    """
    Loads and compiles patterns from dutch_web_patterns.yaml.
    Provides matching against discard and entity patterns.
    """

    def __init__(self, pattern_file: Path, strip_anchors: bool = True):
        """Load and compile patterns from YAML file.

        Args:
            pattern_file: Path to YAML pattern file
            strip_anchors: If True, remove ^ and $ anchors from entity patterns
                          to enable substring matching (default: True)
        """
        self.pattern_file = pattern_file
        self.raw_data = load_yaml(pattern_file)
        self.strip_anchors = strip_anchors

        # Compiled patterns
        self.discard_patterns: list[tuple[re.Pattern, str]] = []  # (regex, reason)
        self.entity_patterns: list[dict] = []  # Full pattern config with compiled regex

        self._compile_patterns()

    def _strip_regex_anchors(self, pattern: str) -> str:
        r"""Remove ^ and $ anchors from a regex pattern for substring matching.

        Replaces anchors with word boundary markers (\b) to prevent false positives
        from partial word matches. For example:
        - ^gemeente\s+(\w+)$ becomes \bgemeente\s+(\w+)\b

        This allows the pattern to match "... gemeente Assen ..."
        but not "... gemeentebestuur ..." or "...in gemeente op de..."

        Preserves anchors that are escaped (\\^ or \\$).
        """
        if not self.strip_anchors:
            return pattern

        # Replace leading ^ with word boundary \b (but not \^)
        if pattern.startswith('^'):
            pattern = r'\b' + pattern[1:]
        else:
            # Add word boundary at start if not present
            if not pattern.startswith(r'\b'):
                pattern = r'\b' + pattern

        # Replace trailing $ with word boundary \b (but not \$)
        if pattern.endswith('$') and not pattern.endswith('\\$'):
            pattern = pattern[:-1] + r'\b'
        else:
            # Add word boundary at end if not present
            if not pattern.endswith(r'\b'):
                pattern = pattern + r'\b'

        return pattern

    def _compile_patterns(self):
        """Compile all regex patterns for efficient matching."""
        # Compile discard patterns
        discard_section = self.raw_data.get('discard_patterns', {})
        for category, cat_data in discard_section.items():
            if isinstance(cat_data, dict) and 'patterns' in cat_data:
                for pat_item in cat_data['patterns']:
                    pattern_str = pat_item.get('pattern', '')
                    reason = pat_item.get('discard_reason', category)
                    if pattern_str:
                        try:
                            compiled = re.compile(pattern_str, re.IGNORECASE)
                            self.discard_patterns.append((compiled, reason))
                        except re.error as e:
                            print(f"Warning: Invalid discard pattern '{pattern_str}': {e}")

        # Compile entity patterns
        entity_section = self.raw_data.get('entity_patterns', {})
        self._compile_entity_section(entity_section)

    def _compile_entity_section(self, section: dict, parent_path: str = ""):
        """Recursively compile entity patterns from nested structure."""
        for key, value in section.items():
            if isinstance(value, dict):
                if 'patterns' in value:
                    # This is a pattern category with actual patterns
                    for pat_item in value['patterns']:
                        pattern_str = pat_item.get('pattern', '')
                        if pattern_str:
                            try:
                                # Strip anchors for substring matching
                                pattern_for_compile = self._strip_regex_anchors(pattern_str)
                                compiled = re.compile(pattern_for_compile, re.IGNORECASE)
                                entity_config = {
                                    'regex': compiled,
                                    'pattern_str': pattern_str,  # Keep original for logging
                                    'pattern_compiled': pattern_for_compile,  # Actual compiled pattern
                                    'category': f"{parent_path}/{key}" if parent_path else key,
                                    'entity_type': pat_item.get('entity_type'),
                                    'entity_subtype': pat_item.get('entity_subtype'),
                                    'label_template': pat_item.get('label_template'),
                                    'capture_groups': pat_item.get('capture_groups', {}),
                                    'relationships': pat_item.get('relationships', []),
                                    'description': pat_item.get('description', ''),
                                }
                                self.entity_patterns.append(entity_config)
                            except re.error as e:
                                print(f"Warning: Invalid entity pattern '{pattern_str}': {e}")
                else:
                    # Nested category, recurse
                    new_path = f"{parent_path}/{key}" if parent_path else key
                    self._compile_entity_section(value, new_path)

    def should_discard(self, text: str) -> tuple[bool, Optional[str]]:
        """
        Check if text matches any discard pattern.

        Returns:
            Tuple of (should_discard, reason or None)
        """
        text_lower = text.lower().strip()

        for regex, reason in self.discard_patterns:
            if regex.search(text_lower):
                return True, reason

        return False, None

    def match_entity(self, text: str) -> Optional[dict]:
        """
        Match text against entity patterns.

        Uses search() instead of match() to find patterns anywhere in the text,
        not just at the beginning. This dramatically improves entity yield.

        Returns:
            Dict with match info including entity_type, captures, relationships
            or None if no match
        """
        text_stripped = text.strip()

        for pattern in self.entity_patterns:
            match = pattern['regex'].search(text_stripped)
            if match:
                # Use the matched substring, not the full text
                matched_substring = match.group(0)

                result = {
                    'matched_text': matched_substring,
                    'full_context': text_stripped if text_stripped != matched_substring else None,
                    'entity_type': pattern['entity_type'],
                    'entity_subtype': pattern['entity_subtype'],
                    'pattern_str': pattern['pattern_str'],
                    'category': pattern['category'],
                    'description': pattern['description'],
                    'captures': {},
                    'relationships': [],
                }

                # Extract capture groups
                if pattern['capture_groups']:
                    for group_num, group_config in pattern['capture_groups'].items():
                        try:
                            group_idx = int(group_num)
                            if group_idx <= len(match.groups()):
                                captured_value = match.group(group_idx)
                                if captured_value:
                                    result['captures'][group_idx] = {
                                        'value': captured_value,
                                        'type': group_config.get('type'),
                                        'role': group_config.get('role'),
                                    }
                        except (ValueError, IndexError):
                            pass

                # Generate relationships - use matched_substring as the entity
                if pattern['relationships']:
                    for rel in pattern['relationships']:
                        relationship = {
                            'predicate': rel.get('predicate'),
                            'subject': self._resolve_reference(rel.get('subject'), matched_substring, result['captures']),
                            'object': self._resolve_reference(rel.get('object'), matched_substring, result['captures']),
                            'confidence': rel.get('confidence', 0.8),
                        }

                        # Add type info if available
                        if rel.get('subject_type'):
                            relationship['subject_type'] = rel['subject_type']
                        if rel.get('object_type'):
                            relationship['object_type'] = rel['object_type']

                        result['relationships'].append(relationship)

                # Apply label template if exists
                if pattern['label_template'] and result['captures']:
                    try:
                        label = pattern['label_template']
                        for idx, cap in result['captures'].items():
                            label = label.replace(f'{{{idx}}}', cap['value'])
                        result['entity_label'] = label
                    except Exception:
                        result['entity_label'] = matched_substring
                else:
                    result['entity_label'] = matched_substring

                # Filter out false positives caused by stopwords in capture groups
                if is_stopword_match(result):
                    continue  # Try next pattern instead of returning this match

                return result

        return None

    def _resolve_reference(self, ref: Any, matched_text: str, captures: dict) -> Optional[str]:
        """Resolve a reference in relationship definition."""
        if ref is None:
            return None
        if ref == '$0':
            return matched_text
        if isinstance(ref, str) and ref.startswith('$'):
            try:
                idx = int(ref[1:])
                if idx in captures:
                    return captures[idx]['value']
            except ValueError:
                pass
        if ref == 'CUSTODIAN':
            return 'CUSTODIAN'  # Placeholder for the custodian being processed
        return str(ref)


# ============================================================================
# CUSTODIAN FILE PROCESSING
# ============================================================================

def find_html_files(archive_dir: Path) -> list[Path]:
    """Find all HTML files in a web archive directory."""
    html_files = []

    mirror_dir = archive_dir / 'mirror'
    if mirror_dir.exists():
        for html_file in mirror_dir.rglob('*.html'):
            html_files.append(html_file)

    pages_dir = archive_dir / 'pages'
    if pages_dir.exists():
        for html_file in pages_dir.rglob('*.html'):
            html_files.append(html_file)

    return html_files


def process_custodian_file(
    custodian_path: Path,
    base_path: Path,
    matcher: PatternMatcher,
    dry_run: bool = False,
    verbose: bool = False,
    show_entities: bool = False,
    show_unmatched: int = 0,
    min_length: int = 10
) -> dict:
    """
    Process a single custodian file to extract and add pattern-based entities.

    Args:
        custodian_path: Path to custodian YAML file
        base_path: Base path for web archives (data/custodian/)
        matcher: Compiled pattern matcher
        dry_run: If True, don't write changes
        verbose: If True, show detailed output
        show_entities: If True, print each entity as it's found
        show_unmatched: Number of unmatched segments to show (for debugging)
        min_length: Minimum text segment length to analyze

    Returns:
        Dict with processing stats
    """
    stats = {
        'file': str(custodian_path.name),
        'web_archives_found': 0,
        'html_files_processed': 0,
        'text_segments_analyzed': 0,
        'segments_discarded': 0,
        'entities_extracted': 0,
        'status': 'skipped',
        'error': None,
    }

    # Collect unmatched segments for debugging
    unmatched_samples = []

    try:
        custodian_data = load_yaml(custodian_path)
    except Exception as e:
        stats['status'] = 'error'
        stats['error'] = f"Failed to load YAML: {e}"
        return stats

    # Check for web_enrichment section
    web_enrichment = custodian_data.get('web_enrichment', {})
    web_archives = web_enrichment.get('web_archives', [])

    if not web_archives:
        stats['status'] = 'no_web_archives'
        return stats

    stats['web_archives_found'] = len(web_archives)

    all_claims = []
    discard_counts = {}

    for archive in web_archives:
        archive_dir_str = archive.get('directory', '')
        if not archive_dir_str:
            continue

        archive_dir = base_path / archive_dir_str
        if not archive_dir.exists():
            continue

        html_files = find_html_files(archive_dir)

        for html_file in html_files:
            stats['html_files_processed'] += 1

            try:
                with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
                    html_content = f.read()
            except Exception as e:
                if verbose:
                    print(f"  Warning: Could not read {html_file}: {e}")
                continue

            # Extract text segments
            text_segments = extract_text_from_html(html_content)

            for segment in text_segments:
                # Skip segments that are too short
                if len(segment) < min_length:
                    continue

                stats['text_segments_analyzed'] += 1

                # First check discard patterns
                should_discard, discard_reason = matcher.should_discard(segment)
                if should_discard:
                    stats['segments_discarded'] += 1
                    discard_counts[discard_reason] = discard_counts.get(discard_reason, 0) + 1
                    continue

                # Try to match entity patterns
                entity_match = matcher.match_entity(segment)
                if entity_match:
                    stats['entities_extracted'] += 1

                    # Build claim record
                    claim = {
                        'entity': entity_match['entity_label'],
                        'matched_text': entity_match['matched_text'],
                        'entity_type': entity_match['entity_type'],
                    }

                    if entity_match.get('entity_subtype'):
                        claim['entity_subtype'] = entity_match['entity_subtype']

                    claim['matched_pattern'] = entity_match['pattern_str']
                    claim['pattern_category'] = entity_match['category']

                    # Add capture groups if any
                    if entity_match['captures']:
                        claim['capture_groups'] = {
                            str(idx): cap for idx, cap in entity_match['captures'].items()
                        }

                    # Add relationships
                    if entity_match['relationships']:
                        claim['relationships'] = entity_match['relationships']

                    # Source file relative to custodian dir
                    try:
                        rel_path = html_file.relative_to(base_path)
                        claim['source_file'] = str(rel_path)
                    except ValueError:
                        claim['source_file'] = str(html_file)

                    claim['confidence'] = 0.85  # Pattern-based extraction confidence

                    all_claims.append(claim)

                    # Show entity if flag is set
                    if show_entities:
                        print(f"    → [{entity_match['entity_type']}] {entity_match['entity_label']}")
                else:
                    # Track unmatched segments for debugging
                    if show_unmatched > 0 and len(unmatched_samples) < show_unmatched:
                        # Only collect interesting segments (likely to contain entities)
                        if (len(segment) >= 15 and len(segment) <= 100 and
                            not segment.isupper() and
                            any(c.isupper() for c in segment[1:]) and
                            not re.match(r'^[\d\s\W]+$', segment)):
                            unmatched_samples.append(segment)

    # Add unmatched samples to stats for debugging
    if unmatched_samples:
        stats['unmatched_samples'] = unmatched_samples

    if not all_claims:
        stats['status'] = 'no_entities_found'
        return stats

    # Deduplicate claims by entity + type
    seen = set()
    unique_claims = []
    for claim in all_claims:
        key = (claim['entity'], claim.get('entity_type', ''))
        if key not in seen:
            seen.add(key)
            unique_claims.append(claim)

    stats['entities_extracted'] = len(unique_claims)

    # Create pattern_entity_claims section
    pattern_entity_claims = {
        'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
        'extraction_method': 'pattern_based_extraction_v1',
        'pattern_file': 'dutch_web_patterns.yaml',
        'pattern_file_version': '1.0.0',
        'html_files_processed': stats['html_files_processed'],
        'text_segments_analyzed': stats['text_segments_analyzed'],
        'segments_discarded': stats['segments_discarded'],
        'entities_count': len(unique_claims),
        'claims': unique_claims,
    }

    # Add to custodian data
    custodian_data['pattern_entity_claims'] = pattern_entity_claims

    if not dry_run:
        save_yaml(custodian_path, custodian_data)
        stats['status'] = 'updated'
    else:
        stats['status'] = 'would_update'

    return stats


def find_custodian_files_with_web_archives(custodian_dir: Path) -> list[Path]:
    """
    Find all custodian files that have web_enrichment.web_archives.

    Args:
        custodian_dir: Directory containing custodian YAML files

    Returns:
        List of paths to custodian files with web archives
    """
    pattern = str(custodian_dir / "NL-*.yaml")
    files = []

    for filepath in glob.glob(pattern):
        path = Path(filepath)
        try:
            with open(path, 'r', encoding='utf-8') as f:
                # Quick check for web_archives: in file
                content = f.read()
                if 'web_archives:' in content:
                    files.append(path)
        except Exception:
            continue

    return sorted(files)


# ============================================================================
# MAIN
# ============================================================================

def main():
    parser = argparse.ArgumentParser(
        description='Extract typed entities from web archives using annotated patterns'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without making changes'
    )
    parser.add_argument(
        '--limit',
        type=int,
        default=None,
        help='Limit number of files to process'
    )
    parser.add_argument(
        '--custodian',
        type=str,
        default=None,
        help='Process only a specific custodian GHCID (e.g., NL-DR-ASS-A-DA)'
    )
    parser.add_argument(
        '--custodian-dir',
        type=Path,
        default=Path('/Users/kempersc/apps/glam/data/custodian'),
        help='Directory containing custodian YAML files'
    )
    parser.add_argument(
        '--pattern-file',
        type=Path,
        default=Path('/Users/kempersc/apps/glam/data/entity_annotation/modules/processing/dutch_web_patterns.yaml'),
        help='Path to pattern definition file'
    )
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Show detailed output'
    )
    parser.add_argument(
        '--show-entities',
        action='store_true',
        help='Show each extracted entity as it is found'
    )
    parser.add_argument(
        '--show-unmatched',
        type=int,
        default=0,
        metavar='N',
        help='Show N sample unmatched text segments (for pattern development)'
    )
    parser.add_argument(
        '--min-length',
        type=int,
        default=10,
        help='Minimum text segment length to analyze (default: 10)'
    )

    args = parser.parse_args()

    custodian_dir = args.custodian_dir
    base_path = custodian_dir

    # Load patterns
    print(f"Loading patterns from {args.pattern_file}...")
    try:
        matcher = PatternMatcher(args.pattern_file)
        print(f"  Loaded {len(matcher.discard_patterns)} discard patterns")
        print(f"  Loaded {len(matcher.entity_patterns)} entity patterns")
    except Exception as e:
        print(f"Error loading patterns: {e}")
        return 1

    # Find custodian files
    if args.custodian:
        # Process specific custodian
        specific_file = custodian_dir / f"{args.custodian}.yaml"
        if not specific_file.exists():
            print(f"Error: Custodian file not found: {specific_file}")
            return 1
        files = [specific_file]
        print(f"Processing specific custodian: {args.custodian}")
    else:
        print(f"Scanning for custodian files with web archives in {custodian_dir}...")
        files = find_custodian_files_with_web_archives(custodian_dir)
        print(f"Found {len(files)} custodian files with web_archives")

    if args.limit:
        files = files[:args.limit]
        print(f"Limited to {args.limit} files")

    if args.dry_run:
        print("\n*** DRY RUN - No changes will be made ***\n")

    # Process statistics
    total_processed = 0
    total_updated = 0
    total_entities = 0
    total_html_files = 0
    total_segments = 0
    total_discarded = 0
    all_unmatched = []

    for filepath in files:
        stats = process_custodian_file(
            filepath, base_path, matcher,
            dry_run=args.dry_run,
            verbose=args.verbose,
            show_entities=args.show_entities,
            show_unmatched=args.show_unmatched,
            min_length=args.min_length
        )
        total_processed += 1

        # Collect unmatched samples
        if 'unmatched_samples' in stats:
            all_unmatched.extend(stats['unmatched_samples'])

        if stats['status'] in ('updated', 'would_update'):
            total_updated += 1
            total_entities += stats['entities_extracted']
            total_html_files += stats['html_files_processed']
            total_segments += stats['text_segments_analyzed']
            total_discarded += stats['segments_discarded']

            if args.verbose or stats['entities_extracted'] > 0:
                msg = f"✓ {stats['file']}: {stats['entities_extracted']} entities"
                msg += f" ({stats['html_files_processed']} HTML files, {stats['segments_discarded']} discarded)"
                print(msg)

        elif stats['status'] == 'no_entities_found':
            total_html_files += stats['html_files_processed']
            total_segments += stats['text_segments_analyzed']
            total_discarded += stats['segments_discarded']
            if args.verbose:
                print(f"○ {stats['file']}: no entities found ({stats['html_files_processed']} HTML files)")

        elif args.verbose:
            if stats['status'] == 'error':
                print(f"✗ {stats['file']}: {stats['error']}")
            elif stats['status'] == 'no_web_archives':
                print(f"○ {stats['file']}: no web_archives section")

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Files processed:         {total_processed}")
    print(f"Files with entities:     {total_updated}")
    print(f"Total HTML files:        {total_html_files}")
    print(f"Text segments analyzed:  {total_segments}")
    print(f"Segments discarded:      {total_discarded}")
    print(f"Total entities found:    {total_entities}")

    # Show unmatched samples if requested
    if args.show_unmatched > 0 and all_unmatched:
        print("\n" + "-" * 60)
        print(f"UNMATCHED SAMPLES (showing up to {args.show_unmatched}):")
        print("-" * 60)
        for i, sample in enumerate(all_unmatched[:args.show_unmatched], 1):
            print(f"  {i}. {sample[:80]}{'...' if len(sample) > 80 else ''}")

    if args.dry_run:
        print("\n*** DRY RUN - No changes were made ***")

    return 0


if __name__ == '__main__':
    exit(main())