glam/scripts/derive_custodian_name_v2.py

#!/usr/bin/env python3
"""
Derive CustodianName by finding consensus across all enrichment sources.

APPROACH: Find the name that appears most consistently across sources.
Instead of a fixed priority, we compare all available names and pick
the one with the highest agreement (fuzzy matching).

Sources checked:
- wikidata_enrichment.wikidata_label_nl / wikidata_label_en
- google_maps_enrichment.name
- isil_enrichment.name
- original_entry.organisatie
- museum_register (if present)
- youtube_enrichment (if present)
- web_claims org_name (og:site_name, schema.org, h1, title)

The consensus approach automatically handles:
- Wrong Google Maps POIs (parking lots won't match other sources)
- Garbage web claims (exhibition titles won't match Wikidata)
- Outdated CSV names (if most sources agree on new name)

Usage:
    python scripts/derive_custodian_name_v2.py [--limit N] [--entry ENTRY_NUM] [--dry-run] [--force]
"""

import argparse
import re
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple

import yaml

try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except ImportError:
    RAPIDFUZZ_AVAILABLE = False


# Directories
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')


# Source weights for tie-breaking (not primary selection)
SOURCE_WEIGHTS = {
    'wikidata': 1.0,
    'google_maps': 0.9,
    'isil': 0.85,
    'original_entry': 0.8,
    'museum_register': 0.75,
    'youtube': 0.7,
    'web_og_site_name': 0.6,
    'web_schema_org': 0.55,
    'web_h1_tag': 0.4,
    'web_title_tag': 0.35,
}


# Patterns that indicate invalid/garbage names
INVALID_NAME_PATTERNS = [
    # Navigation/UI elements
    r'^(home|welkom|welcome|menu|nav|header|footer|sidebar)$',
    r'^(contact|over ons|about|info|informatie)$',
    r'^(nieuws|news|agenda|calendar|events?|activiteiten)$',
    r'^(zoeken?|search|filter|sort|browse|bladeren)$',
    r'^zoeken in', r'^doorzoek\s', r'^bekijk\s', r'^ontdek\s',

    # Cookie/privacy/legal
    r'cookie', r'privacy', r'gdpr', r'consent', r'waarom gebruiken wij',

    # Generic page elements
    r'^(default|untitled|index|main|pagina|page)\s*\d*$',
    r'^(foto|image|picture|afbeelding)\s*\d+$',
    r'^(oproep|call|melding|bericht|scroll)$',
    r'^(openingstijden|tickets|reserveer|plan je bezoek)$',
    r'^(main menu|hoofdmenu)$',

    # Exhibition/event titles
    r'tentoonstelling', r'expositie', r'exhibition', r'verlengd',
    r'^nu te zien', r'^te zien:',

    # Taglines/slogans
    r'^op het kruispunt van', r'^het verhaal van\s', r'^de geschiedenis van\s',
    r'^beleef je\s', r'^ontdek ook\s', r'^welkom bij\s',
    r'^over het museum$', r'^over de\s', r'^over ons$',
    r'binnen handbereik$', r'met een glimlach$',

    # Newsletter/marketing
    r'nieuwsbrief', r'newsletter', r'^schrijf je in', r'^sign up',

    # Wrong websites
    r'webdesign', r'libraries\.org', r'NLmapNew\.com', r'fotobeeldbank',

    # Wrong POIs from Google Maps
    r'^parkeerplaats$', r'^parking$', r'^bushalte$', r'^tramhalte$',

    # Generic/ambiguous
    r'^homepage\s', r'^homepagina\s', r'^chat$', r'^help$',
    r'onder constructie', r"web server's default page",
]


# Patterns to extract actual institution name from greeting/wrapper text
# These patterns capture the institution name from common website title formats
# NOTE: Order matters! More specific patterns MUST come before more general ones.
NAME_EXTRACTION_PATTERNS = [
    # "Welkom op de website van [het] [NAME]" - most specific
    (r'^welkom\s+op\s+de\s+(?:website|site|pagina)\s+van\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
    # "Welkom bij [het] [NAME]" - captures NAME
    (r'^welkom\s+bij\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
    # "Welkom in [het] [NAME]" - captures NAME
    (r'^welkom\s+in\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
    # "Welkom [het] [NAME]" - just "Welkom" followed by name (fallback)
    (r'^welkom\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
    # "[NAME] - Welkom" or "[NAME] | Home" etc.
    (r'^(.+?)\s*[-–—|]\s*(?:welkom|home|homepage|start).*$', 1),
    # "[NAME] | Official Website" etc.
    (r'^(.+?)\s*[-–—|]\s*(?:official\s+)?(?:website|site).*$', 1),
]


# Dutch legal form prefixes that should be stripped for emic names
# These are formal legal designations, NOT part of the public-facing name
#
# NOTE: "Vereniging" is NOT in this list! It describes organizational purpose
# (a voluntary association of members), not just legal registration.
# "Historische Vereniging Nijeveen" is fundamentally different from
# "Stichting Rijksmuseum" - the former's identity IS being a vereniging.
# See AGENTS.md Rule 8 for full rationale.
DUTCH_LEGAL_PREFIXES = [
    r'^stichting\s+',          # Foundation (legal entity type)
    r'^coöperatie\s+',         # Cooperative
    r'^coöperatieve\s+',
    r'^naamloze\s+vennootschap\s+',  # Public company (NV)
    r'^besloten\s+vennootschap\s+',  # Private company (BV)
    r'^commanditaire\s+vennootschap\s+',  # Limited partnership
    r'^vennootschap\s+onder\s+firma\s+',  # General partnership
    r'^maatschap\s+',          # Partnership
    r'^eenmanszaak\s+',        # Sole proprietorship
]

# Suffixes that indicate legal form
DUTCH_LEGAL_SUFFIXES = [
    r'\s+b\.?v\.?\s*$',        # B.V.
    r'\s+n\.?v\.?\s*$',        # N.V.
    r'\s+v\.?o\.?f\.?\s*$',    # V.O.F.
    r'\s+c\.?v\.?\s*$',        # C.V.
]


def normalize_name(name: str) -> str:
    """Normalize name for comparison."""
    if not name:
        return ""
    return ' '.join(name.lower().split())


def extract_name_from_greeting(name: str) -> str:
    """
    Extract the actual institution name from greeting/wrapper text.

    Examples:
        "Welkom op de website van het Zeister Historisch Genootschap (ZHG)"
            -> "Zeister Historisch Genootschap (ZHG)"
        "Welkom bij Oudheidkamer Texel"
            -> "Oudheidkamer Texel"
        "Rijksmuseum | Home"
            -> "Rijksmuseum"
    """
    if not name:
        return ""

    name = name.strip()
    name_lower = name.lower()

    # Try each extraction pattern
    for pattern, group_idx in NAME_EXTRACTION_PATTERNS:
        match = re.match(pattern, name_lower, re.IGNORECASE)
        if match:
            extracted = match.group(group_idx).strip()
            # Preserve original case by finding the extracted part in original
            start_pos = name_lower.find(extracted.lower())
            if start_pos >= 0:
                extracted = name[start_pos:start_pos + len(extracted)]
            return extracted.strip(' -–—|:.')

    return name


def extract_emic_name(name: str) -> str:
    """
    Extract the emic (public-facing) name, stripping legal form prefixes/suffixes.

    Per CustodianName.yaml:
    - CustodianName = How custodian presents itself (emic, operational)
    - LegalName = Formal registered name (in CustodianLegalStatus)
    - Example: "Rijksmuseum" (emic) vs "Stichting Rijksmuseum" (legal)

    Examples:
        "Stichting Het Geld- En Bankmuseum" -> "Geldmuseum" (if Geldmuseum is the emic name)
        "Stichting Rijksmuseum" -> "Rijksmuseum"
        "Vereniging Oud-Utrecht" -> "Oud-Utrecht"
        "Museum Boijmans Van Beuningen B.V." -> "Museum Boijmans Van Beuningen"
    """
    if not name:
        return ""

    result = name.strip()

    # Strip legal prefixes (case-insensitive)
    for pattern in DUTCH_LEGAL_PREFIXES:
        result = re.sub(pattern, '', result, flags=re.IGNORECASE).strip()

    # Strip legal suffixes
    for pattern in DUTCH_LEGAL_SUFFIXES:
        result = re.sub(pattern, '', result, flags=re.IGNORECASE).strip()

    # Clean up any double spaces or leading/trailing punctuation
    result = ' '.join(result.split())
    result = result.strip(' -–—|:.')

    return result if result else name


def get_legal_name(name: str) -> Optional[str]:
    """
    Check if the name contains a legal form indicator.
    If so, return the full legal name; otherwise return None.

    This is used to populate CustodianLegalStatus.legal_name when available.

    NOTE: Also checks inside greeting text (e.g., "Welkom op de website van Vereniging X")
    """
    if not name:
        return None

    # First extract from greeting if present
    extracted = extract_name_from_greeting(name)

    # Check both original and extracted
    for check_name in [extracted, name]:
        if not check_name:
            continue
        check_lower = check_name.lower()

        # Check for legal prefixes
        for pattern in DUTCH_LEGAL_PREFIXES:
            if re.match(pattern, check_lower, re.IGNORECASE):
                return check_name.strip()

        # Check for legal suffixes
        for pattern in DUTCH_LEGAL_SUFFIXES:
            if re.search(pattern, check_lower, re.IGNORECASE):
                return check_name.strip()

    return None


def fuzzy_match_score(name1: str, name2: str) -> float:
    """Calculate fuzzy match score between two names (0-1)."""
    if not name1 or not name2:
        return 0.0

    n1 = normalize_name(name1)
    n2 = normalize_name(name2)

    if n1 == n2:
        return 1.0

    if RAPIDFUZZ_AVAILABLE:
        token_score = fuzz.token_set_ratio(n1, n2) / 100.0
        partial_score = fuzz.partial_ratio(n1, n2) / 100.0
        return max(token_score * 0.8 + partial_score * 0.2, token_score)
    else:
        if n1 in n2 or n2 in n1:
            return min(len(n1), len(n2)) / max(len(n1), len(n2))
        return 0.0


def is_obviously_invalid(name: str) -> bool:
    """Check if a name is obviously invalid."""
    if not name or len(name.strip()) < 3:
        return True

    name_lower = name.lower().strip()

    for pattern in INVALID_NAME_PATTERNS:
        if re.search(pattern, name_lower, re.IGNORECASE):
            return True

    # Mostly numbers
    if sum(1 for c in name if c.isdigit()) > len(name) * 0.5:
        return True

    return False


def clean_name(name: str, extract_from_greeting: bool = True, to_emic: bool = False) -> str:
    """
    Clean organization name.

    Args:
        name: Raw name string
        extract_from_greeting: If True, extract name from "Welkom..." patterns
        to_emic: If True, strip legal form prefixes to get emic name
    """
    if not name:
        return ""
    name = ' '.join(name.split())
    name = name.strip(' -–—|:.')

    # Extract actual name from greeting text if present
    if extract_from_greeting:
        name = extract_name_from_greeting(name)

    # Convert to emic name if requested
    if to_emic:
        name = extract_emic_name(name)

    return name


def extract_all_names(entry_data: Dict) -> Tuple[List[Tuple[str, str, float]], Optional[str]]:
    """
    Extract all candidate names from all enrichment sources.

    Returns:
        - List of (emic_name, source, weight) tuples for consensus matching
        - Optional legal_name if a legal form was detected in any source
    """
    candidates = []
    legal_name = None

    # Wikidata (usually has emic name, not legal name)
    wikidata = entry_data.get('wikidata_enrichment', {})
    for field in ['wikidata_label_nl', 'wikidata_label_en']:
        if wikidata.get(field):
            raw_name = wikidata[field]
            # Extract greeting if present, convert to emic
            name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
            if not is_obviously_invalid(name):
                candidates.append((name, 'wikidata', SOURCE_WEIGHTS['wikidata']))
                # Check if raw name was a legal name
                if not legal_name:
                    legal_name = get_legal_name(raw_name)
                break  # Only use one wikidata name

    # Google Maps (usually has emic name)
    google = entry_data.get('google_maps_enrichment', {})
    if google.get('name'):
        raw_name = google['name']
        name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
        if not is_obviously_invalid(name):
            candidates.append((name, 'google_maps', SOURCE_WEIGHTS['google_maps']))
            if not legal_name:
                legal_name = get_legal_name(raw_name)

    # ISIL registry (may have legal or emic name)
    isil = entry_data.get('isil_enrichment', {})
    if isil.get('name'):
        raw_name = isil['name']
        name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
        if not is_obviously_invalid(name):
            candidates.append((name, 'isil', SOURCE_WEIGHTS['isil']))
            if not legal_name:
                legal_name = get_legal_name(raw_name)

    # NAN ISIL enrichment (authoritative source for legal names)
    nan_isil = entry_data.get('nan_isil_enrichment', {})
    if nan_isil.get('nan_name'):
        raw_name = nan_isil['nan_name']
        name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
        if not is_obviously_invalid(name):
            candidates.append((name, 'nan_isil', SOURCE_WEIGHTS.get('nan_isil', 0.85)))
            if not legal_name:
                legal_name = get_legal_name(raw_name)

    # Original CSV entry (often has legal name with "Stichting" etc.)
    original = entry_data.get('original_entry', {})
    if original.get('organisatie'):
        raw_name = original['organisatie']
        # CSV often has legal names - extract emic version
        name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
        if not is_obviously_invalid(name):
            candidates.append((name, 'original_entry', SOURCE_WEIGHTS['original_entry']))
            # Original entry is good source for legal name
            if not legal_name:
                legal_name = get_legal_name(raw_name)

    # Museum register (if present)
    museum_reg = entry_data.get('museum_register_enrichment', {})
    if museum_reg.get('name'):
        raw_name = museum_reg['name']
        name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
        if not is_obviously_invalid(name):
            candidates.append((name, 'museum_register', SOURCE_WEIGHTS['museum_register']))
            if not legal_name:
                legal_name = get_legal_name(raw_name)

    # YouTube (if present)
    youtube = entry_data.get('youtube_enrichment', {})
    if youtube.get('channel_name'):
        raw_name = youtube['channel_name']
        name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
        if not is_obviously_invalid(name):
            candidates.append((name, 'youtube', SOURCE_WEIGHTS['youtube']))

    # Web claims (title tags often have greetings, need extraction)
    web_claims = entry_data.get('web_claims', {}).get('claims', [])
    for claim in web_claims:
        if claim.get('claim_type') == 'org_name':
            raw_name = claim.get('claim_value', '')
            # Web claims especially need greeting extraction
            name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
            method = claim.get('extraction_method', '')
            source_key = f'web_{method}'
            weight = SOURCE_WEIGHTS.get(source_key, 0.3)

            if not is_obviously_invalid(name):
                candidates.append((name, source_key, weight))

    return candidates, legal_name


def find_consensus_name(candidates: List[Tuple[str, str, float]]) -> Tuple[Optional[str], str, float, Dict]:
    """
    Find the name with highest consensus across sources.

    For each candidate, calculate how well it matches all other candidates.
    The name with highest total agreement wins.

    Returns (best_name, best_source, confidence, match_details)
    """
    if not candidates:
        return None, 'none', 0.0, {}

    if len(candidates) == 1:
        name, source, weight = candidates[0]
        return name, source, weight, {'single_source': True}

    # Calculate agreement scores for each candidate
    agreement_scores = []

    for i, (name1, source1, weight1) in enumerate(candidates):
        total_agreement = 0.0
        matches = []

        for j, (name2, source2, weight2) in enumerate(candidates):
            if i == j:
                continue

            score = fuzzy_match_score(name1, name2)
            # Weight the agreement by the source weight of the matching name
            weighted_score = score * weight2
            total_agreement += weighted_score

            if score >= 0.6:
                matches.append({
                    'source': source2,
                    'name': name2,
                    'score': score,
                })

        # Normalize by number of other sources
        avg_agreement = total_agreement / (len(candidates) - 1) if len(candidates) > 1 else 0

        # Boost by source weight
        final_score = avg_agreement * 0.7 + weight1 * 0.3

        agreement_scores.append({
            'name': name1,
            'source': source1,
            'weight': weight1,
            'avg_agreement': avg_agreement,
            'final_score': final_score,
            'matches': matches,
            'match_count': len(matches),
        })

    # Sort by final score (highest first)
    agreement_scores.sort(key=lambda x: (x['final_score'], x['match_count'], x['weight']), reverse=True)

    best = agreement_scores[0]

    # Calculate confidence based on agreement
    confidence = best['final_score']
    if best['match_count'] >= 2:
        confidence = min(1.0, confidence + 0.1)  # Boost for multiple matches

    return best['name'], best['source'], confidence, {
        'match_count': best['match_count'],
        'matches': best['matches'],
        'avg_agreement': best['avg_agreement'],
        'all_candidates': [(c['name'], c['source'], c['final_score']) for c in agreement_scores],
    }


def process_entry(filepath: Path, dry_run: bool = False) -> Dict[str, Any]:
    """
    Process a single entry file to derive CustodianName by consensus.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return {'status': 'error', 'message': 'Empty file', 'filepath': str(filepath)}

    result = {
        'filepath': str(filepath),
        'filename': filepath.name,
        'entry_index': data.get('entry_index', ''),
        'status': 'ok',
        'name': None,
        'source': None,
        'confidence': 0.0,
        'match_count': 0,
        'previous_name': None,
        'previous_source': None,
    }

    # Get current custodian_name if exists
    current = data.get('custodian_name', {})
    if current.get('claim_value'):
        result['previous_name'] = current.get('claim_value')
        result['previous_source'] = current.get('source') or current.get('extraction_method', 'unknown')

    # Extract all candidate names from all sources
    candidates, legal_name = extract_all_names(data)

    if not candidates:
        result['status'] = 'no_source'
        result['message'] = 'No valid names found in any source'
        return result

    # Store legal name in result for later use
    result['legal_name'] = legal_name

    # Find consensus name
    best_name, best_source, confidence, details = find_consensus_name(candidates)

    if not best_name:
        result['status'] = 'no_consensus'
        result['message'] = 'Could not find consensus among candidates'
        return result

    result['name'] = best_name
    result['source'] = best_source
    result['confidence'] = confidence
    result['match_count'] = details.get('match_count', 0)
    result['candidates'] = len(candidates)

    # Build custodian_name record
    custodian_name = {
        'claim_type': 'custodian_name',
        'claim_value': best_name,
        'source': best_source,
        'confidence': round(confidence, 3),
        'consensus_method': True,
        'sources_checked': len(candidates),
        'sources_matched': details.get('match_count', 0) + 1,  # +1 for self
        'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
    }

    # Add match details
    if details.get('matches'):
        custodian_name['matching_sources'] = [
            {'source': m['source'], 'name': m['name'], 'score': round(m['score'], 2)}
            for m in details['matches']
        ]

    # Track if changed
    if result['previous_name'] and result['previous_name'] != best_name:
        custodian_name['previous_value'] = result['previous_name']
        custodian_name['previous_source'] = result['previous_source']
        result['status'] = 'updated'
    elif not result['previous_name']:
        result['status'] = 'new'

    # Write if not dry run
    if not dry_run:
        data['custodian_name'] = custodian_name

        # Store legal_name separately if detected (for CustodianLegalStatus)
        if legal_name:
            data['custodian_legal_name'] = {
                'claim_type': 'legal_name',
                'claim_value': legal_name,
                'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
                'note': 'Legal form detected in source name (e.g., Stichting, B.V., N.V.)'
            }

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return result


def main():
    parser = argparse.ArgumentParser(description='Derive CustodianName by consensus across sources')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
    parser.add_argument('--entry', type=str, default=None, help='Process specific entry number')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing')
    parser.add_argument('--force', action='store_true', help='Re-derive even if custodian_name exists')
    parser.add_argument('--show-all', action='store_true', help='Show all entries, not just changes')
    parser.add_argument('--verbose', action='store_true', help='Show candidate details')
    args = parser.parse_args()

    # Find entry files
    if args.entry:
        files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml'))
    else:
        files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])

    if args.limit:
        files = files[:args.limit]

    print(f"Processing {len(files)} entries...")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    print(f"Method: Consensus across all enrichment sources")
    print()

    # Track statistics
    stats = defaultdict(int)
    low_confidence = []

    for filepath in files:
        if filepath.is_dir():
            continue

        # Skip if already has custodian_name (unless --force)
        if not args.force:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
            if data and data.get('custodian_name', {}).get('claim_value'):
                stats['unchanged'] += 1
                if args.show_all:
                    name = data['custodian_name']['claim_value']
                    source = data['custodian_name'].get('source', 'unknown')
                    print(f"  = {filepath.name}: '{name}' [{source}]")
                continue

        result = process_entry(filepath, dry_run=args.dry_run)

        # Update stats
        if result['status'] == 'error':
            stats['error'] += 1
            print(f"  ! {filepath.name}: ERROR - {result.get('message', 'Unknown')}")
        elif result['status'] in ('no_source', 'no_consensus'):
            stats['no_source'] += 1
            print(f"  - {filepath.name}: {result.get('message', 'No source')}")
        else:
            stats[result['source']] += 1
            stats['total_derived'] += 1

            # Track low confidence for review
            if result['confidence'] < 0.5:
                low_confidence.append(result)

            if result['status'] == 'updated':
                stats['updated'] += 1
                match_info = f"[{result['match_count']+1}/{result['candidates']} sources]"
                print(f"  ~ {filepath.name}: '{result['previous_name']}' -> '{result['name']}' [{result['source']}] {match_info}")
            elif result['status'] == 'new':
                stats['new'] += 1
                match_info = f"[{result['match_count']+1}/{result['candidates']} sources]"
                print(f"  + {filepath.name}: '{result['name']}' [{result['source']}] {match_info}")
            elif args.show_all:
                print(f"  = {filepath.name}: '{result['name']}' [{result['source']}]")

    # Summary
    print()
    print("=" * 70)
    print(f"{'DRY RUN - ' if args.dry_run else ''}Summary:")
    print()
    print("Sources used:")
    for source in ['wikidata', 'google_maps', 'isil', 'original_entry', 'museum_register',
                   'youtube', 'web_og_site_name', 'web_schema_org', 'web_h1_tag', 'web_title_tag']:
        if stats[source] > 0:
            print(f"  {source:20s}: {stats[source]}")
    print()
    print(f"  New names derived:    {stats['new']}")
    print(f"  Names updated:        {stats['updated']}")
    print(f"  Unchanged (skipped):  {stats['unchanged']}")
    print(f"  No valid source:      {stats['no_source']}")
    print(f"  Errors:               {stats['error']}")
    print()
    print(f"  TOTAL DERIVED:        {stats['total_derived']}")

    if low_confidence:
        print()
        print(f"  Low confidence ({len(low_confidence)} entries) - may need review:")
        for r in low_confidence[:10]:
            print(f"    {r['filename']}: '{r['name']}' (confidence: {r['confidence']:.2f})")
        if len(low_confidence) > 10:
            print(f"    ... and {len(low_confidence) - 10} more")

    print("=" * 70)

    return 0


if __name__ == '__main__':
    sys.exit(main())