glam/scripts/detect_youtube_misattributions.py

#!/usr/bin/env python3
"""
Detect potential YouTube channel misattributions across all enriched entries.

This script scans all entries with youtube_enrichment and checks if the
YouTube channel name matches the institution's custodian_name using fuzzy matching.

A mismatch indicates the YouTube channel may belong to a third party (embedded video)
rather than being the institution's official channel.

Usage:
    python scripts/detect_youtube_misattributions.py
    python scripts/detect_youtube_misattributions.py --threshold 50  # Lower threshold = more sensitive
    python scripts/detect_youtube_misattributions.py --fix  # Apply fixes automatically
"""

import argparse
import re
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional

import yaml
try:
    from rapidfuzz import fuzz
    HAVE_RAPIDFUZZ = True
except ImportError:
    HAVE_RAPIDFUZZ = False
    print("Warning: rapidfuzz not available, using basic string matching")


def normalize_name(name: str) -> str:
    """Normalize a name for comparison.

    - Lowercase
    - Remove accents/diacritics
    - Remove common legal form terms
    - Remove punctuation
    - Collapse whitespace
    """
    if not name:
        return ""

    # Lowercase
    name = name.lower()

    # Remove accents/diacritics
    name = unicodedata.normalize('NFD', name)
    name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')

    # Remove common legal form terms (Dutch/German/English)
    legal_forms = [
        'stichting', 'vereniging', 'foundation', 'verein',
        'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.',
        'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das'
    ]
    for term in legal_forms:
        name = re.sub(rf'\b{re.escape(term)}\b', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', '', name)

    # Collapse whitespace
    name = ' '.join(name.split())

    return name.strip()


def name_similarity(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-100)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)

    if not norm1 or not norm2:
        return 0.0

    if HAVE_RAPIDFUZZ:
        # Use token_set_ratio for better handling of word reordering
        return fuzz.token_set_ratio(norm1, norm2)
    else:
        # Basic: check if one contains the other
        if norm1 in norm2 or norm2 in norm1:
            return 80.0
        # Check word overlap
        words1 = set(norm1.split())
        words2 = set(norm2.split())
        if not words1 or not words2:
            return 0.0
        intersection = words1 & words2
        union = words1 | words2
        return (len(intersection) / len(union)) * 100


def get_custodian_name(entry: dict) -> Optional[str]:
    """Extract the custodian name from an entry."""
    # Try custodian_name field first
    if 'custodian_name' in entry:
        cn = entry['custodian_name']
        if isinstance(cn, dict):
            return cn.get('claim_value') or cn.get('name')
        return str(cn)

    # Try original_entry.organisatie
    if 'original_entry' in entry:
        org = entry['original_entry'].get('organisatie')
        if org:
            return org

    return None


def get_youtube_channel_title(entry: dict) -> Optional[str]:
    """Extract the YouTube channel title from enrichment data."""
    if 'youtube_enrichment' not in entry:
        return None

    yt = entry['youtube_enrichment']
    if 'channel' in yt and isinstance(yt['channel'], dict):
        return yt['channel'].get('title')

    return None


def analyze_entry(entry_path: Path, threshold: float = 60.0) -> Optional[dict]:
    """Analyze an entry for potential YouTube misattribution.

    Returns analysis dict if potential misattribution detected, None otherwise.
    """
    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)

    if not entry:
        return None

    # Skip entries without YouTube enrichment
    if 'youtube_enrichment' not in entry:
        return None

    # Skip already-flagged entries
    if 'misattributed_enrichments' in entry:
        for misattr in entry['misattributed_enrichments']:
            if misattr.get('enrichment_type') == 'youtube':
                return None  # Already handled

    custodian_name = get_custodian_name(entry)
    channel_title = get_youtube_channel_title(entry)

    if not custodian_name or not channel_title:
        return None

    similarity = name_similarity(custodian_name, channel_title)

    if similarity < threshold:
        return {
            'entry_path': str(entry_path),
            'entry_index': entry.get('entry_index'),
            'custodian_name': custodian_name,
            'channel_title': channel_title,
            'similarity': round(similarity, 1),
            'channel_url': entry['youtube_enrichment'].get('source_url'),
            'subscriber_count': entry['youtube_enrichment'].get('channel', {}).get('subscriber_count'),
            'video_count': entry['youtube_enrichment'].get('channel', {}).get('video_count'),
        }

    return None


def main():
    parser = argparse.ArgumentParser(description='Detect YouTube channel misattributions')
    parser.add_argument('--threshold', type=float, default=60.0,
                        help='Similarity threshold below which to flag (0-100, default: 60)')
    parser.add_argument('--fix', action='store_true',
                        help='Apply fixes automatically (move to misattributed_enrichments)')
    parser.add_argument('--entries-dir', type=Path,
                        default=Path('/Users/kempersc/apps/glam/data/nde/enriched/entries'),
                        help='Directory containing entry files')
    args = parser.parse_args()

    entries_dir = args.entries_dir
    if not entries_dir.exists():
        print(f"Error: Entries directory not found: {entries_dir}")
        return 1

    # Find all entry files
    entry_files = sorted(entries_dir.glob('*.yaml'))
    print(f"Scanning {len(entry_files)} entries for YouTube misattributions...")
    print(f"Similarity threshold: {args.threshold}%")
    print()

    potential_misattributions = []
    entries_with_youtube = 0

    for entry_path in entry_files:
        try:
            with open(entry_path, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if entry and 'youtube_enrichment' in entry:
                entries_with_youtube += 1

            result = analyze_entry(entry_path, args.threshold)
            if result:
                potential_misattributions.append(result)
        except Exception as e:
            print(f"Error processing {entry_path}: {e}")

    print(f"Entries with YouTube enrichment: {entries_with_youtube}")
    print(f"Potential misattributions detected: {len(potential_misattributions)}")
    print()

    if potential_misattributions:
        print("=" * 100)
        print("POTENTIAL MISATTRIBUTIONS")
        print("=" * 100)

        for i, m in enumerate(potential_misattributions, 1):
            print(f"\n{i}. Entry {m['entry_index']:04d}")
            print(f"   File: {Path(m['entry_path']).name}")
            print(f"   Custodian Name: {m['custodian_name']}")
            print(f"   Channel Title:  {m['channel_title']}")
            print(f"   Similarity:     {m['similarity']}% (threshold: {args.threshold}%)")
            print(f"   Channel URL:    {m['channel_url']}")
            print(f"   Subscribers:    {m['subscriber_count']}, Videos: {m['video_count']}")

        print()
        print("=" * 100)

        if args.fix:
            print("\nApplying fixes...")
            # TODO: Implement automatic fix (similar to fix_youtube_misattribution.py)
            print("Note: Automatic fix not yet implemented. Please review and fix manually.")
        else:
            print("\nTo apply fixes automatically, re-run with --fix flag")
            print("Or review each case manually using scripts/fix_youtube_misattribution.py")
    else:
        print("No potential misattributions detected!")

    return 0


if __name__ == '__main__':
    exit(main())