glam/scripts/detect_youtube_misattributions_fast.py

#!/usr/bin/env python3
"""
Fast detection of potential YouTube channel misattributions.

Scans only entries that have youtube_enrichment and checks name similarity.
"""

import re
import unicodedata
from pathlib import Path
import yaml


def normalize_name(name: str) -> str:
    """Normalize a name for comparison."""
    if not name:
        return ""

    name = name.lower()
    name = unicodedata.normalize('NFD', name)
    name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')

    # Remove common terms
    for term in ['stichting', 'vereniging', 'foundation', 'verein', 'museum',
                 'archief', 'archive', 'bibliotheek', 'library',
                 'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.',
                 'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das', 'van']:
        name = re.sub(rf'\b{re.escape(term)}\b', '', name)

    name = re.sub(r'[^\w\s]', '', name)
    return ' '.join(name.split()).strip()


def word_overlap_score(name1: str, name2: str) -> float:
    """Calculate word overlap between two normalized names (0-100)."""
    words1 = set(normalize_name(name1).split())
    words2 = set(normalize_name(name2).split())

    if not words1 or not words2:
        return 0.0

    intersection = words1 & words2
    # Jaccard-like but weighted towards shorter name
    min_len = min(len(words1), len(words2))
    if min_len == 0:
        return 0.0

    return (len(intersection) / min_len) * 100


def main():
    entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')

    # Get entries with youtube_enrichment
    youtube_entries = list(entries_dir.glob('*.yaml'))

    mismatches = []
    processed = 0
    with_youtube = 0

    for entry_path in youtube_entries:
        try:
            with open(entry_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Quick check before parsing
            if 'youtube_enrichment:' not in content:
                continue

            if 'misattributed_enrichments:' in content and 'youtube' in content.split('misattributed_enrichments:')[1][:500]:
                continue  # Already flagged

            entry = yaml.safe_load(content)
            if not entry or 'youtube_enrichment' not in entry:
                continue

            with_youtube += 1
            processed += 1

            # Get custodian name
            custodian_name = None
            if 'custodian_name' in entry:
                cn = entry['custodian_name']
                if isinstance(cn, dict):
                    custodian_name = cn.get('claim_value') or cn.get('name')
                else:
                    custodian_name = str(cn)

            if not custodian_name and 'original_entry' in entry:
                custodian_name = entry['original_entry'].get('organisatie')

            # Get YouTube channel title
            yt = entry['youtube_enrichment']
            channel_title = None
            if 'channel' in yt and isinstance(yt['channel'], dict):
                channel_title = yt['channel'].get('title')

            if not custodian_name or not channel_title:
                continue

            # Check similarity
            score = word_overlap_score(custodian_name, channel_title)

            if score < 30:  # Low overlap threshold
                mismatches.append({
                    'file': entry_path.name,
                    'index': entry.get('entry_index'),
                    'custodian': custodian_name,
                    'channel': channel_title,
                    'score': round(score, 1),
                    'channel_url': yt.get('source_url', yt.get('channel', {}).get('channel_url')),
                })
        except Exception as e:
            print(f"Error: {entry_path.name}: {e}")

    print(f"Scanned {processed} entries with YouTube enrichment")
    print(f"Found {len(mismatches)} potential misattributions (score < 30%)")
    print()

    if mismatches:
        print("=" * 100)
        for m in sorted(mismatches, key=lambda x: x['score']):
            print(f"\nEntry {m['index']:04d} ({m['file']})")
            print(f"  Custodian: {m['custodian']}")
            print(f"  Channel:   {m['channel']}")
            print(f"  Overlap:   {m['score']}%")
            print(f"  URL:       {m['channel_url']}")


if __name__ == '__main__':
    main()