#!/usr/bin/env python3 """ Detect potential YouTube channel misattributions across all enriched entries. This script scans all entries with youtube_enrichment and checks if the YouTube channel name matches the institution's custodian_name using fuzzy matching. A mismatch indicates the YouTube channel may belong to a third party (embedded video) rather than being the institution's official channel. Usage: python scripts/detect_youtube_misattributions.py python scripts/detect_youtube_misattributions.py --threshold 50 # Lower threshold = more sensitive python scripts/detect_youtube_misattributions.py --fix # Apply fixes automatically """ import argparse import re import unicodedata from pathlib import Path from datetime import datetime, timezone from typing import Optional import yaml try: from rapidfuzz import fuzz HAVE_RAPIDFUZZ = True except ImportError: HAVE_RAPIDFUZZ = False print("Warning: rapidfuzz not available, using basic string matching") def normalize_name(name: str) -> str: """Normalize a name for comparison. - Lowercase - Remove accents/diacritics - Remove common legal form terms - Remove punctuation - Collapse whitespace """ if not name: return "" # Lowercase name = name.lower() # Remove accents/diacritics name = unicodedata.normalize('NFD', name) name = ''.join(c for c in name if unicodedata.category(c) != 'Mn') # Remove common legal form terms (Dutch/German/English) legal_forms = [ 'stichting', 'vereniging', 'foundation', 'verein', 'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.', 'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das' ] for term in legal_forms: name = re.sub(rf'\b{re.escape(term)}\b', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', '', name) # Collapse whitespace name = ' '.join(name.split()) return name.strip() def name_similarity(name1: str, name2: str) -> float: """Calculate similarity between two names (0-100).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) if not norm1 or not norm2: return 0.0 if HAVE_RAPIDFUZZ: # Use token_set_ratio for better handling of word reordering return fuzz.token_set_ratio(norm1, norm2) else: # Basic: check if one contains the other if norm1 in norm2 or norm2 in norm1: return 80.0 # Check word overlap words1 = set(norm1.split()) words2 = set(norm2.split()) if not words1 or not words2: return 0.0 intersection = words1 & words2 union = words1 | words2 return (len(intersection) / len(union)) * 100 def get_custodian_name(entry: dict) -> Optional[str]: """Extract the custodian name from an entry.""" # Try custodian_name field first if 'custodian_name' in entry: cn = entry['custodian_name'] if isinstance(cn, dict): return cn.get('claim_value') or cn.get('name') return str(cn) # Try original_entry.organisatie if 'original_entry' in entry: org = entry['original_entry'].get('organisatie') if org: return org return None def get_youtube_channel_title(entry: dict) -> Optional[str]: """Extract the YouTube channel title from enrichment data.""" if 'youtube_enrichment' not in entry: return None yt = entry['youtube_enrichment'] if 'channel' in yt and isinstance(yt['channel'], dict): return yt['channel'].get('title') return None def analyze_entry(entry_path: Path, threshold: float = 60.0) -> Optional[dict]: """Analyze an entry for potential YouTube misattribution. Returns analysis dict if potential misattribution detected, None otherwise. """ with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return None # Skip entries without YouTube enrichment if 'youtube_enrichment' not in entry: return None # Skip already-flagged entries if 'misattributed_enrichments' in entry: for misattr in entry['misattributed_enrichments']: if misattr.get('enrichment_type') == 'youtube': return None # Already handled custodian_name = get_custodian_name(entry) channel_title = get_youtube_channel_title(entry) if not custodian_name or not channel_title: return None similarity = name_similarity(custodian_name, channel_title) if similarity < threshold: return { 'entry_path': str(entry_path), 'entry_index': entry.get('entry_index'), 'custodian_name': custodian_name, 'channel_title': channel_title, 'similarity': round(similarity, 1), 'channel_url': entry['youtube_enrichment'].get('source_url'), 'subscriber_count': entry['youtube_enrichment'].get('channel', {}).get('subscriber_count'), 'video_count': entry['youtube_enrichment'].get('channel', {}).get('video_count'), } return None def main(): parser = argparse.ArgumentParser(description='Detect YouTube channel misattributions') parser.add_argument('--threshold', type=float, default=60.0, help='Similarity threshold below which to flag (0-100, default: 60)') parser.add_argument('--fix', action='store_true', help='Apply fixes automatically (move to misattributed_enrichments)') parser.add_argument('--entries-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/nde/enriched/entries'), help='Directory containing entry files') args = parser.parse_args() entries_dir = args.entries_dir if not entries_dir.exists(): print(f"Error: Entries directory not found: {entries_dir}") return 1 # Find all entry files entry_files = sorted(entries_dir.glob('*.yaml')) print(f"Scanning {len(entry_files)} entries for YouTube misattributions...") print(f"Similarity threshold: {args.threshold}%") print() potential_misattributions = [] entries_with_youtube = 0 for entry_path in entry_files: try: with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if entry and 'youtube_enrichment' in entry: entries_with_youtube += 1 result = analyze_entry(entry_path, args.threshold) if result: potential_misattributions.append(result) except Exception as e: print(f"Error processing {entry_path}: {e}") print(f"Entries with YouTube enrichment: {entries_with_youtube}") print(f"Potential misattributions detected: {len(potential_misattributions)}") print() if potential_misattributions: print("=" * 100) print("POTENTIAL MISATTRIBUTIONS") print("=" * 100) for i, m in enumerate(potential_misattributions, 1): print(f"\n{i}. Entry {m['entry_index']:04d}") print(f" File: {Path(m['entry_path']).name}") print(f" Custodian Name: {m['custodian_name']}") print(f" Channel Title: {m['channel_title']}") print(f" Similarity: {m['similarity']}% (threshold: {args.threshold}%)") print(f" Channel URL: {m['channel_url']}") print(f" Subscribers: {m['subscriber_count']}, Videos: {m['video_count']}") print() print("=" * 100) if args.fix: print("\nApplying fixes...") # TODO: Implement automatic fix (similar to fix_youtube_misattribution.py) print("Note: Automatic fix not yet implemented. Please review and fix manually.") else: print("\nTo apply fixes automatically, re-run with --fix flag") print("Or review each case manually using scripts/fix_youtube_misattribution.py") else: print("No potential misattributions detected!") return 0 if __name__ == '__main__': exit(main())