glam/scripts/detect_youtube_misattributions.py
2025-12-03 17:38:46 +01:00

243 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Detect potential YouTube channel misattributions across all enriched entries.
This script scans all entries with youtube_enrichment and checks if the
YouTube channel name matches the institution's custodian_name using fuzzy matching.
A mismatch indicates the YouTube channel may belong to a third party (embedded video)
rather than being the institution's official channel.
Usage:
python scripts/detect_youtube_misattributions.py
python scripts/detect_youtube_misattributions.py --threshold 50 # Lower threshold = more sensitive
python scripts/detect_youtube_misattributions.py --fix # Apply fixes automatically
"""
import argparse
import re
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import yaml
try:
from rapidfuzz import fuzz
HAVE_RAPIDFUZZ = True
except ImportError:
HAVE_RAPIDFUZZ = False
print("Warning: rapidfuzz not available, using basic string matching")
def normalize_name(name: str) -> str:
"""Normalize a name for comparison.
- Lowercase
- Remove accents/diacritics
- Remove common legal form terms
- Remove punctuation
- Collapse whitespace
"""
if not name:
return ""
# Lowercase
name = name.lower()
# Remove accents/diacritics
name = unicodedata.normalize('NFD', name)
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
# Remove common legal form terms (Dutch/German/English)
legal_forms = [
'stichting', 'vereniging', 'foundation', 'verein',
'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.',
'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das'
]
for term in legal_forms:
name = re.sub(rf'\b{re.escape(term)}\b', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', '', name)
# Collapse whitespace
name = ' '.join(name.split())
return name.strip()
def name_similarity(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-100)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
if not norm1 or not norm2:
return 0.0
if HAVE_RAPIDFUZZ:
# Use token_set_ratio for better handling of word reordering
return fuzz.token_set_ratio(norm1, norm2)
else:
# Basic: check if one contains the other
if norm1 in norm2 or norm2 in norm1:
return 80.0
# Check word overlap
words1 = set(norm1.split())
words2 = set(norm2.split())
if not words1 or not words2:
return 0.0
intersection = words1 & words2
union = words1 | words2
return (len(intersection) / len(union)) * 100
def get_custodian_name(entry: dict) -> Optional[str]:
"""Extract the custodian name from an entry."""
# Try custodian_name field first
if 'custodian_name' in entry:
cn = entry['custodian_name']
if isinstance(cn, dict):
return cn.get('claim_value') or cn.get('name')
return str(cn)
# Try original_entry.organisatie
if 'original_entry' in entry:
org = entry['original_entry'].get('organisatie')
if org:
return org
return None
def get_youtube_channel_title(entry: dict) -> Optional[str]:
"""Extract the YouTube channel title from enrichment data."""
if 'youtube_enrichment' not in entry:
return None
yt = entry['youtube_enrichment']
if 'channel' in yt and isinstance(yt['channel'], dict):
return yt['channel'].get('title')
return None
def analyze_entry(entry_path: Path, threshold: float = 60.0) -> Optional[dict]:
"""Analyze an entry for potential YouTube misattribution.
Returns analysis dict if potential misattribution detected, None otherwise.
"""
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
return None
# Skip entries without YouTube enrichment
if 'youtube_enrichment' not in entry:
return None
# Skip already-flagged entries
if 'misattributed_enrichments' in entry:
for misattr in entry['misattributed_enrichments']:
if misattr.get('enrichment_type') == 'youtube':
return None # Already handled
custodian_name = get_custodian_name(entry)
channel_title = get_youtube_channel_title(entry)
if not custodian_name or not channel_title:
return None
similarity = name_similarity(custodian_name, channel_title)
if similarity < threshold:
return {
'entry_path': str(entry_path),
'entry_index': entry.get('entry_index'),
'custodian_name': custodian_name,
'channel_title': channel_title,
'similarity': round(similarity, 1),
'channel_url': entry['youtube_enrichment'].get('source_url'),
'subscriber_count': entry['youtube_enrichment'].get('channel', {}).get('subscriber_count'),
'video_count': entry['youtube_enrichment'].get('channel', {}).get('video_count'),
}
return None
def main():
parser = argparse.ArgumentParser(description='Detect YouTube channel misattributions')
parser.add_argument('--threshold', type=float, default=60.0,
help='Similarity threshold below which to flag (0-100, default: 60)')
parser.add_argument('--fix', action='store_true',
help='Apply fixes automatically (move to misattributed_enrichments)')
parser.add_argument('--entries-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/nde/enriched/entries'),
help='Directory containing entry files')
args = parser.parse_args()
entries_dir = args.entries_dir
if not entries_dir.exists():
print(f"Error: Entries directory not found: {entries_dir}")
return 1
# Find all entry files
entry_files = sorted(entries_dir.glob('*.yaml'))
print(f"Scanning {len(entry_files)} entries for YouTube misattributions...")
print(f"Similarity threshold: {args.threshold}%")
print()
potential_misattributions = []
entries_with_youtube = 0
for entry_path in entry_files:
try:
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if entry and 'youtube_enrichment' in entry:
entries_with_youtube += 1
result = analyze_entry(entry_path, args.threshold)
if result:
potential_misattributions.append(result)
except Exception as e:
print(f"Error processing {entry_path}: {e}")
print(f"Entries with YouTube enrichment: {entries_with_youtube}")
print(f"Potential misattributions detected: {len(potential_misattributions)}")
print()
if potential_misattributions:
print("=" * 100)
print("POTENTIAL MISATTRIBUTIONS")
print("=" * 100)
for i, m in enumerate(potential_misattributions, 1):
print(f"\n{i}. Entry {m['entry_index']:04d}")
print(f" File: {Path(m['entry_path']).name}")
print(f" Custodian Name: {m['custodian_name']}")
print(f" Channel Title: {m['channel_title']}")
print(f" Similarity: {m['similarity']}% (threshold: {args.threshold}%)")
print(f" Channel URL: {m['channel_url']}")
print(f" Subscribers: {m['subscriber_count']}, Videos: {m['video_count']}")
print()
print("=" * 100)
if args.fix:
print("\nApplying fixes...")
# TODO: Implement automatic fix (similar to fix_youtube_misattribution.py)
print("Note: Automatic fix not yet implemented. Please review and fix manually.")
else:
print("\nTo apply fixes automatically, re-run with --fix flag")
print("Or review each case manually using scripts/fix_youtube_misattribution.py")
else:
print("No potential misattributions detected!")
return 0
if __name__ == '__main__':
exit(main())