243 lines
8.1 KiB
Python
243 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Detect potential YouTube channel misattributions across all enriched entries.
|
|
|
|
This script scans all entries with youtube_enrichment and checks if the
|
|
YouTube channel name matches the institution's custodian_name using fuzzy matching.
|
|
|
|
A mismatch indicates the YouTube channel may belong to a third party (embedded video)
|
|
rather than being the institution's official channel.
|
|
|
|
Usage:
|
|
python scripts/detect_youtube_misattributions.py
|
|
python scripts/detect_youtube_misattributions.py --threshold 50 # Lower threshold = more sensitive
|
|
python scripts/detect_youtube_misattributions.py --fix # Apply fixes automatically
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
HAVE_RAPIDFUZZ = True
|
|
except ImportError:
|
|
HAVE_RAPIDFUZZ = False
|
|
print("Warning: rapidfuzz not available, using basic string matching")
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize a name for comparison.
|
|
|
|
- Lowercase
|
|
- Remove accents/diacritics
|
|
- Remove common legal form terms
|
|
- Remove punctuation
|
|
- Collapse whitespace
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Lowercase
|
|
name = name.lower()
|
|
|
|
# Remove accents/diacritics
|
|
name = unicodedata.normalize('NFD', name)
|
|
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
|
|
|
|
# Remove common legal form terms (Dutch/German/English)
|
|
legal_forms = [
|
|
'stichting', 'vereniging', 'foundation', 'verein',
|
|
'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.',
|
|
'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das'
|
|
]
|
|
for term in legal_forms:
|
|
name = re.sub(rf'\b{re.escape(term)}\b', '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', '', name)
|
|
|
|
# Collapse whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name.strip()
|
|
|
|
|
|
def name_similarity(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-100)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
|
|
if not norm1 or not norm2:
|
|
return 0.0
|
|
|
|
if HAVE_RAPIDFUZZ:
|
|
# Use token_set_ratio for better handling of word reordering
|
|
return fuzz.token_set_ratio(norm1, norm2)
|
|
else:
|
|
# Basic: check if one contains the other
|
|
if norm1 in norm2 or norm2 in norm1:
|
|
return 80.0
|
|
# Check word overlap
|
|
words1 = set(norm1.split())
|
|
words2 = set(norm2.split())
|
|
if not words1 or not words2:
|
|
return 0.0
|
|
intersection = words1 & words2
|
|
union = words1 | words2
|
|
return (len(intersection) / len(union)) * 100
|
|
|
|
|
|
def get_custodian_name(entry: dict) -> Optional[str]:
|
|
"""Extract the custodian name from an entry."""
|
|
# Try custodian_name field first
|
|
if 'custodian_name' in entry:
|
|
cn = entry['custodian_name']
|
|
if isinstance(cn, dict):
|
|
return cn.get('claim_value') or cn.get('name')
|
|
return str(cn)
|
|
|
|
# Try original_entry.organisatie
|
|
if 'original_entry' in entry:
|
|
org = entry['original_entry'].get('organisatie')
|
|
if org:
|
|
return org
|
|
|
|
return None
|
|
|
|
|
|
def get_youtube_channel_title(entry: dict) -> Optional[str]:
|
|
"""Extract the YouTube channel title from enrichment data."""
|
|
if 'youtube_enrichment' not in entry:
|
|
return None
|
|
|
|
yt = entry['youtube_enrichment']
|
|
if 'channel' in yt and isinstance(yt['channel'], dict):
|
|
return yt['channel'].get('title')
|
|
|
|
return None
|
|
|
|
|
|
def analyze_entry(entry_path: Path, threshold: float = 60.0) -> Optional[dict]:
|
|
"""Analyze an entry for potential YouTube misattribution.
|
|
|
|
Returns analysis dict if potential misattribution detected, None otherwise.
|
|
"""
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
return None
|
|
|
|
# Skip entries without YouTube enrichment
|
|
if 'youtube_enrichment' not in entry:
|
|
return None
|
|
|
|
# Skip already-flagged entries
|
|
if 'misattributed_enrichments' in entry:
|
|
for misattr in entry['misattributed_enrichments']:
|
|
if misattr.get('enrichment_type') == 'youtube':
|
|
return None # Already handled
|
|
|
|
custodian_name = get_custodian_name(entry)
|
|
channel_title = get_youtube_channel_title(entry)
|
|
|
|
if not custodian_name or not channel_title:
|
|
return None
|
|
|
|
similarity = name_similarity(custodian_name, channel_title)
|
|
|
|
if similarity < threshold:
|
|
return {
|
|
'entry_path': str(entry_path),
|
|
'entry_index': entry.get('entry_index'),
|
|
'custodian_name': custodian_name,
|
|
'channel_title': channel_title,
|
|
'similarity': round(similarity, 1),
|
|
'channel_url': entry['youtube_enrichment'].get('source_url'),
|
|
'subscriber_count': entry['youtube_enrichment'].get('channel', {}).get('subscriber_count'),
|
|
'video_count': entry['youtube_enrichment'].get('channel', {}).get('video_count'),
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Detect YouTube channel misattributions')
|
|
parser.add_argument('--threshold', type=float, default=60.0,
|
|
help='Similarity threshold below which to flag (0-100, default: 60)')
|
|
parser.add_argument('--fix', action='store_true',
|
|
help='Apply fixes automatically (move to misattributed_enrichments)')
|
|
parser.add_argument('--entries-dir', type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/nde/enriched/entries'),
|
|
help='Directory containing entry files')
|
|
args = parser.parse_args()
|
|
|
|
entries_dir = args.entries_dir
|
|
if not entries_dir.exists():
|
|
print(f"Error: Entries directory not found: {entries_dir}")
|
|
return 1
|
|
|
|
# Find all entry files
|
|
entry_files = sorted(entries_dir.glob('*.yaml'))
|
|
print(f"Scanning {len(entry_files)} entries for YouTube misattributions...")
|
|
print(f"Similarity threshold: {args.threshold}%")
|
|
print()
|
|
|
|
potential_misattributions = []
|
|
entries_with_youtube = 0
|
|
|
|
for entry_path in entry_files:
|
|
try:
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if entry and 'youtube_enrichment' in entry:
|
|
entries_with_youtube += 1
|
|
|
|
result = analyze_entry(entry_path, args.threshold)
|
|
if result:
|
|
potential_misattributions.append(result)
|
|
except Exception as e:
|
|
print(f"Error processing {entry_path}: {e}")
|
|
|
|
print(f"Entries with YouTube enrichment: {entries_with_youtube}")
|
|
print(f"Potential misattributions detected: {len(potential_misattributions)}")
|
|
print()
|
|
|
|
if potential_misattributions:
|
|
print("=" * 100)
|
|
print("POTENTIAL MISATTRIBUTIONS")
|
|
print("=" * 100)
|
|
|
|
for i, m in enumerate(potential_misattributions, 1):
|
|
print(f"\n{i}. Entry {m['entry_index']:04d}")
|
|
print(f" File: {Path(m['entry_path']).name}")
|
|
print(f" Custodian Name: {m['custodian_name']}")
|
|
print(f" Channel Title: {m['channel_title']}")
|
|
print(f" Similarity: {m['similarity']}% (threshold: {args.threshold}%)")
|
|
print(f" Channel URL: {m['channel_url']}")
|
|
print(f" Subscribers: {m['subscriber_count']}, Videos: {m['video_count']}")
|
|
|
|
print()
|
|
print("=" * 100)
|
|
|
|
if args.fix:
|
|
print("\nApplying fixes...")
|
|
# TODO: Implement automatic fix (similar to fix_youtube_misattribution.py)
|
|
print("Note: Automatic fix not yet implemented. Please review and fix manually.")
|
|
else:
|
|
print("\nTo apply fixes automatically, re-run with --fix flag")
|
|
print("Or review each case manually using scripts/fix_youtube_misattribution.py")
|
|
else:
|
|
print("No potential misattributions detected!")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|