glam/scripts/detect_youtube_misattributions_fast.py
2025-12-03 17:38:46 +01:00

131 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
Fast detection of potential YouTube channel misattributions.
Scans only entries that have youtube_enrichment and checks name similarity.
"""
import re
import unicodedata
from pathlib import Path
import yaml
def normalize_name(name: str) -> str:
"""Normalize a name for comparison."""
if not name:
return ""
name = name.lower()
name = unicodedata.normalize('NFD', name)
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
# Remove common terms
for term in ['stichting', 'vereniging', 'foundation', 'verein', 'museum',
'archief', 'archive', 'bibliotheek', 'library',
'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.',
'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das', 'van']:
name = re.sub(rf'\b{re.escape(term)}\b', '', name)
name = re.sub(r'[^\w\s]', '', name)
return ' '.join(name.split()).strip()
def word_overlap_score(name1: str, name2: str) -> float:
"""Calculate word overlap between two normalized names (0-100)."""
words1 = set(normalize_name(name1).split())
words2 = set(normalize_name(name2).split())
if not words1 or not words2:
return 0.0
intersection = words1 & words2
# Jaccard-like but weighted towards shorter name
min_len = min(len(words1), len(words2))
if min_len == 0:
return 0.0
return (len(intersection) / min_len) * 100
def main():
entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
# Get entries with youtube_enrichment
youtube_entries = list(entries_dir.glob('*.yaml'))
mismatches = []
processed = 0
with_youtube = 0
for entry_path in youtube_entries:
try:
with open(entry_path, 'r', encoding='utf-8') as f:
content = f.read()
# Quick check before parsing
if 'youtube_enrichment:' not in content:
continue
if 'misattributed_enrichments:' in content and 'youtube' in content.split('misattributed_enrichments:')[1][:500]:
continue # Already flagged
entry = yaml.safe_load(content)
if not entry or 'youtube_enrichment' not in entry:
continue
with_youtube += 1
processed += 1
# Get custodian name
custodian_name = None
if 'custodian_name' in entry:
cn = entry['custodian_name']
if isinstance(cn, dict):
custodian_name = cn.get('claim_value') or cn.get('name')
else:
custodian_name = str(cn)
if not custodian_name and 'original_entry' in entry:
custodian_name = entry['original_entry'].get('organisatie')
# Get YouTube channel title
yt = entry['youtube_enrichment']
channel_title = None
if 'channel' in yt and isinstance(yt['channel'], dict):
channel_title = yt['channel'].get('title')
if not custodian_name or not channel_title:
continue
# Check similarity
score = word_overlap_score(custodian_name, channel_title)
if score < 30: # Low overlap threshold
mismatches.append({
'file': entry_path.name,
'index': entry.get('entry_index'),
'custodian': custodian_name,
'channel': channel_title,
'score': round(score, 1),
'channel_url': yt.get('source_url', yt.get('channel', {}).get('channel_url')),
})
except Exception as e:
print(f"Error: {entry_path.name}: {e}")
print(f"Scanned {processed} entries with YouTube enrichment")
print(f"Found {len(mismatches)} potential misattributions (score < 30%)")
print()
if mismatches:
print("=" * 100)
for m in sorted(mismatches, key=lambda x: x['score']):
print(f"\nEntry {m['index']:04d} ({m['file']})")
print(f" Custodian: {m['custodian']}")
print(f" Channel: {m['channel']}")
print(f" Overlap: {m['score']}%")
print(f" URL: {m['channel_url']}")
if __name__ == '__main__':
main()