#!/usr/bin/env python3 """ Fast detection of potential YouTube channel misattributions. Scans only entries that have youtube_enrichment and checks name similarity. """ import re import unicodedata from pathlib import Path import yaml def normalize_name(name: str) -> str: """Normalize a name for comparison.""" if not name: return "" name = name.lower() name = unicodedata.normalize('NFD', name) name = ''.join(c for c in name if unicodedata.category(c) != 'Mn') # Remove common terms for term in ['stichting', 'vereniging', 'foundation', 'verein', 'museum', 'archief', 'archive', 'bibliotheek', 'library', 'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.', 'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das', 'van']: name = re.sub(rf'\b{re.escape(term)}\b', '', name) name = re.sub(r'[^\w\s]', '', name) return ' '.join(name.split()).strip() def word_overlap_score(name1: str, name2: str) -> float: """Calculate word overlap between two normalized names (0-100).""" words1 = set(normalize_name(name1).split()) words2 = set(normalize_name(name2).split()) if not words1 or not words2: return 0.0 intersection = words1 & words2 # Jaccard-like but weighted towards shorter name min_len = min(len(words1), len(words2)) if min_len == 0: return 0.0 return (len(intersection) / min_len) * 100 def main(): entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') # Get entries with youtube_enrichment youtube_entries = list(entries_dir.glob('*.yaml')) mismatches = [] processed = 0 with_youtube = 0 for entry_path in youtube_entries: try: with open(entry_path, 'r', encoding='utf-8') as f: content = f.read() # Quick check before parsing if 'youtube_enrichment:' not in content: continue if 'misattributed_enrichments:' in content and 'youtube' in content.split('misattributed_enrichments:')[1][:500]: continue # Already flagged entry = yaml.safe_load(content) if not entry or 'youtube_enrichment' not in entry: continue with_youtube += 1 processed += 1 # Get custodian name custodian_name = None if 'custodian_name' in entry: cn = entry['custodian_name'] if isinstance(cn, dict): custodian_name = cn.get('claim_value') or cn.get('name') else: custodian_name = str(cn) if not custodian_name and 'original_entry' in entry: custodian_name = entry['original_entry'].get('organisatie') # Get YouTube channel title yt = entry['youtube_enrichment'] channel_title = None if 'channel' in yt and isinstance(yt['channel'], dict): channel_title = yt['channel'].get('title') if not custodian_name or not channel_title: continue # Check similarity score = word_overlap_score(custodian_name, channel_title) if score < 30: # Low overlap threshold mismatches.append({ 'file': entry_path.name, 'index': entry.get('entry_index'), 'custodian': custodian_name, 'channel': channel_title, 'score': round(score, 1), 'channel_url': yt.get('source_url', yt.get('channel', {}).get('channel_url')), }) except Exception as e: print(f"Error: {entry_path.name}: {e}") print(f"Scanned {processed} entries with YouTube enrichment") print(f"Found {len(mismatches)} potential misattributions (score < 30%)") print() if mismatches: print("=" * 100) for m in sorted(mismatches, key=lambda x: x['score']): print(f"\nEntry {m['index']:04d} ({m['file']})") print(f" Custodian: {m['custodian']}") print(f" Channel: {m['channel']}") print(f" Overlap: {m['score']}%") print(f" URL: {m['channel_url']}") if __name__ == '__main__': main()