131 lines
4.4 KiB
Python
131 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast detection of potential YouTube channel misattributions.
|
|
|
|
Scans only entries that have youtube_enrichment and checks name similarity.
|
|
"""
|
|
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize a name for comparison."""
|
|
if not name:
|
|
return ""
|
|
|
|
name = name.lower()
|
|
name = unicodedata.normalize('NFD', name)
|
|
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
|
|
|
|
# Remove common terms
|
|
for term in ['stichting', 'vereniging', 'foundation', 'verein', 'museum',
|
|
'archief', 'archive', 'bibliotheek', 'library',
|
|
'e.v.', 'ev', 'gmbh', 'bv', 'b.v.', 'nv', 'n.v.',
|
|
'und', 'and', 'en', 'the', 'de', 'het', 'der', 'die', 'das', 'van']:
|
|
name = re.sub(rf'\b{re.escape(term)}\b', '', name)
|
|
|
|
name = re.sub(r'[^\w\s]', '', name)
|
|
return ' '.join(name.split()).strip()
|
|
|
|
|
|
def word_overlap_score(name1: str, name2: str) -> float:
|
|
"""Calculate word overlap between two normalized names (0-100)."""
|
|
words1 = set(normalize_name(name1).split())
|
|
words2 = set(normalize_name(name2).split())
|
|
|
|
if not words1 or not words2:
|
|
return 0.0
|
|
|
|
intersection = words1 & words2
|
|
# Jaccard-like but weighted towards shorter name
|
|
min_len = min(len(words1), len(words2))
|
|
if min_len == 0:
|
|
return 0.0
|
|
|
|
return (len(intersection) / min_len) * 100
|
|
|
|
|
|
def main():
|
|
entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
|
|
# Get entries with youtube_enrichment
|
|
youtube_entries = list(entries_dir.glob('*.yaml'))
|
|
|
|
mismatches = []
|
|
processed = 0
|
|
with_youtube = 0
|
|
|
|
for entry_path in youtube_entries:
|
|
try:
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Quick check before parsing
|
|
if 'youtube_enrichment:' not in content:
|
|
continue
|
|
|
|
if 'misattributed_enrichments:' in content and 'youtube' in content.split('misattributed_enrichments:')[1][:500]:
|
|
continue # Already flagged
|
|
|
|
entry = yaml.safe_load(content)
|
|
if not entry or 'youtube_enrichment' not in entry:
|
|
continue
|
|
|
|
with_youtube += 1
|
|
processed += 1
|
|
|
|
# Get custodian name
|
|
custodian_name = None
|
|
if 'custodian_name' in entry:
|
|
cn = entry['custodian_name']
|
|
if isinstance(cn, dict):
|
|
custodian_name = cn.get('claim_value') or cn.get('name')
|
|
else:
|
|
custodian_name = str(cn)
|
|
|
|
if not custodian_name and 'original_entry' in entry:
|
|
custodian_name = entry['original_entry'].get('organisatie')
|
|
|
|
# Get YouTube channel title
|
|
yt = entry['youtube_enrichment']
|
|
channel_title = None
|
|
if 'channel' in yt and isinstance(yt['channel'], dict):
|
|
channel_title = yt['channel'].get('title')
|
|
|
|
if not custodian_name or not channel_title:
|
|
continue
|
|
|
|
# Check similarity
|
|
score = word_overlap_score(custodian_name, channel_title)
|
|
|
|
if score < 30: # Low overlap threshold
|
|
mismatches.append({
|
|
'file': entry_path.name,
|
|
'index': entry.get('entry_index'),
|
|
'custodian': custodian_name,
|
|
'channel': channel_title,
|
|
'score': round(score, 1),
|
|
'channel_url': yt.get('source_url', yt.get('channel', {}).get('channel_url')),
|
|
})
|
|
except Exception as e:
|
|
print(f"Error: {entry_path.name}: {e}")
|
|
|
|
print(f"Scanned {processed} entries with YouTube enrichment")
|
|
print(f"Found {len(mismatches)} potential misattributions (score < 30%)")
|
|
print()
|
|
|
|
if mismatches:
|
|
print("=" * 100)
|
|
for m in sorted(mismatches, key=lambda x: x['score']):
|
|
print(f"\nEntry {m['index']:04d} ({m['file']})")
|
|
print(f" Custodian: {m['custodian']}")
|
|
print(f" Channel: {m['channel']}")
|
|
print(f" Overlap: {m['score']}%")
|
|
print(f" URL: {m['channel_url']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|