#!/usr/bin/env python3 """ Fix YouTube channel misattribution in entry 0386 (Heemkunde Arcen). The scraper found an embedded YouTube video from a third-party channel (Siemes Sand und Kiesbaggerei - a German sand/gravel company) and incorrectly enriched the heritage institution entry with that channel's data. This script: 1. Moves the youtube_enrichment to misattributed_enrichments with explanation 2. Updates the social_youtube claim to note it's third-party embedded content 3. Updates the provenance to reflect the correction """ import yaml from pathlib import Path from datetime import datetime, timezone def fix_entry(entry_path: Path) -> None: """Fix the YouTube misattribution in the given entry file.""" print(f"Loading {entry_path}...") with open(entry_path, 'r', encoding='utf-8') as f: # Use safe_load to handle the YAML content = f.read() # Parse YAML entry = yaml.safe_load(content) # 1. Extract and remove youtube_enrichment youtube_enrichment = entry.pop('youtube_enrichment', None) if not youtube_enrichment: print("No youtube_enrichment found - nothing to fix") return print(f"Found youtube_enrichment for channel: {youtube_enrichment.get('channel', {}).get('title', 'Unknown')}") # 2. Create misattributed_enrichments section misattribution_record = { 'enrichment_type': 'youtube', 'original_data': youtube_enrichment, 'misattribution_detected': datetime.now(timezone.utc).isoformat(), 'reason': ( "YouTube channel 'Siemes Sand und Kiesbaggerei' (German sand/gravel dredging company) " "was incorrectly attributed to Stichting Heemkunde Arcen. The video was embedded on " "the institution's website because it relates to 'Limburgse Maasschippers' (Maas river " "shipping history), but the channel itself belongs to the dredging company, not the " "heritage institution." ), 'source_claim': { 'claim_type': 'social_youtube', 'claim_value': 'https://www.youtube.com/@siemessandundkiesbaggerei3693', 'xpath': '/html/body/div[1]/div[4]/div[2]/div/div/div/div[11]/div/div/div/div[3]/div[17]/div/div/a' }, 'correct_interpretation': 'third_party_embedded_video', 'correction_method': 'manual_review', 'correction_timestamp': datetime.now(timezone.utc).isoformat() } if 'misattributed_enrichments' not in entry: entry['misattributed_enrichments'] = [] entry['misattributed_enrichments'].append(misattribution_record) # 3. Update the social_youtube claim in web_claims if 'web_claims' in entry and 'claims' in entry['web_claims']: for claim in entry['web_claims']['claims']: if claim.get('claim_type') == 'social_youtube': # Add misattribution note to the claim claim['misattribution_detected'] = True claim['misattribution_note'] = ( "This YouTube link points to 'Siemes Sand und Kiesbaggerei' (third-party channel), " "not the institution's official channel. The video was embedded on the website " "for related content but does not represent the institution's social media presence." ) claim['claim_type_corrected'] = 'embedded_video_third_party' print(f"Updated social_youtube claim with misattribution note") break # 4. Update provenance to note the correction if 'provenance' in entry and 'sources' in entry['provenance']: if 'youtube' in entry['provenance']['sources']: youtube_prov = entry['provenance']['sources']['youtube'] if isinstance(youtube_prov, list) and len(youtube_prov) > 0: youtube_prov[0]['misattribution_corrected'] = True youtube_prov[0]['correction_timestamp'] = datetime.now(timezone.utc).isoformat() youtube_prov[0]['correction_note'] = ( "Channel was third-party (Siemes Sand und Kiesbaggerei), not institution channel. " "Data moved to misattributed_enrichments section." ) print("Updated provenance with correction note") # 5. Write back the corrected entry print(f"Writing corrected entry to {entry_path}...") with open(entry_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) print("Done! YouTube misattribution has been corrected.") print(f" - youtube_enrichment moved to misattributed_enrichments") print(f" - social_youtube claim updated with misattribution note") print(f" - Provenance updated with correction metadata") def main(): entry_path = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries/0386_heemkunde_arcen.yaml') if not entry_path.exists(): print(f"Error: Entry file not found: {entry_path}") return 1 fix_entry(entry_path) return 0 if __name__ == '__main__': exit(main())