glam/scripts/fix_youtube_misattribution.py
2025-12-03 17:38:46 +01:00

117 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""
Fix YouTube channel misattribution in entry 0386 (Heemkunde Arcen).
The scraper found an embedded YouTube video from a third-party channel
(Siemes Sand und Kiesbaggerei - a German sand/gravel company) and incorrectly
enriched the heritage institution entry with that channel's data.
This script:
1. Moves the youtube_enrichment to misattributed_enrichments with explanation
2. Updates the social_youtube claim to note it's third-party embedded content
3. Updates the provenance to reflect the correction
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
def fix_entry(entry_path: Path) -> None:
"""Fix the YouTube misattribution in the given entry file."""
print(f"Loading {entry_path}...")
with open(entry_path, 'r', encoding='utf-8') as f:
# Use safe_load to handle the YAML
content = f.read()
# Parse YAML
entry = yaml.safe_load(content)
# 1. Extract and remove youtube_enrichment
youtube_enrichment = entry.pop('youtube_enrichment', None)
if not youtube_enrichment:
print("No youtube_enrichment found - nothing to fix")
return
print(f"Found youtube_enrichment for channel: {youtube_enrichment.get('channel', {}).get('title', 'Unknown')}")
# 2. Create misattributed_enrichments section
misattribution_record = {
'enrichment_type': 'youtube',
'original_data': youtube_enrichment,
'misattribution_detected': datetime.now(timezone.utc).isoformat(),
'reason': (
"YouTube channel 'Siemes Sand und Kiesbaggerei' (German sand/gravel dredging company) "
"was incorrectly attributed to Stichting Heemkunde Arcen. The video was embedded on "
"the institution's website because it relates to 'Limburgse Maasschippers' (Maas river "
"shipping history), but the channel itself belongs to the dredging company, not the "
"heritage institution."
),
'source_claim': {
'claim_type': 'social_youtube',
'claim_value': 'https://www.youtube.com/@siemessandundkiesbaggerei3693',
'xpath': '/html/body/div[1]/div[4]/div[2]/div/div/div/div[11]/div/div/div/div[3]/div[17]/div/div/a'
},
'correct_interpretation': 'third_party_embedded_video',
'correction_method': 'manual_review',
'correction_timestamp': datetime.now(timezone.utc).isoformat()
}
if 'misattributed_enrichments' not in entry:
entry['misattributed_enrichments'] = []
entry['misattributed_enrichments'].append(misattribution_record)
# 3. Update the social_youtube claim in web_claims
if 'web_claims' in entry and 'claims' in entry['web_claims']:
for claim in entry['web_claims']['claims']:
if claim.get('claim_type') == 'social_youtube':
# Add misattribution note to the claim
claim['misattribution_detected'] = True
claim['misattribution_note'] = (
"This YouTube link points to 'Siemes Sand und Kiesbaggerei' (third-party channel), "
"not the institution's official channel. The video was embedded on the website "
"for related content but does not represent the institution's social media presence."
)
claim['claim_type_corrected'] = 'embedded_video_third_party'
print(f"Updated social_youtube claim with misattribution note")
break
# 4. Update provenance to note the correction
if 'provenance' in entry and 'sources' in entry['provenance']:
if 'youtube' in entry['provenance']['sources']:
youtube_prov = entry['provenance']['sources']['youtube']
if isinstance(youtube_prov, list) and len(youtube_prov) > 0:
youtube_prov[0]['misattribution_corrected'] = True
youtube_prov[0]['correction_timestamp'] = datetime.now(timezone.utc).isoformat()
youtube_prov[0]['correction_note'] = (
"Channel was third-party (Siemes Sand und Kiesbaggerei), not institution channel. "
"Data moved to misattributed_enrichments section."
)
print("Updated provenance with correction note")
# 5. Write back the corrected entry
print(f"Writing corrected entry to {entry_path}...")
with open(entry_path, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
print("Done! YouTube misattribution has been corrected.")
print(f" - youtube_enrichment moved to misattributed_enrichments")
print(f" - social_youtube claim updated with misattribution note")
print(f" - Provenance updated with correction metadata")
def main():
entry_path = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries/0386_heemkunde_arcen.yaml')
if not entry_path.exists():
print(f"Error: Entry file not found: {entry_path}")
return 1
fix_entry(entry_path)
return 0
if __name__ == '__main__':
exit(main())