117 lines
5.1 KiB
Python
117 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix YouTube channel misattribution in entry 0386 (Heemkunde Arcen).
|
|
|
|
The scraper found an embedded YouTube video from a third-party channel
|
|
(Siemes Sand und Kiesbaggerei - a German sand/gravel company) and incorrectly
|
|
enriched the heritage institution entry with that channel's data.
|
|
|
|
This script:
|
|
1. Moves the youtube_enrichment to misattributed_enrichments with explanation
|
|
2. Updates the social_youtube claim to note it's third-party embedded content
|
|
3. Updates the provenance to reflect the correction
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def fix_entry(entry_path: Path) -> None:
|
|
"""Fix the YouTube misattribution in the given entry file."""
|
|
|
|
print(f"Loading {entry_path}...")
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
# Use safe_load to handle the YAML
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
entry = yaml.safe_load(content)
|
|
|
|
# 1. Extract and remove youtube_enrichment
|
|
youtube_enrichment = entry.pop('youtube_enrichment', None)
|
|
|
|
if not youtube_enrichment:
|
|
print("No youtube_enrichment found - nothing to fix")
|
|
return
|
|
|
|
print(f"Found youtube_enrichment for channel: {youtube_enrichment.get('channel', {}).get('title', 'Unknown')}")
|
|
|
|
# 2. Create misattributed_enrichments section
|
|
misattribution_record = {
|
|
'enrichment_type': 'youtube',
|
|
'original_data': youtube_enrichment,
|
|
'misattribution_detected': datetime.now(timezone.utc).isoformat(),
|
|
'reason': (
|
|
"YouTube channel 'Siemes Sand und Kiesbaggerei' (German sand/gravel dredging company) "
|
|
"was incorrectly attributed to Stichting Heemkunde Arcen. The video was embedded on "
|
|
"the institution's website because it relates to 'Limburgse Maasschippers' (Maas river "
|
|
"shipping history), but the channel itself belongs to the dredging company, not the "
|
|
"heritage institution."
|
|
),
|
|
'source_claim': {
|
|
'claim_type': 'social_youtube',
|
|
'claim_value': 'https://www.youtube.com/@siemessandundkiesbaggerei3693',
|
|
'xpath': '/html/body/div[1]/div[4]/div[2]/div/div/div/div[11]/div/div/div/div[3]/div[17]/div/div/a'
|
|
},
|
|
'correct_interpretation': 'third_party_embedded_video',
|
|
'correction_method': 'manual_review',
|
|
'correction_timestamp': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
if 'misattributed_enrichments' not in entry:
|
|
entry['misattributed_enrichments'] = []
|
|
entry['misattributed_enrichments'].append(misattribution_record)
|
|
|
|
# 3. Update the social_youtube claim in web_claims
|
|
if 'web_claims' in entry and 'claims' in entry['web_claims']:
|
|
for claim in entry['web_claims']['claims']:
|
|
if claim.get('claim_type') == 'social_youtube':
|
|
# Add misattribution note to the claim
|
|
claim['misattribution_detected'] = True
|
|
claim['misattribution_note'] = (
|
|
"This YouTube link points to 'Siemes Sand und Kiesbaggerei' (third-party channel), "
|
|
"not the institution's official channel. The video was embedded on the website "
|
|
"for related content but does not represent the institution's social media presence."
|
|
)
|
|
claim['claim_type_corrected'] = 'embedded_video_third_party'
|
|
print(f"Updated social_youtube claim with misattribution note")
|
|
break
|
|
|
|
# 4. Update provenance to note the correction
|
|
if 'provenance' in entry and 'sources' in entry['provenance']:
|
|
if 'youtube' in entry['provenance']['sources']:
|
|
youtube_prov = entry['provenance']['sources']['youtube']
|
|
if isinstance(youtube_prov, list) and len(youtube_prov) > 0:
|
|
youtube_prov[0]['misattribution_corrected'] = True
|
|
youtube_prov[0]['correction_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
youtube_prov[0]['correction_note'] = (
|
|
"Channel was third-party (Siemes Sand und Kiesbaggerei), not institution channel. "
|
|
"Data moved to misattributed_enrichments section."
|
|
)
|
|
print("Updated provenance with correction note")
|
|
|
|
# 5. Write back the corrected entry
|
|
print(f"Writing corrected entry to {entry_path}...")
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
print("Done! YouTube misattribution has been corrected.")
|
|
print(f" - youtube_enrichment moved to misattributed_enrichments")
|
|
print(f" - social_youtube claim updated with misattribution note")
|
|
print(f" - Provenance updated with correction metadata")
|
|
|
|
|
|
def main():
|
|
entry_path = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries/0386_heemkunde_arcen.yaml')
|
|
|
|
if not entry_path.exists():
|
|
print(f"Error: Entry file not found: {entry_path}")
|
|
return 1
|
|
|
|
fix_entry(entry_path)
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|