#!/usr/bin/env python3 """ Fast patch for missing wasDerivedFrom fields in YAML enrichment sections. Uses regex-based detection for speed, then ruamel.yaml only for files that need changes. Usage: python scripts/patch_derived_from_fast.py """ import re import sys from datetime import datetime, timezone from pathlib import Path try: from ruamel.yaml import YAML # type: ignore from ruamel.yaml.comments import CommentedMap # type: ignore except ImportError: print("ERROR: ruamel.yaml not installed. Run: pip install ruamel.yaml") sys.exit(1) def needs_youtube_patch(content: str) -> bool: """Check if file has youtube_enrichment without wasDerivedFrom but with channel_url.""" if 'youtube_enrichment:' not in content: return False if 'channel_url:' not in content and 'channel_id:' not in content: return False # Check if _provenance exists but wasDerivedFrom is missing in youtube section # This is a heuristic - we'll verify with proper YAML parsing yt_match = re.search(r'youtube_enrichment:.*?(?=\n\w|\Z)', content, re.DOTALL) if yt_match: yt_section = yt_match.group() if '_provenance:' in yt_section: # Check if wasDerivedFrom exists in the prov subsection prov_match = re.search(r'_provenance:.*?prov:.*?(?=\n \w|\n \w|\Z)', yt_section, re.DOTALL) if prov_match: prov_section = prov_match.group() if 'wasDerivedFrom:' not in prov_section: return True return False def needs_zcbs_patch(content: str) -> bool: """Check if file has zcbs_enrichment without wasDerivedFrom.""" if 'zcbs_enrichment:' not in content: return False zcbs_match = re.search(r'zcbs_enrichment:.*?(?=\n\w|\Z)', content, re.DOTALL) if zcbs_match: zcbs_section = zcbs_match.group() if '_provenance:' in zcbs_section: prov_match = re.search(r'_provenance:.*?prov:.*?(?=\n \w|\n \w|\Z)', zcbs_section, re.DOTALL) if prov_match: prov_section = prov_match.group() if 'wasDerivedFrom:' not in prov_section: return True return False def patch_youtube_section(section: dict) -> bool: """Add wasDerivedFrom to youtube_enrichment section.""" provenance = section.get('_provenance') if not provenance: return False prov = provenance.get('prov') if not prov: prov = CommentedMap() provenance['prov'] = prov if prov.get('wasDerivedFrom'): return False # Get URL url = section.get('channel_url') or section.get('source_url') if not url and section.get('channel_id'): url = f"https://www.youtube.com/channel/{section['channel_id']}" if not url: return False prov['wasDerivedFrom'] = url if not prov.get('generatedAtTime'): prov['generatedAtTime'] = section.get('fetch_timestamp') or datetime.now(timezone.utc).isoformat() if not prov.get('wasGeneratedBy'): generated_by = CommentedMap() generated_by['@type'] = 'prov:Activity' generated_by['name'] = 'youtube_api_fetch' generated_by['used'] = 'https://www.googleapis.com/youtube/v3' prov['wasGeneratedBy'] = generated_by return True def patch_zcbs_section(section: dict) -> bool: """Add wasDerivedFrom to zcbs_enrichment section.""" provenance = section.get('_provenance') if not provenance: return False prov = provenance.get('prov') if not prov: prov = CommentedMap() provenance['prov'] = prov if prov.get('wasDerivedFrom'): return False # Get URL url = section.get('source') if not url: platform_urls = section.get('platform_urls', {}) if platform_urls: url = platform_urls.get('website') or platform_urls.get('catalog') or next(iter(platform_urls.values()), None) if not url and section.get('zcbs_id'): url = f"https://www.zcbs.nl/organisatie/{section['zcbs_id']}" if not url: return False prov['wasDerivedFrom'] = url if not prov.get('generatedAtTime'): prov['generatedAtTime'] = section.get('enrichment_timestamp') or datetime.now(timezone.utc).isoformat() if not prov.get('wasGeneratedBy'): generated_by = CommentedMap() generated_by['@type'] = 'prov:Activity' generated_by['name'] = 'zcbs_registry_fetch' generated_by['used'] = 'https://www.zcbs.nl' prov['wasGeneratedBy'] = generated_by return True def process_file(filepath: Path, yaml: YAML) -> dict: """Process a single file.""" result = {'modified': False, 'youtube': False, 'zcbs': False, 'error': None} try: content = filepath.read_text(encoding='utf-8') except Exception as e: result['error'] = str(e) return result # Quick check if file needs patching needs_yt = needs_youtube_patch(content) needs_zc = needs_zcbs_patch(content) if not needs_yt and not needs_zc: return result # Parse with ruamel.yaml try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) except Exception as e: result['error'] = str(e) return result modified = False if needs_yt and 'youtube_enrichment' in data: if patch_youtube_section(data['youtube_enrichment']): result['youtube'] = True modified = True if needs_zc and 'zcbs_enrichment' in data: if patch_zcbs_section(data['zcbs_enrichment']): result['zcbs'] = True modified = True if modified: try: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) result['modified'] = True except Exception as e: result['error'] = f"Write error: {e}" return result def main(): yaml = YAML() yaml.preserve_quotes = True yaml.default_flow_style = False yaml.width = 4096 script_dir = Path(__file__).parent base_dir = script_dir.parent custodian_dir = base_dir / 'data' / 'custodian' yaml_files = list(custodian_dir.glob('*.yaml')) total_files = len(yaml_files) print(f"Scanning {total_files} files for missing wasDerivedFrom...") # First pass: identify files that need patching (fast regex scan) candidates = [] for i, filepath in enumerate(yaml_files): if (i + 1) % 5000 == 0: print(f" Scan progress: {i + 1}/{total_files}") try: content = filepath.read_text(encoding='utf-8') if needs_youtube_patch(content) or needs_zcbs_patch(content): candidates.append(filepath) except: pass print(f"\nFound {len(candidates)} files that may need patching.") print(f"Processing with full YAML parser...") stats = { 'files_modified': 0, 'youtube_patched': 0, 'zcbs_patched': 0, 'errors': 0, } for i, filepath in enumerate(candidates): if (i + 1) % 100 == 0: print(f" Patch progress: {i + 1}/{len(candidates)}") result = process_file(filepath, yaml) if result['error']: stats['errors'] += 1 print(f" ERROR: {filepath.name}: {result['error']}") elif result['modified']: stats['files_modified'] += 1 if result['youtube']: stats['youtube_patched'] += 1 if result['zcbs']: stats['zcbs_patched'] += 1 print() print("=" * 60) print("PATCH SUMMARY") print("=" * 60) print(f"Files modified: {stats['files_modified']:,}") print(f"YouTube patched: {stats['youtube_patched']:,}") print(f"ZCBS patched: {stats['zcbs_patched']:,}") print(f"Errors: {stats['errors']:,}") print() print(f"Total wasDerivedFrom fields added: {stats['youtube_patched'] + stats['zcbs_patched']:,}") if __name__ == '__main__': main()