glam/scripts/patch_derived_from_fast.py

#!/usr/bin/env python3
"""
Fast patch for missing wasDerivedFrom fields in YAML enrichment sections.

Uses regex-based detection for speed, then ruamel.yaml only for files that need changes.

Usage:
    python scripts/patch_derived_from_fast.py
"""

import re
import sys
from datetime import datetime, timezone
from pathlib import Path

try:
    from ruamel.yaml import YAML  # type: ignore
    from ruamel.yaml.comments import CommentedMap  # type: ignore
except ImportError:
    print("ERROR: ruamel.yaml not installed. Run: pip install ruamel.yaml")
    sys.exit(1)


def needs_youtube_patch(content: str) -> bool:
    """Check if file has youtube_enrichment without wasDerivedFrom but with channel_url."""
    if 'youtube_enrichment:' not in content:
        return False
    if 'channel_url:' not in content and 'channel_id:' not in content:
        return False

    # Check if _provenance exists but wasDerivedFrom is missing in youtube section
    # This is a heuristic - we'll verify with proper YAML parsing
    yt_match = re.search(r'youtube_enrichment:.*?(?=\n\w|\Z)', content, re.DOTALL)
    if yt_match:
        yt_section = yt_match.group()
        if '_provenance:' in yt_section:
            # Check if wasDerivedFrom exists in the prov subsection
            prov_match = re.search(r'_provenance:.*?prov:.*?(?=\n    \w|\n  \w|\Z)', yt_section, re.DOTALL)
            if prov_match:
                prov_section = prov_match.group()
                if 'wasDerivedFrom:' not in prov_section:
                    return True
    return False


def needs_zcbs_patch(content: str) -> bool:
    """Check if file has zcbs_enrichment without wasDerivedFrom."""
    if 'zcbs_enrichment:' not in content:
        return False

    zcbs_match = re.search(r'zcbs_enrichment:.*?(?=\n\w|\Z)', content, re.DOTALL)
    if zcbs_match:
        zcbs_section = zcbs_match.group()
        if '_provenance:' in zcbs_section:
            prov_match = re.search(r'_provenance:.*?prov:.*?(?=\n    \w|\n  \w|\Z)', zcbs_section, re.DOTALL)
            if prov_match:
                prov_section = prov_match.group()
                if 'wasDerivedFrom:' not in prov_section:
                    return True
    return False


def patch_youtube_section(section: dict) -> bool:
    """Add wasDerivedFrom to youtube_enrichment section."""
    provenance = section.get('_provenance')
    if not provenance:
        return False

    prov = provenance.get('prov')
    if not prov:
        prov = CommentedMap()
        provenance['prov'] = prov

    if prov.get('wasDerivedFrom'):
        return False

    # Get URL
    url = section.get('channel_url') or section.get('source_url')
    if not url and section.get('channel_id'):
        url = f"https://www.youtube.com/channel/{section['channel_id']}"

    if not url:
        return False

    prov['wasDerivedFrom'] = url

    if not prov.get('generatedAtTime'):
        prov['generatedAtTime'] = section.get('fetch_timestamp') or datetime.now(timezone.utc).isoformat()

    if not prov.get('wasGeneratedBy'):
        generated_by = CommentedMap()
        generated_by['@type'] = 'prov:Activity'
        generated_by['name'] = 'youtube_api_fetch'
        generated_by['used'] = 'https://www.googleapis.com/youtube/v3'
        prov['wasGeneratedBy'] = generated_by

    return True


def patch_zcbs_section(section: dict) -> bool:
    """Add wasDerivedFrom to zcbs_enrichment section."""
    provenance = section.get('_provenance')
    if not provenance:
        return False

    prov = provenance.get('prov')
    if not prov:
        prov = CommentedMap()
        provenance['prov'] = prov

    if prov.get('wasDerivedFrom'):
        return False

    # Get URL
    url = section.get('source')
    if not url:
        platform_urls = section.get('platform_urls', {})
        if platform_urls:
            url = platform_urls.get('website') or platform_urls.get('catalog') or next(iter(platform_urls.values()), None)
    if not url and section.get('zcbs_id'):
        url = f"https://www.zcbs.nl/organisatie/{section['zcbs_id']}"

    if not url:
        return False

    prov['wasDerivedFrom'] = url

    if not prov.get('generatedAtTime'):
        prov['generatedAtTime'] = section.get('enrichment_timestamp') or datetime.now(timezone.utc).isoformat()

    if not prov.get('wasGeneratedBy'):
        generated_by = CommentedMap()
        generated_by['@type'] = 'prov:Activity'
        generated_by['name'] = 'zcbs_registry_fetch'
        generated_by['used'] = 'https://www.zcbs.nl'
        prov['wasGeneratedBy'] = generated_by

    return True


def process_file(filepath: Path, yaml: YAML) -> dict:
    """Process a single file."""
    result = {'modified': False, 'youtube': False, 'zcbs': False, 'error': None}

    try:
        content = filepath.read_text(encoding='utf-8')
    except Exception as e:
        result['error'] = str(e)
        return result

    # Quick check if file needs patching
    needs_yt = needs_youtube_patch(content)
    needs_zc = needs_zcbs_patch(content)

    if not needs_yt and not needs_zc:
        return result

    # Parse with ruamel.yaml
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.load(f)
    except Exception as e:
        result['error'] = str(e)
        return result

    modified = False

    if needs_yt and 'youtube_enrichment' in data:
        if patch_youtube_section(data['youtube_enrichment']):
            result['youtube'] = True
            modified = True

    if needs_zc and 'zcbs_enrichment' in data:
        if patch_zcbs_section(data['zcbs_enrichment']):
            result['zcbs'] = True
            modified = True

    if modified:
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f)
            result['modified'] = True
        except Exception as e:
            result['error'] = f"Write error: {e}"

    return result


def main():
    yaml = YAML()
    yaml.preserve_quotes = True
    yaml.default_flow_style = False
    yaml.width = 4096

    script_dir = Path(__file__).parent
    base_dir = script_dir.parent
    custodian_dir = base_dir / 'data' / 'custodian'

    yaml_files = list(custodian_dir.glob('*.yaml'))
    total_files = len(yaml_files)

    print(f"Scanning {total_files} files for missing wasDerivedFrom...")

    # First pass: identify files that need patching (fast regex scan)
    candidates = []
    for i, filepath in enumerate(yaml_files):
        if (i + 1) % 5000 == 0:
            print(f"  Scan progress: {i + 1}/{total_files}")
        try:
            content = filepath.read_text(encoding='utf-8')
            if needs_youtube_patch(content) or needs_zcbs_patch(content):
                candidates.append(filepath)
        except:
            pass

    print(f"\nFound {len(candidates)} files that may need patching.")
    print(f"Processing with full YAML parser...")

    stats = {
        'files_modified': 0,
        'youtube_patched': 0,
        'zcbs_patched': 0,
        'errors': 0,
    }

    for i, filepath in enumerate(candidates):
        if (i + 1) % 100 == 0:
            print(f"  Patch progress: {i + 1}/{len(candidates)}")

        result = process_file(filepath, yaml)

        if result['error']:
            stats['errors'] += 1
            print(f"  ERROR: {filepath.name}: {result['error']}")
        elif result['modified']:
            stats['files_modified'] += 1
            if result['youtube']:
                stats['youtube_patched'] += 1
            if result['zcbs']:
                stats['zcbs_patched'] += 1

    print()
    print("=" * 60)
    print("PATCH SUMMARY")
    print("=" * 60)
    print(f"Files modified:      {stats['files_modified']:,}")
    print(f"YouTube patched:     {stats['youtube_patched']:,}")
    print(f"ZCBS patched:        {stats['zcbs_patched']:,}")
    print(f"Errors:              {stats['errors']:,}")
    print()
    print(f"Total wasDerivedFrom fields added: {stats['youtube_patched'] + stats['zcbs_patched']:,}")


if __name__ == '__main__':
    main()