glam/scripts/patch_web_enrichment_provenance.py

#!/usr/bin/env python3
"""
Patch web_enrichment provenance to add missing wasDerivedFrom.

Sources for wasDerivedFrom (in priority order):
1. web_archives[0].url (original source)
2. archive_metadata.pages_archived[0].url
3. claims[0].source_url
"""

import sys
from pathlib import Path

try:
    from ruamel.yaml import YAML
    yaml = YAML()
    yaml.preserve_quotes = True
    yaml.width = 4096
except ImportError:
    print("Error: ruamel.yaml required. Install with: pip install ruamel.yaml")
    sys.exit(1)

CUSTODIAN_DIR = Path("data/custodian")


def get_web_source_url(web_enrichment: dict) -> str | None:
    """Extract source URL from web_enrichment section."""

    # Priority 1: web_archives[0].url (most authoritative)
    web_archives = web_enrichment.get('web_archives', [])
    if web_archives and isinstance(web_archives, list) and len(web_archives) > 0:
        url = web_archives[0].get('url')
        if url:
            return url

    # Priority 2: archive_metadata.pages_archived[0].url
    archive_meta = web_enrichment.get('archive_metadata', {})
    pages = archive_meta.get('pages_archived', [])
    if pages and isinstance(pages, list) and len(pages) > 0:
        url = pages[0].get('url')
        if url:
            return url

    # Priority 3: claims[0].source_url
    claims = web_enrichment.get('claims', [])
    if claims and isinstance(claims, list) and len(claims) > 0:
        url = claims[0].get('source_url')
        if url:
            return url

    # Priority 4: source_url at root level
    if 'source_url' in web_enrichment:
        return web_enrichment['source_url']

    return None


def patch_file(filepath: Path, dry_run: bool = False, verbose: bool = False) -> tuple[bool, int]:
    """Patch a single file to add missing wasDerivedFrom for web_enrichment."""

    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.load(f)

    if not data:
        return False, 0

    modified = False
    count = 0

    if 'web_enrichment' not in data:
        return False, 0

    we = data['web_enrichment']

    if '_provenance' not in we:
        return False, 0

    prov = we['_provenance']

    # Check if wasDerivedFrom is already present
    if 'prov' in prov and 'wasDerivedFrom' in prov['prov']:
        return False, 0

    # Get source URL
    source_url = get_web_source_url(we)

    if not source_url:
        if verbose:
            print(f"    [web_enrichment] No source URL found - skipping")
        return False, 0

    # Add wasDerivedFrom
    if 'prov' not in prov:
        prov['prov'] = {}

    prov['prov']['wasDerivedFrom'] = source_url
    modified = True
    count = 1

    if verbose:
        print(f"    [web_enrichment] Added wasDerivedFrom: {source_url[:60]}...")

    if modified and not dry_run:
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f)

    return modified, count


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Patch missing wasDerivedFrom in web_enrichment")
    parser.add_argument('--dry-run', action='store_true', help="Show what would be done")
    parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
    parser.add_argument('--limit', type=int, help="Limit number of files to process")
    parser.add_argument('--file', type=str, help="Process single file")
    args = parser.parse_args()

    if args.file:
        files = [Path(args.file)]
    else:
        files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
        if args.limit:
            files = files[:args.limit]

    total = len(files)
    modified_count = 0
    added_count = 0

    print(f"Patching {total} files for web_enrichment wasDerivedFrom...")
    print(f"  Dry run: {args.dry_run}")

    for i, filepath in enumerate(files, 1):
        if i % 1000 == 0 or args.verbose:
            print(f"[{i}/{total}] {filepath.name}")

        try:
            was_modified, count = patch_file(filepath, args.dry_run, args.verbose)
            if was_modified:
                modified_count += 1
                added_count += count
        except Exception as e:
            print(f"  ERROR: {filepath.name}: {e}", file=sys.stderr)

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Files processed:  {total}")
    print(f"Files modified:   {modified_count}")
    print(f"Fields added:     {added_count}")
    if args.dry_run:
        print("\n[DRY-RUN] No files were actually modified.")


if __name__ == '__main__':
    main()