glam/scripts/add_yaml_provenance.py

#!/usr/bin/env python3
"""
Add enhanced provenance to enrichment sections in custodian YAML files.

This script enhances existing enrichment sections with FAIR-compliant provenance elements
following the YAML_PROVENANCE_SCHEMA.md specification:

1. **content_hash** - SHA-256 hash of enrichment section for integrity verification
2. **prov.wasDerivedFrom** - Source URL/entity for PROV-O alignment
3. **prov.generatedAtTime** - Timestamp from existing fetch_timestamp
4. **verification.status** - Verification status tracking

CRITICAL RULES:
- DATA_PRESERVATION_RULES: Never delete existing enriched content - additive only
- Use ruamel.yaml to preserve formatting, comments, and key ordering
- Idempotent processing - skip files already processed
- Hash generation is deterministic (computed from actual content)

Enrichment types processed:
- wikidata_enrichment (17,900 files)
- google_maps_enrichment (3,564 files)
- youtube_enrichment
- web_enrichment (1,708 files)
- zcbs_enrichment (142 files)

Usage:
    python scripts/add_yaml_provenance.py [--limit N] [--dry-run] [--verbose]
    python scripts/add_yaml_provenance.py --file path/to/file.yaml
    python scripts/add_yaml_provenance.py --validate  # Validate without modifying

Author: OpenCode/Claude
Created: 2025-12-28
"""

import argparse
import base64
import hashlib
import json
import os
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Use ruamel.yaml to preserve formatting and comments
try:
    from ruamel.yaml import YAML
    from ruamel.yaml.comments import CommentedMap
    HAS_RUAMEL = True
except ImportError:
    HAS_RUAMEL = False
    print("Warning: ruamel.yaml not installed. Install with: pip install ruamel.yaml", file=sys.stderr)

# Constants
CUSTODIAN_DIR = Path("data/custodian")
PROVENANCE_SCHEMA_VERSION = "2.0"

# Enrichment sections to process with their source URL patterns
ENRICHMENT_SECTIONS = {
    'wikidata_enrichment': {
        'source_key': 'wikidata_entity_id',
        'source_url_template': 'https://www.wikidata.org/wiki/{value}',
        'api_endpoint': 'https://www.wikidata.org/w/rest.php/wikibase/v1',
        'timestamp_key': 'fetch_timestamp',
    },
    'google_maps_enrichment': {
        'source_key': 'place_id',
        'source_url_template': 'https://maps.googleapis.com/maps/api/place/details/json?place_id={value}',
        'api_endpoint': 'https://maps.googleapis.com/maps/api/place',
        'timestamp_key': 'fetch_timestamp',
    },
    'youtube_enrichment': {
        'source_key': 'source_url',
        'source_url_template': None,  # Direct URL
        'source_url_template': 'https://www.googleapis.com/youtube/v3/channels?id={value}',
        'api_endpoint': 'https://www.googleapis.com/youtube/v3',
        'timestamp_key': 'fetch_timestamp',
    },
    'web_enrichment': {
        'source_key': None,  # Uses web_archives[0].url
        'source_url_template': None,
        'api_endpoint': None,
        'timestamp_key': 'full_site_archive_timestamp',
    },
    'zcbs_enrichment': {
        'source_key': 'source_url',
        'source_url_template': None,  # Direct URL
        'api_endpoint': None,
        'timestamp_key': 'fetch_timestamp',
    },
    'linkup_timespan': {
        'source_key': 'search_query',
        'source_url_template': None,
        'api_endpoint': 'https://api.linkup.so',
        'timestamp_key': 'fetch_timestamp',
    },
}


def normalize_for_hash(data: Any) -> Any:
    """
    Normalize data for consistent hashing.

    - Floating point: round to 6 decimal places
    - Timestamps: normalize to ISO 8601 UTC
    - Unicode: apply NFD normalization
    - Remove _provenance to avoid circular dependency
    """
    if isinstance(data, dict):
        return {k: normalize_for_hash(v) for k, v in sorted(data.items()) if k != '_provenance'}
    elif isinstance(data, list):
        return [normalize_for_hash(item) for item in data]
    elif isinstance(data, float):
        return round(data, 6)
    elif isinstance(data, str):
        # Apply Unicode normalization
        return unicodedata.normalize('NFD', data)
    else:
        return data


def generate_content_hash(enrichment_data: dict) -> Dict[str, str]:
    """
    Generate SHA-256 hash for enrichment section integrity.

    Excludes '_provenance' key to avoid circular dependency.
    Uses canonical JSON (sorted keys, no extra whitespace).

    Args:
        enrichment_data: The enrichment section dict

    Returns:
        Dict with algorithm, value (base64), scope, and computed_at
    """
    # Normalize and remove _provenance
    data_to_hash = normalize_for_hash(enrichment_data)

    # Canonical JSON
    canonical = json.dumps(data_to_hash, sort_keys=True, separators=(',', ':'), ensure_ascii=False)

    # SHA-256 hash
    hash_bytes = hashlib.sha256(canonical.encode('utf-8')).digest()
    hash_b64 = base64.b64encode(hash_bytes).decode('ascii')

    return {
        "algorithm": "sha256",
        "value": f"sha256-{hash_b64}",
        "scope": "enrichment_section",
        "computed_at": datetime.now(timezone.utc).isoformat()
    }


def get_source_url(section_name: str, section_data: dict) -> Optional[str]:
    """
    Extract or construct the source URL for an enrichment section.

    Args:
        section_name: Name of the enrichment section
        section_data: The enrichment section data

    Returns:
        Source URL string or None
    """
    config = ENRICHMENT_SECTIONS.get(section_name, {})

    # Special handling for web_enrichment
    if section_name == 'web_enrichment':
        web_archives = section_data.get('web_archives', [])
        if web_archives and isinstance(web_archives, list) and len(web_archives) > 0:
            return web_archives[0].get('url')
        return None

    # Special handling for zcbs_enrichment - use source_url directly
    if section_name == 'zcbs_enrichment':
        return section_data.get('source_url')

    # Get source key and template
    source_key = config.get('source_key')
    template = config.get('source_url_template')

    if not source_key:
        return None

    value = section_data.get(source_key)
    if not value:
        # Try nested paths (e.g., youtube_enrichment.channel.channel_id)
        if section_name == 'youtube_enrichment':
            channel = section_data.get('channel', {})
            value = channel.get('channel_id')

    if not value:
        return None

    if template:
        return template.format(value=value)
    else:
        return str(value)


def get_timestamp(section_name: str, section_data: dict) -> Optional[str]:
    """
    Extract the timestamp from an enrichment section.

    Args:
        section_name: Name of the enrichment section
        section_data: The enrichment section data

    Returns:
        ISO 8601 timestamp string or None
    """
    config = ENRICHMENT_SECTIONS.get(section_name, {})
    timestamp_key = config.get('timestamp_key', 'fetch_timestamp')

    timestamp = section_data.get(timestamp_key)

    # Handle nested timestamps
    if not timestamp and section_name == 'linkup_timespan':
        # Look in provenance.sources.linkup_timespan
        pass  # Already at root level in most cases

    return timestamp


def create_provenance_block(
    section_name: str,
    section_data: dict,
    content_hash: dict
) -> dict:
    """
    Create a _provenance block for an enrichment section.

    Args:
        section_name: Name of the enrichment section
        section_data: The enrichment section data
        content_hash: Pre-computed content hash

    Returns:
        Complete _provenance dict
    """
    config = ENRICHMENT_SECTIONS.get(section_name, {})

    source_url = get_source_url(section_name, section_data)
    timestamp = get_timestamp(section_name, section_data)
    api_endpoint = config.get('api_endpoint')

    provenance = {
        'content_hash': content_hash,
        'prov': {},
        'verification': {
            'status': 'verified',
            'last_verified': datetime.now(timezone.utc).isoformat()
        }
    }

    # Add PROV-O elements
    if source_url:
        provenance['prov']['wasDerivedFrom'] = source_url

    if timestamp:
        provenance['prov']['generatedAtTime'] = timestamp

    if api_endpoint:
        provenance['prov']['wasGeneratedBy'] = {
            '@type': 'prov:Activity',
            'name': f'{section_name.replace("_enrichment", "")}_api_fetch',
            'used': api_endpoint
        }

    # Special handling for web_enrichment - add archive info
    if section_name == 'web_enrichment':
        web_archives = section_data.get('web_archives', [])
        if web_archives and len(web_archives) > 0:
            archive = web_archives[0]
            provenance['archive'] = {
                'local_path': f"{archive.get('directory', '')}/{archive.get('warc_file', '')}",
                'format': archive.get('warc_format', 'ISO 28500 WARC'),
                'size_bytes': archive.get('warc_size_bytes')
            }

    return provenance


def needs_provenance(section_data: dict) -> bool:
    """
    Check if an enrichment section needs provenance added.

    Args:
        section_data: The enrichment section dict

    Returns:
        True if provenance should be added
    """
    if not isinstance(section_data, dict):
        return False

    # Check if _provenance already exists with content_hash
    existing_prov = section_data.get('_provenance', {})
    if existing_prov and 'content_hash' in existing_prov:
        return False

    return True


def add_provenance_to_section(
    section_name: str,
    section_data: dict,
    verbose: bool = False
) -> Tuple[dict, bool]:
    """
    Add provenance to a single enrichment section.

    Args:
        section_name: Name of the enrichment section
        section_data: The enrichment section dict (will be modified)
        verbose: Print progress

    Returns:
        Tuple of (modified_section, was_modified)
    """
    if not needs_provenance(section_data):
        if verbose:
            print(f"    [{section_name}] Already has provenance, skipping")
        return section_data, False

    # Generate content hash
    content_hash = generate_content_hash(section_data)

    # Create provenance block
    provenance = create_provenance_block(section_name, section_data, content_hash)

    # Add to section
    section_data['_provenance'] = provenance

    if verbose:
        source_url = get_source_url(section_name, section_data)
        print(f"    [{section_name}] Added provenance (hash: {content_hash['value'][:30]}...)")
        if source_url:
            print(f"      wasDerivedFrom: {source_url[:60]}...")

    return section_data, True


def update_root_provenance(data: dict, enrichment_summary: dict) -> dict:
    """
    Update the root provenance section with enrichment-level summary.

    Args:
        data: The full YAML data dict
        enrichment_summary: Dict mapping section names to their content hashes

    Returns:
        Modified provenance dict
    """
    if 'provenance' not in data:
        data['provenance'] = {}

    prov = data['provenance']

    # Update schema version
    prov['schema_version'] = '2.0.0'

    # Add enrichment provenance summary
    prov['enrichment_provenance'] = {}
    for section_name, hash_info in enrichment_summary.items():
        prov['enrichment_provenance'][section_name] = {
            'content_hash': hash_info['value'],
            'verified_at': hash_info['computed_at']
        }

    # Add provenance schema version
    prov['provenance_schema_version'] = PROVENANCE_SCHEMA_VERSION

    # Add standards compliance
    prov['standards_compliance'] = [
        'W3C PROV-O',
        'W3C SRI (content hashes)'
    ]

    # Update generated_at
    prov['generated_at'] = datetime.now(timezone.utc).isoformat()

    return prov


def process_file(
    filepath: Path,
    yaml_handler: 'YAML',
    dry_run: bool = False,
    verbose: bool = False
) -> Tuple[bool, int, int]:
    """
    Process a single custodian YAML file.

    Args:
        filepath: Path to the YAML file
        yaml_handler: Configured ruamel.yaml handler
        dry_run: If True, don't write changes
        verbose: Print progress

    Returns:
        Tuple of (file_was_modified, sections_updated, sections_total)
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml_handler.load(f)
    except Exception as e:
        print(f"Error reading {filepath}: {e}", file=sys.stderr)
        return False, 0, 0

    if not isinstance(data, dict):
        return False, 0, 0

    sections_total = 0
    sections_updated = 0
    enrichment_summary = {}
    file_modified = False

    if verbose:
        print(f"\n  Processing {filepath.name}")

    # Process each enrichment section
    for section_name in ENRICHMENT_SECTIONS.keys():
        if section_name not in data:
            continue

        section_data = data[section_name]
        if not isinstance(section_data, dict):
            continue

        sections_total += 1

        updated_section, was_modified = add_provenance_to_section(
            section_name=section_name,
            section_data=section_data,
            verbose=verbose
        )

        if was_modified:
            data[section_name] = updated_section
            sections_updated += 1
            file_modified = True

            # Track for root provenance summary
            prov = updated_section.get('_provenance', {})
            if 'content_hash' in prov:
                enrichment_summary[section_name] = prov['content_hash']

    # Update root provenance if any sections were modified
    if file_modified and enrichment_summary:
        data['provenance'] = update_root_provenance(data, enrichment_summary)

        if verbose:
            print(f"    [provenance] Updated root provenance with {len(enrichment_summary)} sections")

    # Write file if modified
    if file_modified and not dry_run:
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml_handler.dump(data, f)
            if verbose:
                print(f"  Saved {filepath.name}")
        except Exception as e:
            print(f"Error writing {filepath}: {e}", file=sys.stderr)
            return False, sections_updated, sections_total
    elif file_modified and dry_run:
        if verbose:
            print(f"  [DRY-RUN] Would save {filepath.name}")

    return file_modified, sections_updated, sections_total


def validate_file(filepath: Path, yaml_handler: 'YAML', verbose: bool = False) -> List[str]:
    """
    Validate provenance completeness for a file.

    Args:
        filepath: Path to the YAML file
        yaml_handler: Configured ruamel.yaml handler
        verbose: Print progress

    Returns:
        List of validation errors
    """
    errors = []

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml_handler.load(f)
    except Exception as e:
        return [f"Error reading file: {e}"]

    if not isinstance(data, dict):
        return ["File does not contain a dict"]

    for section_name in ENRICHMENT_SECTIONS.keys():
        if section_name not in data:
            continue

        section = data[section_name]
        if not isinstance(section, dict):
            continue

        prov = section.get('_provenance', {})

        # Check mandatory elements
        if 'content_hash' not in prov:
            errors.append(f"{section_name}: missing content_hash")
        if 'verification' not in prov:
            errors.append(f"{section_name}: missing verification")
        if 'prov' not in prov or 'wasDerivedFrom' not in prov.get('prov', {}):
            errors.append(f"{section_name}: missing prov.wasDerivedFrom")

    return errors


def main():
    parser = argparse.ArgumentParser(
        description="Add enhanced provenance to enrichment sections in custodian YAML files"
    )
    parser.add_argument(
        "--limit", type=int, default=None,
        help="Limit number of files to process"
    )
    parser.add_argument(
        "--file", type=str, default=None,
        help="Process a specific file"
    )
    parser.add_argument(
        "--dry-run", action="store_true",
        help="Don't write changes, just report what would be done"
    )
    parser.add_argument(
        "--verbose", "-v", action="store_true",
        help="Print detailed progress"
    )
    parser.add_argument(
        "--validate", action="store_true",
        help="Validate provenance completeness without modifying"
    )
    parser.add_argument(
        "--pattern", type=str, default="*.yaml",
        help="File pattern to match (default: *.yaml)"
    )
    parser.add_argument(
        "--section", type=str, default=None,
        choices=list(ENRICHMENT_SECTIONS.keys()),
        help="Process only a specific enrichment section type"
    )

    args = parser.parse_args()

    # Check requirements
    if not HAS_RUAMEL:
        print("Error: ruamel.yaml is required. Install with: pip install ruamel.yaml", file=sys.stderr)
        sys.exit(1)

    # Configure YAML handler to preserve formatting
    yaml = YAML()
    yaml.preserve_quotes = True
    yaml.default_flow_style = False
    yaml.indent(mapping=2, sequence=4, offset=2)
    yaml.width = 120

    # Get files to process
    if args.file:
        files = [Path(args.file)]
        if not files[0].exists():
            print(f"Error: File not found: {args.file}", file=sys.stderr)
            sys.exit(1)
    else:
        files = sorted(CUSTODIAN_DIR.glob(args.pattern))
        # Only process files at the root level of data/custodian/
        files = [f for f in files if f.parent == CUSTODIAN_DIR]

    if args.limit:
        files = files[:args.limit]

    print(f"Processing {len(files)} files...")
    print(f"  Dry run: {args.dry_run}")
    print(f"  Validate only: {args.validate}")
    if args.section:
        print(f"  Section filter: {args.section}")

    # Validation mode
    if args.validate:
        total_errors = 0
        files_with_errors = 0

        for i, filepath in enumerate(files):
            if args.verbose or (i + 1) % 100 == 0:
                print(f"\n[{i+1}/{len(files)}] Validating {filepath.name}")

            errors = validate_file(filepath, yaml, args.verbose)

            if errors:
                files_with_errors += 1
                total_errors += len(errors)
                if args.verbose:
                    for error in errors:
                        print(f"    ERROR: {error}")

        print(f"\n{'='*60}")
        print(f"VALIDATION SUMMARY")
        print(f"{'='*60}")
        print(f"Files validated: {len(files)}")
        print(f"Files with errors: {files_with_errors}")
        print(f"Total errors: {total_errors}")

        sys.exit(0 if total_errors == 0 else 1)

    # Processing mode
    files_modified = 0
    total_sections_updated = 0
    total_sections = 0

    for i, filepath in enumerate(files):
        if args.verbose or (i + 1) % 100 == 0:
            print(f"\n[{i+1}/{len(files)}] {filepath.name}")

        modified, sections_updated, sections_total = process_file(
            filepath=filepath,
            yaml_handler=yaml,
            dry_run=args.dry_run,
            verbose=args.verbose
        )

        if modified:
            files_modified += 1
        total_sections_updated += sections_updated
        total_sections += sections_total

    # Summary
    print(f"\n{'='*60}")
    print(f"SUMMARY")
    print(f"{'='*60}")
    print(f"Files processed:     {len(files)}")
    print(f"Files modified:      {files_modified}")
    print(f"Sections total:      {total_sections}")
    print(f"Sections updated:    {total_sections_updated}")

    if args.dry_run:
        print(f"\n[DRY-RUN] No files were actually modified.")
    else:
        print(f"\nDone!")


if __name__ == "__main__":
    main()