glam/scripts/patch_wikidata_derived_from.py

#!/usr/bin/env python3
"""
Patch wikidata_enrichment sections to add wasDerivedFrom using available fields.

Handles cases where:
- wikidata_entity_id is None but wikidata_id exists
- wikidata_url exists and can be used directly
- Neither exists (skip)

Usage:
    python scripts/patch_wikidata_derived_from.py [--dry-run] [--normalize]
"""

import argparse
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

try:
    from ruamel.yaml import YAML  # type: ignore
    from ruamel.yaml.comments import CommentedMap  # type: ignore
except ImportError:
    print("ERROR: ruamel.yaml not installed. Run: pip install ruamel.yaml")
    sys.exit(1)


def extract_entity_id_from_url(url: str) -> Optional[str]:
    """Extract Q-number from Wikidata URL."""
    if not url:
        return None
    match = re.search(r'(Q\d+)', url)
    return match.group(1) if match else None


def get_wikidata_derived_from(section: dict) -> Optional[str]:
    """Get wasDerivedFrom URL from wikidata_enrichment section.

    Priority:
    1. wikidata_entity_id (if valid Q-number)
    2. wikidata_id (if valid Q-number)
    3. wikidata_url (use directly)
    """
    # Try wikidata_entity_id first
    entity_id = section.get('wikidata_entity_id')
    if entity_id and str(entity_id).startswith('Q'):
        return f"https://www.wikidata.org/wiki/{entity_id}"

    # Try wikidata_id
    wikidata_id = section.get('wikidata_id')
    if wikidata_id and str(wikidata_id).startswith('Q'):
        return f"https://www.wikidata.org/wiki/{wikidata_id}"

    # Try wikidata_url directly
    wikidata_url = section.get('wikidata_url')
    if wikidata_url:
        # Normalize to wiki URL format
        entity_id = extract_entity_id_from_url(wikidata_url)
        if entity_id:
            return f"https://www.wikidata.org/wiki/{entity_id}"
        # Use URL as-is if we can't extract entity ID
        return wikidata_url

    return None


def patch_section(section: dict, normalize: bool = False) -> tuple[bool, Optional[str]]:
    """Add wasDerivedFrom to section's _provenance if missing.

    Args:
        section: The wikidata_enrichment section dict
        normalize: If True, also copy wikidata_id to wikidata_entity_id

    Returns:
        Tuple of (was_patched, derived_from_url)
    """
    # Get _provenance
    provenance = section.get('_provenance')
    if not provenance:
        return False, None

    # Get or create prov section
    prov = provenance.get('prov')
    if not prov:
        prov = CommentedMap()
        provenance['prov'] = prov

    # Check if wasDerivedFrom already exists
    if prov.get('wasDerivedFrom'):
        return False, prov.get('wasDerivedFrom')

    # Get derived_from URL
    derived_from = get_wikidata_derived_from(section)
    if not derived_from:
        return False, None

    # Add wasDerivedFrom
    prov['wasDerivedFrom'] = derived_from

    # Add generatedAtTime if missing
    if not prov.get('generatedAtTime'):
        timestamp = (
            section.get('enrichment_date') or
            section.get('enrichment_timestamp') or
            datetime.now(timezone.utc).isoformat()
        )
        prov['generatedAtTime'] = timestamp

    # Add wasGeneratedBy if missing
    if not prov.get('wasGeneratedBy'):
        generated_by = CommentedMap()
        generated_by['@type'] = 'prov:Activity'
        generated_by['name'] = 'wikidata_api_fetch'
        generated_by['used'] = 'https://www.wikidata.org/w/rest.php/wikibase/v1'
        prov['wasGeneratedBy'] = generated_by

    # Optionally normalize wikidata_entity_id
    if normalize and not section.get('wikidata_entity_id'):
        wikidata_id = section.get('wikidata_id')
        if wikidata_id and str(wikidata_id).startswith('Q'):
            section['wikidata_entity_id'] = wikidata_id

    return True, derived_from


def process_file(filepath: Path, yaml: YAML, dry_run: bool = False, normalize: bool = False) -> dict:
    """Process a single YAML file."""
    result = {
        'filepath': str(filepath),
        'modified': False,
        'patched': False,
        'derived_from': None,
        'error': None,
    }

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.load(f)
    except Exception as e:
        result['error'] = str(e)
        return result

    if not isinstance(data, dict):
        return result

    # Check wikidata_enrichment
    if 'wikidata_enrichment' not in data:
        return result

    section = data['wikidata_enrichment']
    if not isinstance(section, dict):
        return result

    patched, derived_from = patch_section(section, normalize=normalize)

    if patched:
        result['patched'] = True
        result['derived_from'] = derived_from

        if not dry_run:
            try:
                with open(filepath, 'w', encoding='utf-8') as f:
                    yaml.dump(data, f)
                result['modified'] = True
            except Exception as e:
                result['error'] = f"Write error: {e}"

    return result


def main():
    parser = argparse.ArgumentParser(
        description='Patch wikidata_enrichment sections to add wasDerivedFrom'
    )
    parser.add_argument(
        '--dry-run', action='store_true',
        help='Show what would be changed without modifying files'
    )
    parser.add_argument(
        '--normalize', action='store_true',
        help='Also copy wikidata_id to wikidata_entity_id if missing'
    )
    parser.add_argument(
        '--verbose', '-v', action='store_true',
        help='Show detailed output'
    )

    args = parser.parse_args()

    # Setup YAML
    yaml = YAML()
    yaml.preserve_quotes = True
    yaml.default_flow_style = False
    yaml.width = 4096

    # Find files
    script_dir = Path(__file__).parent
    base_dir = script_dir.parent
    custodian_dir = base_dir / 'data' / 'custodian'

    yaml_files = list(custodian_dir.glob('*.yaml'))
    total_files = len(yaml_files)

    print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {total_files} YAML files...")
    print()

    # Stats
    stats = {
        'files_processed': 0,
        'files_with_wikidata': 0,
        'files_modified': 0,
        'sections_patched': 0,
        'errors': 0,
    }

    for i, filepath in enumerate(yaml_files):
        if (i + 1) % 2000 == 0:
            print(f"  Progress: {i + 1}/{total_files}")

        result = process_file(filepath, yaml, dry_run=args.dry_run, normalize=args.normalize)
        stats['files_processed'] += 1

        if result['error']:
            stats['errors'] += 1
            if args.verbose:
                print(f"  ERROR: {filepath.name}: {result['error']}")

        if result['patched']:
            stats['files_with_wikidata'] += 1
            stats['sections_patched'] += 1
            if result['modified']:
                stats['files_modified'] += 1
            if args.verbose:
                print(f"  Patched: {filepath.name} -> {result['derived_from']}")

    # Summary
    print()
    print("=" * 60)
    print("PATCH SUMMARY")
    print("=" * 60)
    print(f"Files processed:       {stats['files_processed']:,}")
    print(f"Files modified:        {stats['files_modified']:,}")
    print(f"Sections patched:      {stats['sections_patched']:,}")
    print(f"Errors:                {stats['errors']:,}")
    print()

    if args.dry_run:
        print("This was a DRY RUN - no files were modified.")
        print("Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()