glam/scripts/enrich_digital_platforms_fast.py

#!/usr/bin/env python3
"""
Fast batch enrichment script for digital_platforms metadata.
Reads from a pre-generated list file for speed.

Usage:
    # First generate the list:
    find data/nde/enriched/entries -name "*.yaml" -exec grep -L "digital_platforms:" {} \; > /tmp/entries_to_enrich.txt

    # Then run:
    python scripts/enrich_digital_platforms_fast.py --input /tmp/entries_to_enrich.txt --batch 100
"""

import argparse
import sys
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional


def load_entry(filepath: Path) -> dict:
    """Load a YAML entry file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def save_entry(filepath: Path, data: dict):
    """Save a YAML entry file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)


def get_website_url(entry: dict) -> Optional[str]:
    """Extract website URL from entry."""
    # Priority: most explicit sources first

    # 1. Original entry webadres
    original = entry.get('original_entry', {})
    if original.get('webadres_organisatie'):
        return original['webadres_organisatie']

    # 2. Contact section website
    contact = entry.get('contact', {})
    if contact.get('website'):
        return contact['website']

    # 3. Digital presence website
    digital_presence = entry.get('digital_presence', {})
    if digital_presence.get('website'):
        return digital_presence['website']

    # 4. Wikidata official website
    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_official_website'):
        return wikidata['wikidata_official_website']

    # 5. Google Maps website
    google = entry.get('google_maps_enrichment', {})
    if google.get('website'):
        return google['website']

    return None


def detect_platform_type(entry: dict) -> str:
    """Detect platform type from entry data."""
    institution_type = entry.get('original_entry', {}).get('type_organisatie', '')
    types = entry.get('original_entry', {}).get('type', [])

    if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
        return "DISCOVERY_PORTAL"
    if 'museum' in institution_type.lower() or 'M' in types:
        return "WEBSITE"
    if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
        return "ONLINE_CATALOG"
    if 'research' in institution_type.lower() or 'R' in types:
        return "WEB_PORTAL"

    return "WEBSITE"


def detect_technology_stack(entry: dict) -> str:
    """Detect technology stack from entry."""
    original = entry.get('original_entry', {})
    system = original.get('systeem', '')
    if system:
        return system
    return "Standard web technology"


def detect_data_standards(entry: dict) -> list:
    """Detect data standards from web claims."""
    web_claims = entry.get('web_claims', {}).get('claims', [])
    detected_standards = set()

    for claim in web_claims:
        extraction_method = str(claim.get('extraction_method', '')).lower()
        if 'schema' in extraction_method or 'jsonld' in extraction_method:
            detected_standards.add("Schema.org")
        if 'og_' in extraction_method or 'open graph' in extraction_method:
            detected_standards.add("Open Graph")

    if not detected_standards:
        detected_standards.add("HTML5")

    return list(detected_standards)


def extract_user_services(entry: dict) -> str:
    """Extract user services from institution type."""
    services = []
    original = entry.get('original_entry', {})
    institution_type = original.get('type_organisatie', '')
    types = original.get('type', [])

    if 'museum' in institution_type.lower() or 'M' in types:
        services.extend(["Exhibition information", "Visit planning"])
    if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
        services.append("Collection search")
    if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
        services.append("Catalog search")

    if not services:
        services = ["General information", "Contact"]

    return ", ".join(services)


def extract_sustainability_model(entry: dict) -> str:
    """Extract sustainability model from entry data."""
    original = entry.get('original_entry', {})
    museum_register = original.get('museum_register', '')
    if museum_register == 'ja':
        return "Registered museum (government supported)"

    wikidata = entry.get('wikidata_enrichment', {})
    claims = wikidata.get('wikidata_claims', {})
    legal_form = claims.get('P1454_legal_form', {})
    if legal_form:
        value = legal_form.get('value', {})
        if isinstance(value, dict):
            label = value.get('label_nl', value.get('label_en', ''))
            if 'stichting' in label.lower():
                return "Non-profit foundation"
            if 'vereniging' in label.lower():
                return "Membership association"

    return "Institutional funding"


def extract_digital_collections(entry: dict) -> str:
    """Extract description of digital collections."""
    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_description_en'):
        return wikidata['wikidata_description_en']
    if wikidata.get('wikidata_description_nl'):
        return wikidata['wikidata_description_nl']

    google = entry.get('google_maps_enrichment', {})
    if google.get('editorial_summary'):
        return google['editorial_summary']

    return "Organizational website with heritage information"


def get_platform_name(entry: dict) -> str:
    """Get the platform name from entry data."""
    original = entry.get('original_entry', {})
    if original.get('organisatie'):
        return f"{original['organisatie']} Website"

    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_label_nl'):
        return f"{wikidata['wikidata_label_nl']} Website"
    if wikidata.get('wikidata_label_en'):
        return f"{wikidata['wikidata_label_en']} Website"

    return "Official Website"


def create_digital_platform(entry: dict) -> Optional[dict]:
    """Create digital_platforms section for an entry."""
    website_url = get_website_url(entry)
    if not website_url:
        return None

    platform_type = detect_platform_type(entry)
    platform = {
        'platform_name': get_platform_name(entry),
        'platform_url': website_url,
        'platform_type': platform_type,
        'platform_category': ["Organizational website"],
        'digital_collections': extract_digital_collections(entry),
        'technology_stack': detect_technology_stack(entry),
        'data_standards': detect_data_standards(entry),
        'user_services': extract_user_services(entry),
        'sustainability_model': extract_sustainability_model(entry),
        'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
        'source_method': 'automated_extraction',
    }

    if platform_type == 'DISCOVERY_PORTAL':
        platform['platform_category'].append("Heritage discovery")
    elif platform_type == 'ONLINE_CATALOG':
        platform['platform_category'].append("Collection catalog")
    elif platform_type == 'VIRTUAL_MUSEUM':
        platform['platform_category'].append("Virtual exhibition")

    return platform


def process_entry(filepath: Path) -> tuple[bool, str]:
    """Process a single entry. Returns (success, message)."""
    try:
        entry = load_entry(filepath)

        # Skip if already has digital_platforms
        if 'digital_platforms' in entry and entry['digital_platforms']:
            return False, "already enriched"

        platform = create_digital_platform(entry)
        if platform:
            entry['digital_platforms'] = [platform]
            save_entry(filepath, entry)
            return True, f"added {platform['platform_type']}"
        else:
            return False, "no website URL"
    except Exception as e:
        return False, f"error: {e}"


def main():
    parser = argparse.ArgumentParser(description="Fast batch enrich entries with digital_platforms")
    parser.add_argument('--input', '-i', type=str, required=True, help="Input file with list of entries")
    parser.add_argument('--batch', '-b', type=int, default=100, help="Batch size to process")
    parser.add_argument('--skip', '-s', type=int, default=0, help="Number of entries to skip")
    parser.add_argument('--dry-run', action='store_true', help="Don't actually write files")
    args = parser.parse_args()

    # Read list of files to process
    with open(args.input, 'r') as f:
        files = [Path(line.strip()) for line in f if line.strip()]

    print(f"Total entries in list: {len(files)}")

    # Apply skip and batch
    files = files[args.skip:args.skip + args.batch]
    print(f"Processing {len(files)} entries (skip={args.skip}, batch={args.batch})")

    enriched = 0
    skipped = 0
    errors = 0

    for i, filepath in enumerate(files):
        if args.dry_run:
            print(f"[DRY] {filepath.name}")
            continue

        success, msg = process_entry(filepath)
        if success:
            enriched += 1
            if enriched % 10 == 0:
                print(f"Progress: {enriched} enriched, {i+1}/{len(files)}")
        elif "error" in msg:
            errors += 1
            print(f"ERROR {filepath.name}: {msg}")
        else:
            skipped += 1

    print(f"\nDone: {enriched} enriched, {skipped} skipped, {errors} errors")
    print(f"Next batch: --skip {args.skip + args.batch}")


if __name__ == "__main__":
    main()