glam/scripts/enrich_digital_platforms.py

#!/usr/bin/env python3
"""
Batch enrichment script for digital_platforms metadata.

This script:
1. Finds entries with websites but no digital_platforms section
2. Extracts digital platform metadata from existing web claims
3. Infers platform type and metadata from available data
4. Adds the digital_platforms section to each entry

Usage:
    python scripts/enrich_digital_platforms.py [--limit N] [--start-index N] [--dry-run]
"""

import argparse
import os
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional


ENTRIES_DIR = Path("data/nde/enriched/entries")

# Platform type detection patterns
PLATFORM_TYPE_PATTERNS = {
    "DIGITAL_ARCHIVE": ["digitaal archief", "digital archive", "online archief", "archiefbank"],
    "DISCOVERY_PORTAL": ["collectie", "collection", "zoeken", "search", "database"],
    "WEBSITE": ["website", "homepage", "info", "contact", "over ons", "about"],
    "WEB_PORTAL": ["portal", "portaal", "platform"],
    "ONLINE_CATALOG": ["catalogus", "catalog", "bibliotheek", "library"],
    "VIRTUAL_MUSEUM": ["virtueel", "virtual", "3d", "rondleiding", "tour"],
    "EDUCATIONAL_PLATFORM": ["educatie", "education", "lesmateriaal", "leren"],
}

# CMS detection patterns
CMS_PATTERNS = {
    "WordPress": ["wp-content", "wp-includes", "wordpress"],
    "Drupal": ["drupal", "sites/default", "modules/system"],
    "Joomla": ["joomla", "components/com_"],
    "Custom CMS": [],
}

# Data standard patterns
DATA_STANDARD_PATTERNS = {
    "Schema.org": ["schema.org", "itemtype", "itemscope"],
    "Dublin Core": ["dc:", "dcterms:", "dublin core"],
    "Open Graph": ["og:", "og:title", "og:description"],
    "IIIF": ["iiif", "manifest.json", "image-api"],
    "Linked Data": ["application/ld+json", "@context", "rdf"],
}


def load_entry(filepath: Path) -> dict:
    """Load a YAML entry file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def save_entry(filepath: Path, data: dict):
    """Save a YAML entry file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)


def has_website(entry: dict) -> bool:
    """Check if entry has a website URL."""
    # Check original_entry.webadres_organisatie
    original = entry.get('original_entry', {})
    if original.get('webadres_organisatie'):
        return True

    # Check wikidata official website
    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_official_website'):
        return True

    # Check google maps website
    google = entry.get('google_maps_enrichment', {})
    if google.get('website'):
        return True

    return False


def has_digital_platforms(entry: dict) -> bool:
    """Check if entry already has digital_platforms section."""
    return 'digital_platforms' in entry and entry['digital_platforms']


def get_website_url(entry: dict) -> Optional[str]:
    """Extract website URL from entry."""
    # Priority: original_entry > wikidata > google_maps
    original = entry.get('original_entry', {})
    if original.get('webadres_organisatie'):
        return original['webadres_organisatie']

    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_official_website'):
        return wikidata['wikidata_official_website']

    google = entry.get('google_maps_enrichment', {})
    if google.get('website'):
        return google['website']

    return None


def detect_platform_type(entry: dict) -> str:
    """Detect platform type from entry data."""
    institution_type = entry.get('original_entry', {}).get('type_organisatie', '')
    types = entry.get('original_entry', {}).get('type', [])

    # Map institution type to platform type
    if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
        return "DISCOVERY_PORTAL"
    if 'museum' in institution_type.lower() or 'M' in types:
        return "WEBSITE"
    if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
        return "ONLINE_CATALOG"
    if 'research' in institution_type.lower() or 'R' in types:
        return "WEB_PORTAL"

    # Check web claims for patterns
    web_claims = entry.get('web_claims', {}).get('claims', [])
    for claim in web_claims:
        claim_value = str(claim.get('claim_value', '')).lower()
        for platform_type, patterns in PLATFORM_TYPE_PATTERNS.items():
            if any(p in claim_value for p in patterns):
                return platform_type

    return "WEBSITE"  # Default


def detect_technology_stack(entry: dict) -> str:
    """Detect technology stack from web claims."""
    web_claims = entry.get('web_claims', {}).get('claims', [])
    detected_cms = []

    for claim in web_claims:
        html_file = str(claim.get('html_file', '')).lower()
        claim_value = str(claim.get('claim_value', '')).lower()

        for cms, patterns in CMS_PATTERNS.items():
            if any(p in html_file or p in claim_value for p in patterns):
                if cms not in detected_cms:
                    detected_cms.append(cms)

    # Check for Atlantis or other known systems
    original = entry.get('original_entry', {})
    system = original.get('systeem', '')
    if system:
        if system not in detected_cms:
            detected_cms.append(system)

    if detected_cms:
        return ", ".join(detected_cms)
    return "Standard web technology"


def detect_data_standards(entry: dict) -> list:
    """Detect data standards from web claims."""
    web_claims = entry.get('web_claims', {}).get('claims', [])
    detected_standards = set()

    for claim in web_claims:
        extraction_method = str(claim.get('extraction_method', '')).lower()
        claim_type = str(claim.get('claim_type', '')).lower()
        claim_value = str(claim.get('claim_value', '')).lower()

        # Check for schema.org
        if 'schema' in extraction_method or 'jsonld' in extraction_method:
            detected_standards.add("Schema.org")

        # Check for Open Graph
        if 'og_' in extraction_method or 'open graph' in extraction_method:
            detected_standards.add("Open Graph")

        # Check for collection/catalog patterns indicating standards
        if 'collection' in claim_type or 'catalog' in claim_type:
            detected_standards.add("Dublin Core")

    if not detected_standards:
        detected_standards.add("HTML5")

    return list(detected_standards)


def extract_user_services(entry: dict) -> str:
    """Extract user services from web claims and institution type."""
    services = []

    # Check web claims for specific features
    web_claims = entry.get('web_claims', {}).get('claims', [])
    for claim in web_claims:
        claim_type = claim.get('claim_type', '')

        if 'search' in claim_type.lower():
            if "Search" not in services:
                services.append("Search")
        if 'gallery' in claim_type.lower() or 'image' in claim_type.lower():
            if "Image gallery" not in services:
                services.append("Image gallery")
        if 'video' in claim_type.lower():
            if "Video content" not in services:
                services.append("Video content")
        if 'social' in claim_type.lower():
            if "Social media integration" not in services:
                services.append("Social media integration")
        if 'login' in claim_type.lower() or 'signup' in claim_type.lower():
            if "User accounts" not in services:
                services.append("User accounts")
        if 'email' in claim_type.lower() or 'phone' in claim_type.lower():
            if "Contact information" not in services:
                services.append("Contact information")

    # Add basic services based on institution type
    original = entry.get('original_entry', {})
    institution_type = original.get('type_organisatie', '')
    types = original.get('type', [])

    if 'museum' in institution_type.lower() or 'M' in types:
        if "Exhibition information" not in services:
            services.append("Exhibition information")
        if "Visit planning" not in services:
            services.append("Visit planning")

    if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
        if "Collection search" not in services:
            services.append("Collection search")

    if not services:
        services = ["General information", "Contact"]

    return ", ".join(services)


def extract_sustainability_model(entry: dict) -> str:
    """Extract sustainability model from entry data."""
    wikidata = entry.get('wikidata_enrichment', {})
    claims = wikidata.get('wikidata_claims', {})

    # Check for legal form
    legal_form = claims.get('P1454_legal_form', {})
    if legal_form:
        value = legal_form.get('value', {})
        if isinstance(value, dict):
            label = value.get('label_nl', value.get('label_en', ''))
            if 'stichting' in label.lower():
                return "Non-profit foundation"
            if 'vereniging' in label.lower():
                return "Membership association"

    # Check original entry for hints
    original = entry.get('original_entry', {})
    museum_register = original.get('museum_register', '')
    if museum_register == 'ja':
        return "Registered museum (government supported)"

    return "Institutional funding"


def extract_digital_collections(entry: dict) -> str:
    """Extract description of digital collections."""
    descriptions = []

    # Check for collection claims in web data
    web_claims = entry.get('web_claims', {}).get('claims', [])
    for claim in web_claims:
        if claim.get('claim_type') == 'collection_page':
            descriptions.append("Online collection access")
        if claim.get('claim_type') == 'description_short':
            # Use first short description as a basis
            if not descriptions:
                descriptions.append(claim.get('claim_value', '')[:200])

    # Check wikidata descriptions
    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_description_en'):
        descriptions.append(wikidata['wikidata_description_en'])

    # Check google maps editorial summary
    google = entry.get('google_maps_enrichment', {})
    if google.get('editorial_summary'):
        descriptions.append(google['editorial_summary'])

    if descriptions:
        # Combine and deduplicate
        return "; ".join(set(descriptions[:2]))

    return "Organizational website with heritage information"


def get_platform_name(entry: dict) -> str:
    """Get the platform name from entry data."""
    # Use organization name
    original = entry.get('original_entry', {})
    if original.get('organisatie'):
        return f"{original['organisatie']} Website"

    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata.get('wikidata_label_nl'):
        return f"{wikidata['wikidata_label_nl']} Website"
    if wikidata.get('wikidata_label_en'):
        return f"{wikidata['wikidata_label_en']} Website"

    return "Official Website"


def create_digital_platform(entry: dict) -> Optional[dict]:
    """Create digital_platforms section for an entry."""
    website_url = get_website_url(entry)
    if not website_url:
        return None

    platform = {
        'platform_name': get_platform_name(entry),
        'platform_url': website_url,
        'platform_type': detect_platform_type(entry),
        'platform_category': ["Organizational website"],
        'digital_collections': extract_digital_collections(entry),
        'technology_stack': detect_technology_stack(entry),
        'data_standards': detect_data_standards(entry),
        'user_services': extract_user_services(entry),
        'sustainability_model': extract_sustainability_model(entry),
        'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
        'source_method': 'automated_extraction',
    }

    # Add additional category based on platform type
    if platform['platform_type'] == 'DISCOVERY_PORTAL':
        platform['platform_category'].append("Heritage discovery")
    elif platform['platform_type'] == 'ONLINE_CATALOG':
        platform['platform_category'].append("Collection catalog")
    elif platform['platform_type'] == 'VIRTUAL_MUSEUM':
        platform['platform_category'].append("Virtual exhibition")

    return platform


def enrich_entry(entry: dict) -> dict:
    """Add digital_platforms section to entry."""
    platform = create_digital_platform(entry)
    if platform:
        entry['digital_platforms'] = [platform]
    return entry


def find_entries_to_enrich() -> list[Path]:
    """Find all entries that need digital_platforms enrichment."""
    entries_to_enrich = []

    for filepath in sorted(ENTRIES_DIR.glob("*.yaml")):
        if filepath.name.startswith('_'):
            continue

        try:
            entry = load_entry(filepath)
            if has_website(entry) and not has_digital_platforms(entry):
                entries_to_enrich.append(filepath)
        except Exception as e:
            print(f"Error reading {filepath}: {e}")

    return entries_to_enrich


def main():
    parser = argparse.ArgumentParser(description="Enrich entries with digital_platforms metadata")
    parser.add_argument('--limit', type=int, default=None, help="Maximum number of entries to process")
    parser.add_argument('--start-index', type=int, default=0, help="Start index for batch processing")
    parser.add_argument('--dry-run', action='store_true', help="Print changes without writing")
    parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
    args = parser.parse_args()

    print("Finding entries to enrich...")
    entries = find_entries_to_enrich()
    print(f"Found {len(entries)} entries with websites but no digital_platforms")

    # Apply start index and limit
    entries = entries[args.start_index:]
    if args.limit:
        entries = entries[:args.limit]

    print(f"Processing {len(entries)} entries...")

    enriched_count = 0
    for filepath in entries:
        try:
            entry = load_entry(filepath)
            entry = enrich_entry(entry)

            if 'digital_platforms' in entry:
                enriched_count += 1

                if args.verbose:
                    platform = entry['digital_platforms'][0]
                    print(f"\n{filepath.name}:")
                    print(f"  Platform: {platform['platform_name']}")
                    print(f"  URL: {platform['platform_url']}")
                    print(f"  Type: {platform['platform_type']}")
                    print(f"  Tech: {platform['technology_stack']}")

                if not args.dry_run:
                    save_entry(filepath, entry)

        except Exception as e:
            print(f"Error processing {filepath}: {e}")

    print(f"\n{'Would enrich' if args.dry_run else 'Enriched'} {enriched_count} entries")


if __name__ == "__main__":
    main()