glam/scripts/enrich_kb_libraries_exa.py

#!/usr/bin/env python3
"""
Enrich KB Netherlands library entries with website data using Exa MCP.

This script extracts detailed information from library websites to populate
LinkML schema fields including:
- Description and mission
- Collections (types, scope, extent)
- Digital platforms (APIs, IIIF, linked data)
- Services and accessibility
- Organizational structure
- Contact information
- Opening hours (from website if available)
- Metadata standards
- Staff/leadership (if publicly listed)

Usage:
    python scripts/enrich_kb_libraries_exa.py [--dry-run] [--limit N]
"""

import os
import sys
import json
import yaml
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import logging
import argparse
import subprocess
import re

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Paths
ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports")

# Rate limiting for Exa API
REQUEST_DELAY = 1.0  # seconds between requests


@dataclass
class WebsiteEnrichment:
    """Container for website-extracted data following LinkML schema."""

    # Basic info
    description: Optional[str] = None
    mission_statement: Optional[str] = None
    history_summary: Optional[str] = None
    founding_year: Optional[str] = None

    # Collections (CustodianCollection fields)
    collections: List[Dict[str, Any]] = field(default_factory=list)
    collection_types: List[str] = field(default_factory=list)
    collection_scope: Optional[str] = None
    collection_extent: Optional[str] = None
    temporal_coverage: Optional[str] = None
    digitization_status: Optional[str] = None

    # Digital Platform fields
    homepage_url: Optional[str] = None
    catalog_url: Optional[str] = None
    api_endpoints: List[str] = field(default_factory=list)
    sparql_endpoint: Optional[str] = None
    oai_pmh_endpoint: Optional[str] = None
    iiif_support: Optional[bool] = None
    linked_data: Optional[bool] = None
    metadata_standards: List[str] = field(default_factory=list)

    # Services
    services: List[str] = field(default_factory=list)
    accessibility_info: Optional[str] = None
    membership_info: Optional[str] = None

    # Contact & Location
    contact_email: Optional[str] = None
    contact_phone: Optional[str] = None
    address: Optional[str] = None
    opening_hours_text: Optional[str] = None

    # Organization
    parent_organization: Optional[str] = None
    organizational_units: List[str] = field(default_factory=list)
    staff_count: Optional[int] = None
    leadership: List[Dict[str, str]] = field(default_factory=list)

    # Technical
    repository_software: Optional[str] = None
    cms_system: Optional[str] = None
    programming_languages: List[str] = field(default_factory=list)

    # Provenance
    extraction_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    source_urls: List[str] = field(default_factory=list)
    extraction_method: str = "exa_web_search"
    confidence_score: float = 0.0


def extract_collections_from_text(text: str) -> Dict[str, Any]:
    """Extract collection information from website text."""
    collections_info = {
        'collection_types': [],
        'collection_scope': None,
        'collection_extent': None,
        'temporal_coverage': None,
        'digitization_status': None,
    }

    text_lower = text.lower()

    # Detect collection types
    type_patterns = {
        'books': ['books', 'boeken', 'publications', 'publicaties'],
        'newspapers': ['newspapers', 'kranten', 'nieuws'],
        'magazines': ['magazines', 'tijdschriften', 'periodicals'],
        'manuscripts': ['manuscripts', 'handschriften', 'medieval'],
        'digital_born': ['e-books', 'ebooks', 'digital', 'digitaal', 'websites'],
        'photographs': ['photos', 'photographs', 'foto', 'images'],
        'maps': ['maps', 'kaarten', 'cartography'],
        'music': ['music', 'muziek', 'audio', 'sound'],
        'archives': ['archives', 'archieven', 'records'],
    }

    for coll_type, keywords in type_patterns.items():
        if any(kw in text_lower for kw in keywords):
            collections_info['collection_types'].append(coll_type)

    # Extract extent (numbers with units)
    extent_patterns = [
        r'(\d[\d,\.]*)\s*(million|miljoen)?\s*(books|items|volumes|objects|pieces)',
        r'(\d[\d,\.]*)\s*(kilometer|km|metres|meter)s?\s*(of|aan)\s*(shelves|shelf|materials)',
        r'over\s+(\d[\d,\.]*)\s*(items|volumes|books)',
        r'meer dan\s+(\d[\d,\.]*)\s*(items|stukken|boeken)',
    ]

    for pattern in extent_patterns:
        match = re.search(pattern, text_lower)
        if match:
            collections_info['collection_extent'] = match.group(0)
            break

    # Detect digitization status
    if any(phrase in text_lower for phrase in ['fully digitized', 'volledig gedigitaliseerd', 'complete digital']):
        collections_info['digitization_status'] = 'COMPLETE'
    elif any(phrase in text_lower for phrase in ['digitization', 'digitalisering', 'being digitized', 'digital collection']):
        collections_info['digitization_status'] = 'PARTIAL'
    elif any(phrase in text_lower for phrase in ['no digital', 'niet digitaal', 'physical only']):
        collections_info['digitization_status'] = 'NOT_DIGITIZED'

    return collections_info


def extract_digital_platform_info(text: str, url: str) -> Dict[str, Any]:
    """Extract digital platform information from website text."""
    platform_info = {
        'homepage_url': url if url else None,
        'catalog_url': None,
        'api_endpoints': [],
        'sparql_endpoint': None,
        'oai_pmh_endpoint': None,
        'iiif_support': None,
        'linked_data': None,
        'metadata_standards': [],
    }

    text_lower = text.lower()

    # Detect IIIF support
    if 'iiif' in text_lower:
        platform_info['iiif_support'] = True

    # Detect linked data
    if any(term in text_lower for term in ['linked data', 'rdf', 'sparql', 'semantic web', 'json-ld']):
        platform_info['linked_data'] = True

    # Detect metadata standards
    standard_patterns = {
        'Dublin Core': ['dublin core', 'dc:', 'dcterms'],
        'MARC21': ['marc21', 'marc 21', 'marc format'],
        'EAD': ['ead', 'encoded archival description'],
        'LIDO': ['lido'],
        'MODS': ['mods', 'metadata object description'],
        'PREMIS': ['premis', 'preservation metadata'],
        'Schema.org': ['schema.org', 'schema:'],
    }

    for standard, patterns in standard_patterns.items():
        if any(p in text_lower for p in patterns):
            platform_info['metadata_standards'].append(standard)

    # Extract URLs
    url_patterns = {
        'catalog_url': [r'(https?://[^\s<>"]+(?:catalog|catalogue|search|zoeken|collectie)[^\s<>"]*)'],
        'api_endpoints': [r'(https?://[^\s<>"]+(?:api|webservice)[^\s<>"]*)'],
        'sparql_endpoint': [r'(https?://[^\s<>"]+sparql[^\s<>"]*)'],
        'oai_pmh_endpoint': [r'(https?://[^\s<>"]+(?:oai|oai-pmh)[^\s<>"]*)'],
    }

    for field, patterns in url_patterns.items():
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                if field == 'api_endpoints':
                    platform_info[field].extend(matches[:3])  # Max 3
                else:
                    platform_info[field] = matches[0]
                break

    return platform_info


def extract_organization_info(text: str) -> Dict[str, Any]:
    """Extract organizational information from website text."""
    org_info = {
        'parent_organization': None,
        'organizational_units': [],
        'leadership': [],
        'staff_count': None,
    }

    text_lower = text.lower()

    # Detect parent organizations
    parent_patterns = [
        r'(?:part of|onderdeel van|under|onder)\s+(?:the\s+)?([A-Z][^,\.\n]+)',
        r'(?:ministry|ministerie)\s+(?:of|van)\s+([A-Z][^,\.\n]+)',
    ]

    for pattern in parent_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            org_info['parent_organization'] = match.group(1).strip()
            break

    # Detect organizational units/departments
    unit_patterns = [
        r'(?:department|afdeling|team|unit)\s+(?:of|voor)?\s*([A-Za-z\s]+)',
    ]

    for pattern in unit_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        org_info['organizational_units'].extend([m.strip() for m in matches[:5]])

    # Extract staff count
    staff_patterns = [
        r'(\d+)\s*(?:staff|employees|medewerkers|fte)',
        r'team of\s*(\d+)',
    ]

    for pattern in staff_patterns:
        match = re.search(pattern, text_lower)
        if match:
            try:
                org_info['staff_count'] = int(match.group(1))
            except ValueError:
                pass
            break

    return org_info


def extract_services_info(text: str) -> Dict[str, Any]:
    """Extract services and accessibility information."""
    services_info = {
        'services': [],
        'accessibility_info': None,
        'membership_info': None,
    }

    text_lower = text.lower()

    # Detect services
    service_keywords = {
        'Reading room': ['reading room', 'leeszaal', 'study room'],
        'Interlibrary loan': ['interlibrary loan', 'ibl', 'interbibliothecair leenverkeer'],
        'Digital access': ['digital access', 'online access', 'remote access'],
        'Research support': ['research support', 'onderzoeksondersteuning', 'reference service'],
        'Exhibitions': ['exhibitions', 'tentoonstellingen', 'displays'],
        'Tours': ['tours', 'rondleidingen', 'guided tours'],
        'Events': ['events', 'evenementen', 'lectures', 'workshops'],
        'Scanning services': ['scanning', 'digitization service', 'reproduction'],
        'Wi-Fi': ['wi-fi', 'wifi', 'internet access'],
        'Copying': ['copying', 'kopiëren', 'printing'],
    }

    for service, keywords in service_keywords.items():
        if any(kw in text_lower for kw in keywords):
            services_info['services'].append(service)

    # Detect accessibility
    accessibility_keywords = ['wheelchair', 'rolstoel', 'accessible', 'toegankelijk', 'disability', 'handicap']
    if any(kw in text_lower for kw in accessibility_keywords):
        services_info['accessibility_info'] = 'Accessibility features available'

    # Detect membership
    if any(term in text_lower for term in ['membership', 'lidmaatschap', 'member', 'lid worden', 'join']):
        services_info['membership_info'] = 'Membership available'

    return services_info


def extract_contact_info(text: str) -> Dict[str, Any]:
    """Extract contact information from website text."""
    contact_info = {
        'contact_email': None,
        'contact_phone': None,
        'address': None,
    }

    # Email pattern
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    if email_match:
        contact_info['contact_email'] = email_match.group(0)

    # Dutch phone pattern
    phone_patterns = [
        r'(?:\+31|0031|0)\s*(?:\d[\s-]*){9,10}',
        r'tel(?:efoon)?[:\s]+([+\d\s\-()]+)',
    ]

    for pattern in phone_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            contact_info['contact_phone'] = match.group(0).strip()
            break

    # Address pattern (Dutch postal code)
    address_match = re.search(r'(\d{4}\s*[A-Z]{2})\s*([A-Za-z\s]+)', text)
    if address_match:
        contact_info['address'] = address_match.group(0)

    return contact_info


def call_exa_search(query: str, num_results: int = 3) -> Optional[str]:
    """
    Call Exa MCP tool via subprocess (simulated - in practice this would use the MCP client).

    For now, returns None as we'll use the direct API in the enrichment pipeline.
    """
    # This is a placeholder - the actual Exa search will be done via MCP
    return None


def process_exa_results(results: List[Dict[str, Any]]) -> WebsiteEnrichment:
    """Process Exa search results into WebsiteEnrichment structure."""
    enrichment = WebsiteEnrichment()

    all_text = ""
    source_urls = []

    for result in results:
        url = result.get('url', '')
        text = result.get('text', '')
        title = result.get('title', '')

        if url:
            source_urls.append(url)
        if text:
            all_text += f"\n{text}"

        # Set homepage from first result
        if not enrichment.homepage_url and url:
            enrichment.homepage_url = url

    enrichment.source_urls = source_urls

    if all_text:
        # Extract description (first 500 chars after cleaning)
        clean_text = re.sub(r'\s+', ' ', all_text).strip()
        if len(clean_text) > 100:
            enrichment.description = clean_text[:500] + "..."

        # Extract collections info
        collections_info = extract_collections_from_text(all_text)
        enrichment.collection_types = collections_info['collection_types']
        enrichment.collection_scope = collections_info['collection_scope']
        enrichment.collection_extent = collections_info['collection_extent']
        enrichment.temporal_coverage = collections_info['temporal_coverage']
        enrichment.digitization_status = collections_info['digitization_status']

        # Extract digital platform info
        platform_info = extract_digital_platform_info(all_text, enrichment.homepage_url)
        enrichment.catalog_url = platform_info['catalog_url']
        enrichment.api_endpoints = platform_info['api_endpoints']
        enrichment.sparql_endpoint = platform_info['sparql_endpoint']
        enrichment.oai_pmh_endpoint = platform_info['oai_pmh_endpoint']
        enrichment.iiif_support = platform_info['iiif_support']
        enrichment.linked_data = platform_info['linked_data']
        enrichment.metadata_standards = platform_info['metadata_standards']

        # Extract organization info
        org_info = extract_organization_info(all_text)
        enrichment.parent_organization = org_info['parent_organization']
        enrichment.organizational_units = org_info['organizational_units']
        enrichment.leadership = org_info['leadership']
        enrichment.staff_count = org_info['staff_count']

        # Extract services info
        services_info = extract_services_info(all_text)
        enrichment.services = services_info['services']
        enrichment.accessibility_info = services_info['accessibility_info']
        enrichment.membership_info = services_info['membership_info']

        # Extract contact info
        contact_info = extract_contact_info(all_text)
        enrichment.contact_email = contact_info['contact_email']
        enrichment.contact_phone = contact_info['contact_phone']
        enrichment.address = contact_info['address']

        # Calculate confidence score based on data extracted
        extracted_fields = sum([
            bool(enrichment.description),
            len(enrichment.collection_types) > 0,
            bool(enrichment.collection_extent),
            bool(enrichment.digitization_status),
            len(enrichment.metadata_standards) > 0,
            len(enrichment.services) > 0,
            bool(enrichment.contact_email or enrichment.contact_phone),
        ])
        enrichment.confidence_score = min(extracted_fields / 7.0, 1.0)

    return enrichment


def load_kb_library_files() -> List[Dict[str, Any]]:
    """Load all KB library YAML files."""
    entries = []

    for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
                data['_filepath'] = str(filepath)
                data['_filename'] = filepath.name
                entries.append(data)
        except Exception as e:
            logger.error(f"Error loading {filepath}: {e}")

    return entries


def save_entry(entry: Dict[str, Any], filepath: str):
    """Save entry back to YAML file."""
    # Remove internal fields before saving
    save_data = {k: v for k, v in entry.items() if not k.startswith('_')}

    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(save_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)


def enrichment_to_dict(enrichment: WebsiteEnrichment) -> Dict[str, Any]:
    """Convert WebsiteEnrichment dataclass to dict for YAML storage."""
    data = {
        'extraction_timestamp': enrichment.extraction_timestamp,
        'extraction_method': enrichment.extraction_method,
        'confidence_score': enrichment.confidence_score,
        'source_urls': enrichment.source_urls,
    }

    # Add non-empty fields
    if enrichment.description:
        data['description'] = enrichment.description
    if enrichment.mission_statement:
        data['mission_statement'] = enrichment.mission_statement
    if enrichment.history_summary:
        data['history_summary'] = enrichment.history_summary
    if enrichment.founding_year:
        data['founding_year'] = enrichment.founding_year

    # Collections
    if enrichment.collection_types:
        data['collection_types'] = enrichment.collection_types
    if enrichment.collection_scope:
        data['collection_scope'] = enrichment.collection_scope
    if enrichment.collection_extent:
        data['collection_extent'] = enrichment.collection_extent
    if enrichment.temporal_coverage:
        data['temporal_coverage'] = enrichment.temporal_coverage
    if enrichment.digitization_status:
        data['digitization_status'] = enrichment.digitization_status

    # Digital Platform
    if enrichment.homepage_url:
        data['homepage_url'] = enrichment.homepage_url
    if enrichment.catalog_url:
        data['catalog_url'] = enrichment.catalog_url
    if enrichment.api_endpoints:
        data['api_endpoints'] = enrichment.api_endpoints
    if enrichment.sparql_endpoint:
        data['sparql_endpoint'] = enrichment.sparql_endpoint
    if enrichment.oai_pmh_endpoint:
        data['oai_pmh_endpoint'] = enrichment.oai_pmh_endpoint
    if enrichment.iiif_support is not None:
        data['iiif_support'] = enrichment.iiif_support
    if enrichment.linked_data is not None:
        data['linked_data'] = enrichment.linked_data
    if enrichment.metadata_standards:
        data['metadata_standards'] = enrichment.metadata_standards

    # Services
    if enrichment.services:
        data['services'] = enrichment.services
    if enrichment.accessibility_info:
        data['accessibility_info'] = enrichment.accessibility_info
    if enrichment.membership_info:
        data['membership_info'] = enrichment.membership_info

    # Contact
    if enrichment.contact_email:
        data['contact_email'] = enrichment.contact_email
    if enrichment.contact_phone:
        data['contact_phone'] = enrichment.contact_phone
    if enrichment.address:
        data['address_from_website'] = enrichment.address

    # Organization
    if enrichment.parent_organization:
        data['parent_organization'] = enrichment.parent_organization
    if enrichment.organizational_units:
        data['organizational_units'] = enrichment.organizational_units
    if enrichment.staff_count:
        data['staff_count'] = enrichment.staff_count
    if enrichment.leadership:
        data['leadership'] = enrichment.leadership

    # Technical
    if enrichment.repository_software:
        data['repository_software'] = enrichment.repository_software
    if enrichment.cms_system:
        data['cms_system'] = enrichment.cms_system
    if enrichment.programming_languages:
        data['programming_languages'] = enrichment.programming_languages

    return data


def main():
    """Main function - this provides structure for MCP-based enrichment."""
    parser = argparse.ArgumentParser(description='Enrich KB libraries with website data via Exa')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of entries to process')
    args = parser.parse_args()

    logger.info("=" * 60)
    logger.info("KB Netherlands Libraries - Exa Website Enrichment")
    logger.info("=" * 60)

    # Load entries
    entries = load_kb_library_files()
    logger.info(f"Loaded {len(entries)} KB library entries")

    if args.limit:
        entries = entries[:args.limit]
        logger.info(f"Limited to {len(entries)} entries")

    # Filter entries that need website enrichment
    needs_enrichment = []
    for entry in entries:
        # Check if already has website enrichment
        if 'website_enrichment' not in entry:
            # Check if we have a website URL to search
            google_enrichment = entry.get('google_maps_enrichment', {})
            wikidata_enrichment = entry.get('wikidata_enrichment', {})

            website_url = (
                google_enrichment.get('website') or
                wikidata_enrichment.get('wikidata_identifiers', {}).get('Website')
            )

            if website_url:
                entry['_website_url'] = website_url
                needs_enrichment.append(entry)

    logger.info(f"Entries needing website enrichment: {len(needs_enrichment)}")

    if args.dry_run:
        logger.info("DRY RUN - No changes will be made")
        for entry in needs_enrichment[:10]:
            name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
            website = entry.get('_website_url', 'No URL')
            logger.info(f"  Would enrich: {name} - {website}")
        return

    # Print guidance for manual MCP-based enrichment
    logger.info("\n" + "=" * 60)
    logger.info("MANUAL ENRICHMENT REQUIRED")
    logger.info("=" * 60)
    logger.info("\nThis script identifies entries needing enrichment.")
    logger.info("Use the Exa MCP tool to search each library's website.")
    logger.info("\nExample search queries:")

    for entry in needs_enrichment[:5]:
        name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
        city = entry.get('original_entry', {}).get('plaatsnaam_bezoekadres', '')
        website = entry.get('_website_url', '')

        if website:
            domain = website.replace('https://', '').replace('http://', '').split('/')[0]
            logger.info(f"\n  Library: {name}")
            logger.info(f"  Website: {website}")
            logger.info(f"  Query: site:{domain} about collections services contact")

    logger.info("\n" + "=" * 60)
    logger.info(f"Total entries to enrich: {len(needs_enrichment)}")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()