glam/scripts/discover_websites_linkup.py

#!/usr/bin/env python3
"""
Website Discovery for Custodians using Linkup MCP.

This script identifies custodian files without websites and generates
search queries for Linkup MCP. Results can then be processed to update
YAML files.

Usage:
    # List files needing website discovery:
    python scripts/discover_websites_linkup.py --list

    # Generate search queries for first N files:
    python scripts/discover_websites_linkup.py --generate-queries --limit 10

    # Update a file with discovered website:
    python scripts/discover_websites_linkup.py --update JP-01-ABU-L-K --url https://example.com

    # Batch update from results JSON:
    python scripts/discover_websites_linkup.py --batch-update results.json
"""
import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse

import yaml

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"

# URL patterns to filter out (not useful as institutional websites)
EXCLUDED_URL_PATTERNS = [
    r'wikipedia\.org',
    r'wikidata\.org',
    r'tripadvisor\.',
    r'google\.com/maps',
    r'maps\.google\.',
    r'facebook\.com',
    r'twitter\.com',
    r'instagram\.com',
    r'youtube\.com',
    r'linkedin\.com',
    r'amazon\.co',
    r'booking\.com',
    r'yelp\.',
]


def has_website(entry: dict) -> bool:
    """Check if custodian entry already has a website."""
    # Check original_entry.identifiers
    orig_ids = entry.get('original_entry', {}).get('identifiers', [])
    for ident in orig_ids:
        if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
            return True

    # Check wikidata_enrichment
    wiki_web = entry.get('wikidata_enrichment', {}).get('wikidata_official_website')
    if wiki_web:
        return True

    # Check website_discovery (already discovered)
    if entry.get('website_discovery', {}).get('website_url'):
        return True

    return False


def get_custodian_name(entry: dict) -> str | None:
    """Extract the best name for searching."""
    # Try custodian_name first
    name = entry.get('custodian_name', {}).get('claim_value')
    if name:
        return name

    # Try original_entry.name
    name = entry.get('original_entry', {}).get('name')
    if name:
        return name

    # Try wikidata label (Japanese preferred for Japanese institutions)
    wikidata = entry.get('wikidata_enrichment', {})
    ja_label = wikidata.get('wikidata_label_ja')
    if ja_label:
        return ja_label
    en_label = wikidata.get('wikidata_label_en')
    if en_label:
        return en_label

    return None


def get_location_info(entry: dict) -> dict:
    """Extract location information for search context."""
    location = entry.get('location', {})
    orig_loc = entry.get('original_entry', {}).get('locations', [{}])[0] if entry.get('original_entry', {}).get('locations') else {}

    return {
        'city': location.get('city') or orig_loc.get('city'),
        'region': location.get('region') or orig_loc.get('region'),
        'country': location.get('country') or orig_loc.get('country') or 'JP',
    }


def generate_search_query(entry: dict) -> str | None:
    """Generate optimal search query for Linkup."""
    name = get_custodian_name(entry)
    if not name:
        return None

    location = get_location_info(entry)
    inst_type = entry.get('original_entry', {}).get('institution_type', 'LIBRARY')

    # Build query parts
    parts = [name]

    # Add Japanese label if different
    wikidata = entry.get('wikidata_enrichment', {})
    ja_label = wikidata.get('wikidata_label_ja')
    if ja_label and ja_label != name:
        parts.append(ja_label)

    # Add location context
    if location.get('city'):
        parts.append(location['city'])
    if location.get('region'):
        parts.append(location['region'])

    # Add institution type hint
    type_hints = {
        'LIBRARY': 'library 図書館',
        'MUSEUM': 'museum 博物館',
        'ARCHIVE': 'archive アーカイブ',
        'GALLERY': 'gallery ギャラリー',
    }
    if inst_type in type_hints:
        parts.append(type_hints[inst_type])

    # Add country
    parts.append('Japan website official')

    return ' '.join(parts)


def is_valid_website_url(url: str) -> bool:
    """Check if URL is a valid institutional website."""
    if not url:
        return False

    # Check against excluded patterns
    for pattern in EXCLUDED_URL_PATTERNS:
        if re.search(pattern, url, re.IGNORECASE):
            return False

    # Parse URL
    try:
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            return False
    except Exception:
        return False

    return True


def score_url(url: str, entry: dict) -> float:
    """Score URL relevance (0.0-1.0)."""
    score = 0.5  # Base score

    parsed = urlparse(url)
    domain = parsed.netloc.lower()

    # Prefer .jp domains for Japanese institutions
    if entry.get('location', {}).get('country') == 'JP':
        if '.jp' in domain:
            score += 0.2
        if '.go.jp' in domain or '.lg.jp' in domain:
            score += 0.1  # Government domains

    # Prefer domains containing institution name parts
    name = get_custodian_name(entry) or ''
    name_parts = [p.lower() for p in re.split(r'\s+', name) if len(p) > 3]
    for part in name_parts:
        if part in domain:
            score += 0.1

    # Prefer shorter paths (homepage vs deep link)
    path_depth = len([p for p in parsed.path.split('/') if p])
    if path_depth <= 2:
        score += 0.1

    return min(score, 1.0)


def update_custodian_file(filepath: Path, website_url: str, discovery_method: str = 'linkup_search',
                          confidence: float = 0.9, search_query: str | None = None) -> bool:
    """Update custodian YAML file with discovered website."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)
            if not entry:
                logger.error(f"Invalid file: {filepath}")
                return False

        # Add website discovery metadata
        entry['website_discovery'] = {
            'website_url': website_url,
            'discovery_date': datetime.now(timezone.utc).isoformat(),
            'discovery_method': discovery_method,
            'confidence_score': confidence,
        }
        if search_query:
            entry['website_discovery']['search_query'] = search_query

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        logger.info(f"Updated: {filepath.name} → {website_url}")
        return True
    except Exception as e:
        logger.error(f"Failed to update {filepath}: {e}")
        return False


def list_files_without_websites(pattern: str = "JP-*.yaml", limit: int | None = None) -> list:
    """List custodian files that don't have websites."""
    files = sorted(CUSTODIAN_DIR.glob(pattern))
    results = []

    for filepath in files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)

            if entry and not has_website(entry):
                name = get_custodian_name(entry) or filepath.stem
                results.append({
                    'filepath': str(filepath),
                    'filename': filepath.name,
                    'ghcid': filepath.stem,
                    'name': name,
                })

                if limit and len(results) >= limit:
                    break
        except Exception as e:
            logger.warning(f"Error reading {filepath}: {e}")

    return results


def generate_queries(pattern: str = "JP-*.yaml", limit: int | None = 10) -> list:
    """Generate search queries for files without websites."""
    files_without = list_files_without_websites(pattern, limit)
    queries = []

    for item in files_without:
        filepath = Path(item['filepath'])
        with open(filepath, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)

        query = generate_search_query(entry)
        if query:
            queries.append({
                'ghcid': item['ghcid'],
                'filename': item['filename'],
                'name': item['name'],
                'search_query': query,
            })

    return queries


def main():
    parser = argparse.ArgumentParser(description='Website Discovery using Linkup MCP')
    parser.add_argument('--list', action='store_true', help='List files without websites')
    parser.add_argument('--generate-queries', action='store_true', help='Generate search queries')
    parser.add_argument('--limit', type=int, default=10, help='Limit number of files to process')
    parser.add_argument('--pattern', type=str, default='JP-*.yaml', help='File pattern to match')
    parser.add_argument('--update', type=str, help='Update specific file (GHCID)')
    parser.add_argument('--url', type=str, help='Website URL to add')
    parser.add_argument('--batch-update', type=str, help='Batch update from JSON file')
    parser.add_argument('--output', type=str, help='Output file for queries JSON')

    args = parser.parse_args()

    if args.list:
        files = list_files_without_websites(args.pattern, args.limit)
        print(f"\n=== Files Without Websites ({len(files)} found) ===\n")
        for item in files:
            print(f"  {item['ghcid']}: {item['name'][:60]}")
        print(f"\nTotal: {len(files)} files need website discovery")

    elif args.generate_queries:
        queries = generate_queries(args.pattern, args.limit)
        print(f"\n=== Search Queries ({len(queries)} generated) ===\n")
        for q in queries:
            print(f"GHCID: {q['ghcid']}")
            print(f"Name:  {q['name'][:60]}")
            print(f"Query: {q['search_query']}")
            print()

        if args.output:
            with open(args.output, 'w', encoding='utf-8') as f:
                json.dump(queries, f, indent=2, ensure_ascii=False)
            print(f"Saved to: {args.output}")

    elif args.update and args.url:
        # Find the file
        matches = list(CUSTODIAN_DIR.glob(f"{args.update}*.yaml"))
        if not matches:
            print(f"Error: No file found matching {args.update}")
            sys.exit(1)
        filepath = matches[0]

        if not is_valid_website_url(args.url):
            print(f"Warning: URL may not be a valid institutional website: {args.url}")

        if update_custodian_file(filepath, args.url):
            print(f"✅ Updated {filepath.name} with {args.url}")
        else:
            print(f"❌ Failed to update {filepath.name}")
            sys.exit(1)

    elif args.batch_update:
        with open(args.batch_update, 'r', encoding='utf-8') as f:
            updates = json.load(f)

        success = 0
        failed = 0
        for item in updates:
            ghcid = item.get('ghcid')
            url = item.get('website_url')
            if not ghcid or not url:
                continue

            matches = list(CUSTODIAN_DIR.glob(f"{ghcid}*.yaml"))
            if matches:
                if update_custodian_file(matches[0], url,
                                         search_query=item.get('search_query'),
                                         confidence=item.get('confidence', 0.9)):
                    success += 1
                else:
                    failed += 1
            else:
                logger.warning(f"File not found: {ghcid}")
                failed += 1

        print(f"\n=== Batch Update Complete ===")
        print(f"Success: {success}")
        print(f"Failed:  {failed}")

    else:
        parser.print_help()


if __name__ == '__main__':
    main()