glam/scripts/fetch_remaining_custodians.py

#!/usr/bin/env python3
"""
Fetch remaining custodian profile details from KIEN using Playwright.

This script fetches contact information (website, email, phone) for
custodians that haven't been processed yet.

Author: GLAM Project
Date: 2025-12-04
"""

import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path

from playwright.async_api import async_playwright, Page

# Configuration
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
DELAY_MS = 1500  # Delay between requests

async def fetch_custodian_profile(page: Page, name: str, url: str) -> dict:
    """Fetch detailed profile for a custodian from their KIEN page."""
    result = {
        'name': name,
        'kien_url': url,
        'website': None,
        'email': None,
        'phone': None,
        'address': None,
        'description': None,
        'heritage_forms_linked': [],
        'fetched_at': datetime.now(timezone.utc).isoformat()
    }

    try:
        await page.goto(url, wait_until='networkidle', timeout=30000)
        await asyncio.sleep(1)

        # Extract contact information using JavaScript
        data = await page.evaluate('''() => {
            const result = {
                website: null,
                email: null,
                phone: null,
                description: null,
                heritageForms: []
            };

            const text = document.body.innerText;
            const html = document.body.innerHTML;

            // Get description from meta or first paragraph
            const metaDesc = document.querySelector('meta[name="description"]');
            if (metaDesc) {
                result.description = metaDesc.getAttribute('content');
            } else {
                const firstP = document.querySelector('main p, article p, .content p');
                if (firstP) {
                    result.description = firstP.textContent.trim().substring(0, 500);
                }
            }

            // Find website link (external, not KIEN itself)
            const websiteLink = document.querySelector('a[href^="http"]:not([href*="immaterieelerfgoed.nl"])');
            if (websiteLink) {
                const href = websiteLink.getAttribute('href');
                // Skip social media
                const socialDomains = ['facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com'];
                if (!socialDomains.some(d => href.includes(d))) {
                    result.website = href;
                }
            }

            // Find email - look for mailto: links first
            const emailLink = document.querySelector('a[href^="mailto:"]');
            if (emailLink) {
                result.email = emailLink.getAttribute('href').replace('mailto:', '');
            } else {
                // Try regex in text
                const emailMatch = text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/);
                if (emailMatch) {
                    result.email = emailMatch[0];
                }
            }

            // Find phone - look for tel: links first
            const phoneLink = document.querySelector('a[href^="tel:"]');
            if (phoneLink) {
                result.phone = phoneLink.getAttribute('href').replace('tel:', '');
            } else {
                // Try regex for Dutch phone numbers
                const phoneMatch = text.match(/(\\+31|0)[0-9\\s\\-]{9,}/);
                if (phoneMatch) {
                    result.phone = phoneMatch[0].replace(/\\s/g, '').replace(/-/g, '');
                }
            }

            // Find linked heritage forms
            const heritageLinks = document.querySelectorAll('a[href*="/nl/"][href*="/page/"]');
            const seenForms = new Set();
            heritageLinks.forEach(link => {
                const href = link.getAttribute('href');
                const name = link.textContent.trim();
                // Skip navigation-like links
                if (name.length > 5 && !seenForms.has(name) &&
                    !href.includes('kennisbank') && !href.includes('contact') &&
                    !href.includes('partner') && !href.includes('publicaties')) {
                    seenForms.add(name);
                    result.heritageForms.push({
                        name: name,
                        url: 'https://www.immaterieelerfgoed.nl' + href
                    });
                }
            });

            return result;
        }''')

        result['website'] = data.get('website')
        result['email'] = data.get('email')
        result['phone'] = data.get('phone')
        result['description'] = data.get('description')
        result['heritage_forms_linked'] = data.get('heritageForms', [])

    except Exception as e:
        result['error'] = str(e)

    return result


async def main():
    """Main function to fetch remaining custodian profiles."""
    print("=" * 60)
    print("KIEN Custodian Profile Fetcher")
    print("=" * 60)
    print(f"Started at: {datetime.now().isoformat()}")
    print()

    # Load existing profiles
    profiles_file = OUTPUT_DIR / "custodian_profiles.json"
    if profiles_file.exists():
        with open(profiles_file) as f:
            existing = json.load(f)
            fetched_urls = {c['kien_url'] for c in existing.get('custodians', [])}
    else:
        fetched_urls = set()

    print(f"Already fetched: {len(fetched_urls)} profiles")

    # Load all custodians list
    cleaned_file = OUTPUT_DIR / "kien_custodians_cleaned.json"
    with open(cleaned_file) as f:
        all_custodians = json.load(f)['custodians']

    # Filter to remaining
    remaining = [c for c in all_custodians if c['kien_url'] not in fetched_urls]
    print(f"Remaining to fetch: {len(remaining)} profiles")
    print()

    if not remaining:
        print("No remaining profiles to fetch!")
        return

    # Fetch remaining profiles
    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="GLAM-Heritage-Research/1.0 (Academic Research)"
        )
        page = await context.new_page()

        for i, custodian in enumerate(remaining, 1):
            name = custodian['name']
            url = custodian['kien_url']

            print(f"[{i}/{len(remaining)}] {name}")

            profile = await fetch_custodian_profile(page, name, url)
            results.append(profile)

            # Show what we found
            if profile.get('website'):
                print(f"    → Website: {profile['website']}")
            if profile.get('email'):
                print(f"    → Email: {profile['email']}")
            if profile.get('phone'):
                print(f"    → Phone: {profile['phone']}")
            if profile.get('error'):
                print(f"    ⚠ Error: {profile['error']}")

            # Rate limiting
            await asyncio.sleep(DELAY_MS / 1000)

            # Progress save every 25 items
            if i % 25 == 0:
                progress_file = OUTPUT_DIR / f"profiles_progress_{i}.json"
                with open(progress_file, 'w', encoding='utf-8') as f:
                    json.dump(results, f, ensure_ascii=False, indent=2)
                print(f"  [PROGRESS] Saved {i} profiles")

        await browser.close()

    # Merge with existing profiles
    print("\nMerging with existing profiles...")

    if profiles_file.exists():
        with open(profiles_file) as f:
            existing = json.load(f)
    else:
        existing = {'custodians': []}

    all_profiles = existing.get('custodians', []) + results

    output = {
        'extracted_at': datetime.now(timezone.utc).isoformat(),
        'source': 'https://www.immaterieelerfgoed.nl',
        'total_custodians': len(all_profiles),
        'custodians': all_profiles
    }

    with open(profiles_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    # Summary
    websites = sum(1 for c in all_profiles if c.get('website'))
    emails = sum(1 for c in all_profiles if c.get('email'))
    phones = sum(1 for c in all_profiles if c.get('phone'))

    print("\n" + "=" * 60)
    print("FETCH COMPLETE")
    print("=" * 60)
    print(f"New profiles fetched: {len(results)}")
    print(f"Total profiles: {len(all_profiles)}")
    print(f"  With website: {websites}")
    print(f"  With email: {emails}")
    print(f"  With phone: {phones}")
    print(f"\nSaved to: {profiles_file}")


if __name__ == "__main__":
    asyncio.run(main())