glam/scripts/scrape_kien_profiles_for_locations.py

#!/usr/bin/env python3
"""
Scrape KIEN profile pages to extract location/address information.

KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) profile pages often contain
contact information including addresses. This script scrapes those pages to
find location data for KIEN entries that don't have locations yet.

The KIEN profile pages are at:
https://www.immaterieelerfgoed.nl/nl/page/{id}/{slug}

This script:
1. Finds all KIEN entries without locations
2. Scrapes their KIEN profile pages
3. Extracts address/location information
4. Updates the entry files with the discovered locations
"""

import argparse
import re
import sqlite3
import time
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from urllib.parse import urlparse

# Paths
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')

# Reverse mapping - admin1 code to province code
ADMIN1_TO_PROVINCE = {
    '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
    '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
    '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
}


def get_kien_entries_without_locations() -> List[Dict[str, Any]]:
    """Find all KIEN entries that don't have location data."""
    entries = []

    # KIEN entries are in the 17xx and 18xx range
    for pattern in ['17*.yaml', '18*.yaml']:
        for entry_path in ENTRIES_DIR.glob(pattern):
            with open(entry_path, 'r', encoding='utf-8') as f:
                try:
                    entry = yaml.safe_load(f)
                except yaml.YAMLError:
                    continue

            # Check if it's a KIEN entry
            if not entry:
                continue

            provenance = entry.get('provenance', {})
            sources = provenance.get('sources', {})
            is_kien = 'kien' in sources or any(
                'kien' in str(s).lower() or 'immaterieelerfgoed' in str(s).lower()
                for s in [entry.get('original_entry', {}).get('systeem', '')]
            )

            if not is_kien:
                continue

            # Check if already has locations or GHCID
            has_location = bool(entry.get('locations'))
            has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current'))

            if not has_location and not has_ghcid:
                kien_url = entry.get('kien_enrichment', {}).get('kien_url', '')
                org_name = entry.get('original_entry', {}).get('organisatie', '')
                website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie')

                entries.append({
                    'path': entry_path,
                    'entry_index': entry.get('entry_index'),
                    'org_name': org_name,
                    'kien_url': kien_url,
                    'website': website,
                    'entry': entry,
                })

    return entries


def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
    """Look up a city in the GeoNames database."""
    if not GEONAMES_DB.exists():
        return None

    conn = sqlite3.connect(GEONAMES_DB)
    cursor = conn.cursor()

    # Try exact match first
    cursor.execute("""
        SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
               population, feature_code
        FROM cities
        WHERE country_code = ?
          AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY population DESC
        LIMIT 1
    """, (country_code, city_name, city_name))

    row = cursor.fetchone()
    conn.close()

    if row:
        return {
            'geonames_id': row[0],
            'name': row[1],
            'ascii_name': row[2],
            'admin1_code': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'population': row[6],
            'feature_code': row[7],
            'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'),
        }

    return None


def extract_address_from_html(html: str) -> Optional[Dict[str, Any]]:
    """
    Extract address information from KIEN profile page HTML.

    The KIEN profile pages have a specific structure:
    - Organization contact info is in a "Contact" section near the top (under the avatar)
    - KIEN's own contact info is in the footer (should be ignored)

    The organization's location often appears as just a city name in all caps
    (e.g., "KAMPEN") in the contact section.

    Returns dict with extracted address components or None.
    """
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")
        return None

    soup = BeautifulSoup(html, 'html.parser')

    # Collect address candidates (excluding footer)
    address_candidates: List[str] = []

    # Strategy 1: Find the organization's contact section (not footer)
    # The page structure has an avatar image, then h1 with contact name, then h2 with org name
    # Below that is the contact info with city in caps

    # Look for content before the footer
    # The footer contains KIEN's address "Postbus 649, 6800 AP Arnhem"

    # Find all h3 elements with "Contact" heading - there are two: one for org, one for KIEN
    contact_headings = [h for h in soup.find_all('h3') if h.get_text(strip=True) == 'Contact']

    # The first "Contact" section is the organization's
    if contact_headings:
        org_contact = contact_headings[0]
        # Get the parent/siblings that contain the actual contact info
        parent = org_contact.parent
        if parent:
            # Get text from this section
            section_text = parent.get_text(separator='\n', strip=True)
            # Stop at the next section or at "Kenniscentrum" (KIEN's address marker)
            lines = section_text.split('\n')
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                # Skip if it's KIEN's address
                if 'Kenniscentrum' in line or 'Postbus' in line or '6800 AP' in line:
                    break
                # City names can be in ALL CAPS or Title Case
                # Accept lines that are 3-50 chars and look like place names
                if len(line) > 2 and len(line) < 50:
                    # All caps city (e.g., "KAMPEN")
                    if line.isupper():
                        address_candidates.append(line)
                    # Title case city (e.g., "Hattem", "Den Haag")
                    elif re.match(r'^[A-Z][a-z]+(?:\s+[a-z]+)*$|^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$', line):
                        # Single word capitalized or multi-word with each word capitalized
                        address_candidates.append(line)
                # Or postal code + city
                if re.match(r'\d{4}\s?[A-Z]{2}', line):
                    address_candidates.append(line)

    # Strategy 2: Look for city names near the top of the page
    # (but not in footer/navigation)
    main_content = soup.get_text()

    # Split at "Kenniscentrum Immaterieel Erfgoed" which marks the footer
    parts = main_content.split('Kenniscentrum Immaterieel Erfgoed')
    if parts:
        org_section = parts[0]

        # Look for all-caps words that might be city names (3+ chars)
        caps_words = re.findall(r'\b([A-Z]{3,}(?:\s+[A-Z]+)?)\b', org_section)
        for word in caps_words:
            # Skip common non-city words
            if word in ['CONTACT', 'MEDIA', 'IMMATERIEEL', 'ERFGOED', 'WEBSITE', 'NEDERLAND',
                        'HOME', 'OVER', 'NIEUWS', 'AGENDA', 'ENGLISH', 'DUTCH', 'MENU']:
                continue
            address_candidates.append(word)

        # Also look for title-case words that could be Dutch cities
        # Pattern: Words starting with capital, potentially with lowercase following
        # Look for standalone words that might be city names
        title_words = re.findall(r'\b([A-Z][a-z]{2,}(?:[\s\-][A-Za-z]+)?)\b', org_section)
        for word in title_words:
            # Skip common non-city words
            skip_words = ['Contact', 'Media', 'Website', 'Email', 'Telefoon', 'Facebook',
                          'Twitter', 'Instagram', 'YouTube', 'LinkedIn', 'Home', 'Over',
                          'Nieuws', 'Agenda', 'English', 'Dutch', 'Menu', 'Zoeken',
                          'Stichting', 'Vereniging', 'Nederland', 'Nederlands', 'Meer']
            if word in skip_words:
                continue
            # Add to candidates
            address_candidates.append(word)

    # Try to validate each candidate as a Dutch city
    for candidate in address_candidates:
        # Handle postal code + city pattern
        match = re.search(r'(\d{4}\s?[A-Z]{2})\s+([A-Za-z][A-Za-z\-\'\s]+)', candidate)
        if match:
            postal_code = match.group(1).replace(' ', '')
            city = match.group(2).strip().title()  # Convert to title case
        else:
            # Single city name (possibly in caps)
            postal_code = None
            city = candidate.strip().title()  # Convert KAMPEN to Kampen

        # Clean up city name
        city = re.sub(r'\s+', ' ', city)
        city = city.split(',')[0].strip()

        # Skip if too short
        if len(city) < 3:
            continue

        # Validate city in GeoNames
        geonames = lookup_city_in_geonames(city)
        if geonames:
            result: Dict[str, Any] = {
                'city': geonames['name'],
                'country': 'NL',
                'latitude': geonames['latitude'],
                'longitude': geonames['longitude'],
                'geonames_id': geonames['geonames_id'],
                'region_code': geonames['region_code'],
                'extraction_method': 'KIEN_PROFILE_CONTACT_SECTION',
            }
            if postal_code:
                result['postal_code'] = postal_code
            return result

    return None


def scrape_kien_profile(url: str, page: Any) -> Optional[str]:
    """
    Scrape a KIEN profile page using Playwright.

    Returns HTML content or None on failure.
    """
    try:
        page.goto(url, wait_until='networkidle', timeout=30000)
        time.sleep(1)  # Wait for any dynamic content
        html = page.content()
        return html
    except Exception as e:
        if 'Timeout' in str(type(e).__name__):
            print(f"  Timeout for {url}")
        else:
            print(f"  Error scraping {url}: {e}")
        return None


def scrape_org_website(url: str, page: Any) -> Optional[str]:
    """
    Scrape an organization's own website for contact/address info.

    Returns HTML content or None on failure.
    """
    if not url or url == 'null':
        return None

    try:
        from bs4 import BeautifulSoup
    except ImportError:
        return None

    # Normalize URL
    if not url.startswith('http'):
        url = 'https://' + url

    try:
        page.goto(url, wait_until='networkidle', timeout=30000)
        time.sleep(1)

        # Try to find contact/about page
        html = page.content()
        soup = BeautifulSoup(html, 'html.parser')

        # Look for contact links
        contact_links: List[str] = []
        for link in soup.find_all('a', href=True):
            href_attr = link.get('href')
            if href_attr:
                href = str(href_attr).lower()
                text = link.get_text(strip=True).lower()
                if any(term in href or term in text for term in ['contact', 'over', 'about', 'adres', 'locatie']):
                    contact_links.append(str(href_attr))

        # If we find contact page, navigate there
        if contact_links:
            for contact_href in contact_links[:1]:  # Try first contact link
                if not contact_href.startswith('http'):
                    # Make relative URL absolute
                    parsed = urlparse(url)
                    if contact_href.startswith('/'):
                        contact_href = f"{parsed.scheme}://{parsed.netloc}{contact_href}"
                    else:
                        contact_href = f"{url.rstrip('/')}/{contact_href}"

                try:
                    page.goto(contact_href, wait_until='networkidle', timeout=20000)
                    time.sleep(1)
                    return page.content()
                except Exception:
                    pass

        return html

    except Exception as e:
        print(f"  Error scraping {url}: {e}")
        return None


def update_entry_with_location(entry_path: Path, location: Dict[str, Any],
                                extraction_method: str, source_url: str,
                                dry_run: bool = True) -> bool:
    """Update an entry file with discovered location."""
    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)

    # Create location structure
    location_data: Dict[str, Any] = {
        'city': location['city'],
        'country': 'NL',
        'latitude': location.get('latitude'),
        'longitude': location.get('longitude'),
    }
    if 'postal_code' in location:
        location_data['postal_code'] = location['postal_code']

    # Create resolution metadata
    resolution = {
        'method': extraction_method,
        'source_url': source_url,
        'geonames_id': location.get('geonames_id'),
        'geonames_name': location.get('city'),
        'region_code': location.get('region_code'),
        'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
    }

    if not dry_run:
        entry['locations'] = [location_data]
        entry['location_resolution'] = resolution

        # Add provenance note
        if 'provenance' not in entry:
            entry['provenance'] = {'notes': []}
        if 'notes' not in entry['provenance']:
            entry['provenance']['notes'] = []
        entry['provenance']['notes'].append(
            f"Location extracted from {extraction_method} - {source_url}"
        )

        with open(entry_path, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return True


def main() -> None:
    parser = argparse.ArgumentParser(description='Scrape KIEN profiles for location data')
    parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
    parser.add_argument('--limit', type=int, default=10, help='Limit number of entries to process')
    parser.add_argument('--skip-kien', action='store_true', help='Skip KIEN profile scraping')
    parser.add_argument('--skip-website', action='store_true', help='Skip organization website scraping')
    args = parser.parse_args()

    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        print("Error: Playwright not installed. Run: pip install playwright && playwright install chromium")
        return

    # Get entries without locations
    entries = get_kien_entries_without_locations()
    print(f"Found {len(entries)} KIEN entries without locations")

    if args.limit:
        entries = entries[:args.limit]
        print(f"Processing first {args.limit} entries")

    extracted: List[Dict[str, Any]] = []
    failed: List[Dict[str, Any]] = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        )
        page = context.new_page()

        for i, entry_info in enumerate(entries):
            print(f"\n[{i+1}/{len(entries)}] {entry_info['org_name']}")

            location: Optional[Dict[str, Any]] = None
            source_url: Optional[str] = None
            method: Optional[str] = None

            # Try KIEN profile first
            if not args.skip_kien and entry_info['kien_url']:
                print(f"  Scraping KIEN profile: {entry_info['kien_url']}")
                html = scrape_kien_profile(entry_info['kien_url'], page)
                if html:
                    location = extract_address_from_html(html)
                    if location:
                        source_url = entry_info['kien_url']
                        method = 'KIEN_PROFILE_SCRAPE'
                        print(f"  ✓ Found location: {location['city']}")

            # Try organization website
            if not location and not args.skip_website and entry_info['website']:
                print(f"  Scraping website: {entry_info['website']}")
                html = scrape_org_website(entry_info['website'], page)
                if html:
                    location = extract_address_from_html(html)
                    if location:
                        source_url = entry_info['website']
                        method = 'ORG_WEBSITE_SCRAPE'
                        print(f"  ✓ Found location: {location['city']}")

            if location and source_url and method:
                extracted.append({
                    'entry_index': entry_info['entry_index'],
                    'org_name': entry_info['org_name'],
                    'location': location,
                    'source': source_url,
                    'method': method,
                })

                # Update the entry file
                update_entry_with_location(
                    entry_info['path'],
                    location,
                    method,
                    source_url,
                    dry_run=args.dry_run
                )
            else:
                failed.append({
                    'entry_index': entry_info['entry_index'],
                    'org_name': entry_info['org_name'],
                    'kien_url': entry_info['kien_url'],
                    'website': entry_info['website'],
                })
                print("  ✗ No location found")

            # Rate limiting
            time.sleep(2)

        browser.close()

    # Summary
    print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
    print(f"  - Locations extracted: {len(extracted)}")
    print(f"  - No location found: {len(failed)}")

    if extracted:
        print("\nExtracted locations:")
        for e in extracted:
            print(f"  {e['org_name']} → {e['location']['city']} ({e['method']})")

    if failed:
        print("\nFailed to find location:")
        for f in failed:
            print(f"  {f['org_name']}")


if __name__ == '__main__':
    main()