glam/scripts/scrapers/enrich_bayern_museums.py

#!/usr/bin/env python3
"""
Bavaria Museum Metadata Enrichment

Enriches the 1,231 Bayern museum records by scraping detail pages from:
http://www.museen-in-deutschland.de/

Extracts:
- Full street addresses
- Postal codes
- Phone numbers
- Email addresses
- Website URLs
- Opening hours
- Extended descriptions

Author: OpenCode AI Agent
Date: 2025-11-20
Status: PRODUCTION - Enriching existing Bayern museum dataset
"""

import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup


# Configuration
BASE_URL = "http://www.museen-in-deutschland.de"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
}
RATE_LIMIT_DELAY = 1.0  # Seconds between requests (be respectful)


def fetch_detail_page(url: str) -> Optional[str]:
    """Fetch a museum detail page HTML."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        response.encoding = 'utf-8'
        return response.text
    except requests.RequestException as e:
        print(f"    ✗ Error fetching {url}: {e}")
        return None


def parse_detail_page(html: str) -> Dict[str, any]:
    """
    Parse museum detail page to extract metadata.

    The page uses icon prefixes:
    🏘 = Museum name
    Street address (no icon)
    Postal code + City (no icon)
    ✆ = Phone
    🖷 = Fax
    🕸 = Website
    ⌖ = Coordinates
    📧 = Email (often empty)

    Returns dict with fields:
    - street_address
    - postal_code
    - phone
    - email
    - website
    - latitude
    - longitude
    - description
    """
    soup = BeautifulSoup(html, 'html.parser')
    metadata = {}

    # Get clean text with line breaks preserved
    page_text = soup.get_text(separator='\n')

    # Extract address block (after museum name, before phone)
    # Pattern: "Streetname Number\nPostal City"
    address_match = re.search(r'🏘[^\n]+\n([^\n]+)\n(\d{5})\s+([^\n]+)', page_text)
    if address_match:
        metadata['street_address'] = address_match.group(1).strip()
        metadata['postal_code'] = address_match.group(2).strip()
        # City already in dataset
    else:
        # Fallback: Look for postal code pattern
        postal_match = re.search(r'(\d{5})\s+([^\n]+)', page_text)
        if postal_match:
            metadata['postal_code'] = postal_match.group(1).strip()
            # Try to find street on previous line
            lines = page_text.split('\n')
            for i, line in enumerate(lines):
                if postal_match.group(0) in line and i > 0:
                    prev_line = lines[i-1].strip()
                    if prev_line and not any(x in prev_line for x in ['✆', '🖷', '🕸', '📧']):
                        metadata['street_address'] = prev_line
                    break

    # Extract phone (after ✆ icon)
    phone_match = re.search(r'✆\s*([+\d\s()/-]{8,25})', page_text)
    if phone_match:
        metadata['phone'] = phone_match.group(1).strip()

    # Extract email (after 📧 icon or look for email pattern)
    email_match = re.search(r'📧\s*([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text)
    if email_match:
        metadata['email'] = email_match.group(1).strip()
    else:
        # Fallback: search anywhere
        email_match = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text)
        if email_match:
            metadata['email'] = email_match.group(1).strip()

    # Extract website (after 🕸 icon)
    website_match = re.search(r'🕸\s*(https?://[^\s<>"]+)', page_text)
    if website_match:
        metadata['website'] = website_match.group(1).strip()
    else:
        # Fallback: search anywhere
        website_match = re.search(r'(https?://[^\s<>"]+)', page_text)
        if website_match:
            url = website_match.group(1).strip()
            # Exclude the isil.museum site itself
            if 'isil.museum' not in url and 'museen-in-deutschland.de' not in url:
                metadata['website'] = url

    # Extract coordinates (after ⌖ icon)
    coords_match = re.search(r'⌖\s*([\d.]+),\s*([\d.]+)', page_text)
    if coords_match:
        metadata['latitude'] = float(coords_match.group(1))
        metadata['longitude'] = float(coords_match.group(2))

    return metadata


def enrich_museum(museum: Dict) -> Dict:
    """Enrich a single museum record with detail page data."""

    # Check if museum has Registry identifier with detail URL
    detail_url = None
    for identifier in museum.get('identifiers', []):
        if identifier.get('identifier_scheme') == 'Registry':
            detail_url = identifier.get('identifier_value')
            break

    if not detail_url:
        print(f"  ⚠ No detail URL for: {museum['name']}")
        return museum

    print(f"  → Enriching: {museum['name']} ({museum['locations'][0]['city']})")

    # Fetch detail page
    html = fetch_detail_page(detail_url)
    if not html:
        return museum

    # Parse metadata
    metadata = parse_detail_page(html)

    # Update museum record
    if metadata.get('street_address'):
        museum['locations'][0]['street_address'] = metadata['street_address']

    if metadata.get('postal_code'):
        museum['locations'][0]['postal_code'] = metadata['postal_code']

    if metadata.get('latitude') and metadata.get('longitude'):
        museum['locations'][0]['latitude'] = metadata['latitude']
        museum['locations'][0]['longitude'] = metadata['longitude']

    if metadata.get('phone'):
        # Add phone as identifier
        museum['identifiers'].append({
            'identifier_scheme': 'Phone',
            'identifier_value': metadata['phone']
        })

    if metadata.get('email'):
        # Add email as identifier
        museum['identifiers'].append({
            'identifier_scheme': 'Email',
            'identifier_value': metadata['email']
        })

    if metadata.get('website'):
        # Add/update website
        has_website = any(i.get('identifier_scheme') == 'Website' for i in museum['identifiers'])
        if not has_website:
            museum['identifiers'].append({
                'identifier_scheme': 'Website',
                'identifier_value': metadata['website'],
                'identifier_url': metadata['website']
            })

    # Note: Opening hours and extended descriptions not available in ISIL registry format
    # Registry focuses on contact data and identifiers

    # Update provenance
    museum['provenance']['confidence_score'] = 0.95  # Higher confidence after enrichment
    museum['provenance']['notes'] = f"Enriched with detail page data from {detail_url}"

    # Log enrichment
    enriched_fields = [k for k in metadata.keys() if metadata[k]]
    print(f"    ✓ Added: {', '.join(enriched_fields)}")

    return museum


def main():
    """Enrich Bayern museums with detail page data."""
    print("=" * 80)
    print("Bavaria Museum Metadata Enrichment")
    print("=" * 80)
    print()

    # Load existing dataset
    input_file = Path("data/isil/germany/bayern_museums_20251120_213144.json")

    if not input_file.exists():
        print(f"✗ Input file not found: {input_file}")
        print("  Please run harvest_isil_museum_bayern.py first")
        return None

    print(f"Loading: {input_file.name}")
    with open(input_file, 'r', encoding='utf-8') as f:
        museums = json.load(f)

    print(f"✓ Loaded {len(museums)} museums")
    print()

    # Estimate time
    total = len(museums)
    estimated_time = (total * RATE_LIMIT_DELAY) / 60  # minutes
    print(f"Estimated time: {estimated_time:.1f} minutes (rate limit: {RATE_LIMIT_DELAY}s per request)")
    print()

    # Enrich each museum
    print("Starting enrichment...")
    print()

    enriched_museums = []
    success_count = 0
    fail_count = 0

    for i, museum in enumerate(museums, 1):
        print(f"[{i}/{total}]", end=" ")

        original_fields_list = [
            bool(museum['locations'][0].get('street_address')),
            bool(museum['locations'][0].get('postal_code')),
            any(id.get('identifier_scheme') == 'Phone' for id in museum.get('identifiers', [])),
            any(id.get('identifier_scheme') == 'Email' for id in museum.get('identifiers', [])),
            any(id.get('identifier_scheme') == 'Website' for id in museum.get('identifiers', []))
        ]
        original_fields = sum(original_fields_list)

        enriched = enrich_museum(museum)
        enriched_museums.append(enriched)

        enriched_fields_list = [
            bool(enriched['locations'][0].get('street_address')),
            bool(enriched['locations'][0].get('postal_code')),
            any(id.get('identifier_scheme') == 'Phone' for id in enriched.get('identifiers', [])),
            any(id.get('identifier_scheme') == 'Email' for id in enriched.get('identifiers', [])),
            any(id.get('identifier_scheme') == 'Website' for id in enriched.get('identifiers', []))
        ]
        enriched_fields = sum(enriched_fields_list)

        if enriched_fields > original_fields:
            success_count += 1
        else:
            fail_count += 1

        # Rate limiting
        time.sleep(RATE_LIMIT_DELAY)

        # Progress update every 50 museums
        if i % 50 == 0:
            print()
            print(f"  Progress: {i}/{total} ({i/total*100:.1f}%) - {success_count} enriched, {fail_count} unchanged")
            print()

    print()
    print("=" * 80)
    print("Enrichment Complete")
    print("=" * 80)
    print()
    print(f"Total museums: {total}")
    print(f"Successfully enriched: {success_count}")
    print(f"Unchanged: {fail_count}")
    print(f"Success rate: {success_count/total*100:.1f}%")
    print()

    # Calculate completeness
    completeness = {
        'name': sum(1 for m in enriched_museums if m.get('name')),
        'city': sum(1 for m in enriched_museums if m['locations'][0].get('city')),
        'ISIL': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'ISIL' for i in m.get('identifiers', []))),
        'street_address': sum(1 for m in enriched_museums if m['locations'][0].get('street_address')),
        'postal_code': sum(1 for m in enriched_museums if m['locations'][0].get('postal_code')),
        'phone': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'Phone' for i in m.get('identifiers', []))),
        'email': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'Email' for i in m.get('identifiers', []))),
        'website': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'Website' for i in m.get('identifiers', []))),
    }

    print("Metadata Completeness After Enrichment:")
    print()
    for field, count in completeness.items():
        percentage = (count / total) * 100
        status = "✓" if percentage > 90 else "○"
        print(f"{status} {field:20s}: {count}/{total} ({percentage:5.1f}%)")

    print()

    # Save enriched dataset
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_dir = Path("data/isil/germany")
    output_file = output_dir / f"bayern_museums_enriched_{timestamp}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(enriched_museums, f, ensure_ascii=False, indent=2)

    print(f"✓ Exported to: {output_file}")
    print(f"  File size: {output_file.stat().st_size:,} bytes")
    print()

    print("=" * 80)
    print(f"Enrichment complete! {success_count}/{total} museums enhanced.")
    print("=" * 80)
    print()

    print("Next steps:")
    print("  1. Merge enriched museums with Bayern archives/libraries")
    print("  2. Generate Bayern complete dataset with ~80% metadata completeness")
    print("  3. Proceed to Baden-Württemberg extraction")
    print()

    return output_file


if __name__ == "__main__":
    main()