glam/scripts/scrapers/enrich_sachsen_anhalt_museums.py

#!/usr/bin/env python3
"""
Enrich Sachsen-Anhalt Museums with Detail Page Data
Scrapes individual museum pages for complete metadata:
- Physical addresses
- Contact information
- Opening hours
- Cities
Target: 100% metadata completeness
"""

import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional
import time
from urllib.parse import urljoin

def scrape_museum_detail_page(url: str) -> Dict[str, Any]:
    """Scrape individual museum detail page for complete metadata."""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        response.encoding = 'utf-8'

        soup = BeautifulSoup(response.text, 'html.parser')

        details = {
            'city': '',
            'street_address': '',
            'postal_code': '',
            'phone': '',
            'email': '',
            'opening_hours': '',
            'full_description': ''
        }

        # Extract address block
        address_div = soup.find('div', class_=lambda c: c and ('address' in c.lower() or 'kontakt' in c.lower()))
        if address_div:
            address_text = address_div.get_text(separator='\n', strip=True)
            lines = [line.strip() for line in address_text.split('\n') if line.strip()]

            for i, line in enumerate(lines):
                # Postal code + city (e.g., "06618 Naumburg")
                if any(char.isdigit() for char in line) and len(line) > 3:
                    parts = line.split(maxsplit=1)
                    if len(parts) == 2 and parts[0].isdigit():
                        details['postal_code'] = parts[0]
                        details['city'] = parts[1]

                # Street address (before postal code)
                if not details['street_address'] and i < len(lines) - 1:
                    next_line = lines[i + 1]
                    if any(char.isdigit() for char in next_line) and len(next_line) > 3:
                        details['street_address'] = line

                # Phone
                if 'tel' in line.lower() or 'fon' in line.lower():
                    details['phone'] = line.split(':', 1)[-1].strip()

                # Email
                if '@' in line:
                    details['email'] = line.strip()

        # Alternative: structured data with dt/dd tags
        dt_tags = soup.find_all('dt')
        for dt in dt_tags:
            label = dt.get_text(strip=True).lower()
            dd = dt.find_next_sibling('dd')
            if not dd:
                continue

            value = dd.get_text(strip=True)

            if 'adresse' in label or 'anschrift' in label:
                # Parse address
                lines = [l.strip() for l in value.split('\n') if l.strip()]
                for line in lines:
                    if any(char.isdigit() for char in line):
                        parts = line.split(maxsplit=1)
                        if len(parts) == 2 and parts[0].isdigit():
                            details['postal_code'] = parts[0]
                            details['city'] = parts[1]

            elif 'öffnungszeiten' in label or 'opening' in label:
                details['opening_hours'] = value

            elif 'telefon' in label or 'phone' in label:
                details['phone'] = value

            elif 'mail' in label:
                details['email'] = value

        # Extract full description
        desc_div = soup.find('div', class_=lambda c: c and ('description' in c.lower() or 'content' in c.lower()))
        if desc_div:
            paragraphs = desc_div.find_all('p')
            full_desc = '\n\n'.join(p.get_text(strip=True) for p in paragraphs)
            if full_desc:
                details['full_description'] = full_desc

        return details

    except requests.exceptions.RequestException as e:
        print(f"⚠️  Failed to scrape {url}: {e}")
        return {}

def enrich_museums(input_file: Path) -> List[Dict[str, Any]]:
    """Enrich museum records with detail page data."""

    # Load existing museum data
    with open(input_file, 'r', encoding='utf-8') as f:
        museums = json.load(f)

    print(f"Loaded {len(museums)} museums from {input_file.name}")
    print()

    enriched = []
    errors = 0

    for i, museum in enumerate(museums, 1):
        print(f"[{i}/{len(museums)}] Enriching: {museum['name'][:60]}")

        # Get detail URL
        detail_url = None
        if museum.get('identifiers'):
            for ident in museum['identifiers']:
                if ident['identifier_scheme'] == 'Website':
                    detail_url = ident['identifier_value']
                    break

        if not detail_url:
            print(f"  ⚠️  No detail URL, skipping")
            enriched.append(museum)
            errors += 1
            continue

        # Scrape detail page
        details = scrape_museum_detail_page(detail_url)

        if not details:
            print(f"  ⚠️  Failed to scrape details")
            enriched.append(museum)
            errors += 1
            continue

        # Update location with city data
        if details.get('city'):
            museum['locations'][0]['city'] = details['city']
            print(f"  ✅ City: {details['city']}")

        if details.get('street_address'):
            museum['locations'][0]['street_address'] = details['street_address']

        if details.get('postal_code'):
            museum['locations'][0]['postal_code'] = details['postal_code']

        # Update description if longer
        if details.get('full_description') and len(details['full_description']) > len(museum.get('description', '')):
            museum['description'] = details['full_description']

        # Add contact identifiers
        if details.get('phone'):
            museum['identifiers'].append({
                'identifier_scheme': 'Phone',
                'identifier_value': details['phone'],
                'identifier_url': f"tel:{details['phone']}"
            })

        if details.get('email'):
            museum['identifiers'].append({
                'identifier_scheme': 'Email',
                'identifier_value': details['email'],
                'identifier_url': f"mailto:{details['email']}"
            })

        # Add opening hours to description or metadata
        if details.get('opening_hours'):
            if not museum.get('description'):
                museum['description'] = ''
            museum['description'] += f"\n\nÖffnungszeiten: {details['opening_hours']}"

        enriched.append(museum)

        # Rate limiting
        time.sleep(0.5)  # 0.5 second delay between requests

    print()
    print(f"Enrichment complete: {len(enriched) - errors}/{len(enriched)} successful")
    print(f"Errors: {errors}")

    return enriched

def main():
    """Main execution."""

    print("=" * 80)
    print("Enrich Sachsen-Anhalt Museums with Detail Page Data")
    print("=" * 80)
    print()

    # Find most recent museum file
    data_dir = Path('data/isil/germany')
    museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)

    if not museum_files:
        print("❌ No museum files found. Run harvest_sachsen_anhalt_museums.py first.")
        return

    input_file = museum_files[0]
    print(f"Input: {input_file.name}")
    print()

    # Enrich museums
    enriched_museums = enrich_museums(input_file)

    # Statistics
    has_city = sum(1 for m in enriched_museums if m['locations'][0].get('city'))
    has_address = sum(1 for m in enriched_museums if m['locations'][0].get('street_address'))
    has_postal = sum(1 for m in enriched_museums if m['locations'][0].get('postal_code'))

    print()
    print("Data Completeness:")
    print(f"  City: {has_city}/{len(enriched_museums)} ({has_city/len(enriched_museums)*100:.1f}%)")
    print(f"  Street Address: {has_address}/{len(enriched_museums)} ({has_address/len(enriched_museums)*100:.1f}%)")
    print(f"  Postal Code: {has_postal}/{len(enriched_museums)} ({has_postal/len(enriched_museums)*100:.1f}%)")
    print()

    # City counts
    city_counts = {}
    for museum in enriched_museums:
        city = museum['locations'][0].get('city', '')
        if city:
            city_counts[city] = city_counts.get(city, 0) + 1

    print("Top 10 Cities:")
    for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
        print(f"  {city}: {count}")
    print()

    # Save enriched data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = data_dir / f'sachsen_anhalt_museums_enriched_{timestamp}.json'

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(enriched_museums, f, ensure_ascii=False, indent=2)

    file_size_kb = output_path.stat().st_size / 1024

    print(f"✅ Saved to: {output_path}")
    print(f"   File size: {file_size_kb:.1f} KB")
    print(f"   Total museums: {len(enriched_museums)}")
    print()
    print("=" * 80)
    print("Enrichment complete!")
    print("=" * 80)

if __name__ == '__main__':
    main()