glam/scripts/scrapers/enrich_sachsen_anhalt_museums_v2.py

#!/usr/bin/env python3
"""
Enrich Sachsen-Anhalt Museums with Detail Page Data - v2.0
Scrapes individual museum pages for complete metadata:
- Physical addresses (street, postal code, city)
- Contact information (phone, email, website)
- Opening hours
- Full descriptions

Improvements over v1.0:
- Properly parses address block structure
- Better error handling
- Progress tracking
- Rate limiting with delays
"""

import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional
import time
import re

def scrape_museum_detail_page(url: str) -> Dict[str, Any]:
    """Scrape individual museum detail page for complete metadata."""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        response.encoding = 'utf-8'

        soup = BeautifulSoup(response.text, 'html.parser')

        details = {
            'city': '',
            'street_address': '',
            'postal_code': '',
            'phone': '',
            'email': '',
            'website': '',
            'opening_hours': '',
            'full_description': ''
        }

        # Extract address block (Postanschrift)
        address_div = soup.find('div', class_=lambda c: c and 'address' in c.lower())
        if address_div:
            address_text = address_div.get_text(separator='\n', strip=True)
            lines = [line.strip() for line in address_text.split('\n') if line.strip()]

            # Parse address structure:
            # Line 0: "Postanschrift"
            # Line 1: Museum name
            # Line 2: Street address
            # Line 3: Postal code + city

            for i, line in enumerate(lines):
                # Postal code + city (e.g., "06385 Aken")
                postal_match = re.match(r'(\d{5})\s+(.+)', line)
                if postal_match:
                    details['postal_code'] = postal_match.group(1)
                    details['city'] = postal_match.group(2).strip()

                # Street address (line before postal code)
                # Look for pattern: Word + "straße/str./weg/platz" + number
                street_match = re.search(r'[A-ZÄÖÜ][a-zäöüß]+(?:straße|str\.|weg|platz|gasse|allee)\s+\d+', line, re.IGNORECASE)
                if street_match:
                    details['street_address'] = street_match.group(0)

        # Extract contact information from dt/dd pairs
        dt_tags = soup.find_all('dt')
        for dt in dt_tags:
            label = dt.get_text(strip=True).lower()
            dd = dt.find_next_sibling('dd')
            if not dd:
                continue

            value = dd.get_text(strip=True)

            if 'telefon' in label or 'phone' in label:
                details['phone'] = value

            elif 'e-mail' in label or 'mail' in label:
                # Skip generic museum association email
                if 'mv-sachsen-anhalt' not in value:
                    details['email'] = value

            elif 'internet' in label or 'website' in label:
                details['website'] = value

            elif 'öffnungszeiten' in label or 'opening' in label:
                details['opening_hours'] = value

        # Extract full description from main content area
        content_div = soup.find('div', class_=lambda c: c and ('content' in c.lower() or 'description' in c.lower() or 'text' in c.lower()))
        if content_div:
            paragraphs = content_div.find_all('p')
            if paragraphs:
                full_desc = '\n\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
                if full_desc and len(full_desc) > 50:
                    details['full_description'] = full_desc

        return details

    except requests.exceptions.RequestException as e:
        print(f"      ⚠️  Request failed: {e}")
        return {}
    except Exception as e:
        print(f"      ⚠️  Parse error: {e}")
        return {}

def enrich_museums(input_file: Path) -> List[Dict[str, Any]]:
    """Enrich museum records with detail page data."""

    # Load existing museum data
    with open(input_file, 'r', encoding='utf-8') as f:
        museums = json.load(f)

    print(f"Loaded {len(museums)} museums from {input_file.name}")
    print()

    enriched = []
    success_count = 0
    error_count = 0

    for i, museum in enumerate(museums, 1):
        name_display = museum['name'][:60] + '...' if len(museum['name']) > 60 else museum['name']
        print(f"[{i}/{len(museums)}] {name_display}")

        # Get detail URL
        detail_url = None
        if museum.get('identifiers'):
            for ident in museum['identifiers']:
                if ident['identifier_scheme'] == 'Website':
                    detail_url = ident['identifier_value']
                    break

        if not detail_url:
            print(f"      ⚠️  No detail URL, skipping")
            enriched.append(museum)
            error_count += 1
            continue

        # Scrape detail page
        details = scrape_museum_detail_page(detail_url)

        if not details:
            print(f"      ⚠️  Failed to scrape details")
            enriched.append(museum)
            error_count += 1
            continue

        # Update location with city data
        fields_updated = []

        if details.get('city'):
            museum['locations'][0]['city'] = details['city']
            fields_updated.append(f"City: {details['city']}")

        if details.get('street_address'):
            museum['locations'][0]['street_address'] = details['street_address']
            fields_updated.append(f"Address: {details['street_address']}")

        if details.get('postal_code'):
            museum['locations'][0]['postal_code'] = details['postal_code']

        # Update description if longer
        if details.get('full_description') and len(details['full_description']) > len(museum.get('description', '')):
            old_len = len(museum.get('description', ''))
            museum['description'] = details['full_description']
            fields_updated.append(f"Description: {old_len}→{len(details['full_description'])} chars")

        # Add contact identifiers (avoid duplicates)
        existing_schemes = {i['identifier_scheme'] for i in museum.get('identifiers', [])}

        if details.get('phone') and 'Phone' not in existing_schemes:
            museum['identifiers'].append({
                'identifier_scheme': 'Phone',
                'identifier_value': details['phone'],
                'identifier_url': f"tel:{details['phone']}"
            })
            fields_updated.append(f"Phone: {details['phone']}")

        if details.get('email') and 'Email' not in existing_schemes:
            museum['identifiers'].append({
                'identifier_scheme': 'Email',
                'identifier_value': details['email'],
                'identifier_url': f"mailto:{details['email']}"
            })
            fields_updated.append(f"Email: {details['email']}")

        # Add opening hours to description
        if details.get('opening_hours'):
            if not museum.get('description'):
                museum['description'] = ''
            if 'Öffnungszeiten:' not in museum['description']:
                museum['description'] += f"\n\nÖffnungszeiten: {details['opening_hours']}"

        # Update provenance
        if not museum.get('provenance'):
            museum['provenance'] = {
                'data_source': 'WEBSITE_SCRAPING',
                'data_tier': 'TIER_2_VERIFIED',
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'extraction_method': 'Museumsverband Sachsen-Anhalt detail pages v2.0',
                'confidence_score': 0.95
            }

        enriched.append(museum)

        if fields_updated:
            print(f"      ✅ {', '.join(fields_updated)}")
            success_count += 1
        else:
            print(f"      ⚠️  No new data extracted")
            error_count += 1

        # Rate limiting - 1 second delay between requests
        if i < len(museums):
            time.sleep(1.0)

    print()
    print(f"Enrichment complete: {success_count} successful, {error_count} failed/skipped")

    return enriched

def main():
    """Main execution."""

    print("=" * 80)
    print("Enrich Sachsen-Anhalt Museums with Detail Page Data v2.0")
    print("=" * 80)
    print()

    # Find most recent museum file
    data_dir = Path('data/isil/germany')
    museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)

    if not museum_files:
        print("❌ No museum files found. Run harvest_sachsen_anhalt_museums.py first.")
        return

    # Skip already enriched files
    input_file = None
    for f in museum_files:
        if 'enriched' not in f.name:
            input_file = f
            break

    if not input_file:
        print("❌ No non-enriched museum files found.")
        return

    print(f"Input: {input_file.name}")
    print()

    # Enrich museums
    start_time = time.time()
    enriched_museums = enrich_museums(input_file)
    elapsed = time.time() - start_time

    # Statistics
    has_city = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('city'))
    has_address = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('street_address'))
    has_postal = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('postal_code'))
    has_phone = sum(1 for m in enriched_museums if m.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in m['identifiers']))
    has_email = sum(1 for m in enriched_museums if m.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in m['identifiers']))

    print()
    print("=" * 80)
    print("Data Completeness:")
    print("=" * 80)
    print(f"  City:           {has_city:3d}/{len(enriched_museums)} ({has_city/len(enriched_museums)*100:5.1f}%)")
    print(f"  Street Address: {has_address:3d}/{len(enriched_museums)} ({has_address/len(enriched_museums)*100:5.1f}%)")
    print(f"  Postal Code:    {has_postal:3d}/{len(enriched_museums)} ({has_postal/len(enriched_museums)*100:5.1f}%)")
    print(f"  Phone:          {has_phone:3d}/{len(enriched_museums)} ({has_phone/len(enriched_museums)*100:5.1f}%)")
    print(f"  Email:          {has_email:3d}/{len(enriched_museums)} ({has_email/len(enriched_museums)*100:5.1f}%)")
    print()

    # City counts
    from collections import Counter
    city_counts = Counter()
    for museum in enriched_museums:
        if museum.get('locations'):
            city = museum['locations'][0].get('city', '')
            if city:
                city_counts[city] += 1

    print("Top 20 Cities:")
    for city, count in city_counts.most_common(20):
        print(f"  {city:30s}: {count:2d}")
    print()

    print(f"Total cities covered: {len(city_counts)}")
    print(f"Processing time: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
    print()

    # Save enriched data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = data_dir / f'sachsen_anhalt_museums_enriched_{timestamp}.json'

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(enriched_museums, f, ensure_ascii=False, indent=2)

    file_size_kb = output_path.stat().st_size / 1024

    print(f"✅ Saved to: {output_path}")
    print(f"   File size: {file_size_kb:.1f} KB")
    print(f"   Total museums: {len(enriched_museums)}")
    print()
    print("=" * 80)
    print("Enrichment complete!")
    print("=" * 80)

if __name__ == '__main__':
    main()