#!/usr/bin/env python3
"""
Swiss ISIL Database Scraper
Scrapes complete heritage institution data from https://www.isil.nb.admin.ch/en/
Author: GLAM Data Extraction Project
Date: November 2025
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

class SwissISILScraper:
    """Scraper for Swiss National Library ISIL directory"""
    
    BASE_URL = "https://www.isil.nb.admin.ch"
    LIST_URL = f"{BASE_URL}/en/"
    OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
        })
        self.institutions = []
        self.stats = {
            'start_time': datetime.now().isoformat(),
            'pages_scraped': 0,
            'institutions_found': 0,
            'detail_pages_scraped': 0,
            'errors': []
        }
        
    def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
        """Fetch a page with retry logic"""
        for attempt in range(retry_count):
            try:
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return response.text
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < retry_count - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    self.stats['errors'].append({'url': url, 'error': str(e)})
                    logger.error(f"Failed to fetch {url} after {retry_count} attempts")
                    return None
    
    def parse_list_page(self, html: str) -> List[Dict]:
        """Parse institution listing page"""
        soup = BeautifulSoup(html, 'html.parser')
        institutions = []
        
        # Find all institution cards
        institution_cards = soup.select('ul.search-results-list > li')
        
        logger.info(f"Found {len(institution_cards)} institution cards on page")
        
        for card in institution_cards:
            try:
                institution = {}
                
                # Get detail page URL
                link = card.select_one('a.card__link')
                if link and link.get('href'):
                    institution['detail_url'] = self.BASE_URL + link['href']
                
                # Get institution name
                name_elem = card.select_one('h3')
                if name_elem:
                    # Extract primary name and alternative name
                    name_text = name_elem.get_text(strip=True)
                    # Check for alternative name (indicated by /)
                    if '/' in name_text:
                        parts = name_text.split('/')
                        institution['name'] = parts[0].strip()
                        institution['alternative_name'] = parts[1].strip() if len(parts) > 1 else None
                    else:
                        institution['name'] = name_text
                    
                    # Check if inactive (strikethrough)
                    if name_elem.select_one('.line-through'):
                        institution['status'] = 'inactive'
                    else:
                        institution['status'] = 'active'
                
                # Get description
                desc_elem = card.select_one('.leading-snug')
                if desc_elem:
                    institution['description'] = desc_elem.get_text(strip=True)
                
                # Get location (region and canton)
                location_elem = card.select_one('span.inline-block.text-base.text-gray-500.font-bold.mr-4')
                if location_elem:
                    location_text = location_elem.get_text(strip=True)
                    if ',' in location_text:
                        parts = location_text.split(',')
                        institution['region'] = parts[0].strip()
                        institution['canton'] = parts[1].strip()
                    else:
                        institution['canton'] = location_text
                
                # Get institution categories/types
                categories = []
                category_elems = card.select('span.inline-flex.items-center.mr-4 span.inline-block.text-gray-500.text-base.font-bold')
                for cat in category_elems:
                    categories.append(cat.get_text(strip=True))
                if categories:
                    institution['categories'] = categories
                
                # Get merged institution info
                merged_elem = card.select_one('p.text-base a')
                if merged_elem:
                    institution['merged_into'] = {
                        'name': merged_elem.get_text(strip=True),
                        'url': self.BASE_URL + merged_elem['href'] if merged_elem.get('href') else None
                    }
                
                if institution.get('name'):
                    institutions.append(institution)
                    
            except Exception as e:
                logger.error(f"Error parsing institution card: {e}")
                self.stats['errors'].append({'context': 'parse_card', 'error': str(e)})
        
        return institutions
    
    def parse_detail_page(self, html: str) -> Dict:
        """Parse institution detail page"""
        soup = BeautifulSoup(html, 'html.parser')
        details = {}
        
        try:
            # Get ISIL code from page
            isil_elem = soup.select_one('dd:-soup-contains("CH-")')
            if not isil_elem:
                # Try alternative selection methods
                for dd in soup.select('dd'):
                    text = dd.get_text(strip=True)
                    if text.startswith('CH-'):
                        isil_elem = dd
                        break
            
            if isil_elem:
                details['isil_code'] = isil_elem.get_text(strip=True)
            
            # Get full address
            address_parts = {}
            
            # Street address
            street_elem = soup.select_one('dd:-soup-contains("Street")')
            if street_elem:
                address_parts['street'] = street_elem.get_text(strip=True)
            
            # Postal code and city
            postal_elem = soup.select_one('dd:-soup-contains("Postal code")')
            if postal_elem:
                address_parts['postal_code'] = postal_elem.get_text(strip=True)
            
            city_elem = soup.select_one('dd:-soup-contains("City")')
            if city_elem:
                address_parts['city'] = city_elem.get_text(strip=True)
            
            # Try to find address section
            address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
            if address_section:
                dd = address_section.find_next('dd')
                if dd:
                    address_text = dd.get_text(separator='|', strip=True)
                    lines = address_text.split('|')
                    if len(lines) >= 2:
                        address_parts['street'] = lines[0].strip()
                        # Parse postal code and city
                        if len(lines) >= 3:
                            postal_city = lines[1].strip()
                            match = re.match(r'(\d{4})\s+(.+)', postal_city)
                            if match:
                                address_parts['postal_code'] = match.group(1)
                                address_parts['city'] = match.group(2)
            
            if address_parts:
                details['address'] = address_parts
            
            # Get contact information
            contact = {}
            
            # Phone
            phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
            if phone_elem:
                dd = phone_elem.find_next('dd')
                if dd:
                    contact['phone'] = dd.get_text(strip=True)
            
            # Email
            email_elem = soup.find('a', href=re.compile(r'^mailto:'))
            if email_elem:
                contact['email'] = email_elem.get_text(strip=True)
            
            # Website
            website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
            if website_elem:
                dd = website_elem.find_next('dd')
                if dd:
                    link = dd.find('a')
                    if link:
                        contact['website'] = link.get('href')
            
            if contact:
                details['contact'] = contact
            
            # Get institution type/category
            type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
            if type_elem:
                dd = type_elem.find_next('dd')
                if dd:
                    details['institution_type'] = dd.get_text(strip=True)
            
            # Get opening hours
            hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
            if hours_elem:
                dd = hours_elem.find_next('dd')
                if dd:
                    details['opening_hours'] = dd.get_text(separator=' | ', strip=True)
            
            # Get membership information
            memberships = []
            member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
            if member_section:
                dd = member_section.find_next('dd')
                if dd:
                    member_tags = dd.select('span.badge, a')
                    for tag in member_tags:
                        memberships.append(tag.get_text(strip=True))
            
            if memberships:
                details['memberships'] = memberships
            
            # Get Dewey classification
            dewey = []
            dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
            if dewey_section:
                dd = dewey_section.find_next('dd')
                if dd:
                    dewey_tags = dd.select('span.badge, a')
                    for tag in dewey_tags:
                        dewey.append(tag.get_text(strip=True))
            
            if dewey:
                details['dewey_classifications'] = dewey
            
        except Exception as e:
            logger.error(f"Error parsing detail page: {e}")
            self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})
        
        return details
    
    def scrape_institution_details(self, institution: Dict) -> Dict:
        """Scrape detailed information for a single institution"""
        if not institution.get('detail_url'):
            return institution
        
        logger.info(f"Fetching details for: {institution.get('name')}")
        html = self.fetch_page(institution['detail_url'])
        
        if html:
            details = self.parse_detail_page(html)
            institution.update(details)
            self.stats['detail_pages_scraped'] += 1
            
        time.sleep(0.5)  # Rate limiting
        return institution
    
    def scrape_all_pages(self, max_pages: Optional[int] = None):
        """Scrape all pages of the institution listing"""
        page = 1
        
        while True:
            if max_pages and page > max_pages:
                break
                
            url = f"{self.LIST_URL}?page={page}"
            logger.info(f"Scraping page {page}: {url}")
            
            html = self.fetch_page(url)
            if not html:
                logger.error(f"Failed to fetch page {page}")
                break
            
            institutions = self.parse_list_page(html)
            
            if not institutions:
                logger.info(f"No institutions found on page {page}, stopping")
                break
            
            logger.info(f"Found {len(institutions)} institutions on page {page}")
            self.institutions.extend(institutions)
            self.stats['pages_scraped'] += 1
            self.stats['institutions_found'] += len(institutions)
            
            # Check if there's a next page
            soup = BeautifulSoup(html, 'html.parser')
            next_link = soup.select_one('a[href*="page="]:-soup-contains("Next")')
            
            if not next_link and page >= 96:  # We know there are 96 pages
                logger.info("Reached last page")
                break
            
            page += 1
            time.sleep(1)  # Rate limiting between pages
        
        logger.info(f"Completed scraping {self.stats['pages_scraped']} pages")
        logger.info(f"Total institutions collected: {self.stats['institutions_found']}")
    
    def scrape_all_details(self):
        """Scrape detailed information for all institutions"""
        logger.info(f"Starting detailed scrape for {len(self.institutions)} institutions")
        
        for i, institution in enumerate(self.institutions, 1):
            logger.info(f"Processing institution {i}/{len(self.institutions)}")
            self.scrape_institution_details(institution)
            
            # Save intermediate results every 50 institutions
            if i % 50 == 0:
                self.save_results(suffix=f"_batch_{i}")
        
        logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")
    
    def save_results(self, suffix: str = ""):
        """Save scraped data to JSON files"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save institutions data
        output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.institutions, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")
        
        # Save statistics
        self.stats['end_time'] = datetime.now().isoformat()
        self.stats['total_institutions'] = len(self.institutions)
        stats_file = self.OUTPUT_DIR / f"scraping_stats_{timestamp}.json"
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(self.stats, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved statistics to {stats_file}")
        
        # Generate summary report
        self.generate_report()
    
    def generate_report(self):
        """Generate a summary report of the scraped data"""
        report = []
        report.append("=" * 80)
        report.append("SWISS ISIL DATABASE SCRAPING REPORT")
        report.append("=" * 80)
        report.append(f"Scraping started: {self.stats.get('start_time', 'N/A')}")
        report.append(f"Scraping ended: {self.stats.get('end_time', 'N/A')}")
        report.append(f"Pages scraped: {self.stats['pages_scraped']}")
        report.append(f"Institutions found: {self.stats['institutions_found']}")
        report.append(f"Detail pages scraped: {self.stats['detail_pages_scraped']}")
        report.append(f"Errors encountered: {len(self.stats['errors'])}")
        report.append("")
        
        # Count by status
        active = sum(1 for i in self.institutions if i.get('status') == 'active')
        inactive = sum(1 for i in self.institutions if i.get('status') == 'inactive')
        report.append(f"Active institutions: {active}")
        report.append(f"Inactive institutions: {inactive}")
        report.append("")
        
        # Count by canton
        canton_counts = {}
        for inst in self.institutions:
            canton = inst.get('canton', 'Unknown')
            canton_counts[canton] = canton_counts.get(canton, 0) + 1
        
        report.append("Institutions by Canton:")
        for canton, count in sorted(canton_counts.items(), key=lambda x: x[1], reverse=True):
            report.append(f"  {canton}: {count}")
        report.append("")
        
        # Count by institution type
        type_counts = {}
        for inst in self.institutions:
            categories = inst.get('categories', [])
            for cat in categories:
                type_counts[cat] = type_counts.get(cat, 0) + 1
        
        report.append("Institutions by Type:")
        for cat, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
            report.append(f"  {cat}: {count}")
        report.append("")
        
        # ISIL codes found
        with_isil = sum(1 for i in self.institutions if i.get('isil_code'))
        report.append(f"Institutions with ISIL codes: {with_isil}")
        report.append("")
        
        report.append("=" * 80)
        
        report_text = "\n".join(report)
        print("\n" + report_text)
        
        # Save report to file
        report_file = self.OUTPUT_DIR / f"scraping_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report_text)
        logger.info(f"Saved report to {report_file}")

def main():
    """Main execution function"""
    logger.info("Starting Swiss ISIL database scraper")
    
    scraper = SwissISILScraper()
    
    # Step 1: Scrape all listing pages
    logger.info("Step 1: Scraping institution listings")
    scraper.scrape_all_pages()
    
    # Save intermediate results
    scraper.save_results(suffix="_listings_only")
    
    # Step 2: Scrape detail pages for each institution
    logger.info("Step 2: Scraping detailed information for each institution")
    scraper.scrape_all_details()
    
    # Save final results
    scraper.save_results()
    
    logger.info("Scraping complete!")

if __name__ == "__main__":
    main()