glam/scripts/scrapers/scrape_agn_argentina.py

#!/usr/bin/env python3
"""
Scraper for Archivo General de la Nación (AGN) - Argentina
Extracts archival collections and library catalogs from AGN website.

GLAM Data Extraction Project
Country: Argentina (AR)
Source: AGN (Archivo General de la Nación)
"""

import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any

class AGNScraper:
    def __init__(self):
        self.base_url = "https://argentina.gob.ar"
        self.headers = {
            'User-Agent': 'GLAM-AGN-Scraper/1.0 (Academic Research; Heritage Institution Mapping)'
        }
        self.institutions = []

    def scrape_main_archive(self) -> Dict[str, Any]:
        """
        Scrape main AGN archive information.
        """
        print("=" * 80)
        print("SCRAPING: Archivo General de la Nación (Main Institution)")
        print("=" * 80)
        print()

        url = f"{self.base_url}/interior/archivo-general-de-la-nacion"

        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract basic information
            institution = {
                "name": "Archivo General de la Nación",
                "name_en": "National Archive of Argentina",
                "type": "ARCHIVE",
                "country": "AR",
                "city": "Buenos Aires",
                "province": "Ciudad Autónoma de Buenos Aires",
                "url": url,
                "data_source": "AGN_WEB",
                "extraction_date": datetime.now(timezone.utc).isoformat(),
                "description": "Argentina's national archive, responsible for preserving and providing access to government records and historical documents."
            }

            # Try to find address information
            content = soup.get_text()
            if "Leandro N. Alem" in content or "Alem" in content:
                institution["street_address"] = "Leandro N. Alem 246"
                institution["postal_code"] = "C1003AAP"

            # Check for contact information
            email_links = soup.find_all('a', href=lambda x: x and 'mailto:' in x)
            if email_links:
                emails = [link['href'].replace('mailto:', '') for link in email_links]
                if emails:
                    institution["email"] = emails[0]

            print(f"✅ Extracted: {institution['name']}")
            print(f"   Type: {institution['type']}")
            print(f"   City: {institution['city']}")
            print()

            return institution

        except Exception as e:
            print(f"❌ Error scraping main archive: {e}")
            return None

    def scrape_fondos_colecciones(self) -> List[Dict[str, Any]]:
        """
        Scrape archival fondos and collections listings.
        """
        print("=" * 80)
        print("SCRAPING: Fondos y Colecciones Documentales")
        print("=" * 80)
        print()

        url = f"{self.base_url}/interior/archivo-general/fondosycolecciones"
        collections = []

        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Look for collection listings
            # The page structure may vary, so we'll try multiple strategies

            # Strategy 1: Look for tables with fondos
            tables = soup.find_all('table')
            print(f"Found {len(tables)} tables")

            for table in tables:
                rows = table.find_all('tr')
                for row in rows[1:]:  # Skip header
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        collection = {
                            "type": "collection",
                            "parent_institution": "Archivo General de la Nación",
                            "extraction_date": datetime.now(timezone.utc).isoformat()
                        }

                        # Extract collection data (structure depends on actual HTML)
                        collection["name"] = cells[0].get_text(strip=True)

                        if len(cells) > 1:
                            collection["dates"] = cells[1].get_text(strip=True)

                        if collection["name"]:
                            collections.append(collection)
                            print(f"  • {collection['name']}")

            # Strategy 2: Look for list items
            lists = soup.find_all(['ul', 'ol'])
            for ul in lists:
                items = ul.find_all('li')
                for item in items:
                    text = item.get_text(strip=True)
                    # Filter for collection-like entries
                    if any(keyword in text.lower() for keyword in ['fondo', 'colección', 'archivo', 'biblioteca']):
                        if len(text) > 10 and len(text) < 200:  # Reasonable length
                            collection = {
                                "type": "collection",
                                "name": text,
                                "parent_institution": "Archivo General de la Nación",
                                "extraction_date": datetime.now(timezone.utc).isoformat()
                            }
                            collections.append(collection)
                            print(f"  • {text[:80]}")

            print()
            print(f"✅ Extracted {len(collections)} collections")
            print()

            return collections

        except Exception as e:
            print(f"❌ Error scraping fondos: {e}")
            return []

    def scrape_koha_catalog(self) -> Dict[str, Any]:
        """
        Attempt to scrape AGN's KOHA library catalog (if accessible).
        """
        print("=" * 80)
        print("CHECKING: KOHA Library Catalog")
        print("=" * 80)
        print()

        # KOHA typically runs on a subdomain or separate URL
        potential_urls = [
            "https://koha.mininterior.gob.ar",
            "https://biblioteca.agn.gov.ar",
            f"{self.base_url}/interior/archivo-general/biblioteca"
        ]

        for url in potential_urls:
            try:
                print(f"Trying: {url}")
                response = requests.get(url, headers=self.headers, timeout=10)

                if response.status_code == 200:
                    print(f"  ✅ Found KOHA catalog at: {url}")

                    # Create library record
                    library = {
                        "name": "Biblioteca del Archivo General de la Nación",
                        "name_en": "Library of the National Archive",
                        "type": "LIBRARY",
                        "country": "AR",
                        "city": "Buenos Aires",
                        "parent_institution": "Archivo General de la Nación",
                        "catalog_url": url,
                        "catalog_system": "KOHA",
                        "data_source": "AGN_WEB",
                        "extraction_date": datetime.now(timezone.utc).isoformat()
                    }

                    return library
                else:
                    print(f"  ⚠️  Status {response.status_code}")

            except Exception as e:
                print(f"  ❌ Not accessible: {e}")

        print()
        print("⚠️  KOHA catalog not found at expected URLs")
        print()
        return None

    def run_full_scrape(self) -> Dict[str, Any]:
        """
        Run complete AGN scraping workflow.
        """
        print("=" * 80)
        print("ARCHIVO GENERAL DE LA NACIÓN - FULL SCRAPE")
        print("=" * 80)
        print(f"Started: {datetime.now(timezone.utc).isoformat()}")
        print()

        results = {
            "metadata": {
                "source": "Archivo General de la Nación (AGN)",
                "country": "AR",
                "extraction_date": datetime.now(timezone.utc).isoformat(),
                "scraper": "scrape_agn_argentina.py"
            },
            "institutions": [],
            "collections": []
        }

        # 1. Scrape main archive
        main_archive = self.scrape_main_archive()
        if main_archive:
            results["institutions"].append(main_archive)

        time.sleep(2)  # Rate limiting

        # 2. Scrape fondos and collections
        collections = self.scrape_fondos_colecciones()
        results["collections"].extend(collections)

        time.sleep(2)

        # 3. Check for library catalog
        library = self.scrape_koha_catalog()
        if library:
            results["institutions"].append(library)

        # Summary
        print("=" * 80)
        print("SCRAPING COMPLETE")
        print("=" * 80)
        print(f"Institutions: {len(results['institutions'])}")
        print(f"Collections: {len(results['collections'])}")
        print()

        return results

    def save_results(self, results: Dict[str, Any], output_file: Path):
        """
        Save scraping results to JSON file.
        """
        output_file.parent.mkdir(parents=True, exist_ok=True)

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        print(f"📁 Results saved to: {output_file}")
        print()

if __name__ == "__main__":
    # Setup paths
    base_dir = Path(__file__).parent.parent.parent
    output_dir = base_dir / "data" / "isil" / "AR"
    output_file = output_dir / "agn_argentina_archives.json"

    # Run scraper
    scraper = AGNScraper()
    results = scraper.run_full_scrape()
    scraper.save_results(results, output_file)

    print("✅ AGN scraping workflow complete")