#!/usr/bin/env python3 """ Scraper for Archivo General de la Nación (AGN) - Argentina Extracts archival collections and library catalogs from AGN website. GLAM Data Extraction Project Country: Argentina (AR) Source: AGN (Archivo General de la Nación) """ import requests from bs4 import BeautifulSoup import json import time from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Any class AGNScraper: def __init__(self): self.base_url = "https://argentina.gob.ar" self.headers = { 'User-Agent': 'GLAM-AGN-Scraper/1.0 (Academic Research; Heritage Institution Mapping)' } self.institutions = [] def scrape_main_archive(self) -> Dict[str, Any]: """ Scrape main AGN archive information. """ print("=" * 80) print("SCRAPING: Archivo General de la Nación (Main Institution)") print("=" * 80) print() url = f"{self.base_url}/interior/archivo-general-de-la-nacion" try: response = requests.get(url, headers=self.headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract basic information institution = { "name": "Archivo General de la Nación", "name_en": "National Archive of Argentina", "type": "ARCHIVE", "country": "AR", "city": "Buenos Aires", "province": "Ciudad Autónoma de Buenos Aires", "url": url, "data_source": "AGN_WEB", "extraction_date": datetime.now(timezone.utc).isoformat(), "description": "Argentina's national archive, responsible for preserving and providing access to government records and historical documents." } # Try to find address information content = soup.get_text() if "Leandro N. Alem" in content or "Alem" in content: institution["street_address"] = "Leandro N. Alem 246" institution["postal_code"] = "C1003AAP" # Check for contact information email_links = soup.find_all('a', href=lambda x: x and 'mailto:' in x) if email_links: emails = [link['href'].replace('mailto:', '') for link in email_links] if emails: institution["email"] = emails[0] print(f"✅ Extracted: {institution['name']}") print(f" Type: {institution['type']}") print(f" City: {institution['city']}") print() return institution except Exception as e: print(f"❌ Error scraping main archive: {e}") return None def scrape_fondos_colecciones(self) -> List[Dict[str, Any]]: """ Scrape archival fondos and collections listings. """ print("=" * 80) print("SCRAPING: Fondos y Colecciones Documentales") print("=" * 80) print() url = f"{self.base_url}/interior/archivo-general/fondosycolecciones" collections = [] try: response = requests.get(url, headers=self.headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Look for collection listings # The page structure may vary, so we'll try multiple strategies # Strategy 1: Look for tables with fondos tables = soup.find_all('table') print(f"Found {len(tables)} tables") for table in tables: rows = table.find_all('tr') for row in rows[1:]: # Skip header cells = row.find_all(['td', 'th']) if len(cells) >= 2: collection = { "type": "collection", "parent_institution": "Archivo General de la Nación", "extraction_date": datetime.now(timezone.utc).isoformat() } # Extract collection data (structure depends on actual HTML) collection["name"] = cells[0].get_text(strip=True) if len(cells) > 1: collection["dates"] = cells[1].get_text(strip=True) if collection["name"]: collections.append(collection) print(f" • {collection['name']}") # Strategy 2: Look for list items lists = soup.find_all(['ul', 'ol']) for ul in lists: items = ul.find_all('li') for item in items: text = item.get_text(strip=True) # Filter for collection-like entries if any(keyword in text.lower() for keyword in ['fondo', 'colección', 'archivo', 'biblioteca']): if len(text) > 10 and len(text) < 200: # Reasonable length collection = { "type": "collection", "name": text, "parent_institution": "Archivo General de la Nación", "extraction_date": datetime.now(timezone.utc).isoformat() } collections.append(collection) print(f" • {text[:80]}") print() print(f"✅ Extracted {len(collections)} collections") print() return collections except Exception as e: print(f"❌ Error scraping fondos: {e}") return [] def scrape_koha_catalog(self) -> Dict[str, Any]: """ Attempt to scrape AGN's KOHA library catalog (if accessible). """ print("=" * 80) print("CHECKING: KOHA Library Catalog") print("=" * 80) print() # KOHA typically runs on a subdomain or separate URL potential_urls = [ "https://koha.mininterior.gob.ar", "https://biblioteca.agn.gov.ar", f"{self.base_url}/interior/archivo-general/biblioteca" ] for url in potential_urls: try: print(f"Trying: {url}") response = requests.get(url, headers=self.headers, timeout=10) if response.status_code == 200: print(f" ✅ Found KOHA catalog at: {url}") # Create library record library = { "name": "Biblioteca del Archivo General de la Nación", "name_en": "Library of the National Archive", "type": "LIBRARY", "country": "AR", "city": "Buenos Aires", "parent_institution": "Archivo General de la Nación", "catalog_url": url, "catalog_system": "KOHA", "data_source": "AGN_WEB", "extraction_date": datetime.now(timezone.utc).isoformat() } return library else: print(f" ⚠️ Status {response.status_code}") except Exception as e: print(f" ❌ Not accessible: {e}") print() print("⚠️ KOHA catalog not found at expected URLs") print() return None def run_full_scrape(self) -> Dict[str, Any]: """ Run complete AGN scraping workflow. """ print("=" * 80) print("ARCHIVO GENERAL DE LA NACIÓN - FULL SCRAPE") print("=" * 80) print(f"Started: {datetime.now(timezone.utc).isoformat()}") print() results = { "metadata": { "source": "Archivo General de la Nación (AGN)", "country": "AR", "extraction_date": datetime.now(timezone.utc).isoformat(), "scraper": "scrape_agn_argentina.py" }, "institutions": [], "collections": [] } # 1. Scrape main archive main_archive = self.scrape_main_archive() if main_archive: results["institutions"].append(main_archive) time.sleep(2) # Rate limiting # 2. Scrape fondos and collections collections = self.scrape_fondos_colecciones() results["collections"].extend(collections) time.sleep(2) # 3. Check for library catalog library = self.scrape_koha_catalog() if library: results["institutions"].append(library) # Summary print("=" * 80) print("SCRAPING COMPLETE") print("=" * 80) print(f"Institutions: {len(results['institutions'])}") print(f"Collections: {len(results['collections'])}") print() return results def save_results(self, results: Dict[str, Any], output_file: Path): """ Save scraping results to JSON file. """ output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"📁 Results saved to: {output_file}") print() if __name__ == "__main__": # Setup paths base_dir = Path(__file__).parent.parent.parent output_dir = base_dir / "data" / "isil" / "AR" output_file = output_dir / "agn_argentina_archives.json" # Run scraper scraper = AGNScraper() results = scraper.run_full_scrape() scraper.save_results(results, output_file) print("✅ AGN scraping workflow complete")