glam/scripts/scrapers/scrape_agn_argentina.py
2025-11-19 23:25:22 +01:00

278 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Scraper for Archivo General de la Nación (AGN) - Argentina
Extracts archival collections and library catalogs from AGN website.
GLAM Data Extraction Project
Country: Argentina (AR)
Source: AGN (Archivo General de la Nación)
"""
import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
class AGNScraper:
def __init__(self):
self.base_url = "https://argentina.gob.ar"
self.headers = {
'User-Agent': 'GLAM-AGN-Scraper/1.0 (Academic Research; Heritage Institution Mapping)'
}
self.institutions = []
def scrape_main_archive(self) -> Dict[str, Any]:
"""
Scrape main AGN archive information.
"""
print("=" * 80)
print("SCRAPING: Archivo General de la Nación (Main Institution)")
print("=" * 80)
print()
url = f"{self.base_url}/interior/archivo-general-de-la-nacion"
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract basic information
institution = {
"name": "Archivo General de la Nación",
"name_en": "National Archive of Argentina",
"type": "ARCHIVE",
"country": "AR",
"city": "Buenos Aires",
"province": "Ciudad Autónoma de Buenos Aires",
"url": url,
"data_source": "AGN_WEB",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"description": "Argentina's national archive, responsible for preserving and providing access to government records and historical documents."
}
# Try to find address information
content = soup.get_text()
if "Leandro N. Alem" in content or "Alem" in content:
institution["street_address"] = "Leandro N. Alem 246"
institution["postal_code"] = "C1003AAP"
# Check for contact information
email_links = soup.find_all('a', href=lambda x: x and 'mailto:' in x)
if email_links:
emails = [link['href'].replace('mailto:', '') for link in email_links]
if emails:
institution["email"] = emails[0]
print(f"✅ Extracted: {institution['name']}")
print(f" Type: {institution['type']}")
print(f" City: {institution['city']}")
print()
return institution
except Exception as e:
print(f"❌ Error scraping main archive: {e}")
return None
def scrape_fondos_colecciones(self) -> List[Dict[str, Any]]:
"""
Scrape archival fondos and collections listings.
"""
print("=" * 80)
print("SCRAPING: Fondos y Colecciones Documentales")
print("=" * 80)
print()
url = f"{self.base_url}/interior/archivo-general/fondosycolecciones"
collections = []
try:
response = requests.get(url, headers=self.headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Look for collection listings
# The page structure may vary, so we'll try multiple strategies
# Strategy 1: Look for tables with fondos
tables = soup.find_all('table')
print(f"Found {len(tables)} tables")
for table in tables:
rows = table.find_all('tr')
for row in rows[1:]: # Skip header
cells = row.find_all(['td', 'th'])
if len(cells) >= 2:
collection = {
"type": "collection",
"parent_institution": "Archivo General de la Nación",
"extraction_date": datetime.now(timezone.utc).isoformat()
}
# Extract collection data (structure depends on actual HTML)
collection["name"] = cells[0].get_text(strip=True)
if len(cells) > 1:
collection["dates"] = cells[1].get_text(strip=True)
if collection["name"]:
collections.append(collection)
print(f"{collection['name']}")
# Strategy 2: Look for list items
lists = soup.find_all(['ul', 'ol'])
for ul in lists:
items = ul.find_all('li')
for item in items:
text = item.get_text(strip=True)
# Filter for collection-like entries
if any(keyword in text.lower() for keyword in ['fondo', 'colección', 'archivo', 'biblioteca']):
if len(text) > 10 and len(text) < 200: # Reasonable length
collection = {
"type": "collection",
"name": text,
"parent_institution": "Archivo General de la Nación",
"extraction_date": datetime.now(timezone.utc).isoformat()
}
collections.append(collection)
print(f"{text[:80]}")
print()
print(f"✅ Extracted {len(collections)} collections")
print()
return collections
except Exception as e:
print(f"❌ Error scraping fondos: {e}")
return []
def scrape_koha_catalog(self) -> Dict[str, Any]:
"""
Attempt to scrape AGN's KOHA library catalog (if accessible).
"""
print("=" * 80)
print("CHECKING: KOHA Library Catalog")
print("=" * 80)
print()
# KOHA typically runs on a subdomain or separate URL
potential_urls = [
"https://koha.mininterior.gob.ar",
"https://biblioteca.agn.gov.ar",
f"{self.base_url}/interior/archivo-general/biblioteca"
]
for url in potential_urls:
try:
print(f"Trying: {url}")
response = requests.get(url, headers=self.headers, timeout=10)
if response.status_code == 200:
print(f" ✅ Found KOHA catalog at: {url}")
# Create library record
library = {
"name": "Biblioteca del Archivo General de la Nación",
"name_en": "Library of the National Archive",
"type": "LIBRARY",
"country": "AR",
"city": "Buenos Aires",
"parent_institution": "Archivo General de la Nación",
"catalog_url": url,
"catalog_system": "KOHA",
"data_source": "AGN_WEB",
"extraction_date": datetime.now(timezone.utc).isoformat()
}
return library
else:
print(f" ⚠️ Status {response.status_code}")
except Exception as e:
print(f" ❌ Not accessible: {e}")
print()
print("⚠️ KOHA catalog not found at expected URLs")
print()
return None
def run_full_scrape(self) -> Dict[str, Any]:
"""
Run complete AGN scraping workflow.
"""
print("=" * 80)
print("ARCHIVO GENERAL DE LA NACIÓN - FULL SCRAPE")
print("=" * 80)
print(f"Started: {datetime.now(timezone.utc).isoformat()}")
print()
results = {
"metadata": {
"source": "Archivo General de la Nación (AGN)",
"country": "AR",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"scraper": "scrape_agn_argentina.py"
},
"institutions": [],
"collections": []
}
# 1. Scrape main archive
main_archive = self.scrape_main_archive()
if main_archive:
results["institutions"].append(main_archive)
time.sleep(2) # Rate limiting
# 2. Scrape fondos and collections
collections = self.scrape_fondos_colecciones()
results["collections"].extend(collections)
time.sleep(2)
# 3. Check for library catalog
library = self.scrape_koha_catalog()
if library:
results["institutions"].append(library)
# Summary
print("=" * 80)
print("SCRAPING COMPLETE")
print("=" * 80)
print(f"Institutions: {len(results['institutions'])}")
print(f"Collections: {len(results['collections'])}")
print()
return results
def save_results(self, results: Dict[str, Any], output_file: Path):
"""
Save scraping results to JSON file.
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"📁 Results saved to: {output_file}")
print()
if __name__ == "__main__":
# Setup paths
base_dir = Path(__file__).parent.parent.parent
output_dir = base_dir / "data" / "isil" / "AR"
output_file = output_dir / "agn_argentina_archives.json"
# Run scraper
scraper = AGNScraper()
results = scraper.run_full_scrape()
scraper.save_results(results, output_file)
print("✅ AGN scraping workflow complete")