278 lines
10 KiB
Python
278 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scraper for Archivo General de la Nación (AGN) - Argentina
|
|
Extracts archival collections and library catalogs from AGN website.
|
|
|
|
GLAM Data Extraction Project
|
|
Country: Argentina (AR)
|
|
Source: AGN (Archivo General de la Nación)
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
class AGNScraper:
|
|
def __init__(self):
|
|
self.base_url = "https://argentina.gob.ar"
|
|
self.headers = {
|
|
'User-Agent': 'GLAM-AGN-Scraper/1.0 (Academic Research; Heritage Institution Mapping)'
|
|
}
|
|
self.institutions = []
|
|
|
|
def scrape_main_archive(self) -> Dict[str, Any]:
|
|
"""
|
|
Scrape main AGN archive information.
|
|
"""
|
|
print("=" * 80)
|
|
print("SCRAPING: Archivo General de la Nación (Main Institution)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
url = f"{self.base_url}/interior/archivo-general-de-la-nacion"
|
|
|
|
try:
|
|
response = requests.get(url, headers=self.headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Extract basic information
|
|
institution = {
|
|
"name": "Archivo General de la Nación",
|
|
"name_en": "National Archive of Argentina",
|
|
"type": "ARCHIVE",
|
|
"country": "AR",
|
|
"city": "Buenos Aires",
|
|
"province": "Ciudad Autónoma de Buenos Aires",
|
|
"url": url,
|
|
"data_source": "AGN_WEB",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"description": "Argentina's national archive, responsible for preserving and providing access to government records and historical documents."
|
|
}
|
|
|
|
# Try to find address information
|
|
content = soup.get_text()
|
|
if "Leandro N. Alem" in content or "Alem" in content:
|
|
institution["street_address"] = "Leandro N. Alem 246"
|
|
institution["postal_code"] = "C1003AAP"
|
|
|
|
# Check for contact information
|
|
email_links = soup.find_all('a', href=lambda x: x and 'mailto:' in x)
|
|
if email_links:
|
|
emails = [link['href'].replace('mailto:', '') for link in email_links]
|
|
if emails:
|
|
institution["email"] = emails[0]
|
|
|
|
print(f"✅ Extracted: {institution['name']}")
|
|
print(f" Type: {institution['type']}")
|
|
print(f" City: {institution['city']}")
|
|
print()
|
|
|
|
return institution
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error scraping main archive: {e}")
|
|
return None
|
|
|
|
def scrape_fondos_colecciones(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape archival fondos and collections listings.
|
|
"""
|
|
print("=" * 80)
|
|
print("SCRAPING: Fondos y Colecciones Documentales")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
url = f"{self.base_url}/interior/archivo-general/fondosycolecciones"
|
|
collections = []
|
|
|
|
try:
|
|
response = requests.get(url, headers=self.headers, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Look for collection listings
|
|
# The page structure may vary, so we'll try multiple strategies
|
|
|
|
# Strategy 1: Look for tables with fondos
|
|
tables = soup.find_all('table')
|
|
print(f"Found {len(tables)} tables")
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows[1:]: # Skip header
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) >= 2:
|
|
collection = {
|
|
"type": "collection",
|
|
"parent_institution": "Archivo General de la Nación",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
# Extract collection data (structure depends on actual HTML)
|
|
collection["name"] = cells[0].get_text(strip=True)
|
|
|
|
if len(cells) > 1:
|
|
collection["dates"] = cells[1].get_text(strip=True)
|
|
|
|
if collection["name"]:
|
|
collections.append(collection)
|
|
print(f" • {collection['name']}")
|
|
|
|
# Strategy 2: Look for list items
|
|
lists = soup.find_all(['ul', 'ol'])
|
|
for ul in lists:
|
|
items = ul.find_all('li')
|
|
for item in items:
|
|
text = item.get_text(strip=True)
|
|
# Filter for collection-like entries
|
|
if any(keyword in text.lower() for keyword in ['fondo', 'colección', 'archivo', 'biblioteca']):
|
|
if len(text) > 10 and len(text) < 200: # Reasonable length
|
|
collection = {
|
|
"type": "collection",
|
|
"name": text,
|
|
"parent_institution": "Archivo General de la Nación",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
collections.append(collection)
|
|
print(f" • {text[:80]}")
|
|
|
|
print()
|
|
print(f"✅ Extracted {len(collections)} collections")
|
|
print()
|
|
|
|
return collections
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error scraping fondos: {e}")
|
|
return []
|
|
|
|
def scrape_koha_catalog(self) -> Dict[str, Any]:
|
|
"""
|
|
Attempt to scrape AGN's KOHA library catalog (if accessible).
|
|
"""
|
|
print("=" * 80)
|
|
print("CHECKING: KOHA Library Catalog")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# KOHA typically runs on a subdomain or separate URL
|
|
potential_urls = [
|
|
"https://koha.mininterior.gob.ar",
|
|
"https://biblioteca.agn.gov.ar",
|
|
f"{self.base_url}/interior/archivo-general/biblioteca"
|
|
]
|
|
|
|
for url in potential_urls:
|
|
try:
|
|
print(f"Trying: {url}")
|
|
response = requests.get(url, headers=self.headers, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
print(f" ✅ Found KOHA catalog at: {url}")
|
|
|
|
# Create library record
|
|
library = {
|
|
"name": "Biblioteca del Archivo General de la Nación",
|
|
"name_en": "Library of the National Archive",
|
|
"type": "LIBRARY",
|
|
"country": "AR",
|
|
"city": "Buenos Aires",
|
|
"parent_institution": "Archivo General de la Nación",
|
|
"catalog_url": url,
|
|
"catalog_system": "KOHA",
|
|
"data_source": "AGN_WEB",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
return library
|
|
else:
|
|
print(f" ⚠️ Status {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Not accessible: {e}")
|
|
|
|
print()
|
|
print("⚠️ KOHA catalog not found at expected URLs")
|
|
print()
|
|
return None
|
|
|
|
def run_full_scrape(self) -> Dict[str, Any]:
|
|
"""
|
|
Run complete AGN scraping workflow.
|
|
"""
|
|
print("=" * 80)
|
|
print("ARCHIVO GENERAL DE LA NACIÓN - FULL SCRAPE")
|
|
print("=" * 80)
|
|
print(f"Started: {datetime.now(timezone.utc).isoformat()}")
|
|
print()
|
|
|
|
results = {
|
|
"metadata": {
|
|
"source": "Archivo General de la Nación (AGN)",
|
|
"country": "AR",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"scraper": "scrape_agn_argentina.py"
|
|
},
|
|
"institutions": [],
|
|
"collections": []
|
|
}
|
|
|
|
# 1. Scrape main archive
|
|
main_archive = self.scrape_main_archive()
|
|
if main_archive:
|
|
results["institutions"].append(main_archive)
|
|
|
|
time.sleep(2) # Rate limiting
|
|
|
|
# 2. Scrape fondos and collections
|
|
collections = self.scrape_fondos_colecciones()
|
|
results["collections"].extend(collections)
|
|
|
|
time.sleep(2)
|
|
|
|
# 3. Check for library catalog
|
|
library = self.scrape_koha_catalog()
|
|
if library:
|
|
results["institutions"].append(library)
|
|
|
|
# Summary
|
|
print("=" * 80)
|
|
print("SCRAPING COMPLETE")
|
|
print("=" * 80)
|
|
print(f"Institutions: {len(results['institutions'])}")
|
|
print(f"Collections: {len(results['collections'])}")
|
|
print()
|
|
|
|
return results
|
|
|
|
def save_results(self, results: Dict[str, Any], output_file: Path):
|
|
"""
|
|
Save scraping results to JSON file.
|
|
"""
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"📁 Results saved to: {output_file}")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
# Setup paths
|
|
base_dir = Path(__file__).parent.parent.parent
|
|
output_dir = base_dir / "data" / "isil" / "AR"
|
|
output_file = output_dir / "agn_argentina_archives.json"
|
|
|
|
# Run scraper
|
|
scraper = AGNScraper()
|
|
results = scraper.run_full_scrape()
|
|
scraper.save_results(results, output_file)
|
|
|
|
print("✅ AGN scraping workflow complete")
|