glam/scripts/scrape_austrian_isil_batch.py
2025-11-19 23:25:22 +01:00

203 lines
8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fast batch scraper for Austrian ISIL database
Extracts all institutions from https://www.isil.at in one automated run
"""
from playwright.sync_api import sync_playwright
import json
import time
from pathlib import Path
from datetime import datetime, timezone
import sys
def log(msg):
timestamp = datetime.now().strftime('%H:%M:%S')
print(f"[{timestamp}] {msg}")
sys.stdout.flush()
def extract_institutions(page):
"""Extract institution data from current page using JavaScript"""
results = page.evaluate("""
() => {
const results = [];
const items = document.querySelectorAll('prm-brief-result-container');
items.forEach(item => {
const headingEl = item.querySelector('h3.item-title');
if (!headingEl) return;
const fullText = headingEl.textContent.trim();
// Pattern 1: ISIL code at END separated by space
// Example: "Stadtarchiv Steyr AT-40201-AR"
const matchEndSpace = fullText.match(/^(.+?)\\s+(AT-[A-Za-z0-9\\-]+)$/);
// Pattern 2: ISIL code at END without space (embedded in name)
// Example: "Stadtarchiv Steyr AT-40201-AR" (same as pattern 1 but handles edge cases)
const matchEndNoSpace = fullText.match(/^(.+?)\\s+(AT-[A-Za-z0-9\\-]+)$/);
// Pattern 3: ISIL code embedded IN NAME (before the end)
// Example: "Universität Wien | Bibliothek AT-UBW-097"
const matchEmbedded = fullText.match(/^(.+)\\s+(AT-[A-Za-z0-9\\-]+)$/);
if (matchEndSpace) {
results.push({
name: matchEndSpace[1].trim(),
isil_code: matchEndSpace[2].trim()
});
} else if (matchEmbedded) {
results.push({
name: matchEmbedded[1].trim(),
isil_code: matchEmbedded[2].trim()
});
} else if (fullText.length > 0) {
// Extract institutions WITHOUT ISIL codes
results.push({
name: fullText,
isil_code: null
});
}
});
return results;
}
""")
return results
def scrape_all_pages(start_page=1, end_page=194):
"""
Scrape all pages from Austrian ISIL database
Args:
start_page: Starting page number (1-based)
end_page: Ending page number (inclusive)
"""
output_dir = Path("data/isil/austria")
output_dir.mkdir(parents=True, exist_ok=True)
log(f"Starting batch scrape: pages {start_page} to {end_page}")
log(f"Output directory: {output_dir}")
stats = {
"start_time": datetime.now(timezone.utc).isoformat(),
"pages_scraped": 0,
"institutions_extracted": 0,
"failed_pages": [],
"start_page": start_page,
"end_page": end_page
}
with sync_playwright() as p:
log("Launching browser...")
browser = p.chromium.launch(headless=True) # Headless for speed
page = browser.new_page()
page.set_default_timeout(30000) # 30 second timeout
try:
for page_num in range(start_page, end_page + 1):
offset = (page_num - 1) * 10
url = f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
log(f"\n=== Page {page_num}/{end_page} (offset={offset}) ===")
try:
# Navigate to page
page.goto(url, wait_until="networkidle")
time.sleep(5) # Wait for AngularJS to render
# Extract institutions
institutions = extract_institutions(page)
if not institutions:
log(f"⚠️ WARNING: No institutions found on page {page_num}")
stats["failed_pages"].append({
"page": page_num,
"offset": offset,
"reason": "No institutions extracted"
})
continue
# Save to JSON
output_file = output_dir / f"page_{page_num:03d}_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(institutions, f, ensure_ascii=False, indent=2)
log(f"✅ Extracted {len(institutions)} institutions")
log(f" Saved to: {output_file.name}")
stats["pages_scraped"] += 1
stats["institutions_extracted"] += len(institutions)
# Rate limiting - be nice to the server
time.sleep(2)
except Exception as e:
log(f"❌ ERROR on page {page_num}: {e}")
stats["failed_pages"].append({
"page": page_num,
"offset": offset,
"reason": str(e)
})
continue
# Progress update every 10 pages
if page_num % 10 == 0:
elapsed = (datetime.now(timezone.utc) - datetime.fromisoformat(stats["start_time"])).total_seconds()
rate = stats["pages_scraped"] / elapsed if elapsed > 0 else 0
remaining = (end_page - page_num) / rate if rate > 0 else 0
log(f"\n📊 Progress: {stats['pages_scraped']}/{end_page} pages")
log(f" Total institutions: {stats['institutions_extracted']}")
log(f" Rate: {rate:.2f} pages/sec")
log(f" Estimated time remaining: {remaining/60:.1f} minutes\n")
finally:
browser.close()
# Final statistics
stats["end_time"] = datetime.now(timezone.utc).isoformat()
stats["duration_seconds"] = (
datetime.fromisoformat(stats["end_time"]) -
datetime.fromisoformat(stats["start_time"])
).total_seconds()
# Save statistics
stats_file = output_dir / f"scraping_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2)
# Print summary
log("\n" + "="*60)
log("SCRAPING COMPLETE")
log("="*60)
log(f"Pages scraped: {stats['pages_scraped']}/{end_page - start_page + 1}")
log(f"Institutions extracted: {stats['institutions_extracted']}")
log(f"Failed pages: {len(stats['failed_pages'])}")
log(f"Duration: {stats['duration_seconds']/60:.1f} minutes")
log(f"Statistics saved to: {stats_file}")
if stats["failed_pages"]:
log("\n⚠️ Failed pages (need retry):")
for fail in stats["failed_pages"]:
log(f" - Page {fail['page']}: {fail['reason']}")
return stats
def main():
import argparse
parser = argparse.ArgumentParser(description="Scrape Austrian ISIL database")
parser.add_argument("--start", type=int, default=1, help="Start page (default: 1)")
parser.add_argument("--end", type=int, default=194, help="End page (default: 194)")
parser.add_argument("--test", action="store_true", help="Test mode: scrape only 5 pages")
args = parser.parse_args()
if args.test:
log("🧪 TEST MODE: Scraping first 5 pages only")
scrape_all_pages(start_page=1, end_page=5)
else:
scrape_all_pages(start_page=args.start, end_page=args.end)
if __name__ == "__main__":
main()