203 lines
8 KiB
Python
Executable file
203 lines
8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fast batch scraper for Austrian ISIL database
|
|
Extracts all institutions from https://www.isil.at in one automated run
|
|
"""
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import sys
|
|
|
|
def log(msg):
|
|
timestamp = datetime.now().strftime('%H:%M:%S')
|
|
print(f"[{timestamp}] {msg}")
|
|
sys.stdout.flush()
|
|
|
|
def extract_institutions(page):
|
|
"""Extract institution data from current page using JavaScript"""
|
|
results = page.evaluate("""
|
|
() => {
|
|
const results = [];
|
|
const items = document.querySelectorAll('prm-brief-result-container');
|
|
|
|
items.forEach(item => {
|
|
const headingEl = item.querySelector('h3.item-title');
|
|
if (!headingEl) return;
|
|
|
|
const fullText = headingEl.textContent.trim();
|
|
|
|
// Pattern 1: ISIL code at END separated by space
|
|
// Example: "Stadtarchiv Steyr AT-40201-AR"
|
|
const matchEndSpace = fullText.match(/^(.+?)\\s+(AT-[A-Za-z0-9\\-]+)$/);
|
|
|
|
// Pattern 2: ISIL code at END without space (embedded in name)
|
|
// Example: "Stadtarchiv Steyr AT-40201-AR" (same as pattern 1 but handles edge cases)
|
|
const matchEndNoSpace = fullText.match(/^(.+?)\\s+(AT-[A-Za-z0-9\\-]+)$/);
|
|
|
|
// Pattern 3: ISIL code embedded IN NAME (before the end)
|
|
// Example: "Universität Wien | Bibliothek AT-UBW-097"
|
|
const matchEmbedded = fullText.match(/^(.+)\\s+(AT-[A-Za-z0-9\\-]+)$/);
|
|
|
|
if (matchEndSpace) {
|
|
results.push({
|
|
name: matchEndSpace[1].trim(),
|
|
isil_code: matchEndSpace[2].trim()
|
|
});
|
|
} else if (matchEmbedded) {
|
|
results.push({
|
|
name: matchEmbedded[1].trim(),
|
|
isil_code: matchEmbedded[2].trim()
|
|
});
|
|
} else if (fullText.length > 0) {
|
|
// Extract institutions WITHOUT ISIL codes
|
|
results.push({
|
|
name: fullText,
|
|
isil_code: null
|
|
});
|
|
}
|
|
});
|
|
|
|
return results;
|
|
}
|
|
""")
|
|
return results
|
|
|
|
def scrape_all_pages(start_page=1, end_page=194):
|
|
"""
|
|
Scrape all pages from Austrian ISIL database
|
|
|
|
Args:
|
|
start_page: Starting page number (1-based)
|
|
end_page: Ending page number (inclusive)
|
|
"""
|
|
output_dir = Path("data/isil/austria")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
log(f"Starting batch scrape: pages {start_page} to {end_page}")
|
|
log(f"Output directory: {output_dir}")
|
|
|
|
stats = {
|
|
"start_time": datetime.now(timezone.utc).isoformat(),
|
|
"pages_scraped": 0,
|
|
"institutions_extracted": 0,
|
|
"failed_pages": [],
|
|
"start_page": start_page,
|
|
"end_page": end_page
|
|
}
|
|
|
|
with sync_playwright() as p:
|
|
log("Launching browser...")
|
|
browser = p.chromium.launch(headless=True) # Headless for speed
|
|
page = browser.new_page()
|
|
page.set_default_timeout(30000) # 30 second timeout
|
|
|
|
try:
|
|
for page_num in range(start_page, end_page + 1):
|
|
offset = (page_num - 1) * 10
|
|
url = f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
|
|
|
|
log(f"\n=== Page {page_num}/{end_page} (offset={offset}) ===")
|
|
|
|
try:
|
|
# Navigate to page
|
|
page.goto(url, wait_until="networkidle")
|
|
time.sleep(5) # Wait for AngularJS to render
|
|
|
|
# Extract institutions
|
|
institutions = extract_institutions(page)
|
|
|
|
if not institutions:
|
|
log(f"⚠️ WARNING: No institutions found on page {page_num}")
|
|
stats["failed_pages"].append({
|
|
"page": page_num,
|
|
"offset": offset,
|
|
"reason": "No institutions extracted"
|
|
})
|
|
continue
|
|
|
|
# Save to JSON
|
|
output_file = output_dir / f"page_{page_num:03d}_data.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, ensure_ascii=False, indent=2)
|
|
|
|
log(f"✅ Extracted {len(institutions)} institutions")
|
|
log(f" Saved to: {output_file.name}")
|
|
|
|
stats["pages_scraped"] += 1
|
|
stats["institutions_extracted"] += len(institutions)
|
|
|
|
# Rate limiting - be nice to the server
|
|
time.sleep(2)
|
|
|
|
except Exception as e:
|
|
log(f"❌ ERROR on page {page_num}: {e}")
|
|
stats["failed_pages"].append({
|
|
"page": page_num,
|
|
"offset": offset,
|
|
"reason": str(e)
|
|
})
|
|
continue
|
|
|
|
# Progress update every 10 pages
|
|
if page_num % 10 == 0:
|
|
elapsed = (datetime.now(timezone.utc) - datetime.fromisoformat(stats["start_time"])).total_seconds()
|
|
rate = stats["pages_scraped"] / elapsed if elapsed > 0 else 0
|
|
remaining = (end_page - page_num) / rate if rate > 0 else 0
|
|
log(f"\n📊 Progress: {stats['pages_scraped']}/{end_page} pages")
|
|
log(f" Total institutions: {stats['institutions_extracted']}")
|
|
log(f" Rate: {rate:.2f} pages/sec")
|
|
log(f" Estimated time remaining: {remaining/60:.1f} minutes\n")
|
|
|
|
finally:
|
|
browser.close()
|
|
|
|
# Final statistics
|
|
stats["end_time"] = datetime.now(timezone.utc).isoformat()
|
|
stats["duration_seconds"] = (
|
|
datetime.fromisoformat(stats["end_time"]) -
|
|
datetime.fromisoformat(stats["start_time"])
|
|
).total_seconds()
|
|
|
|
# Save statistics
|
|
stats_file = output_dir / f"scraping_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, indent=2)
|
|
|
|
# Print summary
|
|
log("\n" + "="*60)
|
|
log("SCRAPING COMPLETE")
|
|
log("="*60)
|
|
log(f"Pages scraped: {stats['pages_scraped']}/{end_page - start_page + 1}")
|
|
log(f"Institutions extracted: {stats['institutions_extracted']}")
|
|
log(f"Failed pages: {len(stats['failed_pages'])}")
|
|
log(f"Duration: {stats['duration_seconds']/60:.1f} minutes")
|
|
log(f"Statistics saved to: {stats_file}")
|
|
|
|
if stats["failed_pages"]:
|
|
log("\n⚠️ Failed pages (need retry):")
|
|
for fail in stats["failed_pages"]:
|
|
log(f" - Page {fail['page']}: {fail['reason']}")
|
|
|
|
return stats
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Scrape Austrian ISIL database")
|
|
parser.add_argument("--start", type=int, default=1, help="Start page (default: 1)")
|
|
parser.add_argument("--end", type=int, default=194, help="End page (default: 194)")
|
|
parser.add_argument("--test", action="store_true", help="Test mode: scrape only 5 pages")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.test:
|
|
log("🧪 TEST MODE: Scraping first 5 pages only")
|
|
scrape_all_pages(start_page=1, end_page=5)
|
|
else:
|
|
scrape_all_pages(start_page=args.start, end_page=args.end)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|