#!/usr/bin/env python3 """ Check progress of Austrian ISIL scraping Shows current status, institutions found, and estimated time remaining """ import json from pathlib import Path from datetime import datetime def check_progress(): data_dir = Path("data/isil/austria") # Count scraped pages page_files = sorted(data_dir.glob("page_*_data.json")) total_pages = len(page_files) # Count institutions institutions_with_isil = 0 institutions_without_isil = 0 for page_file in page_files: with open(page_file, 'r', encoding='utf-8') as f: institutions = json.load(f) for inst in institutions: if inst.get('isil_code'): institutions_with_isil += 1 else: institutions_without_isil += 1 total_institutions = institutions_with_isil + institutions_without_isil # Calculate progress expected_total = 194 progress_pct = (total_pages / expected_total) * 100 # Estimate time remaining (assuming 9 sec/page) remaining_pages = expected_total - total_pages estimated_minutes = (remaining_pages * 9) / 60 # Print status print("=" * 60) print("AUSTRIAN ISIL SCRAPING PROGRESS") print("=" * 60) print(f"Pages scraped: {total_pages}/{expected_total} ({progress_pct:.1f}%)") print(f"Total institutions: {total_institutions}") print(f" - With ISIL codes: {institutions_with_isil}") print(f" - Without ISIL codes: {institutions_without_isil}") print() print(f"Estimated time remaining: {estimated_minutes:.1f} minutes") print() # Show page ranges page_nums = [int(f.stem.split('_')[1]) for f in page_files] if page_nums: print(f"Page range: {min(page_nums)} - {max(page_nums)}") # Find gaps expected_pages = set(range(min(page_nums), max(page_nums) + 1)) actual_pages = set(page_nums) gaps = sorted(expected_pages - actual_pages) if gaps: print(f"\n⚠️ Missing pages: {gaps}") print("=" * 60) if __name__ == '__main__': check_progress()