glam/scripts/check_austrian_scraping_progress.py
2025-11-19 23:25:22 +01:00

69 lines
2.1 KiB
Python

#!/usr/bin/env python3
"""
Check progress of Austrian ISIL scraping
Shows current status, institutions found, and estimated time remaining
"""
import json
from pathlib import Path
from datetime import datetime
def check_progress():
data_dir = Path("data/isil/austria")
# Count scraped pages
page_files = sorted(data_dir.glob("page_*_data.json"))
total_pages = len(page_files)
# Count institutions
institutions_with_isil = 0
institutions_without_isil = 0
for page_file in page_files:
with open(page_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
for inst in institutions:
if inst.get('isil_code'):
institutions_with_isil += 1
else:
institutions_without_isil += 1
total_institutions = institutions_with_isil + institutions_without_isil
# Calculate progress
expected_total = 194
progress_pct = (total_pages / expected_total) * 100
# Estimate time remaining (assuming 9 sec/page)
remaining_pages = expected_total - total_pages
estimated_minutes = (remaining_pages * 9) / 60
# Print status
print("=" * 60)
print("AUSTRIAN ISIL SCRAPING PROGRESS")
print("=" * 60)
print(f"Pages scraped: {total_pages}/{expected_total} ({progress_pct:.1f}%)")
print(f"Total institutions: {total_institutions}")
print(f" - With ISIL codes: {institutions_with_isil}")
print(f" - Without ISIL codes: {institutions_without_isil}")
print()
print(f"Estimated time remaining: {estimated_minutes:.1f} minutes")
print()
# Show page ranges
page_nums = [int(f.stem.split('_')[1]) for f in page_files]
if page_nums:
print(f"Page range: {min(page_nums)} - {max(page_nums)}")
# Find gaps
expected_pages = set(range(min(page_nums), max(page_nums) + 1))
actual_pages = set(page_nums)
gaps = sorted(expected_pages - actual_pages)
if gaps:
print(f"\n⚠️ Missing pages: {gaps}")
print("=" * 60)
if __name__ == '__main__':
check_progress()