69 lines
2.1 KiB
Python
69 lines
2.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Check progress of Austrian ISIL scraping
|
|
Shows current status, institutions found, and estimated time remaining
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
def check_progress():
|
|
data_dir = Path("data/isil/austria")
|
|
|
|
# Count scraped pages
|
|
page_files = sorted(data_dir.glob("page_*_data.json"))
|
|
total_pages = len(page_files)
|
|
|
|
# Count institutions
|
|
institutions_with_isil = 0
|
|
institutions_without_isil = 0
|
|
|
|
for page_file in page_files:
|
|
with open(page_file, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
for inst in institutions:
|
|
if inst.get('isil_code'):
|
|
institutions_with_isil += 1
|
|
else:
|
|
institutions_without_isil += 1
|
|
|
|
total_institutions = institutions_with_isil + institutions_without_isil
|
|
|
|
# Calculate progress
|
|
expected_total = 194
|
|
progress_pct = (total_pages / expected_total) * 100
|
|
|
|
# Estimate time remaining (assuming 9 sec/page)
|
|
remaining_pages = expected_total - total_pages
|
|
estimated_minutes = (remaining_pages * 9) / 60
|
|
|
|
# Print status
|
|
print("=" * 60)
|
|
print("AUSTRIAN ISIL SCRAPING PROGRESS")
|
|
print("=" * 60)
|
|
print(f"Pages scraped: {total_pages}/{expected_total} ({progress_pct:.1f}%)")
|
|
print(f"Total institutions: {total_institutions}")
|
|
print(f" - With ISIL codes: {institutions_with_isil}")
|
|
print(f" - Without ISIL codes: {institutions_without_isil}")
|
|
print()
|
|
print(f"Estimated time remaining: {estimated_minutes:.1f} minutes")
|
|
print()
|
|
|
|
# Show page ranges
|
|
page_nums = [int(f.stem.split('_')[1]) for f in page_files]
|
|
if page_nums:
|
|
print(f"Page range: {min(page_nums)} - {max(page_nums)}")
|
|
|
|
# Find gaps
|
|
expected_pages = set(range(min(page_nums), max(page_nums) + 1))
|
|
actual_pages = set(page_nums)
|
|
gaps = sorted(expected_pages - actual_pages)
|
|
|
|
if gaps:
|
|
print(f"\n⚠️ Missing pages: {gaps}")
|
|
|
|
print("=" * 60)
|
|
|
|
if __name__ == '__main__':
|
|
check_progress()
|