glam/scripts/bosnia_isil_scraper.py
2025-11-19 23:25:22 +01:00

202 lines
7.3 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Bosnia ISIL Code Scraper
Automated script to check all 80 COBISS.BH libraries for ISIL codes.
Strategy:
1. Load bosnia_cobiss_libraries_raw.json
2. For each library:
a. Check COBISS library record page
b. Check institutional website (if available)
c. Search for patterns: BA-, BO-, ISIL, ISO 15511
3. Output results to bosnia_isil_codes_found.json
Estimated time: 80 libraries × 5 min = ~6.5 hours
With automation: ~10-20 minutes
"""
import json
import time
import re
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
# Paths
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/bosnia")
INPUT_FILE = DATA_DIR / "bosnia_cobiss_libraries_raw.json"
OUTPUT_FILE = DATA_DIR / "bosnia_isil_codes_found.json"
LOG_FILE = DATA_DIR / "scraper_log.txt"
# ISIL code patterns
ISIL_PATTERNS = [
r'BA-[A-Z0-9]+', # BA- prefix (ISO 3166-1)
r'BO-[A-Z0-9]+', # BO- prefix (legacy?)
r'ISIL[:\s]+[A-Z]{2}-[A-Z0-9]+', # "ISIL: XX-XXXXX"
r'ISO\s*15511', # ISO standard mention
]
def log_message(message):
"""Log to both console and file."""
print(message)
with open(LOG_FILE, 'a', encoding='utf-8') as f:
f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
def search_for_isil(text):
"""Search text for ISIL code patterns."""
found_codes = []
for pattern in ISIL_PATTERNS:
matches = re.findall(pattern, text, re.IGNORECASE)
found_codes.extend(matches)
return list(set(found_codes)) # Remove duplicates
def check_cobiss_library_page(page, acronym, timeout=10000):
"""Check COBISS library information page for ISIL codes."""
try:
# Try various COBISS URLs
urls_to_try = [
f"https://bh.cobiss.net/en/libraries/{acronym.lower()}",
f"https://plus.cobiss.net/cobiss/bh/en/library/{acronym.lower()}",
f"https://bh.cobiss.net/biblioteke/{acronym.lower()}",
]
for url in urls_to_try:
try:
page.goto(url, timeout=timeout, wait_until='domcontentloaded')
if page.title() != "ERROR - 404 - Page not found":
content = page.content()
isil_codes = search_for_isil(content)
if isil_codes:
return {"source": url, "codes": isil_codes}
except PlaywrightTimeout:
continue
except Exception as e:
log_message(f"Error checking {url}: {e}")
continue
return None
except Exception as e:
log_message(f"Error in check_cobiss_library_page for {acronym}: {e}")
return None
def check_institution_website(page, homepage, timeout=15000):
"""Check institutional website for ISIL codes."""
if not homepage or homepage == "domača stranica biblioteke":
return None
try:
# Normalize URL
if not homepage.startswith('http'):
homepage = f"https://{homepage}"
page.goto(homepage, timeout=timeout, wait_until='domcontentloaded')
# Check main page
content = page.content()
isil_codes = search_for_isil(content)
if isil_codes:
return {"source": homepage, "codes": isil_codes}
# Check "About" or "Contact" pages
about_links = page.locator('a[href*="about"], a[href*="o-nama"], a[href*="kontakt"]').all()
for link in about_links[:3]: # Check first 3 matching links
try:
link.click(timeout=5000)
time.sleep(2)
content = page.content()
isil_codes = search_for_isil(content)
if isil_codes:
return {"source": page.url, "codes": isil_codes}
except:
continue
return None
except Exception as e:
log_message(f"Error checking website {homepage}: {e}")
return None
def scrape_all_libraries():
"""Main scraping function."""
log_message("Starting Bosnia ISIL scraper...")
# Load libraries
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
libraries = json.load(f)
log_message(f"Loaded {len(libraries)} libraries")
results = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
for i, library in enumerate(libraries, 1):
log_message(f"[{i}/80] Checking: {library['name']} ({library['acronym']})")
result = {
"number": library["number"],
"name": library["name"],
"city": library["city"],
"acronym": library["acronym"],
"homepage": library["homepage"],
"isil_found": False,
"isil_codes": [],
"sources_checked": [],
"notes": []
}
# Check COBISS pages
cobiss_result = check_cobiss_library_page(page, library["acronym"])
result["sources_checked"].append("COBISS library pages")
if cobiss_result:
result["isil_found"] = True
result["isil_codes"].extend(cobiss_result["codes"])
result["notes"].append(f"Found in COBISS: {cobiss_result['source']}")
log_message(f" ✓ Found ISIL codes in COBISS: {cobiss_result['codes']}")
# Check institution website
if library["homepage"] and library["homepage"] != "domača stranica biblioteke":
website_result = check_institution_website(page, library["homepage"])
result["sources_checked"].append(f"Website: {library['homepage']}")
if website_result:
result["isil_found"] = True
result["isil_codes"].extend(website_result["codes"])
result["notes"].append(f"Found on website: {website_result['source']}")
log_message(f" ✓ Found ISIL codes on website: {website_result['codes']}")
# Remove duplicate codes
result["isil_codes"] = list(set(result["isil_codes"]))
results.append(result)
# Small delay between requests
time.sleep(2)
# Save intermediate results every 10 libraries
if i % 10 == 0:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
log_message(f" → Intermediate save: {i}/80 libraries processed")
browser.close()
# Final save
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Summary statistics
found_count = sum(1 for r in results if r["isil_found"])
total_codes = sum(len(r["isil_codes"]) for r in results)
log_message("\n" + "="*50)
log_message("SCRAPING COMPLETE")
log_message("="*50)
log_message(f"Libraries checked: {len(results)}")
log_message(f"ISIL codes found: {found_count}/{len(results)} libraries")
log_message(f"Total unique codes: {total_codes}")
log_message(f"Results saved to: {OUTPUT_FILE}")
log_message("="*50)
if __name__ == "__main__":
scrape_all_libraries()