#!/usr/bin/env python3 """ Bosnia ISIL Code Scraper Automated script to check all 80 COBISS.BH libraries for ISIL codes. Strategy: 1. Load bosnia_cobiss_libraries_raw.json 2. For each library: a. Check COBISS library record page b. Check institutional website (if available) c. Search for patterns: BA-, BO-, ISIL, ISO 15511 3. Output results to bosnia_isil_codes_found.json Estimated time: 80 libraries × 5 min = ~6.5 hours With automation: ~10-20 minutes """ import json import time import re from pathlib import Path from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout # Paths DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/bosnia") INPUT_FILE = DATA_DIR / "bosnia_cobiss_libraries_raw.json" OUTPUT_FILE = DATA_DIR / "bosnia_isil_codes_found.json" LOG_FILE = DATA_DIR / "scraper_log.txt" # ISIL code patterns ISIL_PATTERNS = [ r'BA-[A-Z0-9]+', # BA- prefix (ISO 3166-1) r'BO-[A-Z0-9]+', # BO- prefix (legacy?) r'ISIL[:\s]+[A-Z]{2}-[A-Z0-9]+', # "ISIL: XX-XXXXX" r'ISO\s*15511', # ISO standard mention ] def log_message(message): """Log to both console and file.""" print(message) with open(LOG_FILE, 'a', encoding='utf-8') as f: f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n") def search_for_isil(text): """Search text for ISIL code patterns.""" found_codes = [] for pattern in ISIL_PATTERNS: matches = re.findall(pattern, text, re.IGNORECASE) found_codes.extend(matches) return list(set(found_codes)) # Remove duplicates def check_cobiss_library_page(page, acronym, timeout=10000): """Check COBISS library information page for ISIL codes.""" try: # Try various COBISS URLs urls_to_try = [ f"https://bh.cobiss.net/en/libraries/{acronym.lower()}", f"https://plus.cobiss.net/cobiss/bh/en/library/{acronym.lower()}", f"https://bh.cobiss.net/biblioteke/{acronym.lower()}", ] for url in urls_to_try: try: page.goto(url, timeout=timeout, wait_until='domcontentloaded') if page.title() != "ERROR - 404 - Page not found": content = page.content() isil_codes = search_for_isil(content) if isil_codes: return {"source": url, "codes": isil_codes} except PlaywrightTimeout: continue except Exception as e: log_message(f"Error checking {url}: {e}") continue return None except Exception as e: log_message(f"Error in check_cobiss_library_page for {acronym}: {e}") return None def check_institution_website(page, homepage, timeout=15000): """Check institutional website for ISIL codes.""" if not homepage or homepage == "domača stranica biblioteke": return None try: # Normalize URL if not homepage.startswith('http'): homepage = f"https://{homepage}" page.goto(homepage, timeout=timeout, wait_until='domcontentloaded') # Check main page content = page.content() isil_codes = search_for_isil(content) if isil_codes: return {"source": homepage, "codes": isil_codes} # Check "About" or "Contact" pages about_links = page.locator('a[href*="about"], a[href*="o-nama"], a[href*="kontakt"]').all() for link in about_links[:3]: # Check first 3 matching links try: link.click(timeout=5000) time.sleep(2) content = page.content() isil_codes = search_for_isil(content) if isil_codes: return {"source": page.url, "codes": isil_codes} except: continue return None except Exception as e: log_message(f"Error checking website {homepage}: {e}") return None def scrape_all_libraries(): """Main scraping function.""" log_message("Starting Bosnia ISIL scraper...") # Load libraries with open(INPUT_FILE, 'r', encoding='utf-8') as f: libraries = json.load(f) log_message(f"Loaded {len(libraries)} libraries") results = [] with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() for i, library in enumerate(libraries, 1): log_message(f"[{i}/80] Checking: {library['name']} ({library['acronym']})") result = { "number": library["number"], "name": library["name"], "city": library["city"], "acronym": library["acronym"], "homepage": library["homepage"], "isil_found": False, "isil_codes": [], "sources_checked": [], "notes": [] } # Check COBISS pages cobiss_result = check_cobiss_library_page(page, library["acronym"]) result["sources_checked"].append("COBISS library pages") if cobiss_result: result["isil_found"] = True result["isil_codes"].extend(cobiss_result["codes"]) result["notes"].append(f"Found in COBISS: {cobiss_result['source']}") log_message(f" ✓ Found ISIL codes in COBISS: {cobiss_result['codes']}") # Check institution website if library["homepage"] and library["homepage"] != "domača stranica biblioteke": website_result = check_institution_website(page, library["homepage"]) result["sources_checked"].append(f"Website: {library['homepage']}") if website_result: result["isil_found"] = True result["isil_codes"].extend(website_result["codes"]) result["notes"].append(f"Found on website: {website_result['source']}") log_message(f" ✓ Found ISIL codes on website: {website_result['codes']}") # Remove duplicate codes result["isil_codes"] = list(set(result["isil_codes"])) results.append(result) # Small delay between requests time.sleep(2) # Save intermediate results every 10 libraries if i % 10 == 0: with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) log_message(f" → Intermediate save: {i}/80 libraries processed") browser.close() # Final save with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) # Summary statistics found_count = sum(1 for r in results if r["isil_found"]) total_codes = sum(len(r["isil_codes"]) for r in results) log_message("\n" + "="*50) log_message("SCRAPING COMPLETE") log_message("="*50) log_message(f"Libraries checked: {len(results)}") log_message(f"ISIL codes found: {found_count}/{len(results)} libraries") log_message(f"Total unique codes: {total_codes}") log_message(f"Results saved to: {OUTPUT_FILE}") log_message("="*50) if __name__ == "__main__": scrape_all_libraries()