202 lines
7.3 KiB
Python
Executable file
202 lines
7.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Bosnia ISIL Code Scraper
|
||
|
||
Automated script to check all 80 COBISS.BH libraries for ISIL codes.
|
||
|
||
Strategy:
|
||
1. Load bosnia_cobiss_libraries_raw.json
|
||
2. For each library:
|
||
a. Check COBISS library record page
|
||
b. Check institutional website (if available)
|
||
c. Search for patterns: BA-, BO-, ISIL, ISO 15511
|
||
3. Output results to bosnia_isil_codes_found.json
|
||
|
||
Estimated time: 80 libraries × 5 min = ~6.5 hours
|
||
With automation: ~10-20 minutes
|
||
"""
|
||
|
||
import json
|
||
import time
|
||
import re
|
||
from pathlib import Path
|
||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||
|
||
# Paths
|
||
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/bosnia")
|
||
INPUT_FILE = DATA_DIR / "bosnia_cobiss_libraries_raw.json"
|
||
OUTPUT_FILE = DATA_DIR / "bosnia_isil_codes_found.json"
|
||
LOG_FILE = DATA_DIR / "scraper_log.txt"
|
||
|
||
# ISIL code patterns
|
||
ISIL_PATTERNS = [
|
||
r'BA-[A-Z0-9]+', # BA- prefix (ISO 3166-1)
|
||
r'BO-[A-Z0-9]+', # BO- prefix (legacy?)
|
||
r'ISIL[:\s]+[A-Z]{2}-[A-Z0-9]+', # "ISIL: XX-XXXXX"
|
||
r'ISO\s*15511', # ISO standard mention
|
||
]
|
||
|
||
def log_message(message):
|
||
"""Log to both console and file."""
|
||
print(message)
|
||
with open(LOG_FILE, 'a', encoding='utf-8') as f:
|
||
f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
|
||
|
||
def search_for_isil(text):
|
||
"""Search text for ISIL code patterns."""
|
||
found_codes = []
|
||
for pattern in ISIL_PATTERNS:
|
||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||
found_codes.extend(matches)
|
||
return list(set(found_codes)) # Remove duplicates
|
||
|
||
def check_cobiss_library_page(page, acronym, timeout=10000):
|
||
"""Check COBISS library information page for ISIL codes."""
|
||
try:
|
||
# Try various COBISS URLs
|
||
urls_to_try = [
|
||
f"https://bh.cobiss.net/en/libraries/{acronym.lower()}",
|
||
f"https://plus.cobiss.net/cobiss/bh/en/library/{acronym.lower()}",
|
||
f"https://bh.cobiss.net/biblioteke/{acronym.lower()}",
|
||
]
|
||
|
||
for url in urls_to_try:
|
||
try:
|
||
page.goto(url, timeout=timeout, wait_until='domcontentloaded')
|
||
if page.title() != "ERROR - 404 - Page not found":
|
||
content = page.content()
|
||
isil_codes = search_for_isil(content)
|
||
if isil_codes:
|
||
return {"source": url, "codes": isil_codes}
|
||
except PlaywrightTimeout:
|
||
continue
|
||
except Exception as e:
|
||
log_message(f"Error checking {url}: {e}")
|
||
continue
|
||
|
||
return None
|
||
except Exception as e:
|
||
log_message(f"Error in check_cobiss_library_page for {acronym}: {e}")
|
||
return None
|
||
|
||
def check_institution_website(page, homepage, timeout=15000):
|
||
"""Check institutional website for ISIL codes."""
|
||
if not homepage or homepage == "domača stranica biblioteke":
|
||
return None
|
||
|
||
try:
|
||
# Normalize URL
|
||
if not homepage.startswith('http'):
|
||
homepage = f"https://{homepage}"
|
||
|
||
page.goto(homepage, timeout=timeout, wait_until='domcontentloaded')
|
||
|
||
# Check main page
|
||
content = page.content()
|
||
isil_codes = search_for_isil(content)
|
||
if isil_codes:
|
||
return {"source": homepage, "codes": isil_codes}
|
||
|
||
# Check "About" or "Contact" pages
|
||
about_links = page.locator('a[href*="about"], a[href*="o-nama"], a[href*="kontakt"]').all()
|
||
for link in about_links[:3]: # Check first 3 matching links
|
||
try:
|
||
link.click(timeout=5000)
|
||
time.sleep(2)
|
||
content = page.content()
|
||
isil_codes = search_for_isil(content)
|
||
if isil_codes:
|
||
return {"source": page.url, "codes": isil_codes}
|
||
except:
|
||
continue
|
||
|
||
return None
|
||
except Exception as e:
|
||
log_message(f"Error checking website {homepage}: {e}")
|
||
return None
|
||
|
||
def scrape_all_libraries():
|
||
"""Main scraping function."""
|
||
log_message("Starting Bosnia ISIL scraper...")
|
||
|
||
# Load libraries
|
||
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
||
libraries = json.load(f)
|
||
|
||
log_message(f"Loaded {len(libraries)} libraries")
|
||
|
||
results = []
|
||
|
||
with sync_playwright() as p:
|
||
browser = p.chromium.launch(headless=True)
|
||
page = browser.new_page()
|
||
|
||
for i, library in enumerate(libraries, 1):
|
||
log_message(f"[{i}/80] Checking: {library['name']} ({library['acronym']})")
|
||
|
||
result = {
|
||
"number": library["number"],
|
||
"name": library["name"],
|
||
"city": library["city"],
|
||
"acronym": library["acronym"],
|
||
"homepage": library["homepage"],
|
||
"isil_found": False,
|
||
"isil_codes": [],
|
||
"sources_checked": [],
|
||
"notes": []
|
||
}
|
||
|
||
# Check COBISS pages
|
||
cobiss_result = check_cobiss_library_page(page, library["acronym"])
|
||
result["sources_checked"].append("COBISS library pages")
|
||
if cobiss_result:
|
||
result["isil_found"] = True
|
||
result["isil_codes"].extend(cobiss_result["codes"])
|
||
result["notes"].append(f"Found in COBISS: {cobiss_result['source']}")
|
||
log_message(f" ✓ Found ISIL codes in COBISS: {cobiss_result['codes']}")
|
||
|
||
# Check institution website
|
||
if library["homepage"] and library["homepage"] != "domača stranica biblioteke":
|
||
website_result = check_institution_website(page, library["homepage"])
|
||
result["sources_checked"].append(f"Website: {library['homepage']}")
|
||
if website_result:
|
||
result["isil_found"] = True
|
||
result["isil_codes"].extend(website_result["codes"])
|
||
result["notes"].append(f"Found on website: {website_result['source']}")
|
||
log_message(f" ✓ Found ISIL codes on website: {website_result['codes']}")
|
||
|
||
# Remove duplicate codes
|
||
result["isil_codes"] = list(set(result["isil_codes"]))
|
||
|
||
results.append(result)
|
||
|
||
# Small delay between requests
|
||
time.sleep(2)
|
||
|
||
# Save intermediate results every 10 libraries
|
||
if i % 10 == 0:
|
||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
log_message(f" → Intermediate save: {i}/80 libraries processed")
|
||
|
||
browser.close()
|
||
|
||
# Final save
|
||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
|
||
# Summary statistics
|
||
found_count = sum(1 for r in results if r["isil_found"])
|
||
total_codes = sum(len(r["isil_codes"]) for r in results)
|
||
|
||
log_message("\n" + "="*50)
|
||
log_message("SCRAPING COMPLETE")
|
||
log_message("="*50)
|
||
log_message(f"Libraries checked: {len(results)}")
|
||
log_message(f"ISIL codes found: {found_count}/{len(results)} libraries")
|
||
log_message(f"Total unique codes: {total_codes}")
|
||
log_message(f"Results saved to: {OUTPUT_FILE}")
|
||
log_message("="*50)
|
||
|
||
if __name__ == "__main__":
|
||
scrape_all_libraries()
|