glam/scripts/scrapers/harvest_nrw_archives_fast.py
2025-11-19 23:25:22 +01:00

283 lines
9.5 KiB
Python
Executable file
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
NRW Archives Fast Harvester (No Clicking Strategy)
Extracts ALL 523+ archives from archive.nrw.de by parsing the rendered page
This version extracts archive names from the list WITHOUT clicking each one,
which is much faster. ISIL codes will be enriched later via detail page scraping.
Portal: https://www.archive.nrw.de/archivsuche
Strategy: Parse rendered HTML after JavaScript execution
Speed: ~10 seconds (vs 10+ minutes for clicking approach)
Author: OpenCode + AI Agent
Date: 2025-11-19
Version: 3.0 (Fast Harvest - Name Only)
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re
# Configuration
BASE_URL = "https://www.archive.nrw.de"
SEARCH_URL = f"{BASE_URL}/archivsuche"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
"Landesarchiv": "OFFICIAL_INSTITUTION",
"Stadtarchiv": "ARCHIVE",
"Gemeindearchiv": "ARCHIVE",
"Kreisarchiv": "ARCHIVE",
"Stiftsarchiv": "ARCHIVE",
"Kommunalarchiv": "ARCHIVE",
"Stadt- und": "ARCHIVE",
"Institut für": "RESEARCH_CENTER",
"Archiv des": "ARCHIVE",
"Archiv der": "ARCHIVE",
"Historisches": "RESEARCH_CENTER",
"Universitätsarchiv": "EDUCATION_PROVIDER",
"Hochschularchiv": "EDUCATION_PROVIDER",
"Bistumsarchiv": "HOLY_SITES",
"Erzbistumsarchiv": "HOLY_SITES",
"Diözesanarchiv": "HOLY_SITES",
"Landeskirchliches": "HOLY_SITES",
"Kirchenkreises": "HOLY_SITES",
"Unternehmensarchiv": "CORPORATION",
"Konzernarchiv": "CORPORATION",
"Wirtschaftsarchiv": "CORPORATION",
}
def infer_institution_type(name: str) -> str:
"""Infer institution type from German archive name."""
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
if keyword in name:
return inst_type
return "ARCHIVE"
def extract_city_from_name(name: str) -> Optional[str]:
"""Extract city name from German archive names."""
patterns = [
r'Stadtarchiv\s+(.+)',
r'Gemeindearchiv\s+(.+)',
r'Kreisarchiv\s+(.+)',
r'Kommunalarchiv\s+(.+)',
r'Stadt-\s+und\s+\w+\s+(.+)',
r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
r'Historisches\s+Zentrum\s+(.+)',
r'Stiftsarchiv\s+(.+)',
]
for pattern in patterns:
match = re.search(pattern, name)
if match:
city = match.group(1).strip()
city = re.sub(r'\s+\(.*\)$', '', city) # Remove (Westf.), (Ruhr), etc.
return city
return None
def harvest_archives_fast() -> List[Dict]:
"""
Fast harvest strategy: Extract archive names from page without clicking.
Strategy:
1. Load page with all archives visible
2. Extract ALL archive button texts at once
3. Filter to top-level archives only
4. Save metadata
ISIL codes can be enriched later in a second pass if needed.
"""
archives = []
with sync_playwright() as p:
print("Launching browser...")
browser = p.chromium.launch(headless=True)
page = browser.new_page()
try:
print(f"Navigating to {SEARCH_URL}...")
page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)
# Accept cookies if present
try:
page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
print(" ✓ Accepted cookies")
except PlaywrightTimeout:
pass
# Click on "Navigierende Suche" tab
try:
page.get_by_text("Navigierende Suche").click(timeout=5000)
time.sleep(3) # Wait for all 523 archives to load
print(" ✓ Switched to Navigierende Suche (all archives)\n")
except PlaywrightTimeout:
print(" ⚠ Could not find Navigierende Suche tab")
return archives
print("🔍 Extracting archive names from page...")
print("=" * 70)
# Get ALL button elements at once (much faster than clicking each)
archive_buttons = page.get_by_role("button").all()
print(f"Found {len(archive_buttons)} buttons on page")
# Extract text from all buttons
archive_names = []
for button in archive_buttons:
try:
text = button.text_content()
if text:
archive_names.append(text.strip())
except:
continue
print(f"Extracted {len(archive_names)} button texts")
print()
# Filter to top-level archive institutions
# Criteria: Contains archive keywords AND NOT a sub-collection
print("Filtering to top-level archive institutions...")
for name in archive_names:
# Must contain archive-related keywords
if not any(keyword in name.lower() for keyword in ['archiv', 'institut', 'zentrum', 'stiftung', 'bibliothek']):
continue
# Skip sub-collections (start with *, numbers, or contain " / ")
if name.startswith('*') or ' / ' in name:
continue
# Skip numeric prefixes (sub-collection indicators)
if re.match(r'^[0-9]+', name):
continue
# Extract metadata
city = extract_city_from_name(name)
inst_type = infer_institution_type(name)
record = {
"name": name,
"city": city,
"country": "DE",
"region": "Nordrhein-Westfalen",
"institution_type": inst_type,
"isil_code": None, # To be enriched in second pass
"url": SEARCH_URL,
"source": "archive.nrw.de",
"harvest_date": datetime.now(timezone.utc).isoformat(),
"notes": "Fast harvest - ISIL codes require detail page scraping"
}
archives.append(record)
city_display = f"({city})" if city else "(no city)"
print(f"{name} {city_display}")
except Exception as e:
print(f"❌ Error during harvest: {e}")
import traceback
traceback.print_exc()
finally:
browser.close()
return archives
def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
"""Remove duplicate archive entries based on name."""
seen = set()
unique = []
for archive in archives:
key = archive['name'].lower().strip()
if key not in seen:
seen.add(key)
unique.append(archive)
return unique
def main():
"""Main harvest workflow."""
print("=" * 70)
print("NRW Archives FAST Harvester")
print("Extracting ALL 523+ archive names (ISIL codes in second pass)")
print("=" * 70)
print()
start_time = time.time()
# Harvest archives using fast method
archives = harvest_archives_fast()
if not archives:
print("❌ No archives found. The page structure may have changed.")
return
# Deduplicate
archives = deduplicate_archives(archives)
print()
print("=" * 70)
print(f"✅ Harvested {len(archives)} unique NRW archives")
print("=" * 70)
print()
# Statistics
cities = set(a['city'] for a in archives if a['city'])
types = {}
for archive in archives:
inst_type = archive['institution_type']
types[inst_type] = types.get(inst_type, 0) + 1
print("📊 Statistics:")
print(f" Total archives: {len(archives)}")
print(f" Cities covered: {len(cities)}")
print(f" Archives with city data: {len([a for a in archives if a['city']])} ({len([a for a in archives if a['city']])/len(archives)*100:.1f}%)")
print()
print(" Institution types:")
for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type}: {count}")
# Export to JSON
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = OUTPUT_DIR / f"nrw_archives_fast_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(archives, f, ensure_ascii=False, indent=2)
print()
print(f"📁 Output: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
print(f"⏱️ Time: {time.time() - start_time:.1f}s")
print()
# Show sample records
print("📋 Sample records:")
for i, archive in enumerate(archives[:5], 1):
print(f"\n{i}. {archive['name']}")
print(f" City: {archive['city'] or 'Unknown'}")
print(f" Type: {archive['institution_type']}")
print("\n" + "=" * 70)
print(" NOTE: ISIL codes not included in fast harvest.")
print(" Run detail page scraper to enrich with ISIL codes:")
print(" python scripts/scrapers/enrich_nrw_with_isil.py")
print("=" * 70)
if __name__ == "__main__":
main()