283 lines
9.5 KiB
Python
Executable file
283 lines
9.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
NRW Archives Fast Harvester (No Clicking Strategy)
|
||
Extracts ALL 523+ archives from archive.nrw.de by parsing the rendered page
|
||
|
||
This version extracts archive names from the list WITHOUT clicking each one,
|
||
which is much faster. ISIL codes will be enriched later via detail page scraping.
|
||
|
||
Portal: https://www.archive.nrw.de/archivsuche
|
||
Strategy: Parse rendered HTML after JavaScript execution
|
||
Speed: ~10 seconds (vs 10+ minutes for clicking approach)
|
||
|
||
Author: OpenCode + AI Agent
|
||
Date: 2025-11-19
|
||
Version: 3.0 (Fast Harvest - Name Only)
|
||
"""
|
||
|
||
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
||
import json
|
||
import time
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from typing import List, Dict, Optional
|
||
import re
|
||
|
||
# Configuration
|
||
BASE_URL = "https://www.archive.nrw.de"
|
||
SEARCH_URL = f"{BASE_URL}/archivsuche"
|
||
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Map German archive types to GLAM taxonomy
|
||
ARCHIVE_TYPE_MAPPING = {
|
||
"Landesarchiv": "OFFICIAL_INSTITUTION",
|
||
"Stadtarchiv": "ARCHIVE",
|
||
"Gemeindearchiv": "ARCHIVE",
|
||
"Kreisarchiv": "ARCHIVE",
|
||
"Stiftsarchiv": "ARCHIVE",
|
||
"Kommunalarchiv": "ARCHIVE",
|
||
"Stadt- und": "ARCHIVE",
|
||
"Institut für": "RESEARCH_CENTER",
|
||
"Archiv des": "ARCHIVE",
|
||
"Archiv der": "ARCHIVE",
|
||
"Historisches": "RESEARCH_CENTER",
|
||
"Universitätsarchiv": "EDUCATION_PROVIDER",
|
||
"Hochschularchiv": "EDUCATION_PROVIDER",
|
||
"Bistumsarchiv": "HOLY_SITES",
|
||
"Erzbistumsarchiv": "HOLY_SITES",
|
||
"Diözesanarchiv": "HOLY_SITES",
|
||
"Landeskirchliches": "HOLY_SITES",
|
||
"Kirchenkreises": "HOLY_SITES",
|
||
"Unternehmensarchiv": "CORPORATION",
|
||
"Konzernarchiv": "CORPORATION",
|
||
"Wirtschaftsarchiv": "CORPORATION",
|
||
}
|
||
|
||
|
||
def infer_institution_type(name: str) -> str:
|
||
"""Infer institution type from German archive name."""
|
||
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
|
||
if keyword in name:
|
||
return inst_type
|
||
return "ARCHIVE"
|
||
|
||
|
||
def extract_city_from_name(name: str) -> Optional[str]:
|
||
"""Extract city name from German archive names."""
|
||
patterns = [
|
||
r'Stadtarchiv\s+(.+)',
|
||
r'Gemeindearchiv\s+(.+)',
|
||
r'Kreisarchiv\s+(.+)',
|
||
r'Kommunalarchiv\s+(.+)',
|
||
r'Stadt-\s+und\s+\w+\s+(.+)',
|
||
r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
|
||
r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
|
||
r'Historisches\s+Zentrum\s+(.+)',
|
||
r'Stiftsarchiv\s+(.+)',
|
||
]
|
||
|
||
for pattern in patterns:
|
||
match = re.search(pattern, name)
|
||
if match:
|
||
city = match.group(1).strip()
|
||
city = re.sub(r'\s+\(.*\)$', '', city) # Remove (Westf.), (Ruhr), etc.
|
||
return city
|
||
|
||
return None
|
||
|
||
|
||
def harvest_archives_fast() -> List[Dict]:
|
||
"""
|
||
Fast harvest strategy: Extract archive names from page without clicking.
|
||
|
||
Strategy:
|
||
1. Load page with all archives visible
|
||
2. Extract ALL archive button texts at once
|
||
3. Filter to top-level archives only
|
||
4. Save metadata
|
||
|
||
ISIL codes can be enriched later in a second pass if needed.
|
||
"""
|
||
archives = []
|
||
|
||
with sync_playwright() as p:
|
||
print("Launching browser...")
|
||
browser = p.chromium.launch(headless=True)
|
||
page = browser.new_page()
|
||
|
||
try:
|
||
print(f"Navigating to {SEARCH_URL}...")
|
||
page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)
|
||
|
||
# Accept cookies if present
|
||
try:
|
||
page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
|
||
print(" ✓ Accepted cookies")
|
||
except PlaywrightTimeout:
|
||
pass
|
||
|
||
# Click on "Navigierende Suche" tab
|
||
try:
|
||
page.get_by_text("Navigierende Suche").click(timeout=5000)
|
||
time.sleep(3) # Wait for all 523 archives to load
|
||
print(" ✓ Switched to Navigierende Suche (all archives)\n")
|
||
except PlaywrightTimeout:
|
||
print(" ⚠ Could not find Navigierende Suche tab")
|
||
return archives
|
||
|
||
print("🔍 Extracting archive names from page...")
|
||
print("=" * 70)
|
||
|
||
# Get ALL button elements at once (much faster than clicking each)
|
||
archive_buttons = page.get_by_role("button").all()
|
||
print(f"Found {len(archive_buttons)} buttons on page")
|
||
|
||
# Extract text from all buttons
|
||
archive_names = []
|
||
for button in archive_buttons:
|
||
try:
|
||
text = button.text_content()
|
||
if text:
|
||
archive_names.append(text.strip())
|
||
except:
|
||
continue
|
||
|
||
print(f"Extracted {len(archive_names)} button texts")
|
||
print()
|
||
|
||
# Filter to top-level archive institutions
|
||
# Criteria: Contains archive keywords AND NOT a sub-collection
|
||
print("Filtering to top-level archive institutions...")
|
||
for name in archive_names:
|
||
# Must contain archive-related keywords
|
||
if not any(keyword in name.lower() for keyword in ['archiv', 'institut', 'zentrum', 'stiftung', 'bibliothek']):
|
||
continue
|
||
|
||
# Skip sub-collections (start with *, numbers, or contain " / ")
|
||
if name.startswith('*') or ' / ' in name:
|
||
continue
|
||
|
||
# Skip numeric prefixes (sub-collection indicators)
|
||
if re.match(r'^[0-9]+', name):
|
||
continue
|
||
|
||
# Extract metadata
|
||
city = extract_city_from_name(name)
|
||
inst_type = infer_institution_type(name)
|
||
|
||
record = {
|
||
"name": name,
|
||
"city": city,
|
||
"country": "DE",
|
||
"region": "Nordrhein-Westfalen",
|
||
"institution_type": inst_type,
|
||
"isil_code": None, # To be enriched in second pass
|
||
"url": SEARCH_URL,
|
||
"source": "archive.nrw.de",
|
||
"harvest_date": datetime.now(timezone.utc).isoformat(),
|
||
"notes": "Fast harvest - ISIL codes require detail page scraping"
|
||
}
|
||
|
||
archives.append(record)
|
||
|
||
city_display = f"({city})" if city else "(no city)"
|
||
print(f" ✓ {name} {city_display}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error during harvest: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
finally:
|
||
browser.close()
|
||
|
||
return archives
|
||
|
||
|
||
def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
|
||
"""Remove duplicate archive entries based on name."""
|
||
seen = set()
|
||
unique = []
|
||
|
||
for archive in archives:
|
||
key = archive['name'].lower().strip()
|
||
if key not in seen:
|
||
seen.add(key)
|
||
unique.append(archive)
|
||
|
||
return unique
|
||
|
||
|
||
def main():
|
||
"""Main harvest workflow."""
|
||
print("=" * 70)
|
||
print("NRW Archives FAST Harvester")
|
||
print("Extracting ALL 523+ archive names (ISIL codes in second pass)")
|
||
print("=" * 70)
|
||
print()
|
||
|
||
start_time = time.time()
|
||
|
||
# Harvest archives using fast method
|
||
archives = harvest_archives_fast()
|
||
|
||
if not archives:
|
||
print("❌ No archives found. The page structure may have changed.")
|
||
return
|
||
|
||
# Deduplicate
|
||
archives = deduplicate_archives(archives)
|
||
|
||
print()
|
||
print("=" * 70)
|
||
print(f"✅ Harvested {len(archives)} unique NRW archives")
|
||
print("=" * 70)
|
||
print()
|
||
|
||
# Statistics
|
||
cities = set(a['city'] for a in archives if a['city'])
|
||
types = {}
|
||
|
||
for archive in archives:
|
||
inst_type = archive['institution_type']
|
||
types[inst_type] = types.get(inst_type, 0) + 1
|
||
|
||
print("📊 Statistics:")
|
||
print(f" Total archives: {len(archives)}")
|
||
print(f" Cities covered: {len(cities)}")
|
||
print(f" Archives with city data: {len([a for a in archives if a['city']])} ({len([a for a in archives if a['city']])/len(archives)*100:.1f}%)")
|
||
print()
|
||
print(" Institution types:")
|
||
for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
|
||
print(f" {inst_type}: {count}")
|
||
|
||
# Export to JSON
|
||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||
output_file = OUTPUT_DIR / f"nrw_archives_fast_{timestamp}.json"
|
||
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(archives, f, ensure_ascii=False, indent=2)
|
||
|
||
print()
|
||
print(f"📁 Output: {output_file}")
|
||
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
|
||
print(f"⏱️ Time: {time.time() - start_time:.1f}s")
|
||
print()
|
||
|
||
# Show sample records
|
||
print("📋 Sample records:")
|
||
for i, archive in enumerate(archives[:5], 1):
|
||
print(f"\n{i}. {archive['name']}")
|
||
print(f" City: {archive['city'] or 'Unknown'}")
|
||
print(f" Type: {archive['institution_type']}")
|
||
|
||
print("\n" + "=" * 70)
|
||
print("ℹ️ NOTE: ISIL codes not included in fast harvest.")
|
||
print(" Run detail page scraper to enrich with ISIL codes:")
|
||
print(" python scripts/scrapers/enrich_nrw_with_isil.py")
|
||
print("=" * 70)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|