glam/scripts/scrapers/harvest_archivportal_d.py
2025-11-19 23:25:22 +01:00

477 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Archivportal-D Archive Harvester
This script harvests archive listings from Archivportal-D, the national German
archive portal operated by Deutsche Digitale Bibliothek.
Portal: https://www.archivportal-d.de/
Coverage: All archives across Germany (state, municipal, church, business, etc.)
Method: Web scraping (fallback if API unavailable)
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import json
import time
import re
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote
# Configuration
ARCHIVPORTAL_BASE_URL = "https://www.archivportal-d.de"
ARCHIVE_LIST_URL = f"{ARCHIVPORTAL_BASE_URL}/struktur"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
REQUEST_DELAY = 1.5 # Seconds between requests (be respectful)
MAX_RETRIES = 3
USER_AGENT = "GlamDataHarvester/1.0 (https://github.com/yourusername/glam; contact@email.com)"
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def fetch_page(url: str, params: Optional[Dict] = None) -> Optional[BeautifulSoup]:
"""
Fetch a page and return BeautifulSoup object.
Args:
url: URL to fetch
params: Optional query parameters
Returns:
BeautifulSoup object or None on error
"""
headers = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'de,en;q=0.9'
}
for attempt in range(MAX_RETRIES):
try:
print(f"Fetching: {url}", end=' ')
if params:
print(f"(params: {params})", end=' ')
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
print("OK")
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(REQUEST_DELAY * (attempt + 1))
else:
return None
def extract_archive_from_listing(item_elem) -> Optional[Dict]:
"""
Extract archive information from a listing item element.
Args:
item_elem: BeautifulSoup element for archive listing item
Returns:
Dictionary with archive info or None
"""
try:
archive: Dict = {
'name': '',
'location': '',
'federal_state': '',
'archive_type': '',
'archive_id': '',
'profile_url': '',
'description': '',
'isil': ''
}
# Extract name
name_elem = item_elem.find(['h2', 'h3', 'h4'], class_=['title', 'heading', 'name'])
if name_elem:
archive['name'] = name_elem.get_text(strip=True)
# Extract profile URL
link_elem = item_elem.find('a', href=True)
if link_elem:
archive['profile_url'] = urljoin(ARCHIVPORTAL_BASE_URL, link_elem['href'])
# Extract ID from URL if present
match = re.search(r'/struktur/([A-Za-z0-9_-]+)', link_elem['href'])
if match:
archive['archive_id'] = match.group(1)
# Extract location (city)
location_elem = item_elem.find(class_=['location', 'place', 'city'])
if location_elem:
archive['location'] = location_elem.get_text(strip=True)
# Extract federal state (Bundesland)
state_elem = item_elem.find(class_=['state', 'federal-state', 'bundesland'])
if state_elem:
archive['federal_state'] = state_elem.get_text(strip=True)
# Extract archive type/sector
type_elem = item_elem.find(class_=['type', 'sector', 'category'])
if type_elem:
archive['archive_type'] = type_elem.get_text(strip=True)
# Extract description
desc_elem = item_elem.find(class_=['description', 'abstract', 'summary'])
if desc_elem:
archive['description'] = desc_elem.get_text(strip=True)
# Look for ISIL code in text
text = item_elem.get_text()
isil_match = re.search(r'\b(DE-[A-Za-z0-9]+)\b', text)
if isil_match:
archive['isil'] = isil_match.group(1)
return archive if archive['name'] else None
except Exception as e:
print(f"Error parsing archive item: {e}")
return None
def fetch_archive_profile(profile_url: str) -> Dict:
"""
Fetch detailed information from an archive's profile page.
Args:
profile_url: URL to archive profile
Returns:
Dictionary with enriched archive info
"""
soup = fetch_page(profile_url)
if not soup:
return {}
enriched = {
'contact': {},
'finding_aids': None,
'digital_copies': None,
'collections': [],
'coordinates': {}
}
try:
# Extract contact information
contact_section = soup.find(['section', 'div'], class_=['contact', 'kontakt'])
if contact_section:
email_elem = contact_section.find('a', href=re.compile(r'^mailto:'))
if email_elem:
enriched['contact']['email'] = email_elem['href'].replace('mailto:', '')
phone_elem = contact_section.find('a', href=re.compile(r'^tel:'))
if phone_elem:
enriched['contact']['phone'] = phone_elem['href'].replace('tel:', '')
website_elem = contact_section.find('a', href=re.compile(r'^https?://'))
if website_elem:
enriched['contact']['website'] = website_elem['href']
# Extract finding aids count
finding_aids_elem = soup.find(text=re.compile(r'Findbücher|Finding aids'))
if finding_aids_elem:
match = re.search(r'(\d+)', finding_aids_elem)
if match:
enriched['finding_aids'] = int(match.group(1))
# Extract digital copies count
digital_elem = soup.find(text=re.compile(r'digitalisierte|digital'))
if digital_elem:
match = re.search(r'(\d+)', digital_elem)
if match:
enriched['digital_copies'] = int(match.group(1))
# Extract coordinates if present
map_elem = soup.find(['div', 'section'], class_=['map', 'karte'])
if map_elem:
lat_match = re.search(r'latitude["\s:]+([0-9.]+)', str(map_elem))
lon_match = re.search(r'longitude["\s:]+([0-9.]+)', str(map_elem))
if lat_match and lon_match:
enriched['coordinates'] = {
'latitude': lat_match.group(1),
'longitude': lon_match.group(1)
}
except Exception as e:
print(f"Error enriching profile: {e}")
return enriched
def harvest_archive_list(max_pages: Optional[int] = None, enrich_profiles: bool = False) -> List[Dict]:
"""
Harvest archive listings from Archivportal-D.
Args:
max_pages: Maximum pages to fetch (None = all)
enrich_profiles: Whether to fetch detailed profile pages
Returns:
List of archive records
"""
print(f"\n{'='*70}")
print(f"Harvesting Archivportal-D Archive Listings")
print(f"Portal: {ARCHIVPORTAL_BASE_URL}")
print(f"{'='*70}\n")
all_archives = []
page = 0
while True:
if max_pages and page >= max_pages:
break
# Fetch listing page
params = {'page': page} if page > 0 else None
soup = fetch_page(ARCHIVE_LIST_URL, params)
if not soup:
print(f"Warning: Failed to fetch page {page}. Stopping.")
break
# Find archive listings
# This selector may need adjustment based on actual HTML structure
listings = soup.find_all(['article', 'div', 'li'], class_=['archive', 'item', 'result'])
if not listings:
# Try alternative selectors
listings = soup.find_all('a', href=re.compile(r'/struktur/[A-Za-z0-9_-]+'))
if not listings:
print(f"No archives found on page {page}. Stopping.")
break
print(f"\nPage {page}: Found {len(listings)} listings")
# Parse each listing
for listing in listings:
archive = extract_archive_from_listing(listing)
if archive:
# Enrich with profile page if requested
if enrich_profiles and archive['profile_url']:
print(f" Enriching: {archive['name'][:50]}...", end=' ')
enriched = fetch_archive_profile(archive['profile_url'])
archive.update(enriched)
print("OK")
time.sleep(REQUEST_DELAY)
all_archives.append(archive)
print(f"Progress: {len(all_archives)} archives collected")
# Check for next page
next_button = soup.find('a', class_=['next', 'pagination-next'])
if not next_button or not next_button.get('href'):
print("No more pages. Stopping.")
break
page += 1
time.sleep(REQUEST_DELAY)
print(f"\n{'='*70}")
print(f"Harvest complete: {len(all_archives)} archives")
print(f"{'='*70}\n")
return all_archives
def harvest_by_federal_state() -> Dict[str, List[Dict]]:
"""
Harvest archives grouped by federal state.
Returns:
Dictionary mapping federal state to list of archives
"""
federal_states = [
"Baden-Württemberg",
"Bayern",
"Berlin",
"Brandenburg",
"Bremen",
"Hamburg",
"Hessen",
"Mecklenburg-Vorpommern",
"Niedersachsen",
"Nordrhein-Westfalen",
"Rheinland-Pfalz",
"Saarland",
"Sachsen",
"Sachsen-Anhalt",
"Schleswig-Holstein",
"Thüringen"
]
print(f"\n{'='*70}")
print(f"Harvesting Archives by Federal State")
print(f"{'='*70}\n")
archives_by_state = {}
for state in federal_states:
print(f"\nFetching archives for: {state}")
# Use filter parameter (this will need adjustment based on actual URL structure)
params = {'federalState': state}
soup = fetch_page(ARCHIVE_LIST_URL, params)
if not soup:
print(f" Failed to fetch {state}. Skipping.")
continue
# Parse listings
listings = soup.find_all(['article', 'div'], class_=['archive', 'item'])
state_archives = []
for listing in listings:
archive = extract_archive_from_listing(listing)
if archive:
archive['federal_state'] = state
state_archives.append(archive)
archives_by_state[state] = state_archives
print(f" Found {len(state_archives)} archives")
time.sleep(REQUEST_DELAY)
return archives_by_state
def save_archives(archives: List[Dict], filename_suffix: str = ""):
"""Save archives to JSON file."""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = OUTPUT_DIR / f"archivportal_d_archives{filename_suffix}_{timestamp}.json"
output = {
'metadata': {
'source': 'Archivportal-D',
'source_url': ARCHIVPORTAL_BASE_URL,
'operator': 'Deutsche Digitale Bibliothek',
'harvest_date': datetime.utcnow().isoformat() + 'Z',
'total_archives': len(archives),
'method': 'Web scraping',
'license': 'CC0 1.0 Universal (Public Domain)',
'coverage': 'All German archives (state, municipal, church, business, etc.)'
},
'archives': archives
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"✓ Saved to: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
return output_file
def generate_statistics(archives: List[Dict]):
"""Generate and display statistics."""
stats = {
'total': len(archives),
'by_state': {},
'by_type': {},
'with_isil': 0,
'with_profile_url': 0,
'with_email': 0,
'with_phone': 0,
'with_coordinates': 0
}
for archive in archives:
# Count by state
state = archive.get('federal_state', 'Unknown')
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
# Count by type
arch_type = archive.get('archive_type', 'Unknown')
stats['by_type'][arch_type] = stats['by_type'].get(arch_type, 0) + 1
# Count completeness
if archive.get('isil'):
stats['with_isil'] += 1
if archive.get('profile_url'):
stats['with_profile_url'] += 1
if archive.get('contact', {}).get('email'):
stats['with_email'] += 1
if archive.get('contact', {}).get('phone'):
stats['with_phone'] += 1
if archive.get('coordinates', {}).get('latitude'):
stats['with_coordinates'] += 1
print(f"\n{'='*70}")
print("Statistics:")
print(f"{'='*70}")
print(f"Total archives: {stats['total']}")
print(f"\nData completeness:")
print(f" - With ISIL code: {stats['with_isil']} ({stats['with_isil']/stats['total']*100:.1f}%)")
print(f" - With profile URL: {stats['with_profile_url']} ({stats['with_profile_url']/stats['total']*100:.1f}%)")
print(f" - With email: {stats['with_email']} ({stats['with_email']/stats['total']*100:.1f}%)")
print(f" - With phone: {stats['with_phone']} ({stats['with_phone']/stats['total']*100:.1f}%)")
print(f" - With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")
print(f"\nArchives by federal state:")
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True):
print(f" - {state}: {count}")
print(f"\nTop 10 archive types:")
for arch_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" - {arch_type}: {count}")
print(f"{'='*70}\n")
return stats
def main():
"""Main execution."""
print(f"\n{'#'*70}")
print(f"# Archivportal-D Archive Harvester")
print(f"# Deutsche Digitale Bibliothek")
print(f"{'#'*70}\n")
print("NOTE: This harvester uses web scraping as a fallback method.")
print("The HTML structure may change over time and require updates to selectors.")
print("For production use, consider registering for DDB API access.\n")
# Harvest archives
archives = harvest_archive_list(max_pages=10, enrich_profiles=False) # Start with 10 pages for testing
if not archives:
print("No archives harvested. Exiting.")
return
# Save archives
save_archives(archives)
# Generate statistics
stats = generate_statistics(archives)
# Save statistics
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
stats_file = OUTPUT_DIR / f"archivportal_d_stats_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print(f"✓ Statistics saved to: {stats_file}\n")
print("✓ Harvest complete!\n")
print("Next steps:")
print(" 1. Review the harvested data for accuracy")
print(" 2. Adjust HTML selectors if needed")
print(" 3. Run full harvest (remove max_pages limit)")
print(" 4. Cross-reference with ISIL dataset\n")
if __name__ == "__main__":
main()