477 lines
16 KiB
Python
Executable file
477 lines
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Archivportal-D Archive Harvester
|
|
|
|
This script harvests archive listings from Archivportal-D, the national German
|
|
archive portal operated by Deutsche Digitale Bibliothek.
|
|
|
|
Portal: https://www.archivportal-d.de/
|
|
Coverage: All archives across Germany (state, municipal, church, business, etc.)
|
|
Method: Web scraping (fallback if API unavailable)
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin, quote
|
|
|
|
# Configuration
|
|
ARCHIVPORTAL_BASE_URL = "https://www.archivportal-d.de"
|
|
ARCHIVE_LIST_URL = f"{ARCHIVPORTAL_BASE_URL}/struktur"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
REQUEST_DELAY = 1.5 # Seconds between requests (be respectful)
|
|
MAX_RETRIES = 3
|
|
USER_AGENT = "GlamDataHarvester/1.0 (https://github.com/yourusername/glam; contact@email.com)"
|
|
|
|
# Create output directory
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def fetch_page(url: str, params: Optional[Dict] = None) -> Optional[BeautifulSoup]:
|
|
"""
|
|
Fetch a page and return BeautifulSoup object.
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
params: Optional query parameters
|
|
|
|
Returns:
|
|
BeautifulSoup object or None on error
|
|
"""
|
|
headers = {
|
|
'User-Agent': USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml',
|
|
'Accept-Language': 'de,en;q=0.9'
|
|
}
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
print(f"Fetching: {url}", end=' ')
|
|
if params:
|
|
print(f"(params: {params})", end=' ')
|
|
|
|
response = requests.get(url, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
print("OK")
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
time.sleep(REQUEST_DELAY * (attempt + 1))
|
|
else:
|
|
return None
|
|
|
|
|
|
def extract_archive_from_listing(item_elem) -> Optional[Dict]:
|
|
"""
|
|
Extract archive information from a listing item element.
|
|
|
|
Args:
|
|
item_elem: BeautifulSoup element for archive listing item
|
|
|
|
Returns:
|
|
Dictionary with archive info or None
|
|
"""
|
|
try:
|
|
archive: Dict = {
|
|
'name': '',
|
|
'location': '',
|
|
'federal_state': '',
|
|
'archive_type': '',
|
|
'archive_id': '',
|
|
'profile_url': '',
|
|
'description': '',
|
|
'isil': ''
|
|
}
|
|
|
|
# Extract name
|
|
name_elem = item_elem.find(['h2', 'h3', 'h4'], class_=['title', 'heading', 'name'])
|
|
if name_elem:
|
|
archive['name'] = name_elem.get_text(strip=True)
|
|
|
|
# Extract profile URL
|
|
link_elem = item_elem.find('a', href=True)
|
|
if link_elem:
|
|
archive['profile_url'] = urljoin(ARCHIVPORTAL_BASE_URL, link_elem['href'])
|
|
# Extract ID from URL if present
|
|
match = re.search(r'/struktur/([A-Za-z0-9_-]+)', link_elem['href'])
|
|
if match:
|
|
archive['archive_id'] = match.group(1)
|
|
|
|
# Extract location (city)
|
|
location_elem = item_elem.find(class_=['location', 'place', 'city'])
|
|
if location_elem:
|
|
archive['location'] = location_elem.get_text(strip=True)
|
|
|
|
# Extract federal state (Bundesland)
|
|
state_elem = item_elem.find(class_=['state', 'federal-state', 'bundesland'])
|
|
if state_elem:
|
|
archive['federal_state'] = state_elem.get_text(strip=True)
|
|
|
|
# Extract archive type/sector
|
|
type_elem = item_elem.find(class_=['type', 'sector', 'category'])
|
|
if type_elem:
|
|
archive['archive_type'] = type_elem.get_text(strip=True)
|
|
|
|
# Extract description
|
|
desc_elem = item_elem.find(class_=['description', 'abstract', 'summary'])
|
|
if desc_elem:
|
|
archive['description'] = desc_elem.get_text(strip=True)
|
|
|
|
# Look for ISIL code in text
|
|
text = item_elem.get_text()
|
|
isil_match = re.search(r'\b(DE-[A-Za-z0-9]+)\b', text)
|
|
if isil_match:
|
|
archive['isil'] = isil_match.group(1)
|
|
|
|
return archive if archive['name'] else None
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing archive item: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_archive_profile(profile_url: str) -> Dict:
|
|
"""
|
|
Fetch detailed information from an archive's profile page.
|
|
|
|
Args:
|
|
profile_url: URL to archive profile
|
|
|
|
Returns:
|
|
Dictionary with enriched archive info
|
|
"""
|
|
soup = fetch_page(profile_url)
|
|
if not soup:
|
|
return {}
|
|
|
|
enriched = {
|
|
'contact': {},
|
|
'finding_aids': None,
|
|
'digital_copies': None,
|
|
'collections': [],
|
|
'coordinates': {}
|
|
}
|
|
|
|
try:
|
|
# Extract contact information
|
|
contact_section = soup.find(['section', 'div'], class_=['contact', 'kontakt'])
|
|
if contact_section:
|
|
email_elem = contact_section.find('a', href=re.compile(r'^mailto:'))
|
|
if email_elem:
|
|
enriched['contact']['email'] = email_elem['href'].replace('mailto:', '')
|
|
|
|
phone_elem = contact_section.find('a', href=re.compile(r'^tel:'))
|
|
if phone_elem:
|
|
enriched['contact']['phone'] = phone_elem['href'].replace('tel:', '')
|
|
|
|
website_elem = contact_section.find('a', href=re.compile(r'^https?://'))
|
|
if website_elem:
|
|
enriched['contact']['website'] = website_elem['href']
|
|
|
|
# Extract finding aids count
|
|
finding_aids_elem = soup.find(text=re.compile(r'Findbücher|Finding aids'))
|
|
if finding_aids_elem:
|
|
match = re.search(r'(\d+)', finding_aids_elem)
|
|
if match:
|
|
enriched['finding_aids'] = int(match.group(1))
|
|
|
|
# Extract digital copies count
|
|
digital_elem = soup.find(text=re.compile(r'digitalisierte|digital'))
|
|
if digital_elem:
|
|
match = re.search(r'(\d+)', digital_elem)
|
|
if match:
|
|
enriched['digital_copies'] = int(match.group(1))
|
|
|
|
# Extract coordinates if present
|
|
map_elem = soup.find(['div', 'section'], class_=['map', 'karte'])
|
|
if map_elem:
|
|
lat_match = re.search(r'latitude["\s:]+([0-9.]+)', str(map_elem))
|
|
lon_match = re.search(r'longitude["\s:]+([0-9.]+)', str(map_elem))
|
|
if lat_match and lon_match:
|
|
enriched['coordinates'] = {
|
|
'latitude': lat_match.group(1),
|
|
'longitude': lon_match.group(1)
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error enriching profile: {e}")
|
|
|
|
return enriched
|
|
|
|
|
|
def harvest_archive_list(max_pages: Optional[int] = None, enrich_profiles: bool = False) -> List[Dict]:
|
|
"""
|
|
Harvest archive listings from Archivportal-D.
|
|
|
|
Args:
|
|
max_pages: Maximum pages to fetch (None = all)
|
|
enrich_profiles: Whether to fetch detailed profile pages
|
|
|
|
Returns:
|
|
List of archive records
|
|
"""
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvesting Archivportal-D Archive Listings")
|
|
print(f"Portal: {ARCHIVPORTAL_BASE_URL}")
|
|
print(f"{'='*70}\n")
|
|
|
|
all_archives = []
|
|
page = 0
|
|
|
|
while True:
|
|
if max_pages and page >= max_pages:
|
|
break
|
|
|
|
# Fetch listing page
|
|
params = {'page': page} if page > 0 else None
|
|
soup = fetch_page(ARCHIVE_LIST_URL, params)
|
|
|
|
if not soup:
|
|
print(f"Warning: Failed to fetch page {page}. Stopping.")
|
|
break
|
|
|
|
# Find archive listings
|
|
# This selector may need adjustment based on actual HTML structure
|
|
listings = soup.find_all(['article', 'div', 'li'], class_=['archive', 'item', 'result'])
|
|
|
|
if not listings:
|
|
# Try alternative selectors
|
|
listings = soup.find_all('a', href=re.compile(r'/struktur/[A-Za-z0-9_-]+'))
|
|
|
|
if not listings:
|
|
print(f"No archives found on page {page}. Stopping.")
|
|
break
|
|
|
|
print(f"\nPage {page}: Found {len(listings)} listings")
|
|
|
|
# Parse each listing
|
|
for listing in listings:
|
|
archive = extract_archive_from_listing(listing)
|
|
if archive:
|
|
# Enrich with profile page if requested
|
|
if enrich_profiles and archive['profile_url']:
|
|
print(f" Enriching: {archive['name'][:50]}...", end=' ')
|
|
enriched = fetch_archive_profile(archive['profile_url'])
|
|
archive.update(enriched)
|
|
print("OK")
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
all_archives.append(archive)
|
|
|
|
print(f"Progress: {len(all_archives)} archives collected")
|
|
|
|
# Check for next page
|
|
next_button = soup.find('a', class_=['next', 'pagination-next'])
|
|
if not next_button or not next_button.get('href'):
|
|
print("No more pages. Stopping.")
|
|
break
|
|
|
|
page += 1
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvest complete: {len(all_archives)} archives")
|
|
print(f"{'='*70}\n")
|
|
|
|
return all_archives
|
|
|
|
|
|
def harvest_by_federal_state() -> Dict[str, List[Dict]]:
|
|
"""
|
|
Harvest archives grouped by federal state.
|
|
|
|
Returns:
|
|
Dictionary mapping federal state to list of archives
|
|
"""
|
|
federal_states = [
|
|
"Baden-Württemberg",
|
|
"Bayern",
|
|
"Berlin",
|
|
"Brandenburg",
|
|
"Bremen",
|
|
"Hamburg",
|
|
"Hessen",
|
|
"Mecklenburg-Vorpommern",
|
|
"Niedersachsen",
|
|
"Nordrhein-Westfalen",
|
|
"Rheinland-Pfalz",
|
|
"Saarland",
|
|
"Sachsen",
|
|
"Sachsen-Anhalt",
|
|
"Schleswig-Holstein",
|
|
"Thüringen"
|
|
]
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvesting Archives by Federal State")
|
|
print(f"{'='*70}\n")
|
|
|
|
archives_by_state = {}
|
|
|
|
for state in federal_states:
|
|
print(f"\nFetching archives for: {state}")
|
|
|
|
# Use filter parameter (this will need adjustment based on actual URL structure)
|
|
params = {'federalState': state}
|
|
soup = fetch_page(ARCHIVE_LIST_URL, params)
|
|
|
|
if not soup:
|
|
print(f" Failed to fetch {state}. Skipping.")
|
|
continue
|
|
|
|
# Parse listings
|
|
listings = soup.find_all(['article', 'div'], class_=['archive', 'item'])
|
|
state_archives = []
|
|
|
|
for listing in listings:
|
|
archive = extract_archive_from_listing(listing)
|
|
if archive:
|
|
archive['federal_state'] = state
|
|
state_archives.append(archive)
|
|
|
|
archives_by_state[state] = state_archives
|
|
print(f" Found {len(state_archives)} archives")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
return archives_by_state
|
|
|
|
|
|
def save_archives(archives: List[Dict], filename_suffix: str = ""):
|
|
"""Save archives to JSON file."""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_file = OUTPUT_DIR / f"archivportal_d_archives{filename_suffix}_{timestamp}.json"
|
|
|
|
output = {
|
|
'metadata': {
|
|
'source': 'Archivportal-D',
|
|
'source_url': ARCHIVPORTAL_BASE_URL,
|
|
'operator': 'Deutsche Digitale Bibliothek',
|
|
'harvest_date': datetime.utcnow().isoformat() + 'Z',
|
|
'total_archives': len(archives),
|
|
'method': 'Web scraping',
|
|
'license': 'CC0 1.0 Universal (Public Domain)',
|
|
'coverage': 'All German archives (state, municipal, church, business, etc.)'
|
|
},
|
|
'archives': archives
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ Saved to: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
|
|
|
|
return output_file
|
|
|
|
|
|
def generate_statistics(archives: List[Dict]):
|
|
"""Generate and display statistics."""
|
|
stats = {
|
|
'total': len(archives),
|
|
'by_state': {},
|
|
'by_type': {},
|
|
'with_isil': 0,
|
|
'with_profile_url': 0,
|
|
'with_email': 0,
|
|
'with_phone': 0,
|
|
'with_coordinates': 0
|
|
}
|
|
|
|
for archive in archives:
|
|
# Count by state
|
|
state = archive.get('federal_state', 'Unknown')
|
|
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
|
|
|
|
# Count by type
|
|
arch_type = archive.get('archive_type', 'Unknown')
|
|
stats['by_type'][arch_type] = stats['by_type'].get(arch_type, 0) + 1
|
|
|
|
# Count completeness
|
|
if archive.get('isil'):
|
|
stats['with_isil'] += 1
|
|
if archive.get('profile_url'):
|
|
stats['with_profile_url'] += 1
|
|
if archive.get('contact', {}).get('email'):
|
|
stats['with_email'] += 1
|
|
if archive.get('contact', {}).get('phone'):
|
|
stats['with_phone'] += 1
|
|
if archive.get('coordinates', {}).get('latitude'):
|
|
stats['with_coordinates'] += 1
|
|
|
|
print(f"\n{'='*70}")
|
|
print("Statistics:")
|
|
print(f"{'='*70}")
|
|
print(f"Total archives: {stats['total']}")
|
|
print(f"\nData completeness:")
|
|
print(f" - With ISIL code: {stats['with_isil']} ({stats['with_isil']/stats['total']*100:.1f}%)")
|
|
print(f" - With profile URL: {stats['with_profile_url']} ({stats['with_profile_url']/stats['total']*100:.1f}%)")
|
|
print(f" - With email: {stats['with_email']} ({stats['with_email']/stats['total']*100:.1f}%)")
|
|
print(f" - With phone: {stats['with_phone']} ({stats['with_phone']/stats['total']*100:.1f}%)")
|
|
print(f" - With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")
|
|
|
|
print(f"\nArchives by federal state:")
|
|
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True):
|
|
print(f" - {state}: {count}")
|
|
|
|
print(f"\nTop 10 archive types:")
|
|
for arch_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f" - {arch_type}: {count}")
|
|
|
|
print(f"{'='*70}\n")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print(f"\n{'#'*70}")
|
|
print(f"# Archivportal-D Archive Harvester")
|
|
print(f"# Deutsche Digitale Bibliothek")
|
|
print(f"{'#'*70}\n")
|
|
|
|
print("NOTE: This harvester uses web scraping as a fallback method.")
|
|
print("The HTML structure may change over time and require updates to selectors.")
|
|
print("For production use, consider registering for DDB API access.\n")
|
|
|
|
# Harvest archives
|
|
archives = harvest_archive_list(max_pages=10, enrich_profiles=False) # Start with 10 pages for testing
|
|
|
|
if not archives:
|
|
print("No archives harvested. Exiting.")
|
|
return
|
|
|
|
# Save archives
|
|
save_archives(archives)
|
|
|
|
# Generate statistics
|
|
stats = generate_statistics(archives)
|
|
|
|
# Save statistics
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
stats_file = OUTPUT_DIR / f"archivportal_d_stats_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Statistics saved to: {stats_file}\n")
|
|
|
|
print("✓ Harvest complete!\n")
|
|
print("Next steps:")
|
|
print(" 1. Review the harvested data for accuracy")
|
|
print(" 2. Adjust HTML selectors if needed")
|
|
print(" 3. Run full harvest (remove max_pages limit)")
|
|
print(" 4. Cross-reference with ISIL dataset\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|