glam/scripts/scrapers/harvest_thueringen_archives_comprehensive.py
kempersc 38354539a6 feat: Add comprehensive harvester for Thüringen archives
- Implemented a new script to extract full metadata from 149 archive detail pages on archive-in-thueringen.de.
- Extracted data includes addresses, emails, phones, directors, collection sizes, opening hours, histories, and more.
- Introduced structured data parsing and error handling for robust data extraction.
- Added rate limiting to respect server load and improve scraping efficiency.
- Results are saved in a JSON format with detailed metadata about the extraction process.
2025-11-20 00:25:45 +01:00

650 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Thüringen Archives Comprehensive Harvester
Extracts FULL metadata from 149 archive detail pages on archive-in-thueringen.de
Portal: https://www.archive-in-thueringen.de/de/archiv/list
Detail pages: https://www.archive-in-thueringen.de/de/archiv/view/id/{ID}
Strategy:
1. Get list of 149 archive URLs from list page
2. Visit each detail page and extract ALL available metadata
3. Extract: addresses, emails, phones, directors, collection sizes, opening hours, histories
Author: OpenCode + AI Agent
Date: 2025-11-20
Version: 2.0 (Comprehensive)
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re
# Configuration
BASE_URL = "https://www.archive-in-thueringen.de"
ARCHIVE_LIST_URL = f"{BASE_URL}/de/archiv/list"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Rate limiting
REQUEST_DELAY = 1.0 # seconds between requests (be respectful)
# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
"Landesarchiv": "OFFICIAL_INSTITUTION",
"Staatsarchiv": "OFFICIAL_INSTITUTION",
"Hauptstaatsarchiv": "OFFICIAL_INSTITUTION",
"Stadtarchiv": "ARCHIVE",
"Gemeindearchiv": "ARCHIVE",
"Kreisarchiv": "ARCHIVE",
"Stadt- und Kreisarchiv": "ARCHIVE",
"Bistumsarchiv": "HOLY_SITES",
"Kirchenkreisarchiv": "HOLY_SITES",
"Landeskirchenarchiv": "HOLY_SITES",
"Archiv des Ev.": "HOLY_SITES",
"Archiv des Bischöflichen": "HOLY_SITES",
"Pfarrhausarchiv": "HOLY_SITES",
"Universitätsarchiv": "EDUCATION_PROVIDER",
"Hochschularchiv": "EDUCATION_PROVIDER",
"Hochschule": "EDUCATION_PROVIDER",
"Universität": "EDUCATION_PROVIDER",
"Fachhochschule": "EDUCATION_PROVIDER",
"Fachschule": "EDUCATION_PROVIDER",
"Carl Zeiss": "CORPORATION",
"SCHOTT": "CORPORATION",
"Wirtschaftsarchiv": "CORPORATION",
"Handwerkskammer": "CORPORATION",
"Handelskammer": "CORPORATION",
"Industrie- und Handelskammer": "CORPORATION",
"Lederfabrik": "CORPORATION",
"Verlagsgesellschaft": "CORPORATION",
"Bundesarchiv": "OFFICIAL_INSTITUTION",
"Stasi-Unterlagen": "OFFICIAL_INSTITUTION",
"Thüringer Landtages": "OFFICIAL_INSTITUTION",
"Gedenkstätte": "MUSEUM",
"Museum": "MUSEUM",
"Goethe- und Schiller": "RESEARCH_CENTER",
"Akademie": "RESEARCH_CENTER",
"Thüringer Archiv für Zeitgeschichte": "RESEARCH_CENTER",
"Thüringer Industriearchiv": "RESEARCH_CENTER",
"Thüringer Bauteil-Archiv": "RESEARCH_CENTER",
"Thüringer Talsperren": "RESEARCH_CENTER",
"Landesamt": "OFFICIAL_INSTITUTION",
"Archiv des Vogtländischen": "COLLECTING_SOCIETY",
"Archiv des Arbeitskreises": "NGO",
"Grenzlandmuseum": "MUSEUM",
"Archiv der VG": "ARCHIVE",
"Archiv der Verwaltungsgemeinschaft": "ARCHIVE",
"Archiv der Landgemeinde": "ARCHIVE",
"Archiv der Sammlung": "RESEARCH_CENTER",
"Musikarchiv": "RESEARCH_CENTER",
}
def infer_institution_type(name: str) -> str:
"""Infer institution type from German archive name."""
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
if keyword in name:
return inst_type
return "ARCHIVE"
def parse_address_lines(lines: List[str]) -> Dict[str, str]:
"""
Parse German address lines into structured format.
Example input:
[
"Landesarchiv Thüringen - Staatsarchiv Altenburg",
"Schloss 7",
"04600 Altenburg"
]
Returns: {
"organization": "Landesarchiv Thüringen - Staatsarchiv Altenburg",
"street": "Schloss 7",
"postal_code": "04600",
"city": "Altenburg"
}
"""
result = {}
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue
# First line is usually organization
if i == 0:
result["organization"] = line
# Check for postal code pattern (5 digits + city)
elif re.match(r'^\d{5}\s+\S+', line):
parts = line.split(None, 1)
result["postal_code"] = parts[0]
if len(parts) > 1:
result["city"] = parts[1]
# Check for PO Box
elif line.startswith("PF ") or line.startswith("Postfach"):
result["po_box"] = line
# Otherwise assume street address
else:
if "street" not in result:
result["street"] = line
else:
result["street"] += f", {line}"
return result
def extract_detail_page_metadata(page) -> Dict:
"""
Extract comprehensive metadata from archive detail page.
Returns dict with all available fields from the detail page.
"""
metadata = {}
try:
# Extract all structured data using JavaScript
extracted = page.evaluate("""
() => {
const data = {};
// Extract archive name from breadcrumb h1 (not site title)
const h1Elements = document.querySelectorAll('h1');
if (h1Elements.length >= 2) {
data.name = h1Elements[1].textContent.trim();
} else if (h1Elements.length === 1) {
// Fallback: check if it's not the site title
const h1Text = h1Elements[0].textContent.trim();
if (h1Text !== 'Archivportal Thüringen') {
data.name = h1Text;
}
}
// Helper to extract section by heading
function extractSection(headingText) {
const headings = Array.from(document.querySelectorAll('h4, h3, strong'));
const heading = headings.find(h => h.textContent.trim() === headingText);
if (!heading) return null;
// Get parent container
let container = heading.closest('div');
if (!container) return null;
// Extract text content after heading
const content = [];
let sibling = heading.nextElementSibling;
while (sibling && !sibling.matches('h3, h4')) {
const text = sibling.textContent.trim();
if (text) content.push(text);
sibling = sibling.nextElementSibling;
}
return content.join('\\n').trim();
}
// Helper to extract list items from section
function extractListItems(headingText) {
const headings = Array.from(document.querySelectorAll('h4, h3, strong'));
const heading = headings.find(h => h.textContent.trim() === headingText);
if (!heading) return [];
let container = heading.closest('div');
if (!container) return [];
const items = Array.from(container.querySelectorAll('li'))
.map(el => el.textContent.trim())
.filter(text => text.length > 0);
return items;
}
// Extract Postanschrift (postal address)
data.postal_address = extractListItems('Postanschrift');
// Extract Dienstanschrift (physical address)
data.physical_address = extractListItems('Dienstanschrift');
// Extract Besucheranschrift (visitor address) if different
data.visitor_address = extractListItems('Besucheranschrift');
// Extract email
const emailLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]'));
if (emailLinks.length > 0) {
data.email = emailLinks[0].href.replace('mailto:', '').trim();
}
// Extract phone
const phoneLinks = Array.from(document.querySelectorAll('a[href^="tel:"]'));
if (phoneLinks.length > 0) {
data.phone = phoneLinks[0].textContent.trim();
// Also get raw phone number from href
const phoneHref = phoneLinks[0].href.replace('tel:', '').trim();
data.phone_raw = phoneHref;
}
// Extract fax (usually in text, not a link)
const faxPattern = /Fax[:\\s]+([\\d\\s\\/\\-\\(\\)]+)/i;
const bodyText = document.body.textContent;
const faxMatch = bodyText.match(faxPattern);
if (faxMatch) {
data.fax = faxMatch[1].trim();
}
// Extract website
const websiteLinks = Array.from(document.querySelectorAll('a[href^="http"]'))
.filter(a => !a.href.includes('archive-in-thueringen.de'));
if (websiteLinks.length > 0) {
data.website = websiteLinks[0].href;
}
// Extract Öffnungszeiten (opening hours) - special extraction
const openingHoursH4 = Array.from(document.querySelectorAll('h4'))
.find(h => h.textContent.trim() === 'Öffnungszeiten');
if (openingHoursH4) {
const parent = openingHoursH4.parentElement;
if (parent) {
const texts = [];
let node = openingHoursH4.nextSibling;
while (node && node !== parent.querySelector('h4:not(:first-of-type)')) {
if (node.nodeType === Node.TEXT_NODE && node.textContent.trim()) {
texts.push(node.textContent.trim());
} else if (node.nodeType === Node.ELEMENT_NODE && node.tagName !== 'H4') {
const text = node.textContent.trim();
if (text) texts.push(text);
}
node = node.nextSibling;
}
data.opening_hours = texts.join(' ');
}
}
// Extract Archivleiter/in (director) - extract strong tag content
const directorH4 = Array.from(document.querySelectorAll('h4'))
.find(h => h.textContent.trim() === 'Archivleiter/in');
if (directorH4) {
const parent = directorH4.parentElement;
if (parent) {
const strongElem = parent.querySelector('strong');
if (strongElem) {
data.director = strongElem.textContent.trim();
}
}
}
// Extract Bestand (collection size) - in listitem before h4
const bestandH4 = Array.from(document.querySelectorAll('h4'))
.find(h => h.textContent.trim() === 'Bestand');
if (bestandH4) {
const listitem = bestandH4.closest('li');
if (listitem) {
const text = listitem.textContent.replace('Bestand', '').trim();
data.collection_size = text;
}
}
// Extract Laufzeit (temporal coverage) - in listitem before h4
const laufzeitH4 = Array.from(document.querySelectorAll('h4'))
.find(h => h.textContent.trim() === 'Laufzeit');
if (laufzeitH4) {
const listitem = laufzeitH4.closest('li');
if (listitem) {
const text = listitem.textContent.replace('Laufzeit', '').trim();
data.temporal_coverage = text;
}
}
// Extract Archivgeschichte (archive history) - all paragraphs after h4
const archivgeschichteH4 = Array.from(document.querySelectorAll('h4'))
.find(h => h.textContent.trim() === 'Archivgeschichte');
if (archivgeschichteH4) {
const parent = archivgeschichteH4.parentElement;
if (parent) {
const paragraphs = Array.from(parent.querySelectorAll('p'))
.map(p => p.textContent.trim())
.filter(text => text.length > 0);
data.archive_history = paragraphs.join('\\n\\n');
}
}
// Extract Bestände (collection descriptions)
data.collections = extractSection('Bestände');
// Extract Tektonik (classification system)
data.classification = extractSection('Tektonik');
// Extract Recherche (research information)
data.research_info = extractSection('Recherche');
// Extract Benutzung (access/usage information)
data.usage_info = extractSection('Benutzung');
return data;
}
""")
# Merge JavaScript extraction with metadata
metadata.update(extracted)
# Parse addresses
if metadata.get('postal_address'):
metadata['postal_address_parsed'] = parse_address_lines(metadata['postal_address'])
if metadata.get('physical_address'):
metadata['physical_address_parsed'] = parse_address_lines(metadata['physical_address'])
if metadata.get('visitor_address'):
metadata['visitor_address_parsed'] = parse_address_lines(metadata['visitor_address'])
except Exception as e:
print(f" ⚠️ Error extracting metadata: {e}")
return metadata
def harvest_archive_list(page) -> List[Dict]:
"""Get list of all archive URLs from main list page."""
print(f"📄 Loading archive list page...")
page.goto(ARCHIVE_LIST_URL, wait_until='networkidle', timeout=30000)
# Accept cookies if present
try:
cookie_button = page.locator('button:has-text("Akzeptieren"), button:has-text("Accept")')
if cookie_button.is_visible(timeout=2000):
cookie_button.click()
print("✅ Accepted cookies")
time.sleep(1)
except:
pass
print("📋 Extracting archive URLs...")
# Extract archive URLs
result = page.evaluate("""
() => {
const archiveLinks = document.querySelectorAll('ul li a[href*="/de/archiv/view/id/"]');
const uniqueArchives = new Map();
archiveLinks.forEach(link => {
const url = link.href;
const idMatch = url.match(/\\/id\\/(\\d+)/);
if (!idMatch) return;
const archiveId = idMatch[1];
if (uniqueArchives.has(archiveId)) return;
uniqueArchives.set(archiveId, {
id: archiveId,
url: url
});
});
return Array.from(uniqueArchives.values());
}
""")
print(f"✅ Found {len(result)} unique archives")
return result
def harvest_thueringen_archives_comprehensive() -> List[Dict]:
"""
Harvest COMPLETE metadata from all 149 Thüringen archives.
This comprehensive version visits each detail page to extract:
- Addresses (postal, physical, visitor)
- Contact info (email, phone, fax, website)
- Opening hours
- Director names
- Collection sizes
- Temporal coverage
- Archive histories
- Collection descriptions
"""
print(f"🚀 Thüringen Archives Comprehensive Harvester v2.0")
print(f"📍 Portal: {ARCHIVE_LIST_URL}")
print(f"⏱️ Starting harvest at {datetime.now(timezone.utc).isoformat()}")
print(f"⏳ Expected time: ~150 seconds (1 sec/page × 149 pages)")
print()
archives = []
with sync_playwright() as p:
print("🌐 Launching browser...")
browser = p.chromium.launch(headless=True)
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
)
page = context.new_page()
try:
# Step 1: Get list of all archive URLs
archive_list = harvest_archive_list(page)
total = len(archive_list)
# Step 2: Visit each detail page
print(f"\n📚 Processing {total} archive detail pages...")
print(f"⏱️ Rate limit: {REQUEST_DELAY}s between requests")
print()
start_time = time.time()
for idx, archive_info in enumerate(archive_list, 1):
archive_id = archive_info['id']
archive_url = archive_info['url']
print(f"[{idx}/{total}] Processing ID {archive_id}...", end=' ')
try:
# Visit detail page
page.goto(archive_url, wait_until='domcontentloaded', timeout=15000)
time.sleep(0.5) # Let JavaScript render
# Extract comprehensive metadata
metadata = extract_detail_page_metadata(page)
# Determine city from address data
city = None
if metadata.get('physical_address_parsed', {}).get('city'):
city = metadata['physical_address_parsed']['city']
elif metadata.get('postal_address_parsed', {}).get('city'):
city = metadata['postal_address_parsed']['city']
# Infer institution type
inst_type = infer_institution_type(metadata.get('name', ''))
# Build structured record
archive_data = {
"id": f"thueringen-{archive_id}",
"name": metadata.get('name', ''),
"institution_type": inst_type,
"city": city,
"region": "Thüringen",
"country": "DE",
"url": archive_url,
"source_portal": "archive-in-thueringen.de",
# Contact information
"email": metadata.get('email'),
"phone": metadata.get('phone'),
"fax": metadata.get('fax'),
"website": metadata.get('website'),
# Addresses
"postal_address": metadata.get('postal_address_parsed'),
"physical_address": metadata.get('physical_address_parsed'),
"visitor_address": metadata.get('visitor_address_parsed'),
# Archive details
"opening_hours": metadata.get('opening_hours'),
"director": metadata.get('director'),
"collection_size": metadata.get('collection_size'),
"temporal_coverage": metadata.get('temporal_coverage'),
"archive_history": metadata.get('archive_history'),
"collections": metadata.get('collections'),
"classification": metadata.get('classification'),
"research_info": metadata.get('research_info'),
"usage_info": metadata.get('usage_info'),
# Provenance
"provenance": {
"data_source": "WEB_SCRAPING",
"data_tier": "TIER_2_VERIFIED",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "Playwright comprehensive detail page extraction",
"source_url": archive_url,
"confidence_score": 0.95
}
}
archives.append(archive_data)
print(f"{metadata.get('name', 'Unknown')[:40]}")
except PlaywrightTimeout:
print(f"⏱️ Timeout")
archives.append({
"id": f"thueringen-{archive_id}",
"url": archive_url,
"error": "timeout",
"provenance": {
"data_source": "WEB_SCRAPING",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"confidence_score": 0.0
}
})
except Exception as e:
print(f"❌ Error: {e}")
archives.append({
"id": f"thueringen-{archive_id}",
"url": archive_url,
"error": str(e),
"provenance": {
"data_source": "WEB_SCRAPING",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"confidence_score": 0.0
}
})
# Rate limiting
if idx < total:
time.sleep(REQUEST_DELAY)
# Progress update every 25 archives
if idx % 25 == 0:
elapsed = time.time() - start_time
rate = idx / elapsed
remaining = (total - idx) / rate
print(f" 📊 Progress: {idx}/{total} ({idx/total*100:.1f}%) | " +
f"Speed: {rate:.1f}/sec | ETA: {remaining/60:.1f} min")
# Final statistics
elapsed = time.time() - start_time
successful = sum(1 for a in archives if 'error' not in a)
print(f"\n📊 Harvest Statistics:")
print(f" Total archives: {len(archives)}")
print(f" Successful: {successful}")
print(f" Failed: {len(archives) - successful}")
print(f" Time elapsed: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
print(f" Speed: {len(archives)/elapsed:.1f} archives/second")
# Count by type
type_counts = {}
for archive in archives:
if 'error' not in archive:
inst_type = archive.get('institution_type', 'UNKNOWN')
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print(f"\n By institution type:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" - {inst_type}: {count}")
# Count metadata completeness
with_email = sum(1 for a in archives if a.get('email'))
with_phone = sum(1 for a in archives if a.get('phone'))
with_address = sum(1 for a in archives if a.get('physical_address'))
with_director = sum(1 for a in archives if a.get('director'))
with_collection = sum(1 for a in archives if a.get('collection_size'))
with_history = sum(1 for a in archives if a.get('archive_history'))
print(f"\n Metadata completeness:")
print(f" - Email: {with_email}/{successful} ({with_email/successful*100:.1f}%)")
print(f" - Phone: {with_phone}/{successful} ({with_phone/successful*100:.1f}%)")
print(f" - Physical address: {with_address}/{successful} ({with_address/successful*100:.1f}%)")
print(f" - Director: {with_director}/{successful} ({with_director/successful*100:.1f}%)")
print(f" - Collection size: {with_collection}/{successful} ({with_collection/successful*100:.1f}%)")
print(f" - Archive history: {with_history}/{successful} ({with_history/successful*100:.1f}%)")
except Exception as e:
print(f"❌ Critical error during harvest: {e}")
import traceback
traceback.print_exc()
finally:
browser.close()
return archives
def save_results(archives: List[Dict]) -> Path:
"""Save comprehensive harvest to JSON file."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = OUTPUT_DIR / f"thueringen_archives_comprehensive_{timestamp}.json"
output_data = {
"metadata": {
"source": "archive-in-thueringen.de",
"harvest_date": datetime.now(timezone.utc).isoformat(),
"total_archives": len(archives),
"successful_extractions": sum(1 for a in archives if 'error' not in a),
"region": "Thüringen",
"country": "DE",
"harvester_version": "2.0 (comprehensive)",
"extraction_level": "comprehensive_detail_pages"
},
"archives": archives
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\n💾 Results saved to: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
return output_file
def main():
"""Main execution function."""
start_time = time.time()
# Harvest archives with comprehensive metadata
archives = harvest_thueringen_archives_comprehensive()
if archives:
# Save results
output_file = save_results(archives)
elapsed = time.time() - start_time
print(f"\n✅ Comprehensive harvest completed in {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
print(f"\n🎯 Next Steps:")
print(f" 1. Validate extracted data quality")
print(f" 2. Merge with German dataset v3: python scripts/scrapers/merge_thueringen_to_german_dataset.py {output_file}")
print(f" 3. Expected result: Replace 89 basic entries with 149 comprehensive entries")
else:
print("\n❌ No archives harvested!")
return 1
return 0
if __name__ == "__main__":
exit(main())