glam/scripts/scrapers/harvest_thueringen_archives.py
2025-11-19 23:25:22 +01:00

352 lines
13 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Thüringen Archives Harvester
Extracts 149 archives from archive-in-thueringen.de
Portal: https://www.archive-in-thueringen.de/de/archiv/list
Strategy: All archives visible on single page - direct extraction
Speed: ~10 seconds
Author: OpenCode + AI Agent
Date: 2025-11-20
Version: 1.0
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re
# Configuration
BASE_URL = "https://www.archive-in-thueringen.de"
ARCHIVE_LIST_URL = f"{BASE_URL}/de/archiv/list"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
"Landesarchiv": "OFFICIAL_INSTITUTION",
"Staatsarchiv": "OFFICIAL_INSTITUTION",
"Hauptstaatsarchiv": "OFFICIAL_INSTITUTION",
"Stadtarchiv": "ARCHIVE",
"Gemeindearchiv": "ARCHIVE",
"Kreisarchiv": "ARCHIVE",
"Stadt- und Kreisarchiv": "ARCHIVE",
"Bistumsarchiv": "HOLY_SITES",
"Kirchenkreisarchiv": "HOLY_SITES",
"Landeskirchenarchiv": "HOLY_SITES",
"Archiv des Ev.": "HOLY_SITES",
"Archiv des Bischöflichen": "HOLY_SITES",
"Pfarrhausarchiv": "HOLY_SITES",
"Universitätsarchiv": "EDUCATION_PROVIDER",
"Hochschularchiv": "EDUCATION_PROVIDER",
"Hochschule": "EDUCATION_PROVIDER",
"Universität": "EDUCATION_PROVIDER",
"Fachhochschule": "EDUCATION_PROVIDER",
"Fachschule": "EDUCATION_PROVIDER",
"Carl Zeiss": "CORPORATION",
"SCHOTT": "CORPORATION",
"Wirtschaftsarchiv": "CORPORATION",
"Handwerkskammer": "CORPORATION",
"Handelskammer": "CORPORATION",
"Industrie- und Handelskammer": "CORPORATION",
"Lederfabrik": "CORPORATION",
"Verlagsgesellschaft": "CORPORATION",
"Bundesarchiv": "OFFICIAL_INSTITUTION",
"Stasi-Unterlagen": "OFFICIAL_INSTITUTION",
"Thüringer Landtages": "OFFICIAL_INSTITUTION",
"Gedenkstätte": "MUSEUM",
"Museum": "MUSEUM",
"Goethe- und Schiller": "RESEARCH_CENTER",
"Akademie": "RESEARCH_CENTER",
"Thüringer Archiv für Zeitgeschichte": "RESEARCH_CENTER",
"Thüringer Industriearchiv": "RESEARCH_CENTER",
"Thüringer Bauteil-Archiv": "RESEARCH_CENTER",
"Thüringer Talsperren": "RESEARCH_CENTER",
"Landesamt": "OFFICIAL_INSTITUTION",
"Archiv des Vogtländischen": "COLLECTING_SOCIETY",
"Archiv des Arbeitskreises": "NGO",
"Grenzlandmuseum": "MUSEUM",
"Archiv der VG": "ARCHIVE", # Verwaltungsgemeinschaft = administrative community
"Archiv der Verwaltungsgemeinschaft": "ARCHIVE",
"Archiv der Landgemeinde": "ARCHIVE",
"Archiv der Sammlung": "RESEARCH_CENTER",
"Musikarchiv": "RESEARCH_CENTER",
}
def infer_institution_type(name: str) -> str:
"""Infer institution type from German archive name."""
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
if keyword in name:
return inst_type
return "ARCHIVE"
def extract_city_from_fulltext(fulltext: str) -> Optional[str]:
"""
Extract city from Thüringen archive format: "City - Archive Name"
Examples:
- "Altenburg - Stadtarchiv Altenburg" -> "Altenburg"
- "Erfurt - Stadtarchiv Erfurt" -> "Erfurt"
- "Landesarchiv Thüringen - Staatsarchiv Altenburg" -> "Altenburg" (from name)
- "Arnstadt - Kreisarchiv Ilm-Kreis - Altkreis Ilmenau" -> "Arnstadt"
"""
# PRIORITY 1: Extract city from specific archive name patterns
# (must come BEFORE split-by-dash logic to handle "Landesarchiv - Staatsarchiv City")
# Stadtarchiv pattern (most reliable)
stadtarchiv_match = re.search(r'Stadtarchiv\s+(.+?)(?:\s*$)', fulltext)
if stadtarchiv_match:
city = stadtarchiv_match.group(1).strip()
city = re.sub(r'\s*\(.*\)$', '', city) # Remove (StadtA NDH) etc.
return city
# Hauptstaatsarchiv pattern: "Hauptstaatsarchiv City"
hauptstaatsarchiv_match = re.search(r'Hauptstaatsarchiv\s+(.+?)$', fulltext)
if hauptstaatsarchiv_match:
city = hauptstaatsarchiv_match.group(1).strip()
return city
# Staatsarchiv pattern: "Staatsarchiv City"
staatsarchiv_match = re.search(r'Staatsarchiv\s+(.+?)$', fulltext)
if staatsarchiv_match:
city = staatsarchiv_match.group(1).strip()
return city
# Gemeindearchiv pattern
gemeinde_match = re.search(r'Gemeindearchiv\s+(.+?)$', fulltext)
if gemeinde_match:
city = gemeinde_match.group(1).strip()
return city
# Stadt- und Kreisarchiv pattern
stadt_kreis_match = re.search(r'Stadt-\s+und\s+Kreisarchiv\s+(.+?)$', fulltext)
if stadt_kreis_match:
city = stadt_kreis_match.group(1).strip()
return city
# Universitätsarchiv pattern
uni_match = re.search(r'Universitätsarchiv\s+(.+?)$', fulltext)
if uni_match:
city = uni_match.group(1).strip()
return city
# PRIORITY 2: Split by " - " and take first part as city
if " - " in fulltext:
parts = fulltext.split(" - ")
potential_city = parts[0].strip()
# Skip if first part is an organization name (not a city)
skip_patterns = [
"Landesarchiv",
"Bundesarchiv",
"EKM",
"Landkreis",
"Kreisarchiv Ilm-Kreis" # Special case
]
if not any(pattern in potential_city for pattern in skip_patterns):
return potential_city
return None
def harvest_thueringen_archives() -> List[Dict]:
"""
Harvest all 149 archives from Thüringen archive portal.
All archives are visible on a single page, so this is very fast.
"""
print(f"🚀 Thüringen Archives Harvester v1.0")
print(f"📍 Portal: {ARCHIVE_LIST_URL}")
print(f"⏱️ Starting harvest at {datetime.now(timezone.utc).isoformat()}")
print()
archives = []
with sync_playwright() as p:
print("🌐 Launching browser...")
browser = p.chromium.launch(headless=True)
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
)
page = context.new_page()
try:
print(f"📄 Loading archive list page...")
page.goto(ARCHIVE_LIST_URL, wait_until='networkidle', timeout=30000)
# Accept cookies if present
try:
cookie_button = page.locator('button:has-text("Akzeptieren"), button:has-text("Accept")')
if cookie_button.is_visible(timeout=2000):
cookie_button.click()
print("✅ Accepted cookies")
time.sleep(1)
except:
pass
print("📋 Extracting archives from page...")
# Extract archives using JavaScript
result = page.evaluate("""
() => {
const archiveLinks = document.querySelectorAll('ul li a[href*="/de/archiv/view/id/"]');
const uniqueArchives = new Map();
archiveLinks.forEach(link => {
const fullText = link.textContent.trim();
const url = link.href;
// Extract ID from URL
const idMatch = url.match(/\\/id\\/(\\d+)/);
if (!idMatch) return;
const archiveId = idMatch[1];
if (uniqueArchives.has(archiveId)) return;
// Parse "City - Archive Name" format
let city = '';
let archiveName = '';
if (fullText.includes(' - ')) {
const parts = fullText.split(' - ');
city = parts[0].trim();
archiveName = parts.slice(1).join(' - ').trim();
} else {
archiveName = fullText;
}
uniqueArchives.set(archiveId, {
id: archiveId,
city: city,
name: archiveName,
url: url,
fullText: fullText
});
});
return Array.from(uniqueArchives.values());
}
""")
print(f"✅ Extracted {len(result)} unique archives")
# Process each archive
for raw_archive in result:
# Extract city from full text (ignore JavaScript split which may be wrong)
city = extract_city_from_fulltext(raw_archive['fullText'])
# Infer institution type
inst_type = infer_institution_type(raw_archive['name'])
archive_data = {
"id": f"thueringen-{raw_archive['id']}",
"name": raw_archive['name'],
"institution_type": inst_type,
"city": city,
"region": "Thüringen",
"country": "DE",
"url": raw_archive['url'],
"source_portal": "archive-in-thueringen.de",
"fulltext_display": raw_archive['fullText'],
"provenance": {
"data_source": "WEB_SCRAPING",
"data_tier": "TIER_2_VERIFIED",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "Playwright direct page extraction",
"source_url": ARCHIVE_LIST_URL,
"confidence_score": 0.95
}
}
archives.append(archive_data)
print(f"\n📊 Harvest Statistics:")
print(f" Total archives: {len(archives)}")
# Count by type
type_counts = {}
for archive in archives:
inst_type = archive['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print(f" By type:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" - {inst_type}: {count}")
# Count archives with cities
with_city = sum(1 for a in archives if a.get('city'))
print(f" With city names: {with_city}/{len(archives)} ({with_city/len(archives)*100:.1f}%)")
except Exception as e:
print(f"❌ Error during harvest: {e}")
import traceback
traceback.print_exc()
finally:
browser.close()
return archives
def save_results(archives: List[Dict]) -> Path:
"""Save harvested archives to JSON file."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = OUTPUT_DIR / f"thueringen_archives_{timestamp}.json"
output_data = {
"metadata": {
"source": "archive-in-thueringen.de",
"harvest_date": datetime.now(timezone.utc).isoformat(),
"total_archives": len(archives),
"region": "Thüringen",
"country": "DE",
"harvester_version": "1.0"
},
"archives": archives
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\n💾 Results saved to: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
return output_file
def main():
"""Main execution function."""
start_time = time.time()
# Harvest archives
archives = harvest_thueringen_archives()
if archives:
# Save results
output_file = save_results(archives)
elapsed = time.time() - start_time
print(f"\n✅ Harvest completed in {elapsed:.1f} seconds")
print(f"📈 Speed: {len(archives)/elapsed:.1f} archives/second")
print(f"\n🎯 Next Steps:")
print(f" 1. Run geocoding: python scripts/enrich_geocoding.py {output_file}")
print(f" 2. Merge with German dataset: python scripts/scrapers/merge_thueringen_to_german_dataset.py")
print(f" 3. Expected new additions: ~120 archives (after deduplication)")
else:
print("\n❌ No archives harvested!")
return 1
return 0
if __name__ == "__main__":
exit(main())