352 lines
13 KiB
Python
Executable file
352 lines
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Thüringen Archives Harvester
|
|
Extracts 149 archives from archive-in-thueringen.de
|
|
|
|
Portal: https://www.archive-in-thueringen.de/de/archiv/list
|
|
Strategy: All archives visible on single page - direct extraction
|
|
Speed: ~10 seconds
|
|
|
|
Author: OpenCode + AI Agent
|
|
Date: 2025-11-20
|
|
Version: 1.0
|
|
"""
|
|
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
import re
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.archive-in-thueringen.de"
|
|
ARCHIVE_LIST_URL = f"{BASE_URL}/de/archiv/list"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Map German archive types to GLAM taxonomy
|
|
ARCHIVE_TYPE_MAPPING = {
|
|
"Landesarchiv": "OFFICIAL_INSTITUTION",
|
|
"Staatsarchiv": "OFFICIAL_INSTITUTION",
|
|
"Hauptstaatsarchiv": "OFFICIAL_INSTITUTION",
|
|
"Stadtarchiv": "ARCHIVE",
|
|
"Gemeindearchiv": "ARCHIVE",
|
|
"Kreisarchiv": "ARCHIVE",
|
|
"Stadt- und Kreisarchiv": "ARCHIVE",
|
|
"Bistumsarchiv": "HOLY_SITES",
|
|
"Kirchenkreisarchiv": "HOLY_SITES",
|
|
"Landeskirchenarchiv": "HOLY_SITES",
|
|
"Archiv des Ev.": "HOLY_SITES",
|
|
"Archiv des Bischöflichen": "HOLY_SITES",
|
|
"Pfarrhausarchiv": "HOLY_SITES",
|
|
"Universitätsarchiv": "EDUCATION_PROVIDER",
|
|
"Hochschularchiv": "EDUCATION_PROVIDER",
|
|
"Hochschule": "EDUCATION_PROVIDER",
|
|
"Universität": "EDUCATION_PROVIDER",
|
|
"Fachhochschule": "EDUCATION_PROVIDER",
|
|
"Fachschule": "EDUCATION_PROVIDER",
|
|
"Carl Zeiss": "CORPORATION",
|
|
"SCHOTT": "CORPORATION",
|
|
"Wirtschaftsarchiv": "CORPORATION",
|
|
"Handwerkskammer": "CORPORATION",
|
|
"Handelskammer": "CORPORATION",
|
|
"Industrie- und Handelskammer": "CORPORATION",
|
|
"Lederfabrik": "CORPORATION",
|
|
"Verlagsgesellschaft": "CORPORATION",
|
|
"Bundesarchiv": "OFFICIAL_INSTITUTION",
|
|
"Stasi-Unterlagen": "OFFICIAL_INSTITUTION",
|
|
"Thüringer Landtages": "OFFICIAL_INSTITUTION",
|
|
"Gedenkstätte": "MUSEUM",
|
|
"Museum": "MUSEUM",
|
|
"Goethe- und Schiller": "RESEARCH_CENTER",
|
|
"Akademie": "RESEARCH_CENTER",
|
|
"Thüringer Archiv für Zeitgeschichte": "RESEARCH_CENTER",
|
|
"Thüringer Industriearchiv": "RESEARCH_CENTER",
|
|
"Thüringer Bauteil-Archiv": "RESEARCH_CENTER",
|
|
"Thüringer Talsperren": "RESEARCH_CENTER",
|
|
"Landesamt": "OFFICIAL_INSTITUTION",
|
|
"Archiv des Vogtländischen": "COLLECTING_SOCIETY",
|
|
"Archiv des Arbeitskreises": "NGO",
|
|
"Grenzlandmuseum": "MUSEUM",
|
|
"Archiv der VG": "ARCHIVE", # Verwaltungsgemeinschaft = administrative community
|
|
"Archiv der Verwaltungsgemeinschaft": "ARCHIVE",
|
|
"Archiv der Landgemeinde": "ARCHIVE",
|
|
"Archiv der Sammlung": "RESEARCH_CENTER",
|
|
"Musikarchiv": "RESEARCH_CENTER",
|
|
}
|
|
|
|
|
|
def infer_institution_type(name: str) -> str:
|
|
"""Infer institution type from German archive name."""
|
|
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
|
|
if keyword in name:
|
|
return inst_type
|
|
return "ARCHIVE"
|
|
|
|
|
|
def extract_city_from_fulltext(fulltext: str) -> Optional[str]:
|
|
"""
|
|
Extract city from Thüringen archive format: "City - Archive Name"
|
|
|
|
Examples:
|
|
- "Altenburg - Stadtarchiv Altenburg" -> "Altenburg"
|
|
- "Erfurt - Stadtarchiv Erfurt" -> "Erfurt"
|
|
- "Landesarchiv Thüringen - Staatsarchiv Altenburg" -> "Altenburg" (from name)
|
|
- "Arnstadt - Kreisarchiv Ilm-Kreis - Altkreis Ilmenau" -> "Arnstadt"
|
|
"""
|
|
# PRIORITY 1: Extract city from specific archive name patterns
|
|
# (must come BEFORE split-by-dash logic to handle "Landesarchiv - Staatsarchiv City")
|
|
|
|
# Stadtarchiv pattern (most reliable)
|
|
stadtarchiv_match = re.search(r'Stadtarchiv\s+(.+?)(?:\s*$)', fulltext)
|
|
if stadtarchiv_match:
|
|
city = stadtarchiv_match.group(1).strip()
|
|
city = re.sub(r'\s*\(.*\)$', '', city) # Remove (StadtA NDH) etc.
|
|
return city
|
|
|
|
# Hauptstaatsarchiv pattern: "Hauptstaatsarchiv City"
|
|
hauptstaatsarchiv_match = re.search(r'Hauptstaatsarchiv\s+(.+?)$', fulltext)
|
|
if hauptstaatsarchiv_match:
|
|
city = hauptstaatsarchiv_match.group(1).strip()
|
|
return city
|
|
|
|
# Staatsarchiv pattern: "Staatsarchiv City"
|
|
staatsarchiv_match = re.search(r'Staatsarchiv\s+(.+?)$', fulltext)
|
|
if staatsarchiv_match:
|
|
city = staatsarchiv_match.group(1).strip()
|
|
return city
|
|
|
|
# Gemeindearchiv pattern
|
|
gemeinde_match = re.search(r'Gemeindearchiv\s+(.+?)$', fulltext)
|
|
if gemeinde_match:
|
|
city = gemeinde_match.group(1).strip()
|
|
return city
|
|
|
|
# Stadt- und Kreisarchiv pattern
|
|
stadt_kreis_match = re.search(r'Stadt-\s+und\s+Kreisarchiv\s+(.+?)$', fulltext)
|
|
if stadt_kreis_match:
|
|
city = stadt_kreis_match.group(1).strip()
|
|
return city
|
|
|
|
# Universitätsarchiv pattern
|
|
uni_match = re.search(r'Universitätsarchiv\s+(.+?)$', fulltext)
|
|
if uni_match:
|
|
city = uni_match.group(1).strip()
|
|
return city
|
|
|
|
# PRIORITY 2: Split by " - " and take first part as city
|
|
if " - " in fulltext:
|
|
parts = fulltext.split(" - ")
|
|
potential_city = parts[0].strip()
|
|
|
|
# Skip if first part is an organization name (not a city)
|
|
skip_patterns = [
|
|
"Landesarchiv",
|
|
"Bundesarchiv",
|
|
"EKM",
|
|
"Landkreis",
|
|
"Kreisarchiv Ilm-Kreis" # Special case
|
|
]
|
|
|
|
if not any(pattern in potential_city for pattern in skip_patterns):
|
|
return potential_city
|
|
|
|
return None
|
|
|
|
|
|
def harvest_thueringen_archives() -> List[Dict]:
|
|
"""
|
|
Harvest all 149 archives from Thüringen archive portal.
|
|
|
|
All archives are visible on a single page, so this is very fast.
|
|
"""
|
|
print(f"🚀 Thüringen Archives Harvester v1.0")
|
|
print(f"📍 Portal: {ARCHIVE_LIST_URL}")
|
|
print(f"⏱️ Starting harvest at {datetime.now(timezone.utc).isoformat()}")
|
|
print()
|
|
|
|
archives = []
|
|
|
|
with sync_playwright() as p:
|
|
print("🌐 Launching browser...")
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
viewport={'width': 1920, 'height': 1080},
|
|
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
)
|
|
page = context.new_page()
|
|
|
|
try:
|
|
print(f"📄 Loading archive list page...")
|
|
page.goto(ARCHIVE_LIST_URL, wait_until='networkidle', timeout=30000)
|
|
|
|
# Accept cookies if present
|
|
try:
|
|
cookie_button = page.locator('button:has-text("Akzeptieren"), button:has-text("Accept")')
|
|
if cookie_button.is_visible(timeout=2000):
|
|
cookie_button.click()
|
|
print("✅ Accepted cookies")
|
|
time.sleep(1)
|
|
except:
|
|
pass
|
|
|
|
print("📋 Extracting archives from page...")
|
|
|
|
# Extract archives using JavaScript
|
|
result = page.evaluate("""
|
|
() => {
|
|
const archiveLinks = document.querySelectorAll('ul li a[href*="/de/archiv/view/id/"]');
|
|
const uniqueArchives = new Map();
|
|
|
|
archiveLinks.forEach(link => {
|
|
const fullText = link.textContent.trim();
|
|
const url = link.href;
|
|
|
|
// Extract ID from URL
|
|
const idMatch = url.match(/\\/id\\/(\\d+)/);
|
|
if (!idMatch) return;
|
|
|
|
const archiveId = idMatch[1];
|
|
if (uniqueArchives.has(archiveId)) return;
|
|
|
|
// Parse "City - Archive Name" format
|
|
let city = '';
|
|
let archiveName = '';
|
|
|
|
if (fullText.includes(' - ')) {
|
|
const parts = fullText.split(' - ');
|
|
city = parts[0].trim();
|
|
archiveName = parts.slice(1).join(' - ').trim();
|
|
} else {
|
|
archiveName = fullText;
|
|
}
|
|
|
|
uniqueArchives.set(archiveId, {
|
|
id: archiveId,
|
|
city: city,
|
|
name: archiveName,
|
|
url: url,
|
|
fullText: fullText
|
|
});
|
|
});
|
|
|
|
return Array.from(uniqueArchives.values());
|
|
}
|
|
""")
|
|
|
|
print(f"✅ Extracted {len(result)} unique archives")
|
|
|
|
# Process each archive
|
|
for raw_archive in result:
|
|
# Extract city from full text (ignore JavaScript split which may be wrong)
|
|
city = extract_city_from_fulltext(raw_archive['fullText'])
|
|
|
|
# Infer institution type
|
|
inst_type = infer_institution_type(raw_archive['name'])
|
|
|
|
archive_data = {
|
|
"id": f"thueringen-{raw_archive['id']}",
|
|
"name": raw_archive['name'],
|
|
"institution_type": inst_type,
|
|
"city": city,
|
|
"region": "Thüringen",
|
|
"country": "DE",
|
|
"url": raw_archive['url'],
|
|
"source_portal": "archive-in-thueringen.de",
|
|
"fulltext_display": raw_archive['fullText'],
|
|
"provenance": {
|
|
"data_source": "WEB_SCRAPING",
|
|
"data_tier": "TIER_2_VERIFIED",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"extraction_method": "Playwright direct page extraction",
|
|
"source_url": ARCHIVE_LIST_URL,
|
|
"confidence_score": 0.95
|
|
}
|
|
}
|
|
|
|
archives.append(archive_data)
|
|
|
|
print(f"\n📊 Harvest Statistics:")
|
|
print(f" Total archives: {len(archives)}")
|
|
|
|
# Count by type
|
|
type_counts = {}
|
|
for archive in archives:
|
|
inst_type = archive['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print(f" By type:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" - {inst_type}: {count}")
|
|
|
|
# Count archives with cities
|
|
with_city = sum(1 for a in archives if a.get('city'))
|
|
print(f" With city names: {with_city}/{len(archives)} ({with_city/len(archives)*100:.1f}%)")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during harvest: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
finally:
|
|
browser.close()
|
|
|
|
return archives
|
|
|
|
|
|
def save_results(archives: List[Dict]) -> Path:
|
|
"""Save harvested archives to JSON file."""
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
output_file = OUTPUT_DIR / f"thueringen_archives_{timestamp}.json"
|
|
|
|
output_data = {
|
|
"metadata": {
|
|
"source": "archive-in-thueringen.de",
|
|
"harvest_date": datetime.now(timezone.utc).isoformat(),
|
|
"total_archives": len(archives),
|
|
"region": "Thüringen",
|
|
"country": "DE",
|
|
"harvester_version": "1.0"
|
|
},
|
|
"archives": archives
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n💾 Results saved to: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
|
|
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
start_time = time.time()
|
|
|
|
# Harvest archives
|
|
archives = harvest_thueringen_archives()
|
|
|
|
if archives:
|
|
# Save results
|
|
output_file = save_results(archives)
|
|
|
|
elapsed = time.time() - start_time
|
|
print(f"\n✅ Harvest completed in {elapsed:.1f} seconds")
|
|
print(f"📈 Speed: {len(archives)/elapsed:.1f} archives/second")
|
|
|
|
print(f"\n🎯 Next Steps:")
|
|
print(f" 1. Run geocoding: python scripts/enrich_geocoding.py {output_file}")
|
|
print(f" 2. Merge with German dataset: python scripts/scrapers/merge_thueringen_to_german_dataset.py")
|
|
print(f" 3. Expected new additions: ~120 archives (after deduplication)")
|
|
else:
|
|
print("\n❌ No archives harvested!")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|