glam/scripts/scrapers/harvest_sachsen_anhalt_archives.py
2025-11-21 22:12:33 +01:00

165 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Harvest Sachsen-Anhalt GLAM Institutions
Extracts heritage institutions from Sachsen-Anhalt (Saxony-Anhalt), Germany.
Sources:
1. Landesarchiv Sachsen-Anhalt (state archive branches)
2. Archivportal-D (regional archives)
3. DDB SPARQL (museums, libraries, archives)
4. ULB Sachsen-Anhalt (digital collections)
Output: LinkML-compliant JSON with comprehensive metadata
Author: OpenCode AI Agent
Date: 2025-11-20
"""
import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, timezone
from typing import List, Dict, Optional
def extract_landesarchiv_branches() -> List[Dict]:
"""
Extract the 4 branches of Landesarchiv Sachsen-Anhalt.
Locations: Magdeburg, Wernigerode, Merseburg, Dessau
"""
base_url = 'https://landesarchiv.sachsen-anhalt.de'
locations = ['magdeburg', 'wernigerode', 'merseburg', 'dessau']
archives = []
print("📚 Extracting Landesarchiv Sachsen-Anhalt branches...")
print("=" * 70)
for location in locations:
url = f'{base_url}/landesarchiv/standorte/{location}'
print(f" Fetching: {location.upper()}")
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Extract title
title = soup.find('h1')
title_text = title.get_text(strip=True) if title else f'Landesarchiv Sachsen-Anhalt - Abteilung {location.title()}'
# Extract contact details
contact_info: Dict[str, Optional[str]] = {
'email': None,
'phone': None,
'address': None
}
# Look for contact information in paragraphs
for p in soup.find_all('p'):
text = p.get_text()
# Email
if '@' in text and not contact_info['email']:
# Extract email
email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
if email_match:
contact_info['email'] = email_match.group()
# Phone
if 'Telefon' in text or 'Tel.' in text:
# Extract phone number
phone_match = re.search(r'\+?\d[\d\s\-\(\)]+', text)
if phone_match:
contact_info['phone'] = phone_match.group().strip()
# Address
if 'Adresse' in text or 'straße' in text.lower() or 'str.' in text.lower():
contact_info['address'] = p.get_text(strip=True)
archive_record = {
'id': f'sachsen-anhalt-la-{location}',
'name': title_text,
'institution_type': 'ARCHIVE',
'city': location.title(),
'region': 'Sachsen-Anhalt',
'country': 'DE',
'url': url,
'email': contact_info['email'],
'phone': contact_info['phone'],
'address_text': contact_info['address'],
'source_portal': 'landesarchiv.sachsen-anhalt.de',
'provenance': {
'data_source': 'WEB_SCRAPING',
'data_tier': 'TIER_2_VERIFIED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Landesarchiv Sachsen-Anhalt portal harvest',
'source_url': url,
'confidence_score': 0.95
}
}
archives.append(archive_record)
print(f" ✅ Extracted: {title_text}")
else:
print(f" ❌ HTTP {response.status_code}")
except Exception as e:
print(f" ❌ ERROR: {e}")
print()
print(f"✅ Extracted {len(archives)} Landesarchiv branches")
print()
return archives
def save_archives(archives: List[Dict], filename: str):
"""Save archives to JSON file."""
output_data = {
'metadata': {
'source': 'Sachsen-Anhalt GLAM institutions',
'harvest_date': datetime.now(timezone.utc).isoformat(),
'total_archives': len(archives),
'region': 'Sachsen-Anhalt',
'country': 'DE',
'harvester_version': '1.0'
},
'archives': archives
}
with open(filename, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to: {filename}")
print(f" File size: {len(json.dumps(output_data)) / 1024:.1f} KB")
def main():
print("=" * 70)
print("SACHSEN-ANHALT GLAM INSTITUTIONS HARVEST")
print("=" * 70)
print()
# Extract Landesarchiv branches
archives = extract_landesarchiv_branches()
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f'data/isil/germany/sachsen_anhalt_archives_{timestamp}.json'
save_archives(archives, output_file)
print()
print("=" * 70)
print("HARVEST COMPLETE")
print("=" * 70)
print(f"Total archives harvested: {len(archives)}")
print()
print("Next steps:")
print(" 1. Harvest Archivportal-D for Sachsen-Anhalt archives")
print(" 2. Query DDB SPARQL for museums/libraries")
print(" 3. Merge all sources into unified dataset")
if __name__ == '__main__':
main()