165 lines
5.8 KiB
Python
165 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Harvest Sachsen-Anhalt GLAM Institutions
|
|
|
|
Extracts heritage institutions from Sachsen-Anhalt (Saxony-Anhalt), Germany.
|
|
|
|
Sources:
|
|
1. Landesarchiv Sachsen-Anhalt (state archive branches)
|
|
2. Archivportal-D (regional archives)
|
|
3. DDB SPARQL (museums, libraries, archives)
|
|
4. ULB Sachsen-Anhalt (digital collections)
|
|
|
|
Output: LinkML-compliant JSON with comprehensive metadata
|
|
|
|
Author: OpenCode AI Agent
|
|
Date: 2025-11-20
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
|
|
def extract_landesarchiv_branches() -> List[Dict]:
|
|
"""
|
|
Extract the 4 branches of Landesarchiv Sachsen-Anhalt.
|
|
|
|
Locations: Magdeburg, Wernigerode, Merseburg, Dessau
|
|
"""
|
|
base_url = 'https://landesarchiv.sachsen-anhalt.de'
|
|
locations = ['magdeburg', 'wernigerode', 'merseburg', 'dessau']
|
|
|
|
archives = []
|
|
|
|
print("📚 Extracting Landesarchiv Sachsen-Anhalt branches...")
|
|
print("=" * 70)
|
|
|
|
for location in locations:
|
|
url = f'{base_url}/landesarchiv/standorte/{location}'
|
|
print(f" Fetching: {location.upper()}")
|
|
|
|
try:
|
|
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Extract title
|
|
title = soup.find('h1')
|
|
title_text = title.get_text(strip=True) if title else f'Landesarchiv Sachsen-Anhalt - Abteilung {location.title()}'
|
|
|
|
# Extract contact details
|
|
contact_info: Dict[str, Optional[str]] = {
|
|
'email': None,
|
|
'phone': None,
|
|
'address': None
|
|
}
|
|
|
|
# Look for contact information in paragraphs
|
|
for p in soup.find_all('p'):
|
|
text = p.get_text()
|
|
|
|
# Email
|
|
if '@' in text and not contact_info['email']:
|
|
# Extract email
|
|
email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
|
|
if email_match:
|
|
contact_info['email'] = email_match.group()
|
|
|
|
# Phone
|
|
if 'Telefon' in text or 'Tel.' in text:
|
|
# Extract phone number
|
|
phone_match = re.search(r'\+?\d[\d\s\-\(\)]+', text)
|
|
if phone_match:
|
|
contact_info['phone'] = phone_match.group().strip()
|
|
|
|
# Address
|
|
if 'Adresse' in text or 'straße' in text.lower() or 'str.' in text.lower():
|
|
contact_info['address'] = p.get_text(strip=True)
|
|
|
|
archive_record = {
|
|
'id': f'sachsen-anhalt-la-{location}',
|
|
'name': title_text,
|
|
'institution_type': 'ARCHIVE',
|
|
'city': location.title(),
|
|
'region': 'Sachsen-Anhalt',
|
|
'country': 'DE',
|
|
'url': url,
|
|
'email': contact_info['email'],
|
|
'phone': contact_info['phone'],
|
|
'address_text': contact_info['address'],
|
|
'source_portal': 'landesarchiv.sachsen-anhalt.de',
|
|
'provenance': {
|
|
'data_source': 'WEB_SCRAPING',
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Landesarchiv Sachsen-Anhalt portal harvest',
|
|
'source_url': url,
|
|
'confidence_score': 0.95
|
|
}
|
|
}
|
|
|
|
archives.append(archive_record)
|
|
print(f" ✅ Extracted: {title_text}")
|
|
|
|
else:
|
|
print(f" ❌ HTTP {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ ERROR: {e}")
|
|
|
|
print()
|
|
print(f"✅ Extracted {len(archives)} Landesarchiv branches")
|
|
print()
|
|
|
|
return archives
|
|
|
|
def save_archives(archives: List[Dict], filename: str):
|
|
"""Save archives to JSON file."""
|
|
output_data = {
|
|
'metadata': {
|
|
'source': 'Sachsen-Anhalt GLAM institutions',
|
|
'harvest_date': datetime.now(timezone.utc).isoformat(),
|
|
'total_archives': len(archives),
|
|
'region': 'Sachsen-Anhalt',
|
|
'country': 'DE',
|
|
'harvester_version': '1.0'
|
|
},
|
|
'archives': archives
|
|
}
|
|
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"💾 Saved to: {filename}")
|
|
print(f" File size: {len(json.dumps(output_data)) / 1024:.1f} KB")
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("SACHSEN-ANHALT GLAM INSTITUTIONS HARVEST")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Extract Landesarchiv branches
|
|
archives = extract_landesarchiv_branches()
|
|
|
|
# Save results
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_file = f'data/isil/germany/sachsen_anhalt_archives_{timestamp}.json'
|
|
save_archives(archives, output_file)
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("HARVEST COMPLETE")
|
|
print("=" * 70)
|
|
print(f"Total archives harvested: {len(archives)}")
|
|
print()
|
|
print("Next steps:")
|
|
print(" 1. Harvest Archivportal-D for Sachsen-Anhalt archives")
|
|
print(" 2. Query DDB SPARQL for museums/libraries")
|
|
print(" 3. Merge all sources into unified dataset")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|