glam/scripts/scrapers/harvest_sachsen_anhalt_museums.py
2025-11-21 22:12:33 +01:00

214 lines
6.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Sachsen-Anhalt Museums - Museumsverband Website Harvest
Scrapes museum directory from Museumsverband Sachsen-Anhalt e.V.
Target: 100+ museums with comprehensive metadata
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
import time
from urllib.parse import urljoin
def fetch_museum_directory() -> List[Dict[str, Any]]:
"""Fetch museum list from Museumsverband Sachsen-Anhalt."""
base_url = "https://www.mv-sachsen-anhalt.de"
directory_url = f"{base_url}/museen"
print(f"Fetching museum directory from: {directory_url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(directory_url, headers=headers, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# Find museum entries
# Try common patterns: divs with class "museum", "institution", article tags, etc.
museums = []
# Pattern 1: Article tags
articles = soup.find_all('article')
print(f"Found {len(articles)} article tags")
for article in articles:
museum_data = extract_museum_from_element(article, base_url)
if museum_data:
museums.append(museum_data)
# Pattern 2: Div with museum class
if not museums:
museum_divs = soup.find_all('div', class_=lambda c: c and 'museum' in c.lower())
print(f"Found {len(museum_divs)} museum divs")
for div in museum_divs:
museum_data = extract_museum_from_element(div, base_url)
if museum_data:
museums.append(museum_data)
# Pattern 3: List items in museum directory
if not museums:
museum_list = soup.find('ul', class_=lambda c: c and ('museum' in c.lower() or 'directory' in c.lower()))
if museum_list:
list_items = museum_list.find_all('li')
print(f"Found {len(list_items)} list items")
for li in list_items:
museum_data = extract_museum_from_element(li, base_url)
if museum_data:
museums.append(museum_data)
return museums
except requests.exceptions.RequestException as e:
print(f"❌ Failed to fetch museum directory: {e}")
return []
def extract_museum_from_element(element, base_url: str) -> Dict[str, Any]:
"""Extract museum data from HTML element."""
# Extract name
name_tag = element.find(['h2', 'h3', 'h4', 'a'])
if not name_tag:
return None
name = name_tag.get_text(strip=True)
if not name or len(name) < 3:
return None
# Extract link
link_tag = element.find('a', href=True)
detail_url = urljoin(base_url, link_tag['href']) if link_tag else None
# Extract city/location
city = ""
location_patterns = ['address', 'location', 'city', 'ort']
for pattern in location_patterns:
loc_tag = element.find(class_=lambda c: c and pattern in c.lower())
if loc_tag:
city = loc_tag.get_text(strip=True)
break
# Extract description
description = ""
desc_tag = element.find('p')
if desc_tag:
description = desc_tag.get_text(strip=True)
return {
'name': name,
'city': city,
'description': description,
'detail_url': detail_url,
'source_url': base_url
}
def convert_to_linkml_format(raw_museums: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert raw museum data to LinkML heritage custodian format."""
institutions = []
for museum in raw_museums:
location = {
'city': museum.get('city', ''),
'country': 'DE',
'region': 'Sachsen-Anhalt'
}
identifiers = []
if museum.get('detail_url'):
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': museum['detail_url'],
'identifier_url': museum['detail_url']
})
institution = {
'name': museum['name'],
'institution_type': 'MUSEUM',
'description': museum.get('description', ''),
'locations': [location],
'identifiers': identifiers,
'provenance': {
'data_source': 'WEBSITE_SCRAPING',
'data_tier': 'TIER_2_VERIFIED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Web scraping from Museumsverband Sachsen-Anhalt directory',
'confidence_score': 0.90,
'source_url': museum.get('source_url', '')
}
}
institutions.append(institution)
return institutions
def main():
"""Main execution."""
print("=" * 80)
print("Sachsen-Anhalt Museums - Museumsverband Website Harvest")
print("=" * 80)
print()
# Fetch museum directory
raw_museums = fetch_museum_directory()
if not raw_museums:
print("❌ No museums found. Website structure may have changed.")
print()
print("Manual inspection required:")
print(" 1. Visit https://www.mv-sachsen-anhalt.de/museen")
print(" 2. Identify museum list structure (div class, ul/li, etc.)")
print(" 3. Update extraction patterns in script")
return
print()
print(f"Extracted {len(raw_museums)} museums from directory")
print()
# Convert to LinkML format
institutions = convert_to_linkml_format(raw_museums)
# Statistics
city_counts = {}
for inst in institutions:
city = inst['locations'][0]['city']
if city:
city_counts[city] = city_counts.get(city, 0) + 1
print("Museums by City:")
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
print(f" {city}: {count}")
print()
# Save to JSON
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = Path('data/isil/germany') / f'sachsen_anhalt_museums_{timestamp}.json'
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(institutions, f, ensure_ascii=False, indent=2)
file_size_kb = output_path.stat().st_size / 1024
print(f"✅ Saved to: {output_path}")
print(f" File size: {file_size_kb:.1f} KB")
print(f" Total museums: {len(institutions)}")
print()
print("=" * 80)
print("Museum harvest complete!")
print("=" * 80)
if __name__ == '__main__':
main()