214 lines
6.9 KiB
Python
Executable file
214 lines
6.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Sachsen-Anhalt Museums - Museumsverband Website Harvest
|
|
Scrapes museum directory from Museumsverband Sachsen-Anhalt e.V.
|
|
Target: 100+ museums with comprehensive metadata
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
import time
|
|
from urllib.parse import urljoin
|
|
|
|
def fetch_museum_directory() -> List[Dict[str, Any]]:
|
|
"""Fetch museum list from Museumsverband Sachsen-Anhalt."""
|
|
|
|
base_url = "https://www.mv-sachsen-anhalt.de"
|
|
directory_url = f"{base_url}/museen"
|
|
|
|
print(f"Fetching museum directory from: {directory_url}")
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(directory_url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find museum entries
|
|
# Try common patterns: divs with class "museum", "institution", article tags, etc.
|
|
museums = []
|
|
|
|
# Pattern 1: Article tags
|
|
articles = soup.find_all('article')
|
|
print(f"Found {len(articles)} article tags")
|
|
|
|
for article in articles:
|
|
museum_data = extract_museum_from_element(article, base_url)
|
|
if museum_data:
|
|
museums.append(museum_data)
|
|
|
|
# Pattern 2: Div with museum class
|
|
if not museums:
|
|
museum_divs = soup.find_all('div', class_=lambda c: c and 'museum' in c.lower())
|
|
print(f"Found {len(museum_divs)} museum divs")
|
|
|
|
for div in museum_divs:
|
|
museum_data = extract_museum_from_element(div, base_url)
|
|
if museum_data:
|
|
museums.append(museum_data)
|
|
|
|
# Pattern 3: List items in museum directory
|
|
if not museums:
|
|
museum_list = soup.find('ul', class_=lambda c: c and ('museum' in c.lower() or 'directory' in c.lower()))
|
|
if museum_list:
|
|
list_items = museum_list.find_all('li')
|
|
print(f"Found {len(list_items)} list items")
|
|
|
|
for li in list_items:
|
|
museum_data = extract_museum_from_element(li, base_url)
|
|
if museum_data:
|
|
museums.append(museum_data)
|
|
|
|
return museums
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"❌ Failed to fetch museum directory: {e}")
|
|
return []
|
|
|
|
def extract_museum_from_element(element, base_url: str) -> Dict[str, Any]:
|
|
"""Extract museum data from HTML element."""
|
|
|
|
# Extract name
|
|
name_tag = element.find(['h2', 'h3', 'h4', 'a'])
|
|
if not name_tag:
|
|
return None
|
|
|
|
name = name_tag.get_text(strip=True)
|
|
if not name or len(name) < 3:
|
|
return None
|
|
|
|
# Extract link
|
|
link_tag = element.find('a', href=True)
|
|
detail_url = urljoin(base_url, link_tag['href']) if link_tag else None
|
|
|
|
# Extract city/location
|
|
city = ""
|
|
location_patterns = ['address', 'location', 'city', 'ort']
|
|
for pattern in location_patterns:
|
|
loc_tag = element.find(class_=lambda c: c and pattern in c.lower())
|
|
if loc_tag:
|
|
city = loc_tag.get_text(strip=True)
|
|
break
|
|
|
|
# Extract description
|
|
description = ""
|
|
desc_tag = element.find('p')
|
|
if desc_tag:
|
|
description = desc_tag.get_text(strip=True)
|
|
|
|
return {
|
|
'name': name,
|
|
'city': city,
|
|
'description': description,
|
|
'detail_url': detail_url,
|
|
'source_url': base_url
|
|
}
|
|
|
|
def convert_to_linkml_format(raw_museums: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Convert raw museum data to LinkML heritage custodian format."""
|
|
|
|
institutions = []
|
|
|
|
for museum in raw_museums:
|
|
location = {
|
|
'city': museum.get('city', ''),
|
|
'country': 'DE',
|
|
'region': 'Sachsen-Anhalt'
|
|
}
|
|
|
|
identifiers = []
|
|
if museum.get('detail_url'):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': museum['detail_url'],
|
|
'identifier_url': museum['detail_url']
|
|
})
|
|
|
|
institution = {
|
|
'name': museum['name'],
|
|
'institution_type': 'MUSEUM',
|
|
'description': museum.get('description', ''),
|
|
'locations': [location],
|
|
'identifiers': identifiers,
|
|
'provenance': {
|
|
'data_source': 'WEBSITE_SCRAPING',
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Web scraping from Museumsverband Sachsen-Anhalt directory',
|
|
'confidence_score': 0.90,
|
|
'source_url': museum.get('source_url', '')
|
|
}
|
|
}
|
|
|
|
institutions.append(institution)
|
|
|
|
return institutions
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
|
|
print("=" * 80)
|
|
print("Sachsen-Anhalt Museums - Museumsverband Website Harvest")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Fetch museum directory
|
|
raw_museums = fetch_museum_directory()
|
|
|
|
if not raw_museums:
|
|
print("❌ No museums found. Website structure may have changed.")
|
|
print()
|
|
print("Manual inspection required:")
|
|
print(" 1. Visit https://www.mv-sachsen-anhalt.de/museen")
|
|
print(" 2. Identify museum list structure (div class, ul/li, etc.)")
|
|
print(" 3. Update extraction patterns in script")
|
|
return
|
|
|
|
print()
|
|
print(f"Extracted {len(raw_museums)} museums from directory")
|
|
print()
|
|
|
|
# Convert to LinkML format
|
|
institutions = convert_to_linkml_format(raw_museums)
|
|
|
|
# Statistics
|
|
city_counts = {}
|
|
for inst in institutions:
|
|
city = inst['locations'][0]['city']
|
|
if city:
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print("Museums by City:")
|
|
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {city}: {count}")
|
|
print()
|
|
|
|
# Save to JSON
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = Path('data/isil/germany') / f'sachsen_anhalt_museums_{timestamp}.json'
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size_kb = output_path.stat().st_size / 1024
|
|
|
|
print(f"✅ Saved to: {output_path}")
|
|
print(f" File size: {file_size_kb:.1f} KB")
|
|
print(f" Total museums: {len(institutions)}")
|
|
print()
|
|
print("=" * 80)
|
|
print("Museum harvest complete!")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|