glam/scripts/scrapers/harvest_isil_museum_bayern.py
2025-11-21 22:12:33 +01:00

324 lines
10 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Saxony Museum Harvester from isil.museum Registry
Extracts Saxony museums from the official German museum ISIL registry at:
http://www.museen-in-deutschland.de/?t=liste&mode=land&suchbegriff=Bayern
This is the Institut für Museumsforschung (Institute for Museum Research)
official database containing ~6,304 German museums with ISIL codes.
Author: OpenCode AI Agent
Date: 2025-11-20
Status: PRODUCTION - Extracting from authoritative German museum registry
"""
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
# Configuration
BASE_URL = "http://www.museen-in-deutschland.de"
BAYERN_URL = f"{BASE_URL}/?t=liste&mode=land&suchbegriff=Bayern"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
}
RATE_LIMIT_DELAY = 1.0 # Seconds between requests
def fetch_museum_list(url: str) -> str:
"""Fetch the Saxony museum list HTML."""
print(f"Fetching museum list from: {url}")
print()
try:
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
print(f"✓ HTTP {response.status_code} - {len(response.text):,} bytes")
print()
return response.text
except requests.RequestException as e:
print(f"✗ Error fetching URL: {e}")
raise
def parse_museum_table(html: str) -> List[Dict[str, str]]:
"""
Parse the HTML table to extract museum records.
Expected structure:
<tr>
<td><a href="...">DE-MUS-907015</a></td> <!-- ISIL code -->
<td>Adorf/Vogtl.</td> <!-- City -->
<td><a href="...">Museum Name</a></td> <!-- Name + detail link -->
</tr>
"""
soup = BeautifulSoup(html, 'html.parser')
museums = []
# Find the main table (assuming it's the largest <table> element)
tables = soup.find_all('table')
if not tables:
print("✗ No tables found in HTML")
return []
# Try to find table with museum data (contains ISIL codes like DE-MUS-*)
target_table = None
for table in tables:
# Check if this table contains ISIL codes
sample_text = table.get_text()
if 'DE-MUS-' in sample_text:
target_table = table
break
if not target_table:
print("✗ Could not find museum data table")
return []
# Parse table rows
rows = target_table.find_all('tr')
print(f"Found {len(rows)} table rows")
print()
for row in rows:
cells = row.find_all('td')
# Expected: 3 cells (ISIL, City, Name)
if len(cells) < 3:
continue
# Extract ISIL code from first cell
isil_cell = cells[0]
isil_link = isil_cell.find('a')
if not isil_link:
continue
isil_code = isil_link.get_text(strip=True)
if not isil_code.startswith('DE-MUS-'):
continue
# Extract city from second cell
city = cells[1].get_text(strip=True)
# Extract museum name and detail link from third cell
name_cell = cells[2]
name_link = name_cell.find('a')
if name_link:
museum_name = name_link.get_text(strip=True)
detail_path = name_link.get('href', '')
detail_url = f"{BASE_URL}/{detail_path}" if detail_path else None
else:
museum_name = name_cell.get_text(strip=True)
detail_url = None
# Skip if essential fields are missing
if not museum_name or not city:
continue
museums.append({
'isil_code': isil_code,
'city': city,
'name': museum_name,
'detail_url': detail_url
})
return museums
def convert_to_linkml(museum_data: Dict[str, str]) -> Dict:
"""Convert raw museum data to LinkML-compliant HeritageCustodian format."""
# Generate clean ID from city and museum name
city_slug = re.sub(r'[^a-z0-9]+', '-', museum_data['city'].lower())
name_slug = re.sub(r'[^a-z0-9]+', '-', museum_data['name'][:50].lower())
custodian = {
"id": f"https://w3id.org/heritage/custodian/de/{city_slug}-{name_slug}",
"name": museum_data["name"],
"institution_type": "MUSEUM",
"alternative_names": [],
"description": f"Museum in {museum_data['city']}, Bayern. Part of the official German museum registry (Institut für Museumsforschung).",
"locations": [
{
"city": museum_data["city"],
"region": "Bayern",
"country": "DE"
}
],
"identifiers": [
{
"identifier_scheme": "ISIL",
"identifier_value": museum_data["isil_code"],
"identifier_url": f"https://sigel.staatsbibliothek-berlin.de/suche/?isil={museum_data['isil_code']}"
}
],
"provenance": {
"data_source": "WEB_SCRAPING",
"data_tier": "TIER_2_VERIFIED",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "Automated extraction from isil.museum registry (Institut für Museumsforschung)",
"confidence_score": 0.90,
"source_url": museum_data.get("detail_url") or BAYERN_URL,
"notes": "Extracted from official German museum ISIL registry. Detail page URL available for future enrichment."
}
}
# Add detail URL as Website identifier if available
if museum_data.get("detail_url"):
custodian["identifiers"].append({
"identifier_scheme": "Registry",
"identifier_value": museum_data["detail_url"],
"identifier_url": museum_data["detail_url"]
})
return custodian
def enrich_museum_from_detail_page(museum: Dict, detail_html: str) -> Dict:
"""
Optional: Enrich museum record from detail page.
This function is NOT called by default (to save time), but can be enabled
for deep enrichment in future iterations.
"""
soup = BeautifulSoup(detail_html, 'html.parser')
# TODO: Parse detail page structure to extract:
# - Full address (street, postal code)
# - Phone number
# - Email
# - Website URL
# - Opening hours
# - Description
# For now, return museum unchanged
return museum
def main():
"""Extract Saxony museums from isil.museum registry."""
print("=" * 80)
print("Saxony Museum Harvester (isil.museum Registry)")
print("=" * 80)
print()
# Fetch museum list
html = fetch_museum_list(BAYERN_URL)
# Parse table
print("Parsing museum table...")
museums = parse_museum_table(html)
if not museums:
print("✗ No museums found in HTML")
return None
print(f"✓ Extracted {len(museums)} museums from table")
print()
# Sample output
print("Sample museums:")
for museum in museums[:5]:
print(f"{museum['name']} ({museum['city']}) - {museum['isil_code']}")
print(f" ... and {len(museums) - 5} more")
print()
# Convert to LinkML format
print("Converting to LinkML format...")
custodians = []
for museum in museums:
custodian = convert_to_linkml(museum)
custodians.append(custodian)
print(f"✓ Converted {len(custodians)} museums to LinkML format")
print()
# Generate output filename
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_dir = Path("data/isil/germany")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"bayern_museums_{timestamp}.json"
# Export to JSON
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(custodians, f, ensure_ascii=False, indent=2)
print(f"✓ Exported to: {output_file}")
print(f" File size: {output_file.stat().st_size:,} bytes")
print()
# Geographic distribution report
print("=" * 80)
print("Geographic Distribution")
print("=" * 80)
print()
cities = {}
for museum in museums:
city = museum['city']
cities[city] = cities.get(city, 0) + 1
# Sort by count descending
sorted_cities = sorted(cities.items(), key=lambda x: x[1], reverse=True)
print(f"Total cities: {len(sorted_cities)}")
print()
print("Top 10 cities by museum count:")
for city, count in sorted_cities[:10]:
print(f" {count:3d} {city}")
print()
# Metadata completeness report
print("=" * 80)
print("Metadata Completeness Report")
print("=" * 80)
print()
fields = {
"name": len([m for m in museums if m.get('name')]),
"city": len([m for m in museums if m.get('city')]),
"ISIL code": len([m for m in museums if m.get('isil_code')]),
"detail_url": len([m for m in museums if m.get('detail_url')]),
}
total = len(museums)
for field, count in fields.items():
percentage = (count / total) * 100
status = "" if percentage == 100 else ""
print(f"{status} {field:20s}: {count}/{total} ({percentage:5.1f}%)")
print()
print("Note: Address, phone, email, and website data available via detail pages")
print(" Run with --enrich flag to scrape individual museum pages (slower)")
print()
print("=" * 80)
print(f"Harvest complete! {len(custodians)} Saxony museums extracted.")
print("=" * 80)
print()
# Next steps suggestion
print("Next steps:")
print(" 1. Merge with Saxony foundation dataset (12 archives/libraries)")
print(" 2. Total Saxony dataset: ~187 institutions")
print(" 3. Optional: Enrich with detail page scraping for full addresses")
print()
return output_file
if __name__ == "__main__":
main()