glam/scripts/scrapers/enrich_sachsen_anhalt_museums_v2.py
2025-11-21 22:12:33 +01:00

322 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Sachsen-Anhalt Museums with Detail Page Data - v2.0
Scrapes individual museum pages for complete metadata:
- Physical addresses (street, postal code, city)
- Contact information (phone, email, website)
- Opening hours
- Full descriptions
Improvements over v1.0:
- Properly parses address block structure
- Better error handling
- Progress tracking
- Rate limiting with delays
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional
import time
import re
def scrape_museum_detail_page(url: str) -> Dict[str, Any]:
"""Scrape individual museum detail page for complete metadata."""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
details = {
'city': '',
'street_address': '',
'postal_code': '',
'phone': '',
'email': '',
'website': '',
'opening_hours': '',
'full_description': ''
}
# Extract address block (Postanschrift)
address_div = soup.find('div', class_=lambda c: c and 'address' in c.lower())
if address_div:
address_text = address_div.get_text(separator='\n', strip=True)
lines = [line.strip() for line in address_text.split('\n') if line.strip()]
# Parse address structure:
# Line 0: "Postanschrift"
# Line 1: Museum name
# Line 2: Street address
# Line 3: Postal code + city
for i, line in enumerate(lines):
# Postal code + city (e.g., "06385 Aken")
postal_match = re.match(r'(\d{5})\s+(.+)', line)
if postal_match:
details['postal_code'] = postal_match.group(1)
details['city'] = postal_match.group(2).strip()
# Street address (line before postal code)
# Look for pattern: Word + "straße/str./weg/platz" + number
street_match = re.search(r'[A-ZÄÖÜ][a-zäöüß]+(?:straße|str\.|weg|platz|gasse|allee)\s+\d+', line, re.IGNORECASE)
if street_match:
details['street_address'] = street_match.group(0)
# Extract contact information from dt/dd pairs
dt_tags = soup.find_all('dt')
for dt in dt_tags:
label = dt.get_text(strip=True).lower()
dd = dt.find_next_sibling('dd')
if not dd:
continue
value = dd.get_text(strip=True)
if 'telefon' in label or 'phone' in label:
details['phone'] = value
elif 'e-mail' in label or 'mail' in label:
# Skip generic museum association email
if 'mv-sachsen-anhalt' not in value:
details['email'] = value
elif 'internet' in label or 'website' in label:
details['website'] = value
elif 'öffnungszeiten' in label or 'opening' in label:
details['opening_hours'] = value
# Extract full description from main content area
content_div = soup.find('div', class_=lambda c: c and ('content' in c.lower() or 'description' in c.lower() or 'text' in c.lower()))
if content_div:
paragraphs = content_div.find_all('p')
if paragraphs:
full_desc = '\n\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
if full_desc and len(full_desc) > 50:
details['full_description'] = full_desc
return details
except requests.exceptions.RequestException as e:
print(f" ⚠️ Request failed: {e}")
return {}
except Exception as e:
print(f" ⚠️ Parse error: {e}")
return {}
def enrich_museums(input_file: Path) -> List[Dict[str, Any]]:
"""Enrich museum records with detail page data."""
# Load existing museum data
with open(input_file, 'r', encoding='utf-8') as f:
museums = json.load(f)
print(f"Loaded {len(museums)} museums from {input_file.name}")
print()
enriched = []
success_count = 0
error_count = 0
for i, museum in enumerate(museums, 1):
name_display = museum['name'][:60] + '...' if len(museum['name']) > 60 else museum['name']
print(f"[{i}/{len(museums)}] {name_display}")
# Get detail URL
detail_url = None
if museum.get('identifiers'):
for ident in museum['identifiers']:
if ident['identifier_scheme'] == 'Website':
detail_url = ident['identifier_value']
break
if not detail_url:
print(f" ⚠️ No detail URL, skipping")
enriched.append(museum)
error_count += 1
continue
# Scrape detail page
details = scrape_museum_detail_page(detail_url)
if not details:
print(f" ⚠️ Failed to scrape details")
enriched.append(museum)
error_count += 1
continue
# Update location with city data
fields_updated = []
if details.get('city'):
museum['locations'][0]['city'] = details['city']
fields_updated.append(f"City: {details['city']}")
if details.get('street_address'):
museum['locations'][0]['street_address'] = details['street_address']
fields_updated.append(f"Address: {details['street_address']}")
if details.get('postal_code'):
museum['locations'][0]['postal_code'] = details['postal_code']
# Update description if longer
if details.get('full_description') and len(details['full_description']) > len(museum.get('description', '')):
old_len = len(museum.get('description', ''))
museum['description'] = details['full_description']
fields_updated.append(f"Description: {old_len}{len(details['full_description'])} chars")
# Add contact identifiers (avoid duplicates)
existing_schemes = {i['identifier_scheme'] for i in museum.get('identifiers', [])}
if details.get('phone') and 'Phone' not in existing_schemes:
museum['identifiers'].append({
'identifier_scheme': 'Phone',
'identifier_value': details['phone'],
'identifier_url': f"tel:{details['phone']}"
})
fields_updated.append(f"Phone: {details['phone']}")
if details.get('email') and 'Email' not in existing_schemes:
museum['identifiers'].append({
'identifier_scheme': 'Email',
'identifier_value': details['email'],
'identifier_url': f"mailto:{details['email']}"
})
fields_updated.append(f"Email: {details['email']}")
# Add opening hours to description
if details.get('opening_hours'):
if not museum.get('description'):
museum['description'] = ''
if 'Öffnungszeiten:' not in museum['description']:
museum['description'] += f"\n\nÖffnungszeiten: {details['opening_hours']}"
# Update provenance
if not museum.get('provenance'):
museum['provenance'] = {
'data_source': 'WEBSITE_SCRAPING',
'data_tier': 'TIER_2_VERIFIED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Museumsverband Sachsen-Anhalt detail pages v2.0',
'confidence_score': 0.95
}
enriched.append(museum)
if fields_updated:
print(f"{', '.join(fields_updated)}")
success_count += 1
else:
print(f" ⚠️ No new data extracted")
error_count += 1
# Rate limiting - 1 second delay between requests
if i < len(museums):
time.sleep(1.0)
print()
print(f"Enrichment complete: {success_count} successful, {error_count} failed/skipped")
return enriched
def main():
"""Main execution."""
print("=" * 80)
print("Enrich Sachsen-Anhalt Museums with Detail Page Data v2.0")
print("=" * 80)
print()
# Find most recent museum file
data_dir = Path('data/isil/germany')
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)
if not museum_files:
print("❌ No museum files found. Run harvest_sachsen_anhalt_museums.py first.")
return
# Skip already enriched files
input_file = None
for f in museum_files:
if 'enriched' not in f.name:
input_file = f
break
if not input_file:
print("❌ No non-enriched museum files found.")
return
print(f"Input: {input_file.name}")
print()
# Enrich museums
start_time = time.time()
enriched_museums = enrich_museums(input_file)
elapsed = time.time() - start_time
# Statistics
has_city = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('city'))
has_address = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('street_address'))
has_postal = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('postal_code'))
has_phone = sum(1 for m in enriched_museums if m.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in m['identifiers']))
has_email = sum(1 for m in enriched_museums if m.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in m['identifiers']))
print()
print("=" * 80)
print("Data Completeness:")
print("=" * 80)
print(f" City: {has_city:3d}/{len(enriched_museums)} ({has_city/len(enriched_museums)*100:5.1f}%)")
print(f" Street Address: {has_address:3d}/{len(enriched_museums)} ({has_address/len(enriched_museums)*100:5.1f}%)")
print(f" Postal Code: {has_postal:3d}/{len(enriched_museums)} ({has_postal/len(enriched_museums)*100:5.1f}%)")
print(f" Phone: {has_phone:3d}/{len(enriched_museums)} ({has_phone/len(enriched_museums)*100:5.1f}%)")
print(f" Email: {has_email:3d}/{len(enriched_museums)} ({has_email/len(enriched_museums)*100:5.1f}%)")
print()
# City counts
from collections import Counter
city_counts = Counter()
for museum in enriched_museums:
if museum.get('locations'):
city = museum['locations'][0].get('city', '')
if city:
city_counts[city] += 1
print("Top 20 Cities:")
for city, count in city_counts.most_common(20):
print(f" {city:30s}: {count:2d}")
print()
print(f"Total cities covered: {len(city_counts)}")
print(f"Processing time: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
print()
# Save enriched data
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = data_dir / f'sachsen_anhalt_museums_enriched_{timestamp}.json'
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(enriched_museums, f, ensure_ascii=False, indent=2)
file_size_kb = output_path.stat().st_size / 1024
print(f"✅ Saved to: {output_path}")
print(f" File size: {file_size_kb:.1f} KB")
print(f" Total museums: {len(enriched_museums)}")
print()
print("=" * 80)
print("Enrichment complete!")
print("=" * 80)
if __name__ == '__main__':
main()