glam/scripts/scrapers/enrich_sachsen_anhalt_museums.py
2025-11-21 22:12:33 +01:00

267 lines
9.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Sachsen-Anhalt Museums with Detail Page Data
Scrapes individual museum pages for complete metadata:
- Physical addresses
- Contact information
- Opening hours
- Cities
Target: 100% metadata completeness
"""
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional
import time
from urllib.parse import urljoin
def scrape_museum_detail_page(url: str) -> Dict[str, Any]:
"""Scrape individual museum detail page for complete metadata."""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
details = {
'city': '',
'street_address': '',
'postal_code': '',
'phone': '',
'email': '',
'opening_hours': '',
'full_description': ''
}
# Extract address block
address_div = soup.find('div', class_=lambda c: c and ('address' in c.lower() or 'kontakt' in c.lower()))
if address_div:
address_text = address_div.get_text(separator='\n', strip=True)
lines = [line.strip() for line in address_text.split('\n') if line.strip()]
for i, line in enumerate(lines):
# Postal code + city (e.g., "06618 Naumburg")
if any(char.isdigit() for char in line) and len(line) > 3:
parts = line.split(maxsplit=1)
if len(parts) == 2 and parts[0].isdigit():
details['postal_code'] = parts[0]
details['city'] = parts[1]
# Street address (before postal code)
if not details['street_address'] and i < len(lines) - 1:
next_line = lines[i + 1]
if any(char.isdigit() for char in next_line) and len(next_line) > 3:
details['street_address'] = line
# Phone
if 'tel' in line.lower() or 'fon' in line.lower():
details['phone'] = line.split(':', 1)[-1].strip()
# Email
if '@' in line:
details['email'] = line.strip()
# Alternative: structured data with dt/dd tags
dt_tags = soup.find_all('dt')
for dt in dt_tags:
label = dt.get_text(strip=True).lower()
dd = dt.find_next_sibling('dd')
if not dd:
continue
value = dd.get_text(strip=True)
if 'adresse' in label or 'anschrift' in label:
# Parse address
lines = [l.strip() for l in value.split('\n') if l.strip()]
for line in lines:
if any(char.isdigit() for char in line):
parts = line.split(maxsplit=1)
if len(parts) == 2 and parts[0].isdigit():
details['postal_code'] = parts[0]
details['city'] = parts[1]
elif 'öffnungszeiten' in label or 'opening' in label:
details['opening_hours'] = value
elif 'telefon' in label or 'phone' in label:
details['phone'] = value
elif 'mail' in label:
details['email'] = value
# Extract full description
desc_div = soup.find('div', class_=lambda c: c and ('description' in c.lower() or 'content' in c.lower()))
if desc_div:
paragraphs = desc_div.find_all('p')
full_desc = '\n\n'.join(p.get_text(strip=True) for p in paragraphs)
if full_desc:
details['full_description'] = full_desc
return details
except requests.exceptions.RequestException as e:
print(f"⚠️ Failed to scrape {url}: {e}")
return {}
def enrich_museums(input_file: Path) -> List[Dict[str, Any]]:
"""Enrich museum records with detail page data."""
# Load existing museum data
with open(input_file, 'r', encoding='utf-8') as f:
museums = json.load(f)
print(f"Loaded {len(museums)} museums from {input_file.name}")
print()
enriched = []
errors = 0
for i, museum in enumerate(museums, 1):
print(f"[{i}/{len(museums)}] Enriching: {museum['name'][:60]}")
# Get detail URL
detail_url = None
if museum.get('identifiers'):
for ident in museum['identifiers']:
if ident['identifier_scheme'] == 'Website':
detail_url = ident['identifier_value']
break
if not detail_url:
print(f" ⚠️ No detail URL, skipping")
enriched.append(museum)
errors += 1
continue
# Scrape detail page
details = scrape_museum_detail_page(detail_url)
if not details:
print(f" ⚠️ Failed to scrape details")
enriched.append(museum)
errors += 1
continue
# Update location with city data
if details.get('city'):
museum['locations'][0]['city'] = details['city']
print(f" ✅ City: {details['city']}")
if details.get('street_address'):
museum['locations'][0]['street_address'] = details['street_address']
if details.get('postal_code'):
museum['locations'][0]['postal_code'] = details['postal_code']
# Update description if longer
if details.get('full_description') and len(details['full_description']) > len(museum.get('description', '')):
museum['description'] = details['full_description']
# Add contact identifiers
if details.get('phone'):
museum['identifiers'].append({
'identifier_scheme': 'Phone',
'identifier_value': details['phone'],
'identifier_url': f"tel:{details['phone']}"
})
if details.get('email'):
museum['identifiers'].append({
'identifier_scheme': 'Email',
'identifier_value': details['email'],
'identifier_url': f"mailto:{details['email']}"
})
# Add opening hours to description or metadata
if details.get('opening_hours'):
if not museum.get('description'):
museum['description'] = ''
museum['description'] += f"\n\nÖffnungszeiten: {details['opening_hours']}"
enriched.append(museum)
# Rate limiting
time.sleep(0.5) # 0.5 second delay between requests
print()
print(f"Enrichment complete: {len(enriched) - errors}/{len(enriched)} successful")
print(f"Errors: {errors}")
return enriched
def main():
"""Main execution."""
print("=" * 80)
print("Enrich Sachsen-Anhalt Museums with Detail Page Data")
print("=" * 80)
print()
# Find most recent museum file
data_dir = Path('data/isil/germany')
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)
if not museum_files:
print("❌ No museum files found. Run harvest_sachsen_anhalt_museums.py first.")
return
input_file = museum_files[0]
print(f"Input: {input_file.name}")
print()
# Enrich museums
enriched_museums = enrich_museums(input_file)
# Statistics
has_city = sum(1 for m in enriched_museums if m['locations'][0].get('city'))
has_address = sum(1 for m in enriched_museums if m['locations'][0].get('street_address'))
has_postal = sum(1 for m in enriched_museums if m['locations'][0].get('postal_code'))
print()
print("Data Completeness:")
print(f" City: {has_city}/{len(enriched_museums)} ({has_city/len(enriched_museums)*100:.1f}%)")
print(f" Street Address: {has_address}/{len(enriched_museums)} ({has_address/len(enriched_museums)*100:.1f}%)")
print(f" Postal Code: {has_postal}/{len(enriched_museums)} ({has_postal/len(enriched_museums)*100:.1f}%)")
print()
# City counts
city_counts = {}
for museum in enriched_museums:
city = museum['locations'][0].get('city', '')
if city:
city_counts[city] = city_counts.get(city, 0) + 1
print("Top 10 Cities:")
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
print(f" {city}: {count}")
print()
# Save enriched data
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = data_dir / f'sachsen_anhalt_museums_enriched_{timestamp}.json'
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(enriched_museums, f, ensure_ascii=False, indent=2)
file_size_kb = output_path.stat().st_size / 1024
print(f"✅ Saved to: {output_path}")
print(f" File size: {file_size_kb:.1f} KB")
print(f" Total museums: {len(enriched_museums)}")
print()
print("=" * 80)
print("Enrichment complete!")
print("=" * 80)
if __name__ == '__main__':
main()