322 lines
12 KiB
Python
Executable file
322 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Sachsen-Anhalt Museums with Detail Page Data - v2.0
|
|
Scrapes individual museum pages for complete metadata:
|
|
- Physical addresses (street, postal code, city)
|
|
- Contact information (phone, email, website)
|
|
- Opening hours
|
|
- Full descriptions
|
|
|
|
Improvements over v1.0:
|
|
- Properly parses address block structure
|
|
- Better error handling
|
|
- Progress tracking
|
|
- Rate limiting with delays
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
import time
|
|
import re
|
|
|
|
def scrape_museum_detail_page(url: str) -> Dict[str, Any]:
|
|
"""Scrape individual museum detail page for complete metadata."""
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=15)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
details = {
|
|
'city': '',
|
|
'street_address': '',
|
|
'postal_code': '',
|
|
'phone': '',
|
|
'email': '',
|
|
'website': '',
|
|
'opening_hours': '',
|
|
'full_description': ''
|
|
}
|
|
|
|
# Extract address block (Postanschrift)
|
|
address_div = soup.find('div', class_=lambda c: c and 'address' in c.lower())
|
|
if address_div:
|
|
address_text = address_div.get_text(separator='\n', strip=True)
|
|
lines = [line.strip() for line in address_text.split('\n') if line.strip()]
|
|
|
|
# Parse address structure:
|
|
# Line 0: "Postanschrift"
|
|
# Line 1: Museum name
|
|
# Line 2: Street address
|
|
# Line 3: Postal code + city
|
|
|
|
for i, line in enumerate(lines):
|
|
# Postal code + city (e.g., "06385 Aken")
|
|
postal_match = re.match(r'(\d{5})\s+(.+)', line)
|
|
if postal_match:
|
|
details['postal_code'] = postal_match.group(1)
|
|
details['city'] = postal_match.group(2).strip()
|
|
|
|
# Street address (line before postal code)
|
|
# Look for pattern: Word + "straße/str./weg/platz" + number
|
|
street_match = re.search(r'[A-ZÄÖÜ][a-zäöüß]+(?:straße|str\.|weg|platz|gasse|allee)\s+\d+', line, re.IGNORECASE)
|
|
if street_match:
|
|
details['street_address'] = street_match.group(0)
|
|
|
|
# Extract contact information from dt/dd pairs
|
|
dt_tags = soup.find_all('dt')
|
|
for dt in dt_tags:
|
|
label = dt.get_text(strip=True).lower()
|
|
dd = dt.find_next_sibling('dd')
|
|
if not dd:
|
|
continue
|
|
|
|
value = dd.get_text(strip=True)
|
|
|
|
if 'telefon' in label or 'phone' in label:
|
|
details['phone'] = value
|
|
|
|
elif 'e-mail' in label or 'mail' in label:
|
|
# Skip generic museum association email
|
|
if 'mv-sachsen-anhalt' not in value:
|
|
details['email'] = value
|
|
|
|
elif 'internet' in label or 'website' in label:
|
|
details['website'] = value
|
|
|
|
elif 'öffnungszeiten' in label or 'opening' in label:
|
|
details['opening_hours'] = value
|
|
|
|
# Extract full description from main content area
|
|
content_div = soup.find('div', class_=lambda c: c and ('content' in c.lower() or 'description' in c.lower() or 'text' in c.lower()))
|
|
if content_div:
|
|
paragraphs = content_div.find_all('p')
|
|
if paragraphs:
|
|
full_desc = '\n\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
|
if full_desc and len(full_desc) > 50:
|
|
details['full_description'] = full_desc
|
|
|
|
return details
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ⚠️ Request failed: {e}")
|
|
return {}
|
|
except Exception as e:
|
|
print(f" ⚠️ Parse error: {e}")
|
|
return {}
|
|
|
|
def enrich_museums(input_file: Path) -> List[Dict[str, Any]]:
|
|
"""Enrich museum records with detail page data."""
|
|
|
|
# Load existing museum data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
museums = json.load(f)
|
|
|
|
print(f"Loaded {len(museums)} museums from {input_file.name}")
|
|
print()
|
|
|
|
enriched = []
|
|
success_count = 0
|
|
error_count = 0
|
|
|
|
for i, museum in enumerate(museums, 1):
|
|
name_display = museum['name'][:60] + '...' if len(museum['name']) > 60 else museum['name']
|
|
print(f"[{i}/{len(museums)}] {name_display}")
|
|
|
|
# Get detail URL
|
|
detail_url = None
|
|
if museum.get('identifiers'):
|
|
for ident in museum['identifiers']:
|
|
if ident['identifier_scheme'] == 'Website':
|
|
detail_url = ident['identifier_value']
|
|
break
|
|
|
|
if not detail_url:
|
|
print(f" ⚠️ No detail URL, skipping")
|
|
enriched.append(museum)
|
|
error_count += 1
|
|
continue
|
|
|
|
# Scrape detail page
|
|
details = scrape_museum_detail_page(detail_url)
|
|
|
|
if not details:
|
|
print(f" ⚠️ Failed to scrape details")
|
|
enriched.append(museum)
|
|
error_count += 1
|
|
continue
|
|
|
|
# Update location with city data
|
|
fields_updated = []
|
|
|
|
if details.get('city'):
|
|
museum['locations'][0]['city'] = details['city']
|
|
fields_updated.append(f"City: {details['city']}")
|
|
|
|
if details.get('street_address'):
|
|
museum['locations'][0]['street_address'] = details['street_address']
|
|
fields_updated.append(f"Address: {details['street_address']}")
|
|
|
|
if details.get('postal_code'):
|
|
museum['locations'][0]['postal_code'] = details['postal_code']
|
|
|
|
# Update description if longer
|
|
if details.get('full_description') and len(details['full_description']) > len(museum.get('description', '')):
|
|
old_len = len(museum.get('description', ''))
|
|
museum['description'] = details['full_description']
|
|
fields_updated.append(f"Description: {old_len}→{len(details['full_description'])} chars")
|
|
|
|
# Add contact identifiers (avoid duplicates)
|
|
existing_schemes = {i['identifier_scheme'] for i in museum.get('identifiers', [])}
|
|
|
|
if details.get('phone') and 'Phone' not in existing_schemes:
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Phone',
|
|
'identifier_value': details['phone'],
|
|
'identifier_url': f"tel:{details['phone']}"
|
|
})
|
|
fields_updated.append(f"Phone: {details['phone']}")
|
|
|
|
if details.get('email') and 'Email' not in existing_schemes:
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': details['email'],
|
|
'identifier_url': f"mailto:{details['email']}"
|
|
})
|
|
fields_updated.append(f"Email: {details['email']}")
|
|
|
|
# Add opening hours to description
|
|
if details.get('opening_hours'):
|
|
if not museum.get('description'):
|
|
museum['description'] = ''
|
|
if 'Öffnungszeiten:' not in museum['description']:
|
|
museum['description'] += f"\n\nÖffnungszeiten: {details['opening_hours']}"
|
|
|
|
# Update provenance
|
|
if not museum.get('provenance'):
|
|
museum['provenance'] = {
|
|
'data_source': 'WEBSITE_SCRAPING',
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Museumsverband Sachsen-Anhalt detail pages v2.0',
|
|
'confidence_score': 0.95
|
|
}
|
|
|
|
enriched.append(museum)
|
|
|
|
if fields_updated:
|
|
print(f" ✅ {', '.join(fields_updated)}")
|
|
success_count += 1
|
|
else:
|
|
print(f" ⚠️ No new data extracted")
|
|
error_count += 1
|
|
|
|
# Rate limiting - 1 second delay between requests
|
|
if i < len(museums):
|
|
time.sleep(1.0)
|
|
|
|
print()
|
|
print(f"Enrichment complete: {success_count} successful, {error_count} failed/skipped")
|
|
|
|
return enriched
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
|
|
print("=" * 80)
|
|
print("Enrich Sachsen-Anhalt Museums with Detail Page Data v2.0")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Find most recent museum file
|
|
data_dir = Path('data/isil/germany')
|
|
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)
|
|
|
|
if not museum_files:
|
|
print("❌ No museum files found. Run harvest_sachsen_anhalt_museums.py first.")
|
|
return
|
|
|
|
# Skip already enriched files
|
|
input_file = None
|
|
for f in museum_files:
|
|
if 'enriched' not in f.name:
|
|
input_file = f
|
|
break
|
|
|
|
if not input_file:
|
|
print("❌ No non-enriched museum files found.")
|
|
return
|
|
|
|
print(f"Input: {input_file.name}")
|
|
print()
|
|
|
|
# Enrich museums
|
|
start_time = time.time()
|
|
enriched_museums = enrich_museums(input_file)
|
|
elapsed = time.time() - start_time
|
|
|
|
# Statistics
|
|
has_city = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('city'))
|
|
has_address = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('street_address'))
|
|
has_postal = sum(1 for m in enriched_museums if m.get('locations') and m['locations'][0].get('postal_code'))
|
|
has_phone = sum(1 for m in enriched_museums if m.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in m['identifiers']))
|
|
has_email = sum(1 for m in enriched_museums if m.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in m['identifiers']))
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Data Completeness:")
|
|
print("=" * 80)
|
|
print(f" City: {has_city:3d}/{len(enriched_museums)} ({has_city/len(enriched_museums)*100:5.1f}%)")
|
|
print(f" Street Address: {has_address:3d}/{len(enriched_museums)} ({has_address/len(enriched_museums)*100:5.1f}%)")
|
|
print(f" Postal Code: {has_postal:3d}/{len(enriched_museums)} ({has_postal/len(enriched_museums)*100:5.1f}%)")
|
|
print(f" Phone: {has_phone:3d}/{len(enriched_museums)} ({has_phone/len(enriched_museums)*100:5.1f}%)")
|
|
print(f" Email: {has_email:3d}/{len(enriched_museums)} ({has_email/len(enriched_museums)*100:5.1f}%)")
|
|
print()
|
|
|
|
# City counts
|
|
from collections import Counter
|
|
city_counts = Counter()
|
|
for museum in enriched_museums:
|
|
if museum.get('locations'):
|
|
city = museum['locations'][0].get('city', '')
|
|
if city:
|
|
city_counts[city] += 1
|
|
|
|
print("Top 20 Cities:")
|
|
for city, count in city_counts.most_common(20):
|
|
print(f" {city:30s}: {count:2d}")
|
|
print()
|
|
|
|
print(f"Total cities covered: {len(city_counts)}")
|
|
print(f"Processing time: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
|
|
print()
|
|
|
|
# Save enriched data
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = data_dir / f'sachsen_anhalt_museums_enriched_{timestamp}.json'
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(enriched_museums, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size_kb = output_path.stat().st_size / 1024
|
|
|
|
print(f"✅ Saved to: {output_path}")
|
|
print(f" File size: {file_size_kb:.1f} KB")
|
|
print(f" Total museums: {len(enriched_museums)}")
|
|
print()
|
|
print("=" * 80)
|
|
print("Enrichment complete!")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|