267 lines
9.4 KiB
Python
Executable file
267 lines
9.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Sachsen-Anhalt Museums with Detail Page Data
|
|
Scrapes individual museum pages for complete metadata:
|
|
- Physical addresses
|
|
- Contact information
|
|
- Opening hours
|
|
- Cities
|
|
Target: 100% metadata completeness
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
import time
|
|
from urllib.parse import urljoin
|
|
|
|
def scrape_museum_detail_page(url: str) -> Dict[str, Any]:
|
|
"""Scrape individual museum detail page for complete metadata."""
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
details = {
|
|
'city': '',
|
|
'street_address': '',
|
|
'postal_code': '',
|
|
'phone': '',
|
|
'email': '',
|
|
'opening_hours': '',
|
|
'full_description': ''
|
|
}
|
|
|
|
# Extract address block
|
|
address_div = soup.find('div', class_=lambda c: c and ('address' in c.lower() or 'kontakt' in c.lower()))
|
|
if address_div:
|
|
address_text = address_div.get_text(separator='\n', strip=True)
|
|
lines = [line.strip() for line in address_text.split('\n') if line.strip()]
|
|
|
|
for i, line in enumerate(lines):
|
|
# Postal code + city (e.g., "06618 Naumburg")
|
|
if any(char.isdigit() for char in line) and len(line) > 3:
|
|
parts = line.split(maxsplit=1)
|
|
if len(parts) == 2 and parts[0].isdigit():
|
|
details['postal_code'] = parts[0]
|
|
details['city'] = parts[1]
|
|
|
|
# Street address (before postal code)
|
|
if not details['street_address'] and i < len(lines) - 1:
|
|
next_line = lines[i + 1]
|
|
if any(char.isdigit() for char in next_line) and len(next_line) > 3:
|
|
details['street_address'] = line
|
|
|
|
# Phone
|
|
if 'tel' in line.lower() or 'fon' in line.lower():
|
|
details['phone'] = line.split(':', 1)[-1].strip()
|
|
|
|
# Email
|
|
if '@' in line:
|
|
details['email'] = line.strip()
|
|
|
|
# Alternative: structured data with dt/dd tags
|
|
dt_tags = soup.find_all('dt')
|
|
for dt in dt_tags:
|
|
label = dt.get_text(strip=True).lower()
|
|
dd = dt.find_next_sibling('dd')
|
|
if not dd:
|
|
continue
|
|
|
|
value = dd.get_text(strip=True)
|
|
|
|
if 'adresse' in label or 'anschrift' in label:
|
|
# Parse address
|
|
lines = [l.strip() for l in value.split('\n') if l.strip()]
|
|
for line in lines:
|
|
if any(char.isdigit() for char in line):
|
|
parts = line.split(maxsplit=1)
|
|
if len(parts) == 2 and parts[0].isdigit():
|
|
details['postal_code'] = parts[0]
|
|
details['city'] = parts[1]
|
|
|
|
elif 'öffnungszeiten' in label or 'opening' in label:
|
|
details['opening_hours'] = value
|
|
|
|
elif 'telefon' in label or 'phone' in label:
|
|
details['phone'] = value
|
|
|
|
elif 'mail' in label:
|
|
details['email'] = value
|
|
|
|
# Extract full description
|
|
desc_div = soup.find('div', class_=lambda c: c and ('description' in c.lower() or 'content' in c.lower()))
|
|
if desc_div:
|
|
paragraphs = desc_div.find_all('p')
|
|
full_desc = '\n\n'.join(p.get_text(strip=True) for p in paragraphs)
|
|
if full_desc:
|
|
details['full_description'] = full_desc
|
|
|
|
return details
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"⚠️ Failed to scrape {url}: {e}")
|
|
return {}
|
|
|
|
def enrich_museums(input_file: Path) -> List[Dict[str, Any]]:
|
|
"""Enrich museum records with detail page data."""
|
|
|
|
# Load existing museum data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
museums = json.load(f)
|
|
|
|
print(f"Loaded {len(museums)} museums from {input_file.name}")
|
|
print()
|
|
|
|
enriched = []
|
|
errors = 0
|
|
|
|
for i, museum in enumerate(museums, 1):
|
|
print(f"[{i}/{len(museums)}] Enriching: {museum['name'][:60]}")
|
|
|
|
# Get detail URL
|
|
detail_url = None
|
|
if museum.get('identifiers'):
|
|
for ident in museum['identifiers']:
|
|
if ident['identifier_scheme'] == 'Website':
|
|
detail_url = ident['identifier_value']
|
|
break
|
|
|
|
if not detail_url:
|
|
print(f" ⚠️ No detail URL, skipping")
|
|
enriched.append(museum)
|
|
errors += 1
|
|
continue
|
|
|
|
# Scrape detail page
|
|
details = scrape_museum_detail_page(detail_url)
|
|
|
|
if not details:
|
|
print(f" ⚠️ Failed to scrape details")
|
|
enriched.append(museum)
|
|
errors += 1
|
|
continue
|
|
|
|
# Update location with city data
|
|
if details.get('city'):
|
|
museum['locations'][0]['city'] = details['city']
|
|
print(f" ✅ City: {details['city']}")
|
|
|
|
if details.get('street_address'):
|
|
museum['locations'][0]['street_address'] = details['street_address']
|
|
|
|
if details.get('postal_code'):
|
|
museum['locations'][0]['postal_code'] = details['postal_code']
|
|
|
|
# Update description if longer
|
|
if details.get('full_description') and len(details['full_description']) > len(museum.get('description', '')):
|
|
museum['description'] = details['full_description']
|
|
|
|
# Add contact identifiers
|
|
if details.get('phone'):
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Phone',
|
|
'identifier_value': details['phone'],
|
|
'identifier_url': f"tel:{details['phone']}"
|
|
})
|
|
|
|
if details.get('email'):
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': details['email'],
|
|
'identifier_url': f"mailto:{details['email']}"
|
|
})
|
|
|
|
# Add opening hours to description or metadata
|
|
if details.get('opening_hours'):
|
|
if not museum.get('description'):
|
|
museum['description'] = ''
|
|
museum['description'] += f"\n\nÖffnungszeiten: {details['opening_hours']}"
|
|
|
|
enriched.append(museum)
|
|
|
|
# Rate limiting
|
|
time.sleep(0.5) # 0.5 second delay between requests
|
|
|
|
print()
|
|
print(f"Enrichment complete: {len(enriched) - errors}/{len(enriched)} successful")
|
|
print(f"Errors: {errors}")
|
|
|
|
return enriched
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
|
|
print("=" * 80)
|
|
print("Enrich Sachsen-Anhalt Museums with Detail Page Data")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Find most recent museum file
|
|
data_dir = Path('data/isil/germany')
|
|
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)
|
|
|
|
if not museum_files:
|
|
print("❌ No museum files found. Run harvest_sachsen_anhalt_museums.py first.")
|
|
return
|
|
|
|
input_file = museum_files[0]
|
|
print(f"Input: {input_file.name}")
|
|
print()
|
|
|
|
# Enrich museums
|
|
enriched_museums = enrich_museums(input_file)
|
|
|
|
# Statistics
|
|
has_city = sum(1 for m in enriched_museums if m['locations'][0].get('city'))
|
|
has_address = sum(1 for m in enriched_museums if m['locations'][0].get('street_address'))
|
|
has_postal = sum(1 for m in enriched_museums if m['locations'][0].get('postal_code'))
|
|
|
|
print()
|
|
print("Data Completeness:")
|
|
print(f" City: {has_city}/{len(enriched_museums)} ({has_city/len(enriched_museums)*100:.1f}%)")
|
|
print(f" Street Address: {has_address}/{len(enriched_museums)} ({has_address/len(enriched_museums)*100:.1f}%)")
|
|
print(f" Postal Code: {has_postal}/{len(enriched_museums)} ({has_postal/len(enriched_museums)*100:.1f}%)")
|
|
print()
|
|
|
|
# City counts
|
|
city_counts = {}
|
|
for museum in enriched_museums:
|
|
city = museum['locations'][0].get('city', '')
|
|
if city:
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print("Top 10 Cities:")
|
|
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {city}: {count}")
|
|
print()
|
|
|
|
# Save enriched data
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = data_dir / f'sachsen_anhalt_museums_enriched_{timestamp}.json'
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(enriched_museums, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size_kb = output_path.stat().st_size / 1024
|
|
|
|
print(f"✅ Saved to: {output_path}")
|
|
print(f" File size: {file_size_kb:.1f} KB")
|
|
print(f" Total museums: {len(enriched_museums)}")
|
|
print()
|
|
print("=" * 80)
|
|
print("Enrichment complete!")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|