#!/usr/bin/env python3 """ Bavaria Museum Metadata Enrichment Enriches the 1,231 Bayern museum records by scraping detail pages from: http://www.museen-in-deutschland.de/ Extracts: - Full street addresses - Postal codes - Phone numbers - Email addresses - Website URLs - Opening hours - Extended descriptions Author: OpenCode AI Agent Date: 2025-11-20 Status: PRODUCTION - Enriching existing Bayern museum dataset """ import json import re import time from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Optional import requests from bs4 import BeautifulSoup # Configuration BASE_URL = "http://www.museen-in-deutschland.de" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'de,en-US;q=0.7,en;q=0.3', } RATE_LIMIT_DELAY = 1.0 # Seconds between requests (be respectful) def fetch_detail_page(url: str) -> Optional[str]: """Fetch a museum detail page HTML.""" try: response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() response.encoding = 'utf-8' return response.text except requests.RequestException as e: print(f" βœ— Error fetching {url}: {e}") return None def parse_detail_page(html: str) -> Dict[str, any]: """ Parse museum detail page to extract metadata. The page uses icon prefixes: 🏘 = Museum name Street address (no icon) Postal code + City (no icon) βœ† = Phone πŸ–· = Fax πŸ•Έ = Website βŒ– = Coordinates πŸ“§ = Email (often empty) Returns dict with fields: - street_address - postal_code - phone - email - website - latitude - longitude - description """ soup = BeautifulSoup(html, 'html.parser') metadata = {} # Get clean text with line breaks preserved page_text = soup.get_text(separator='\n') # Extract address block (after museum name, before phone) # Pattern: "Streetname Number\nPostal City" address_match = re.search(r'🏘[^\n]+\n([^\n]+)\n(\d{5})\s+([^\n]+)', page_text) if address_match: metadata['street_address'] = address_match.group(1).strip() metadata['postal_code'] = address_match.group(2).strip() # City already in dataset else: # Fallback: Look for postal code pattern postal_match = re.search(r'(\d{5})\s+([^\n]+)', page_text) if postal_match: metadata['postal_code'] = postal_match.group(1).strip() # Try to find street on previous line lines = page_text.split('\n') for i, line in enumerate(lines): if postal_match.group(0) in line and i > 0: prev_line = lines[i-1].strip() if prev_line and not any(x in prev_line for x in ['βœ†', 'πŸ–·', 'πŸ•Έ', 'πŸ“§']): metadata['street_address'] = prev_line break # Extract phone (after βœ† icon) phone_match = re.search(r'βœ†\s*([+\d\s()/-]{8,25})', page_text) if phone_match: metadata['phone'] = phone_match.group(1).strip() # Extract email (after πŸ“§ icon or look for email pattern) email_match = re.search(r'πŸ“§\s*([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text) if email_match: metadata['email'] = email_match.group(1).strip() else: # Fallback: search anywhere email_match = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text) if email_match: metadata['email'] = email_match.group(1).strip() # Extract website (after πŸ•Έ icon) website_match = re.search(r'πŸ•Έ\s*(https?://[^\s<>"]+)', page_text) if website_match: metadata['website'] = website_match.group(1).strip() else: # Fallback: search anywhere website_match = re.search(r'(https?://[^\s<>"]+)', page_text) if website_match: url = website_match.group(1).strip() # Exclude the isil.museum site itself if 'isil.museum' not in url and 'museen-in-deutschland.de' not in url: metadata['website'] = url # Extract coordinates (after βŒ– icon) coords_match = re.search(r'βŒ–\s*([\d.]+),\s*([\d.]+)', page_text) if coords_match: metadata['latitude'] = float(coords_match.group(1)) metadata['longitude'] = float(coords_match.group(2)) return metadata def enrich_museum(museum: Dict) -> Dict: """Enrich a single museum record with detail page data.""" # Check if museum has Registry identifier with detail URL detail_url = None for identifier in museum.get('identifiers', []): if identifier.get('identifier_scheme') == 'Registry': detail_url = identifier.get('identifier_value') break if not detail_url: print(f" ⚠ No detail URL for: {museum['name']}") return museum print(f" β†’ Enriching: {museum['name']} ({museum['locations'][0]['city']})") # Fetch detail page html = fetch_detail_page(detail_url) if not html: return museum # Parse metadata metadata = parse_detail_page(html) # Update museum record if metadata.get('street_address'): museum['locations'][0]['street_address'] = metadata['street_address'] if metadata.get('postal_code'): museum['locations'][0]['postal_code'] = metadata['postal_code'] if metadata.get('latitude') and metadata.get('longitude'): museum['locations'][0]['latitude'] = metadata['latitude'] museum['locations'][0]['longitude'] = metadata['longitude'] if metadata.get('phone'): # Add phone as identifier museum['identifiers'].append({ 'identifier_scheme': 'Phone', 'identifier_value': metadata['phone'] }) if metadata.get('email'): # Add email as identifier museum['identifiers'].append({ 'identifier_scheme': 'Email', 'identifier_value': metadata['email'] }) if metadata.get('website'): # Add/update website has_website = any(i.get('identifier_scheme') == 'Website' for i in museum['identifiers']) if not has_website: museum['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': metadata['website'], 'identifier_url': metadata['website'] }) # Note: Opening hours and extended descriptions not available in ISIL registry format # Registry focuses on contact data and identifiers # Update provenance museum['provenance']['confidence_score'] = 0.95 # Higher confidence after enrichment museum['provenance']['notes'] = f"Enriched with detail page data from {detail_url}" # Log enrichment enriched_fields = [k for k in metadata.keys() if metadata[k]] print(f" βœ“ Added: {', '.join(enriched_fields)}") return museum def main(): """Enrich Bayern museums with detail page data.""" print("=" * 80) print("Bavaria Museum Metadata Enrichment") print("=" * 80) print() # Load existing dataset input_file = Path("data/isil/germany/bayern_museums_20251120_213144.json") if not input_file.exists(): print(f"βœ— Input file not found: {input_file}") print(" Please run harvest_isil_museum_bayern.py first") return None print(f"Loading: {input_file.name}") with open(input_file, 'r', encoding='utf-8') as f: museums = json.load(f) print(f"βœ“ Loaded {len(museums)} museums") print() # Estimate time total = len(museums) estimated_time = (total * RATE_LIMIT_DELAY) / 60 # minutes print(f"Estimated time: {estimated_time:.1f} minutes (rate limit: {RATE_LIMIT_DELAY}s per request)") print() # Enrich each museum print("Starting enrichment...") print() enriched_museums = [] success_count = 0 fail_count = 0 for i, museum in enumerate(museums, 1): print(f"[{i}/{total}]", end=" ") original_fields_list = [ bool(museum['locations'][0].get('street_address')), bool(museum['locations'][0].get('postal_code')), any(id.get('identifier_scheme') == 'Phone' for id in museum.get('identifiers', [])), any(id.get('identifier_scheme') == 'Email' for id in museum.get('identifiers', [])), any(id.get('identifier_scheme') == 'Website' for id in museum.get('identifiers', [])) ] original_fields = sum(original_fields_list) enriched = enrich_museum(museum) enriched_museums.append(enriched) enriched_fields_list = [ bool(enriched['locations'][0].get('street_address')), bool(enriched['locations'][0].get('postal_code')), any(id.get('identifier_scheme') == 'Phone' for id in enriched.get('identifiers', [])), any(id.get('identifier_scheme') == 'Email' for id in enriched.get('identifiers', [])), any(id.get('identifier_scheme') == 'Website' for id in enriched.get('identifiers', [])) ] enriched_fields = sum(enriched_fields_list) if enriched_fields > original_fields: success_count += 1 else: fail_count += 1 # Rate limiting time.sleep(RATE_LIMIT_DELAY) # Progress update every 50 museums if i % 50 == 0: print() print(f" Progress: {i}/{total} ({i/total*100:.1f}%) - {success_count} enriched, {fail_count} unchanged") print() print() print("=" * 80) print("Enrichment Complete") print("=" * 80) print() print(f"Total museums: {total}") print(f"Successfully enriched: {success_count}") print(f"Unchanged: {fail_count}") print(f"Success rate: {success_count/total*100:.1f}%") print() # Calculate completeness completeness = { 'name': sum(1 for m in enriched_museums if m.get('name')), 'city': sum(1 for m in enriched_museums if m['locations'][0].get('city')), 'ISIL': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'ISIL' for i in m.get('identifiers', []))), 'street_address': sum(1 for m in enriched_museums if m['locations'][0].get('street_address')), 'postal_code': sum(1 for m in enriched_museums if m['locations'][0].get('postal_code')), 'phone': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'Phone' for i in m.get('identifiers', []))), 'email': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'Email' for i in m.get('identifiers', []))), 'website': sum(1 for m in enriched_museums if any(i.get('identifier_scheme') == 'Website' for i in m.get('identifiers', []))), } print("Metadata Completeness After Enrichment:") print() for field, count in completeness.items(): percentage = (count / total) * 100 status = "βœ“" if percentage > 90 else "β—‹" print(f"{status} {field:20s}: {count}/{total} ({percentage:5.1f}%)") print() # Save enriched dataset timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") output_dir = Path("data/isil/germany") output_file = output_dir / f"bayern_museums_enriched_{timestamp}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(enriched_museums, f, ensure_ascii=False, indent=2) print(f"βœ“ Exported to: {output_file}") print(f" File size: {output_file.stat().st_size:,} bytes") print() print("=" * 80) print(f"Enrichment complete! {success_count}/{total} museums enhanced.") print("=" * 80) print() print("Next steps:") print(" 1. Merge enriched museums with Bayern archives/libraries") print(" 2. Generate Bayern complete dataset with ~80% metadata completeness") print(" 3. Proceed to Baden-WΓΌrttemberg extraction") print() return output_file if __name__ == "__main__": main()