#!/usr/bin/env python3 """ Bavaria Museum Metadata Enrichment - Sample Run Enriches 100 sample Bayern museums to demonstrate metadata boost from 42% → ~85%. Full enrichment (1,231 museums) can run as background task (~25 minutes). Author: OpenCode AI Agent Date: 2025-11-20 """ import json import re import time from datetime import datetime, timezone from pathlib import Path import requests from bs4 import BeautifulSoup BASE_URL = "http://www.museen-in-deutschland.de" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', } RATE_LIMIT_DELAY = 0.5 # Faster for sample def parse_detail_page(html): """Extract metadata from museum detail page.""" soup = BeautifulSoup(html, 'html.parser') metadata = {} page_text = soup.get_text(separator='\n') # Address: "🏘 Name\nStreet\nPostal City" address_match = re.search(r'🏘[^\n]+\n([^\n]+)\n(\d{5})\s+([^\n]+)', page_text) if address_match: metadata['street_address'] = address_match.group(1).strip() metadata['postal_code'] = address_match.group(2).strip() # Phone: "✆ (number)" phone_match = re.search(r'✆\s*([+\d\s()/-]{8,25})', page_text) if phone_match: metadata['phone'] = phone_match.group(1).strip() # Email: "📧 email@domain" email_match = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text) if email_match: metadata['email'] = email_match.group(1).strip() # Website: "🕸 https://..." website_match = re.search(r'🕸\s*(https?://[^\s<>"]+)', page_text) if website_match: metadata['website'] = website_match.group(1).strip() # Coordinates: "⌖ lat, lon" coords_match = re.search(r'⌖\s*([\d.]+),\s*([\d.]+)', page_text) if coords_match: metadata['latitude'] = float(coords_match.group(1)) metadata['longitude'] = float(coords_match.group(2)) return metadata def enrich_museum(museum): """Enrich single museum with detail page data.""" detail_url = None for identifier in museum.get('identifiers', []): if identifier.get('identifier_scheme') == 'Registry': detail_url = identifier.get('identifier_value') break if not detail_url: return museum, 0 try: response = requests.get(detail_url, headers=HEADERS, timeout=10) response.raise_for_status() metadata = parse_detail_page(response.text) # Update museum record enriched_count = 0 if metadata.get('street_address'): museum['locations'][0]['street_address'] = metadata['street_address'] enriched_count += 1 if metadata.get('postal_code'): museum['locations'][0]['postal_code'] = metadata['postal_code'] enriched_count += 1 if metadata.get('latitude'): museum['locations'][0]['latitude'] = metadata['latitude'] museum['locations'][0]['longitude'] = metadata['longitude'] enriched_count += 1 if metadata.get('phone'): museum['identifiers'].append({ 'identifier_scheme': 'Phone', 'identifier_value': metadata['phone'] }) enriched_count += 1 if metadata.get('email'): museum['identifiers'].append({ 'identifier_scheme': 'Email', 'identifier_value': metadata['email'] }) enriched_count += 1 if metadata.get('website'): museum['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': metadata['website'], 'identifier_url': metadata['website'] }) enriched_count += 1 if enriched_count > 0: museum['provenance']['confidence_score'] = 0.95 museum['provenance']['notes'] = f"Enriched with {enriched_count} fields from detail page" return museum, enriched_count except Exception as e: print(f" ✗ Error: {e}") return museum, 0 def main(): print("=" * 80) print("Bavaria Museum Metadata Enrichment - Sample (100 museums)") print("=" * 80) print() # Load museums input_file = Path("data/isil/germany/bayern_museums_20251120_213144.json") with open(input_file, 'r') as f: all_museums = json.load(f) # Sample 100 museums from different cities sample_museums = all_museums[:100] print(f"Loaded {len(all_museums)} total museums") print(f"Processing sample of {len(sample_museums)} museums") print() # Enrich enriched = [] success_count = 0 total_fields = 0 print("Enriching museums...") for i, museum in enumerate(sample_museums, 1): print(f"[{i}/{len(sample_museums)}] {museum['name'][:50]:<50}", end=" ") enriched_museum, field_count = enrich_museum(museum) enriched.append(enriched_museum) if field_count > 0: success_count += 1 total_fields += field_count print(f"✓ +{field_count} fields") else: print("○ no new data") time.sleep(RATE_LIMIT_DELAY) print() print("=" * 80) print("Enrichment Results") print("=" * 80) print() print(f"Total museums: {len(sample_museums)}") print(f"Successfully enriched: {success_count} ({success_count/len(sample_museums)*100:.1f}%)") print(f"Total fields added: {total_fields}") print(f"Average fields per museum: {total_fields/len(sample_museums):.1f}") print() # Metadata completeness completeness = { 'name': sum(1 for m in enriched if m.get('name')), 'city': sum(1 for m in enriched if m['locations'][0].get('city')), 'ISIL': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'ISIL' for i in m.get('identifiers', []))), 'street_address': sum(1 for m in enriched if m['locations'][0].get('street_address')), 'postal_code': sum(1 for m in enriched if m['locations'][0].get('postal_code')), 'coordinates': sum(1 for m in enriched if m['locations'][0].get('latitude')), 'phone': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Phone' for i in m.get('identifiers', []))), 'email': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Email' for i in m.get('identifiers', []))), 'website': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Website' for i in m.get('identifiers', []))), } print("Metadata Completeness (Sample of 100):") print() for field, count in completeness.items(): percentage = (count / len(sample_museums)) * 100 bar = "█" * int(percentage / 2) + "░" * (50 - int(percentage / 2)) print(f"{field:20s}: {bar} {percentage:5.1f}% ({count}/{len(sample_museums)})") # Calculate overall completeness total_possible = len(sample_museums) * len(completeness) total_filled = sum(completeness.values()) overall = (total_filled / total_possible) * 100 print() print(f"Overall completeness: {overall:.1f}%") print() # Save enriched sample timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") output_file = Path(f"data/isil/germany/bayern_museums_enriched_sample_{timestamp}.json") with open(output_file, 'w') as f: json.dump(enriched, f, ensure_ascii=False, indent=2) print(f"✓ Saved enriched sample: {output_file}") print() # Projection for full dataset print("=" * 80) print("Full Dataset Projection (1,231 museums)") print("=" * 80) print() print(f"Expected success rate: {success_count/len(sample_museums)*100:.1f}%") print(f"Expected enriched museums: ~{int(1231 * success_count/len(sample_museums))}") print(f"Expected time (1s delay): ~25 minutes") print(f"Expected overall completeness: ~{overall:.1f}%") print() return output_file if __name__ == "__main__": main()