glam/scripts/scrapers/enrich_bayern_museums_sample.py
2025-11-21 22:12:33 +01:00

225 lines
8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Bavaria Museum Metadata Enrichment - Sample Run
Enriches 100 sample Bayern museums to demonstrate metadata boost from 42% → ~85%.
Full enrichment (1,231 museums) can run as background task (~25 minutes).
Author: OpenCode AI Agent
Date: 2025-11-20
"""
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.museen-in-deutschland.de"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
}
RATE_LIMIT_DELAY = 0.5 # Faster for sample
def parse_detail_page(html):
"""Extract metadata from museum detail page."""
soup = BeautifulSoup(html, 'html.parser')
metadata = {}
page_text = soup.get_text(separator='\n')
# Address: "🏘 Name\nStreet\nPostal City"
address_match = re.search(r'🏘[^\n]+\n([^\n]+)\n(\d{5})\s+([^\n]+)', page_text)
if address_match:
metadata['street_address'] = address_match.group(1).strip()
metadata['postal_code'] = address_match.group(2).strip()
# Phone: "✆ (number)"
phone_match = re.search(r'\s*([+\d\s()/-]{8,25})', page_text)
if phone_match:
metadata['phone'] = phone_match.group(1).strip()
# Email: "📧 email@domain"
email_match = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text)
if email_match:
metadata['email'] = email_match.group(1).strip()
# Website: "🕸 https://..."
website_match = re.search(r'🕸\s*(https?://[^\s<>"]+)', page_text)
if website_match:
metadata['website'] = website_match.group(1).strip()
# Coordinates: "⌖ lat, lon"
coords_match = re.search(r'\s*([\d.]+),\s*([\d.]+)', page_text)
if coords_match:
metadata['latitude'] = float(coords_match.group(1))
metadata['longitude'] = float(coords_match.group(2))
return metadata
def enrich_museum(museum):
"""Enrich single museum with detail page data."""
detail_url = None
for identifier in museum.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Registry':
detail_url = identifier.get('identifier_value')
break
if not detail_url:
return museum, 0
try:
response = requests.get(detail_url, headers=HEADERS, timeout=10)
response.raise_for_status()
metadata = parse_detail_page(response.text)
# Update museum record
enriched_count = 0
if metadata.get('street_address'):
museum['locations'][0]['street_address'] = metadata['street_address']
enriched_count += 1
if metadata.get('postal_code'):
museum['locations'][0]['postal_code'] = metadata['postal_code']
enriched_count += 1
if metadata.get('latitude'):
museum['locations'][0]['latitude'] = metadata['latitude']
museum['locations'][0]['longitude'] = metadata['longitude']
enriched_count += 1
if metadata.get('phone'):
museum['identifiers'].append({
'identifier_scheme': 'Phone',
'identifier_value': metadata['phone']
})
enriched_count += 1
if metadata.get('email'):
museum['identifiers'].append({
'identifier_scheme': 'Email',
'identifier_value': metadata['email']
})
enriched_count += 1
if metadata.get('website'):
museum['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': metadata['website'],
'identifier_url': metadata['website']
})
enriched_count += 1
if enriched_count > 0:
museum['provenance']['confidence_score'] = 0.95
museum['provenance']['notes'] = f"Enriched with {enriched_count} fields from detail page"
return museum, enriched_count
except Exception as e:
print(f" ✗ Error: {e}")
return museum, 0
def main():
print("=" * 80)
print("Bavaria Museum Metadata Enrichment - Sample (100 museums)")
print("=" * 80)
print()
# Load museums
input_file = Path("data/isil/germany/bayern_museums_20251120_213144.json")
with open(input_file, 'r') as f:
all_museums = json.load(f)
# Sample 100 museums from different cities
sample_museums = all_museums[:100]
print(f"Loaded {len(all_museums)} total museums")
print(f"Processing sample of {len(sample_museums)} museums")
print()
# Enrich
enriched = []
success_count = 0
total_fields = 0
print("Enriching museums...")
for i, museum in enumerate(sample_museums, 1):
print(f"[{i}/{len(sample_museums)}] {museum['name'][:50]:<50}", end=" ")
enriched_museum, field_count = enrich_museum(museum)
enriched.append(enriched_museum)
if field_count > 0:
success_count += 1
total_fields += field_count
print(f"✓ +{field_count} fields")
else:
print("○ no new data")
time.sleep(RATE_LIMIT_DELAY)
print()
print("=" * 80)
print("Enrichment Results")
print("=" * 80)
print()
print(f"Total museums: {len(sample_museums)}")
print(f"Successfully enriched: {success_count} ({success_count/len(sample_museums)*100:.1f}%)")
print(f"Total fields added: {total_fields}")
print(f"Average fields per museum: {total_fields/len(sample_museums):.1f}")
print()
# Metadata completeness
completeness = {
'name': sum(1 for m in enriched if m.get('name')),
'city': sum(1 for m in enriched if m['locations'][0].get('city')),
'ISIL': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'ISIL' for i in m.get('identifiers', []))),
'street_address': sum(1 for m in enriched if m['locations'][0].get('street_address')),
'postal_code': sum(1 for m in enriched if m['locations'][0].get('postal_code')),
'coordinates': sum(1 for m in enriched if m['locations'][0].get('latitude')),
'phone': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Phone' for i in m.get('identifiers', []))),
'email': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Email' for i in m.get('identifiers', []))),
'website': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Website' for i in m.get('identifiers', []))),
}
print("Metadata Completeness (Sample of 100):")
print()
for field, count in completeness.items():
percentage = (count / len(sample_museums)) * 100
bar = "" * int(percentage / 2) + "" * (50 - int(percentage / 2))
print(f"{field:20s}: {bar} {percentage:5.1f}% ({count}/{len(sample_museums)})")
# Calculate overall completeness
total_possible = len(sample_museums) * len(completeness)
total_filled = sum(completeness.values())
overall = (total_filled / total_possible) * 100
print()
print(f"Overall completeness: {overall:.1f}%")
print()
# Save enriched sample
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = Path(f"data/isil/germany/bayern_museums_enriched_sample_{timestamp}.json")
with open(output_file, 'w') as f:
json.dump(enriched, f, ensure_ascii=False, indent=2)
print(f"✓ Saved enriched sample: {output_file}")
print()
# Projection for full dataset
print("=" * 80)
print("Full Dataset Projection (1,231 museums)")
print("=" * 80)
print()
print(f"Expected success rate: {success_count/len(sample_museums)*100:.1f}%")
print(f"Expected enriched museums: ~{int(1231 * success_count/len(sample_museums))}")
print(f"Expected time (1s delay): ~25 minutes")
print(f"Expected overall completeness: ~{overall:.1f}%")
print()
return output_file
if __name__ == "__main__":
main()