225 lines
8 KiB
Python
Executable file
225 lines
8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Bavaria Museum Metadata Enrichment - Sample Run
|
|
|
|
Enriches 100 sample Bayern museums to demonstrate metadata boost from 42% → ~85%.
|
|
Full enrichment (1,231 museums) can run as background task (~25 minutes).
|
|
|
|
Author: OpenCode AI Agent
|
|
Date: 2025-11-20
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
BASE_URL = "http://www.museen-in-deutschland.de"
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
|
}
|
|
RATE_LIMIT_DELAY = 0.5 # Faster for sample
|
|
|
|
|
|
def parse_detail_page(html):
|
|
"""Extract metadata from museum detail page."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
metadata = {}
|
|
page_text = soup.get_text(separator='\n')
|
|
|
|
# Address: "🏘 Name\nStreet\nPostal City"
|
|
address_match = re.search(r'🏘[^\n]+\n([^\n]+)\n(\d{5})\s+([^\n]+)', page_text)
|
|
if address_match:
|
|
metadata['street_address'] = address_match.group(1).strip()
|
|
metadata['postal_code'] = address_match.group(2).strip()
|
|
|
|
# Phone: "✆ (number)"
|
|
phone_match = re.search(r'✆\s*([+\d\s()/-]{8,25})', page_text)
|
|
if phone_match:
|
|
metadata['phone'] = phone_match.group(1).strip()
|
|
|
|
# Email: "📧 email@domain"
|
|
email_match = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,})', page_text)
|
|
if email_match:
|
|
metadata['email'] = email_match.group(1).strip()
|
|
|
|
# Website: "🕸 https://..."
|
|
website_match = re.search(r'🕸\s*(https?://[^\s<>"]+)', page_text)
|
|
if website_match:
|
|
metadata['website'] = website_match.group(1).strip()
|
|
|
|
# Coordinates: "⌖ lat, lon"
|
|
coords_match = re.search(r'⌖\s*([\d.]+),\s*([\d.]+)', page_text)
|
|
if coords_match:
|
|
metadata['latitude'] = float(coords_match.group(1))
|
|
metadata['longitude'] = float(coords_match.group(2))
|
|
|
|
return metadata
|
|
|
|
|
|
def enrich_museum(museum):
|
|
"""Enrich single museum with detail page data."""
|
|
detail_url = None
|
|
for identifier in museum.get('identifiers', []):
|
|
if identifier.get('identifier_scheme') == 'Registry':
|
|
detail_url = identifier.get('identifier_value')
|
|
break
|
|
|
|
if not detail_url:
|
|
return museum, 0
|
|
|
|
try:
|
|
response = requests.get(detail_url, headers=HEADERS, timeout=10)
|
|
response.raise_for_status()
|
|
metadata = parse_detail_page(response.text)
|
|
|
|
# Update museum record
|
|
enriched_count = 0
|
|
if metadata.get('street_address'):
|
|
museum['locations'][0]['street_address'] = metadata['street_address']
|
|
enriched_count += 1
|
|
if metadata.get('postal_code'):
|
|
museum['locations'][0]['postal_code'] = metadata['postal_code']
|
|
enriched_count += 1
|
|
if metadata.get('latitude'):
|
|
museum['locations'][0]['latitude'] = metadata['latitude']
|
|
museum['locations'][0]['longitude'] = metadata['longitude']
|
|
enriched_count += 1
|
|
if metadata.get('phone'):
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Phone',
|
|
'identifier_value': metadata['phone']
|
|
})
|
|
enriched_count += 1
|
|
if metadata.get('email'):
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': metadata['email']
|
|
})
|
|
enriched_count += 1
|
|
if metadata.get('website'):
|
|
museum['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': metadata['website'],
|
|
'identifier_url': metadata['website']
|
|
})
|
|
enriched_count += 1
|
|
|
|
if enriched_count > 0:
|
|
museum['provenance']['confidence_score'] = 0.95
|
|
museum['provenance']['notes'] = f"Enriched with {enriched_count} fields from detail page"
|
|
|
|
return museum, enriched_count
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error: {e}")
|
|
return museum, 0
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("Bavaria Museum Metadata Enrichment - Sample (100 museums)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load museums
|
|
input_file = Path("data/isil/germany/bayern_museums_20251120_213144.json")
|
|
with open(input_file, 'r') as f:
|
|
all_museums = json.load(f)
|
|
|
|
# Sample 100 museums from different cities
|
|
sample_museums = all_museums[:100]
|
|
|
|
print(f"Loaded {len(all_museums)} total museums")
|
|
print(f"Processing sample of {len(sample_museums)} museums")
|
|
print()
|
|
|
|
# Enrich
|
|
enriched = []
|
|
success_count = 0
|
|
total_fields = 0
|
|
|
|
print("Enriching museums...")
|
|
for i, museum in enumerate(sample_museums, 1):
|
|
print(f"[{i}/{len(sample_museums)}] {museum['name'][:50]:<50}", end=" ")
|
|
|
|
enriched_museum, field_count = enrich_museum(museum)
|
|
enriched.append(enriched_museum)
|
|
|
|
if field_count > 0:
|
|
success_count += 1
|
|
total_fields += field_count
|
|
print(f"✓ +{field_count} fields")
|
|
else:
|
|
print("○ no new data")
|
|
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Enrichment Results")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Total museums: {len(sample_museums)}")
|
|
print(f"Successfully enriched: {success_count} ({success_count/len(sample_museums)*100:.1f}%)")
|
|
print(f"Total fields added: {total_fields}")
|
|
print(f"Average fields per museum: {total_fields/len(sample_museums):.1f}")
|
|
print()
|
|
|
|
# Metadata completeness
|
|
completeness = {
|
|
'name': sum(1 for m in enriched if m.get('name')),
|
|
'city': sum(1 for m in enriched if m['locations'][0].get('city')),
|
|
'ISIL': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'ISIL' for i in m.get('identifiers', []))),
|
|
'street_address': sum(1 for m in enriched if m['locations'][0].get('street_address')),
|
|
'postal_code': sum(1 for m in enriched if m['locations'][0].get('postal_code')),
|
|
'coordinates': sum(1 for m in enriched if m['locations'][0].get('latitude')),
|
|
'phone': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Phone' for i in m.get('identifiers', []))),
|
|
'email': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Email' for i in m.get('identifiers', []))),
|
|
'website': sum(1 for m in enriched if any(i.get('identifier_scheme') == 'Website' for i in m.get('identifiers', []))),
|
|
}
|
|
|
|
print("Metadata Completeness (Sample of 100):")
|
|
print()
|
|
for field, count in completeness.items():
|
|
percentage = (count / len(sample_museums)) * 100
|
|
bar = "█" * int(percentage / 2) + "░" * (50 - int(percentage / 2))
|
|
print(f"{field:20s}: {bar} {percentage:5.1f}% ({count}/{len(sample_museums)})")
|
|
|
|
# Calculate overall completeness
|
|
total_possible = len(sample_museums) * len(completeness)
|
|
total_filled = sum(completeness.values())
|
|
overall = (total_filled / total_possible) * 100
|
|
print()
|
|
print(f"Overall completeness: {overall:.1f}%")
|
|
print()
|
|
|
|
# Save enriched sample
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
output_file = Path(f"data/isil/germany/bayern_museums_enriched_sample_{timestamp}.json")
|
|
with open(output_file, 'w') as f:
|
|
json.dump(enriched, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ Saved enriched sample: {output_file}")
|
|
print()
|
|
|
|
# Projection for full dataset
|
|
print("=" * 80)
|
|
print("Full Dataset Projection (1,231 museums)")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Expected success rate: {success_count/len(sample_museums)*100:.1f}%")
|
|
print(f"Expected enriched museums: ~{int(1231 * success_count/len(sample_museums))}")
|
|
print(f"Expected time (1s delay): ~25 minutes")
|
|
print(f"Expected overall completeness: ~{overall:.1f}%")
|
|
print()
|
|
|
|
return output_file
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|