glam/scripts/enrich_from_osm.py

#!/usr/bin/env python3
"""
OpenStreetMap Enrichment Script for Latin American Institutions

Purpose: Fetch OpenStreetMap data for institutions with OSM identifiers and extract:
         - Precise building-level coordinates (upgrade from city-level)
         - Contact information (phone, email, website if tagged)
         - Opening hours (if tagged)
         - Street addresses (if more detailed than current data)
         - Additional names/tags

Strategy:
    1. Load documented Latin American institutions dataset
    2. Find all institutions with OpenStreetMap identifiers (currently 186)
    3. Fetch OSM data via Overpass API for each OSM ID
    4. Parse JSON to extract location and contact metadata
    5. Update institution records with enriched data
    6. Generate enrichment report

Author: Global GLAM Dataset Project
Date: 2025-11-06
"""

import yaml
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from collections import defaultdict
import time
import re

# Overpass API Configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OVERPASS_TIMEOUT = 30  # seconds
RATE_LIMIT_DELAY = 2.0  # seconds between requests (increased to avoid 429)
MAX_RETRIES = 3  # Retry failed requests up to 3 times
RETRY_DELAY = 5.0  # seconds to wait before retry

# Alternative Overpass instances (if main is down)
OVERPASS_MIRRORS = [
    "https://overpass-api.de/api/interpreter",
    "https://overpass.kumi.systems/api/interpreter",
    "https://overpass.openstreetmap.ru/cgi/interpreter"
]


class OSMEnricher:
    """Enriches heritage institution records using OpenStreetMap data"""

    def __init__(self, input_file: Path, output_file: Path):
        self.input_file = input_file
        self.output_file = output_file
        self.institutions = []
        self.overpass_url = OVERPASS_MIRRORS[0]
        self.enrichment_stats = {
            'total_institutions': 0,
            'osm_ids_found': 0,
            'osm_records_fetched': 0,
            'osm_fetch_errors': 0,
            'coordinates_upgraded': 0,  # City-level → Building-level
            'addresses_improved': 0,
            'contact_info_added': 0,
            'opening_hours_added': 0,
            'alternative_names_added': 0,
            'websites_added': 0,
            'institutions_enriched': 0
        }
        self.enrichment_details = []

    def load_institutions(self):
        """Load institutions from YAML file"""
        print(f"Loading institutions from {self.input_file}")
        with open(self.input_file, 'r', encoding='utf-8') as f:
            self.institutions = yaml.safe_load(f)

        self.enrichment_stats['total_institutions'] = len(self.institutions)
        print(f"Loaded {len(self.institutions)} institutions")

    def fetch_osm_data(self, osm_id: str) -> Optional[Dict[str, Any]]:
        """
        Fetch OSM data via Overpass API with retry logic

        Args:
            osm_id: OpenStreetMap identifier (format: "way/123456" or "node/123456" or "relation/123456")

        Returns:
            OSM element data as dictionary or None if fetch failed
        """
        # Parse OSM ID format
        if '/' in osm_id:
            osm_type, osm_number = osm_id.split('/')
        else:
            # Assume it's just a number, try as node first
            osm_type = 'node'
            osm_number = osm_id

        # Build Overpass QL query
        # Request: element by ID, include tags and geometry
        overpass_query = f"""
        [out:json][timeout:{OVERPASS_TIMEOUT}];
        {osm_type}({osm_number});
        out center tags;
        """

        # Retry logic for transient failures
        for attempt in range(MAX_RETRIES):
            try:
                if attempt > 0:
                    print(f"  Retry {attempt}/{MAX_RETRIES-1}...")
                    time.sleep(RETRY_DELAY)

                response = requests.post(
                    self.overpass_url,
                    data={'data': overpass_query},
                    timeout=OVERPASS_TIMEOUT
                )

                if response.status_code == 200:
                    data = response.json()
                    elements = data.get('elements', [])
                    if elements:
                        return elements[0]  # Return first element
                    else:
                        print(f"  ⚠️  OSM element not found: {osm_type}/{osm_number}")
                        return None
                elif response.status_code == 429:
                    # Rate limit - wait longer and retry
                    if attempt < MAX_RETRIES - 1:
                        print(f"  ⚠️  Rate limited (429), waiting {RETRY_DELAY*2}s...")
                        time.sleep(RETRY_DELAY * 2)
                        continue
                    else:
                        print(f"  ⚠️  OSM fetch failed: HTTP 429 (rate limit)")
                        return None
                elif response.status_code in [504, 503, 502]:
                    # Server error - retry
                    if attempt < MAX_RETRIES - 1:
                        print(f"  ⚠️  Server error ({response.status_code}), retrying...")
                        continue
                    else:
                        print(f"  ⚠️  OSM fetch failed: HTTP {response.status_code}")
                        return None
                else:
                    print(f"  ⚠️  OSM fetch failed: HTTP {response.status_code}")
                    return None

            except requests.RequestException as e:
                if attempt < MAX_RETRIES - 1:
                    print(f"  ⚠️  Request error: {e}, retrying...")
                    continue
                else:
                    print(f"  ❌ OSM fetch error: {e}")
                    return None
            except json.JSONDecodeError as e:
                print(f"  ❌ JSON parse error: {e}")
                return None

        return None

    def extract_coordinates(self, osm_element: Dict[str, Any]) -> Optional[Dict[str, float]]:
        """
        Extract precise coordinates from OSM element

        Returns:
            {'latitude': float, 'longitude': float} or None
        """
        # For nodes: lat/lon directly
        if 'lat' in osm_element and 'lon' in osm_element:
            return {
                'latitude': osm_element['lat'],
                'longitude': osm_element['lon']
            }

        # For ways/relations: use center coordinates
        if 'center' in osm_element:
            center = osm_element['center']
            if 'lat' in center and 'lon' in center:
                return {
                    'latitude': center['lat'],
                    'longitude': center['lon']
                }

        return None

    def extract_address(self, tags: Dict[str, str]) -> Dict[str, str]:
        """
        Extract address components from OSM tags

        Returns:
            Dictionary with address fields (street_address, city, postal_code, etc.)
        """
        address = {}

        # OSM address tags: addr:street, addr:housenumber, addr:city, addr:postcode
        street = tags.get('addr:street', '')
        housenumber = tags.get('addr:housenumber', '')

        if street and housenumber:
            address['street_address'] = f"{street} {housenumber}".strip()
        elif street:
            address['street_address'] = street

        if 'addr:city' in tags:
            address['city'] = tags['addr:city']

        if 'addr:postcode' in tags:
            address['postal_code'] = tags['addr:postcode']

        if 'addr:state' in tags:
            address['region'] = tags['addr:state']

        if 'addr:country' in tags:
            address['country'] = tags['addr:country']

        return address

    def extract_contact_info(self, tags: Dict[str, str]) -> Dict[str, str]:
        """
        Extract contact information from OSM tags

        Returns:
            Dictionary with contact:phone, contact:email, contact:website, etc.
        """
        contact = {}

        # Phone numbers
        for key in ['phone', 'contact:phone', 'telephone']:
            if key in tags:
                contact['phone'] = tags[key]
                break

        # Email
        for key in ['email', 'contact:email']:
            if key in tags:
                contact['email'] = tags[key]
                break

        # Website (distinct from identifier URLs)
        for key in ['website', 'contact:website', 'url']:
            if key in tags:
                contact['website'] = tags[key]
                break

        # Opening hours
        if 'opening_hours' in tags:
            contact['opening_hours'] = tags['opening_hours']

        return contact

    def extract_alternative_names(self, tags: Dict[str, str]) -> List[str]:
        """Extract alternative name variants from OSM tags"""
        names = []

        # Common name tags in OSM
        name_keys = [
            'alt_name', 'official_name', 'short_name', 'old_name',
            'name:en', 'name:es', 'name:pt'  # Common languages for Latin America
        ]

        for key in name_keys:
            if key in tags and tags[key]:
                name = tags[key].strip()
                if name and name not in names:
                    names.append(name)

        return names

    def enrich_institution(self, institution: Dict[str, Any]) -> bool:
        """
        Enrich a single institution with OSM data

        Returns:
            True if enrichment occurred, False otherwise
        """
        # Find OSM identifier
        osm_id = None
        identifiers = institution.get('identifiers', [])

        for identifier in identifiers:
            if identifier.get('identifier_scheme') == 'OpenStreetMap':
                osm_id = identifier.get('identifier_value')
                break

        if not osm_id:
            return False

        self.enrichment_stats['osm_ids_found'] += 1

        print(f"\n🗺️  Enriching: {institution.get('name')} (OSM {osm_id})")

        # Fetch OSM data
        osm_element = self.fetch_osm_data(osm_id)
        if osm_element is None:
            self.enrichment_stats['osm_fetch_errors'] += 1
            return False

        self.enrichment_stats['osm_records_fetched'] += 1

        enriched = False
        enrichment_log = {
            'institution_name': institution.get('name'),
            'osm_id': osm_id,
            'improvements': []
        }

        tags = osm_element.get('tags', {})

        # Extract and update coordinates
        coords = self.extract_coordinates(osm_element)
        if coords:
            locations = institution.get('locations', [])
            if locations:
                # Check if we're upgrading from city-level to building-level
                current_location = locations[0]
                current_lat = current_location.get('latitude')
                current_lon = current_location.get('longitude')

                # If coordinates differ significantly (>0.001 degrees ≈ 100m), it's an upgrade
                if current_lat and current_lon:
                    lat_diff = abs(coords['latitude'] - current_lat)
                    lon_diff = abs(coords['longitude'] - current_lon)

                    if lat_diff > 0.001 or lon_diff > 0.001:
                        print(f"  ✅ Upgraded coordinates: precision improved")
                        current_location['latitude'] = coords['latitude']
                        current_location['longitude'] = coords['longitude']
                        self.enrichment_stats['coordinates_upgraded'] += 1
                        enrichment_log['improvements'].append('Coordinates upgraded to building-level precision')
                        enriched = True
                else:
                    # No coordinates yet, add them
                    print(f"  ✅ Added coordinates: {coords['latitude']}, {coords['longitude']}")
                    current_location['latitude'] = coords['latitude']
                    current_location['longitude'] = coords['longitude']
                    self.enrichment_stats['coordinates_upgraded'] += 1
                    enrichment_log['improvements'].append('Added building coordinates')
                    enriched = True

        # Extract and update address
        address = self.extract_address(tags)
        if address:
            locations = institution.get('locations', [])
            if locations:
                current_location = locations[0]

                # Add street address if better than current
                if 'street_address' in address and not current_location.get('street_address'):
                    print(f"  ✅ Added street address: {address['street_address']}")
                    current_location['street_address'] = address['street_address']
                    self.enrichment_stats['addresses_improved'] += 1
                    enrichment_log['improvements'].append(f"Street address: {address['street_address']}")
                    enriched = True

                # Add postal code if missing
                if 'postal_code' in address and not current_location.get('postal_code'):
                    print(f"  ✅ Added postal code: {address['postal_code']}")
                    current_location['postal_code'] = address['postal_code']
                    enrichment_log['improvements'].append(f"Postal code: {address['postal_code']}")
                    enriched = True

        # Extract contact information
        contact = self.extract_contact_info(tags)

        # Add phone number as identifier
        if 'phone' in contact:
            has_phone = any(
                id.get('identifier_scheme') == 'Phone'
                for id in identifiers
            )
            if not has_phone:
                print(f"  ✅ Added phone: {contact['phone']}")
                identifiers.append({
                    'identifier_scheme': 'Phone',
                    'identifier_value': contact['phone'],
                    'identifier_url': None
                })
                self.enrichment_stats['contact_info_added'] += 1
                enrichment_log['improvements'].append(f"Phone: {contact['phone']}")
                enriched = True

        # Add email as identifier
        if 'email' in contact:
            has_email = any(
                id.get('identifier_scheme') == 'Email'
                for id in identifiers
            )
            if not has_email:
                print(f"  ✅ Added email: {contact['email']}")
                identifiers.append({
                    'identifier_scheme': 'Email',
                    'identifier_value': contact['email'],
                    'identifier_url': f"mailto:{contact['email']}"
                })
                self.enrichment_stats['contact_info_added'] += 1
                enrichment_log['improvements'].append(f"Email: {contact['email']}")
                enriched = True

        # Add website if different from existing
        if 'website' in contact:
            existing_websites = [
                id.get('identifier_value')
                for id in identifiers
                if id.get('identifier_scheme') == 'Website'
            ]
            if contact['website'] not in existing_websites:
                print(f"  ✅ Added website: {contact['website']}")
                identifiers.append({
                    'identifier_scheme': 'Website',
                    'identifier_value': contact['website'],
                    'identifier_url': contact['website']
                })
                self.enrichment_stats['websites_added'] += 1
                enrichment_log['improvements'].append(f"Website: {contact['website']}")
                enriched = True

        # Add opening hours to description or as note
        if 'opening_hours' in contact:
            # Add to description
            description = institution.get('description', '')
            hours_text = f"Opening hours: {contact['opening_hours']}"

            if hours_text not in description:
                print(f"  ✅ Added opening hours: {contact['opening_hours']}")
                if description:
                    institution['description'] = f"{description} {hours_text}"
                else:
                    institution['description'] = hours_text

                self.enrichment_stats['opening_hours_added'] += 1
                enrichment_log['improvements'].append(f"Opening hours: {contact['opening_hours']}")
                enriched = True

        # Extract alternative names
        alt_names = self.extract_alternative_names(tags)
        if alt_names:
            existing_alt_names = institution.get('alternative_names', [])
            new_names = [name for name in alt_names if name not in existing_alt_names]

            if new_names:
                print(f"  ✅ Found {len(new_names)} alternative names")
                institution['alternative_names'] = existing_alt_names + new_names
                self.enrichment_stats['alternative_names_added'] += len(new_names)
                enrichment_log['improvements'].append(f"Alternative names: {', '.join(new_names[:3])}")
                enriched = True

        if enriched:
            self.enrichment_stats['institutions_enriched'] += 1
            self.enrichment_details.append(enrichment_log)

            # Update provenance
            if 'provenance' in institution:
                existing_notes = institution['provenance'].get('notes', '')
                osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. "
                osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}."

                institution['provenance']['notes'] = (existing_notes + osm_note).strip()

        return enriched

    def process_all_institutions(self):
        """Process all institutions and enrich from OpenStreetMap"""
        print(f"\n{'='*70}")
        print("OpenStreetMap Enrichment Process")
        print(f"{'='*70}\n")

        for idx, institution in enumerate(self.institutions, 1):
            enriched = self.enrich_institution(institution)

            if enriched:
                print(f"  ✅ Enrichment successful")

            # Rate limiting
            if idx < len(self.institutions):
                time.sleep(RATE_LIMIT_DELAY)

        print(f"\n{'='*70}")
        print("OpenStreetMap Enrichment Complete")
        print(f"{'='*70}\n")

    def save_enriched_dataset(self):
        """Save enriched institutions to output file"""
        print(f"Saving enriched dataset to {self.output_file}")

        with open(self.output_file, 'w', encoding='utf-8') as f:
            f.write("---\n")
            f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
            f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
            f.write("#\n")
            f.write("# OpenStreetMap Enrichment Summary:\n")
            for key, value in self.enrichment_stats.items():
                f.write(f"# - {key}: {value}\n")
            f.write("\n")

            yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False)

        print(f"✅ Saved {len(self.institutions)} institutions")

    def generate_report(self):
        """Generate enrichment report"""
        print("\n" + "="*70)
        print("OPENSTREETMAP ENRICHMENT REPORT")
        print("="*70 + "\n")

        print(f"Total institutions processed:       {self.enrichment_stats['total_institutions']}")
        print(f"Institutions with OSM IDs:          {self.enrichment_stats['osm_ids_found']}")
        print(f"OSM records successfully fetched:   {self.enrichment_stats['osm_records_fetched']}")
        print(f"OSM fetch errors:                   {self.enrichment_stats['osm_fetch_errors']}")
        print(f"\nEnrichment Results:")
        print(f"  Coordinates upgraded:             {self.enrichment_stats['coordinates_upgraded']}")
        print(f"  Addresses improved:               {self.enrichment_stats['addresses_improved']}")
        print(f"  Contact info added:               {self.enrichment_stats['contact_info_added']}")
        print(f"  Opening hours added:              {self.enrichment_stats['opening_hours_added']}")
        print(f"  Alternative names added:          {self.enrichment_stats['alternative_names_added']}")
        print(f"  Websites added:                   {self.enrichment_stats['websites_added']}")
        print(f"  Institutions enriched:            {self.enrichment_stats['institutions_enriched']}")

        if self.enrichment_details:
            print(f"\nDetailed Enrichment Log (showing first 10):")
            for detail in self.enrichment_details[:10]:
                print(f"\n  {detail['institution_name']} (OSM {detail['osm_id']})")
                for improvement in detail['improvements'][:3]:
                    print(f"    + {improvement}")

        print("\n" + "="*70 + "\n")


def main():
    """Main execution"""
    # File paths
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml"
    output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"

    # Validate input file exists
    if not input_file.exists():
        print(f"❌ Error: Input file not found: {input_file}")
        print("   Please ensure the documented dataset exists.")
        return 1

    # Create enricher
    enricher = OSMEnricher(input_file, output_file)

    # Load institutions
    enricher.load_institutions()

    # Process all institutions
    enricher.process_all_institutions()

    # Save enriched dataset
    enricher.save_enriched_dataset()

    # Generate report
    enricher.generate_report()

    print(f"✅ OpenStreetMap enrichment complete!")
    print(f"   Input:  {input_file}")
    print(f"   Output: {output_file}")

    return 0


if __name__ == '__main__':
    exit(main())