glam/scripts/enrich_from_osm_batched.py

#!/usr/bin/env python3
"""
OpenStreetMap Enrichment Script - BATCHED VERSION
Saves progress every 20 institutions to avoid data loss on timeout.
"""

import yaml
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from collections import defaultdict
import time
import re

# Configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OVERPASS_TIMEOUT = 30
RATE_LIMIT_DELAY = 2.0
MAX_RETRIES = 3
RETRY_DELAY = 5.0
BATCH_SIZE = 20  # Save progress every 20 institutions

OVERPASS_MIRRORS = [
    "https://overpass-api.de/api/interpreter",
    "https://overpass.kumi.systems/api/interpreter",
    "https://overpass.openstreetmap.ru/cgi/interpreter"
]


class OSMEnricher:
    """Enriches heritage institution records using OpenStreetMap data"""

    def __init__(self, input_file: Path, output_file: Path):
        self.input_file = input_file
        self.output_file = output_file
        self.institutions = []
        self.enrichment_stats = defaultdict(int)
        self.enrichment_details = []
        self.current_mirror = 0

    def load_institutions(self):
        """Load institutions from YAML file"""
        print(f"Loading institutions from {self.input_file}")
        with open(self.input_file, 'r', encoding='utf-8') as f:
            self.institutions = yaml.safe_load(f)

        if not isinstance(self.institutions, list):
            raise ValueError("Expected list of institutions in YAML file")

        self.enrichment_stats['total_institutions'] = len(self.institutions)
        print(f"✅ Loaded {len(self.institutions)} institutions\n")

    def get_overpass_url(self):
        """Get current Overpass API URL (with failover)"""
        return OVERPASS_MIRRORS[self.current_mirror % len(OVERPASS_MIRRORS)]

    def fetch_osm_element(self, osm_id: str, retry_count: int = 0) -> Optional[Dict]:
        """Fetch OSM element data via Overpass API"""
        # Parse OSM ID format (e.g., "node/123", "way/456", "relation/789")
        match = re.match(r'(node|way|relation)/(\d+)', osm_id)
        if not match:
            print(f"    ⚠️  Invalid OSM ID format: {osm_id}")
            return None

        element_type, element_id = match.groups()

        # Construct Overpass query
        query = f"""
        [out:json][timeout:{OVERPASS_TIMEOUT}];
        {element_type}({element_id});
        out body;
        >;
        out skel qt;
        """

        try:
            url = self.get_overpass_url()
            response = requests.post(
                url,
                data={'data': query},
                timeout=OVERPASS_TIMEOUT + 5
            )

            if response.status_code == 200:
                data = response.json()
                if data.get('elements'):
                    return data['elements'][0]
                else:
                    print(f"    ⚠️  No data returned for OSM {osm_id}")
                    return None

            elif response.status_code == 429:
                # Rate limit - wait and retry
                if retry_count < MAX_RETRIES:
                    wait_time = RETRY_DELAY * 2
                    print(f"    ⚠️  Rate limited (429). Waiting {wait_time}s before retry {retry_count+1}/{MAX_RETRIES}")
                    time.sleep(wait_time)
                    return self.fetch_osm_element(osm_id, retry_count + 1)
                else:
                    print(f"    ❌ Rate limit exceeded after {MAX_RETRIES} retries")
                    return None

            elif response.status_code in [502, 503, 504]:
                # Server error - retry with different mirror
                if retry_count < MAX_RETRIES:
                    self.current_mirror += 1
                    new_url = self.get_overpass_url()
                    print(f"    ⚠️  Server error ({response.status_code}). Switching to {new_url}")
                    time.sleep(RETRY_DELAY)
                    return self.fetch_osm_element(osm_id, retry_count + 1)
                else:
                    print(f"    ❌ Server error after {MAX_RETRIES} retries")
                    return None

            else:
                print(f"    ❌ HTTP {response.status_code}: {response.text[:100]}")
                return None

        except requests.Timeout:
            if retry_count < MAX_RETRIES:
                print(f"    ⚠️  Timeout. Retry {retry_count+1}/{MAX_RETRIES}")
                time.sleep(RETRY_DELAY)
                return self.fetch_osm_element(osm_id, retry_count + 1)
            else:
                print(f"    ❌ Timeout after {MAX_RETRIES} retries")
                return None

        except Exception as e:
            print(f"    ❌ Error fetching OSM data: {e}")
            return None

    def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
        """Calculate distance in meters between two coordinates (Haversine formula)"""
        from math import radians, sin, cos, sqrt, atan2

        R = 6371000  # Earth radius in meters

        lat1_rad = radians(lat1)
        lat2_rad = radians(lat2)
        dlat = radians(lat2 - lat1)
        dlon = radians(lon2 - lon1)

        a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))

        return R * c

    def enrich_institution(self, institution: Dict) -> bool:
        """Enrich a single institution from OSM data"""
        # Check for OSM identifier
        identifiers = institution.get('identifiers', [])
        osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap']

        if not osm_ids:
            return False

        self.enrichment_stats['osm_ids_found'] += 1
        osm_id = osm_ids[0]['identifier_value']

        inst_name = institution.get('name', 'Unknown')
        print(f"\n[{self.enrichment_stats['osm_ids_found']}] {inst_name}")
        print(f"    OSM ID: {osm_id}")

        # Fetch OSM data
        osm_data = self.fetch_osm_element(osm_id)

        if not osm_data:
            self.enrichment_stats['osm_fetch_errors'] += 1
            return False

        self.enrichment_stats['osm_records_fetched'] += 1

        # Extract tags
        tags = osm_data.get('tags', {})
        enriched = False
        enrichment_log = {
            'institution_name': inst_name,
            'osm_id': osm_id,
            'improvements': []
        }

        # 1. Coordinates upgrade
        if 'lat' in osm_data and 'lon' in osm_data:
            osm_lat = float(osm_data['lat'])
            osm_lon = float(osm_data['lon'])

            locations = institution.get('locations', [])
            if locations and locations[0].get('latitude') and locations[0].get('longitude'):
                current_lat = locations[0]['latitude']
                current_lon = locations[0]['longitude']
                distance = self.calculate_distance(current_lat, current_lon, osm_lat, osm_lon)

                if distance > 100:  # More than 100m difference = upgrade
                    locations[0]['latitude'] = osm_lat
                    locations[0]['longitude'] = osm_lon
                    enriched = True
                    self.enrichment_stats['coordinates_upgraded'] += 1
                    enrichment_log['improvements'].append(
                        f"Upgraded coordinates (precision improved by {int(distance)}m)"
                    )
            elif locations:
                locations[0]['latitude'] = osm_lat
                locations[0]['longitude'] = osm_lon
                enriched = True
                self.enrichment_stats['coordinates_upgraded'] += 1
                enrichment_log['improvements'].append("Added precise coordinates from OSM")

        # 2. Street address
        addr_street = tags.get('addr:street')
        addr_housenumber = tags.get('addr:housenumber')
        addr_postcode = tags.get('addr:postcode')

        if addr_street or addr_housenumber or addr_postcode:
            locations = institution.get('locations', [])
            if locations:
                location = locations[0]

                if addr_street and addr_housenumber:
                    full_address = f"{addr_street} {addr_housenumber}"
                    if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')):
                        location['street_address'] = full_address
                        enriched = True
                        self.enrichment_stats['addresses_improved'] += 1
                        enrichment_log['improvements'].append(f"Added street address: {full_address}")

                if addr_postcode and not location.get('postal_code'):
                    location['postal_code'] = addr_postcode
                    enriched = True
                    enrichment_log['improvements'].append(f"Added postal code: {addr_postcode}")

        # 3. Contact information
        phone = tags.get('phone') or tags.get('contact:phone')
        email = tags.get('email') or tags.get('contact:email')
        website = tags.get('website') or tags.get('url') or tags.get('contact:website')

        contact_added = False
        if phone or email:
            # Store in description for now (no dedicated contact field in schema)
            contact_info = []
            if phone:
                contact_info.append(f"Phone: {phone}")
            if email:
                contact_info.append(f"Email: {email}")

            if contact_info:
                contact_text = " | ".join(contact_info)
                current_desc = institution.get('description', '')
                if contact_text not in current_desc:
                    institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip()
                    enriched = True
                    contact_added = True
                    self.enrichment_stats['contact_info_added'] += 1
                    enrichment_log['improvements'].append(f"Added contact info: {contact_text}")

        # 4. Website
        if website:
            identifiers = institution.get('identifiers', [])
            website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website']

            if not website_ids:
                identifiers.append({
                    'identifier_scheme': 'Website',
                    'identifier_value': website,
                    'identifier_url': website
                })
                institution['identifiers'] = identifiers
                enriched = True
                self.enrichment_stats['websites_added'] += 1
                enrichment_log['improvements'].append(f"Added website: {website}")

        # 5. Opening hours
        opening_hours = tags.get('opening_hours')
        if opening_hours:
            current_desc = institution.get('description', '')
            hours_text = f"Opening hours: {opening_hours}"
            if hours_text not in current_desc:
                institution['description'] = (current_desc + f"\n\n{hours_text}").strip()
                enriched = True
                self.enrichment_stats['opening_hours_added'] += 1
                enrichment_log['improvements'].append(f"Added opening hours: {opening_hours}")

        # 6. Alternative names
        alt_names = []
        for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']:
            if key in tags and tags[key] != institution.get('name'):
                alt_names.append(tags[key])

        if alt_names:
            existing_alt_names = institution.get('alternative_names', [])
            new_alt_names = [n for n in alt_names if n not in existing_alt_names]

            if new_alt_names:
                institution['alternative_names'] = existing_alt_names + new_alt_names
                enriched = True
                self.enrichment_stats['alternative_names_added'] += len(new_alt_names)
                enrichment_log['improvements'].append(
                    f"Added {len(new_alt_names)} alternative names"
                )

        # Update provenance
        if enriched:
            self.enrichment_stats['institutions_enriched'] += 1
            self.enrichment_details.append(enrichment_log)

            if 'provenance' in institution:
                existing_notes = institution['provenance'].get('notes', '')
                osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. "
                osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}."

                institution['provenance']['notes'] = (existing_notes + osm_note).strip()

        return enriched

    def save_progress(self, batch_num: int):
        """Save current progress to output file"""
        print(f"\n💾 Saving progress (batch {batch_num})...")

        with open(self.output_file, 'w', encoding='utf-8') as f:
            f.write("---\n")
            f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
            f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
            f.write(f"# Batch: {batch_num}\n")
            f.write("#\n")
            f.write("# OpenStreetMap Enrichment Summary (partial):\n")
            for key, value in self.enrichment_stats.items():
                f.write(f"# - {key}: {value}\n")
            f.write("\n")

            yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False)

        print(f"✅ Saved progress: {self.enrichment_stats['institutions_enriched']} institutions enriched so far\n")

    def process_all_institutions(self):
        """Process all institutions with batched progress saving"""
        print(f"\n{'='*70}")
        print("OpenStreetMap Enrichment Process (BATCHED)")
        print(f"{'='*70}\n")

        batch_num = 0

        for idx, institution in enumerate(self.institutions, 1):
            enriched = self.enrich_institution(institution)

            if enriched:
                print(f"  ✅ Enrichment successful")

            # Save progress every BATCH_SIZE institutions
            if idx % BATCH_SIZE == 0:
                batch_num += 1
                self.save_progress(batch_num)

            # Rate limiting
            if idx < len(self.institutions):
                time.sleep(RATE_LIMIT_DELAY)

        # Save final state if not already saved
        if len(self.institutions) % BATCH_SIZE != 0:
            batch_num += 1
            self.save_progress(batch_num)

        print(f"\n{'='*70}")
        print("OpenStreetMap Enrichment Complete")
        print(f"{'='*70}\n")

    def generate_report(self):
        """Generate enrichment report"""
        print("\n" + "="*70)
        print("OPENSTREETMAP ENRICHMENT REPORT")
        print("="*70 + "\n")

        print(f"Total institutions processed:       {self.enrichment_stats['total_institutions']}")
        print(f"Institutions with OSM IDs:          {self.enrichment_stats['osm_ids_found']}")
        print(f"OSM records successfully fetched:   {self.enrichment_stats['osm_records_fetched']}")
        print(f"OSM fetch errors:                   {self.enrichment_stats['osm_fetch_errors']}")
        print(f"\nEnrichment Results:")
        print(f"  Coordinates upgraded:             {self.enrichment_stats['coordinates_upgraded']}")
        print(f"  Addresses improved:               {self.enrichment_stats['addresses_improved']}")
        print(f"  Contact info added:               {self.enrichment_stats['contact_info_added']}")
        print(f"  Opening hours added:              {self.enrichment_stats['opening_hours_added']}")
        print(f"  Alternative names added:          {self.enrichment_stats['alternative_names_added']}")
        print(f"  Websites added:                   {self.enrichment_stats['websites_added']}")
        print(f"  Institutions enriched:            {self.enrichment_stats['institutions_enriched']}")

        if self.enrichment_details:
            print(f"\nDetailed Enrichment Log (showing first 20):")
            for detail in self.enrichment_details[:20]:
                print(f"\n  {detail['institution_name']} (OSM {detail['osm_id']})")
                for improvement in detail['improvements'][:5]:
                    print(f"    + {improvement}")

        print("\n" + "="*70 + "\n")


def main():
    """Main execution"""
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml"
    output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"

    if not input_file.exists():
        print(f"❌ Error: Input file not found: {input_file}")
        return 1

    enricher = OSMEnricher(input_file, output_file)
    enricher.load_institutions()
    enricher.process_all_institutions()
    enricher.generate_report()

    print(f"\n✅ Enrichment complete! Output saved to:")
    print(f"   {output_file}")

    return 0


if __name__ == "__main__":
    exit(main())