glam/scripts/resume_osm_enrichment.py

#!/usr/bin/env python3
"""
Resume OSM Enrichment - Process institutions 101-304
"""

import yaml
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from collections import defaultdict
import time
import re

# Configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OVERPASS_TIMEOUT = 30
RATE_LIMIT_DELAY = 3.0  # Increased to 3 seconds
MAX_RETRIES = 3
RETRY_DELAY = 10.0  # Increased to 10 seconds
BATCH_SIZE = 20
START_INDEX = 100  # Resume from institution 101 (0-indexed = 100)

print(f"Resuming OSM enrichment from institution {START_INDEX + 1}")
print(f"Rate limit: {RATE_LIMIT_DELAY}s between requests")
print(f"Batch size: {BATCH_SIZE} institutions\n")

# Load partially enriched dataset
base_dir = Path(__file__).parent.parent
enriched_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"

print(f"Loading {enriched_file}")
with open(enriched_file, 'r', encoding='utf-8') as f:
    content = f.read()
    # Skip header
    yaml_content = content.split('\n\n', 1)[1] if '\n\n' in content else content
    institutions = yaml.safe_load(yaml_content)

print(f"Loaded {len(institutions)} institutions")
print(f"Will process institutions {START_INDEX + 1} to {len(institutions)}\n")

# Stats
stats = defaultdict(int)
stats['total_institutions'] = len(institutions)
details = []

def fetch_osm_element(osm_id: str, retry_count: int = 0) -> Optional[Dict]:
    """Fetch OSM element data via Overpass API"""
    match = re.match(r'(node|way|relation)/(\d+)', osm_id)
    if not match:
        return None

    element_type, element_id = match.groups()
    query = f"""
    [out:json][timeout:{OVERPASS_TIMEOUT}];
    {element_type}({element_id});
    out body;
    >;
    out skel qt;
    """

    try:
        response = requests.post(
            OVERPASS_URL,
            data={'data': query},
            timeout=OVERPASS_TIMEOUT + 5
        )

        if response.status_code == 200:
            data = response.json()
            return data.get('elements', [None])[0]
        elif response.status_code == 429 and retry_count < MAX_RETRIES:
            print(f"    ⚠️  Rate limited. Waiting {RETRY_DELAY}s...")
            time.sleep(RETRY_DELAY)
            return fetch_osm_element(osm_id, retry_count + 1)
        elif response.status_code in [502, 503, 504] and retry_count < MAX_RETRIES:
            print(f"    ⚠️  Server error {response.status_code}. Retry {retry_count+1}/{MAX_RETRIES}")
            time.sleep(RETRY_DELAY)
            return fetch_osm_element(osm_id, retry_count + 1)
        else:
            print(f"    ❌ HTTP {response.status_code}")
            return None
    except requests.Timeout:
        if retry_count < MAX_RETRIES:
            print(f"    ⚠️  Timeout. Retry {retry_count+1}/{MAX_RETRIES}")
            time.sleep(RETRY_DELAY)
            return fetch_osm_element(osm_id, retry_count + 1)
        return None
    except Exception as e:
        print(f"    ❌ Error: {e}")
        return None

def calculate_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between coordinates"""
    from math import radians, sin, cos, sqrt, atan2
    R = 6371000
    lat1, lat2 = radians(lat1), radians(lat2)
    dlat, dlon = radians(lat2 - lat1), radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1-a))

# Process institutions from START_INDEX onwards
batch_num = 5  # Continue from batch 5
processed_count = 0

for idx in range(START_INDEX, len(institutions)):
    institution = institutions[idx]

    # Check for OSM ID
    identifiers = institution.get('identifiers', [])
    osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap']

    if not osm_ids:
        continue

    stats['osm_ids_found'] += 1
    osm_id = osm_ids[0]['identifier_value']
    inst_name = institution.get('name', 'Unknown')

    print(f"\n[{idx + 1}/{len(institutions)}] {inst_name}")
    print(f"    OSM ID: {osm_id}")

    # Fetch OSM data
    osm_data = fetch_osm_element(osm_id)

    if not osm_data:
        stats['osm_fetch_errors'] += 1
        processed_count += 1
        time.sleep(RATE_LIMIT_DELAY)
        continue

    stats['osm_records_fetched'] += 1
    tags = osm_data.get('tags', {})
    enriched = False
    improvements = []

    # 1. Coordinates
    if 'lat' in osm_data and 'lon' in osm_data:
        osm_lat, osm_lon = float(osm_data['lat']), float(osm_data['lon'])
        locations = institution.get('locations', [])

        if locations:
            if locations[0].get('latitude') and locations[0].get('longitude'):
                distance = calculate_distance(
                    locations[0]['latitude'], locations[0]['longitude'],
                    osm_lat, osm_lon
                )
                if distance > 100:
                    locations[0]['latitude'] = osm_lat
                    locations[0]['longitude'] = osm_lon
                    enriched = True
                    stats['coordinates_upgraded'] += 1
                    improvements.append(f"Coordinates upgraded ({int(distance)}m)")
            else:
                locations[0]['latitude'] = osm_lat
                locations[0]['longitude'] = osm_lon
                enriched = True
                stats['coordinates_upgraded'] += 1
                improvements.append("Added coordinates")

    # 2. Address
    addr_street = tags.get('addr:street')
    addr_num = tags.get('addr:housenumber')
    addr_postcode = tags.get('addr:postcode')

    if (addr_street or addr_postcode) and institution.get('locations'):
        location = institution['locations'][0]

        if addr_street and addr_num:
            full_address = f"{addr_street} {addr_num}"
            if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')):
                location['street_address'] = full_address
                enriched = True
                stats['addresses_improved'] += 1
                improvements.append(f"Address: {full_address}")

        if addr_postcode and not location.get('postal_code'):
            location['postal_code'] = addr_postcode
            enriched = True
            improvements.append(f"Postcode: {addr_postcode}")

    # 3. Contact info
    phone = tags.get('phone') or tags.get('contact:phone')
    email = tags.get('email') or tags.get('contact:email')

    if phone or email:
        contact_info = []
        if phone:
            contact_info.append(f"Phone: {phone}")
        if email:
            contact_info.append(f"Email: {email}")

        contact_text = " | ".join(contact_info)
        current_desc = institution.get('description', '')

        if contact_text not in current_desc:
            institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip()
            enriched = True
            stats['contact_info_added'] += 1
            improvements.append(f"Contact: {contact_text}")

    # 4. Website
    website = tags.get('website') or tags.get('url') or tags.get('contact:website')
    if website:
        website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website']
        if not website_ids:
            identifiers.append({
                'identifier_scheme': 'Website',
                'identifier_value': website,
                'identifier_url': website
            })
            institution['identifiers'] = identifiers
            enriched = True
            stats['websites_added'] += 1
            improvements.append(f"Website: {website}")

    # 5. Opening hours
    opening_hours = tags.get('opening_hours')
    if opening_hours:
        current_desc = institution.get('description', '')
        hours_text = f"Opening hours: {opening_hours}"
        if hours_text not in current_desc:
            institution['description'] = (current_desc + f"\n\n{hours_text}").strip()
            enriched = True
            stats['opening_hours_added'] += 1
            improvements.append(f"Hours: {opening_hours}")

    # 6. Alternative names
    alt_names = []
    for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']:
        if key in tags and tags[key] != institution.get('name'):
            alt_names.append(tags[key])

    if alt_names:
        existing = institution.get('alternative_names', [])
        new_names = [n for n in alt_names if n not in existing]
        if new_names:
            institution['alternative_names'] = existing + new_names
            enriched = True
            stats['alternative_names_added'] += len(new_names)
            improvements.append(f"{len(new_names)} alt names")

    # Update provenance
    if enriched:
        stats['institutions_enriched'] += 1
        details.append({'name': inst_name, 'osm_id': osm_id, 'improvements': improvements})

        if 'provenance' in institution:
            notes = institution['provenance'].get('notes', '')
            osm_note = f"\nOpenStreetMap enrichment (2025-11-06): {osm_id}. {', '.join(improvements[:3])}."
            institution['provenance']['notes'] = (notes + osm_note).strip()

        print(f"  ✅ Enriched: {', '.join(improvements[:3])}")

    processed_count += 1

    # Save progress every BATCH_SIZE
    if processed_count % BATCH_SIZE == 0:
        batch_num += 1
        print(f"\n💾 Saving batch {batch_num}...")

        with open(enriched_file, 'w', encoding='utf-8') as f:
            f.write("---\n")
            f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
            f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
            f.write(f"# Batch: {batch_num}\n#\n")
            f.write("# OpenStreetMap Enrichment Summary:\n")
            for key, value in stats.items():
                f.write(f"# - {key}: {value}\n")
            f.write("\n")
            yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)

        print(f"✅ Saved. Enriched: {stats['institutions_enriched']}\n")

    # Rate limiting
    time.sleep(RATE_LIMIT_DELAY)

# Save final
if processed_count % BATCH_SIZE != 0:
    batch_num += 1
    print(f"\n💾 Saving final batch {batch_num}...")

    with open(enriched_file, 'w', encoding='utf-8') as f:
        f.write("---\n")
        f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
        f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
        f.write(f"# Batch: {batch_num}\n#\n")
        f.write("# OpenStreetMap Enrichment Summary:\n")
        for key, value in stats.items():
            f.write(f"# - {key}: {value}\n")
        f.write("\n")
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)

    print(f"✅ Saved!\n")

# Report
print("\n" + "="*70)
print("OPENSTREETMAP ENRICHMENT REPORT (RESUMED)")
print("="*70)
print(f"\nProcessed: institutions {START_INDEX + 1} to {len(institutions)}")
print(f"OSM IDs found: {stats['osm_ids_found']}")
print(f"Records fetched: {stats['osm_records_fetched']}")
print(f"Fetch errors: {stats['osm_fetch_errors']}")
print(f"\nEnrichments:")
print(f"  Coordinates: {stats['coordinates_upgraded']}")
print(f"  Addresses: {stats['addresses_improved']}")
print(f"  Contact: {stats['contact_info_added']}")
print(f"  Hours: {stats['opening_hours_added']}")
print(f"  Websites: {stats['websites_added']}")
print(f"  Alt names: {stats['alternative_names_added']}")
print(f"  Total enriched: {stats['institutions_enriched']}")
print("\n" + "="*70 + "\n")