glam/enrich_bulgaria_isil.py

#!/usr/bin/env python3
"""
Bulgarian ISIL Registry Enrichment Script
Enriches Bulgarian institutions with OSM, Wikidata, and VIAF data.
"""

import json
import time
import yaml
import requests
from datetime import datetime, timezone
from typing import List, Dict, Optional
from rapidfuzz import fuzz

def fetch_osm_libraries() -> List[Dict]:
    """Fetch Bulgarian libraries/archives from OpenStreetMap."""
    print("Fetching OSM data for Bulgaria...")

    overpass_url = "https://overpass-api.de/api/interpreter"
    query = """
    [out:json][timeout:90];
    area["ISO3166-1"="BG"][admin_level=2]->.searchArea;
    (
      node["amenity"="library"](area.searchArea);
      way["amenity"="library"](area.searchArea);
      relation["amenity"="library"](area.searchArea);
      node["amenity"="archive"](area.searchArea);
      way["amenity"="archive"](area.searchArea);
      node["tourism"="museum"](area.searchArea);
      way["tourism"="museum"](area.searchArea);
    );
    out body;
    >;
    out skel qt;
    """

    try:
        response = requests.post(overpass_url, data={'data': query}, timeout=120)
        response.raise_for_status()
        data = response.json()

        libraries = []
        for element in data.get('elements', []):
            if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element):
                tags = element.get('tags', {})
                name = tags.get('name') or tags.get('name:bg') or tags.get('name:en')
                if name:
                    lat = element.get('lat') or element.get('center', {}).get('lat')
                    lon = element.get('lon') or element.get('center', {}).get('lon')

                    libraries.append({
                        'name': name,
                        'name_bg': tags.get('name:bg'),
                        'name_en': tags.get('name:en'),
                        'latitude': lat,
                        'longitude': lon,
                        'city': tags.get('addr:city'),
                        'street': tags.get('addr:street'),
                        'website': tags.get('website') or tags.get('contact:website'),
                        'wikidata': tags.get('wikidata'),
                        'osm_id': element.get('id')
                    })

        print(f"✓ Found {len(libraries)} locations in OSM")
        return libraries

    except Exception as e:
        print(f"✗ OSM fetch failed: {e}")
        return []

def fetch_wikidata_institutions() -> List[Dict]:
    """Query Wikidata for Bulgarian heritage institutions."""
    print("Querying Wikidata for Bulgarian institutions...")

    endpoint = "https://query.wikidata.org/sparql"
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords ?address WHERE {
      VALUES ?type {
        wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400
        wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318
      }
      ?item wdt:P31/wdt:P279* ?type .
      ?item wdt:P17 wd:Q219 .

      OPTIONAL { ?item wdt:P791 ?isil }
      OPTIONAL { ?item wdt:P214 ?viaf }
      OPTIONAL { ?item wdt:P856 ?website }
      OPTIONAL { ?item wdt:P625 ?coords }
      OPTIONAL { ?item wdt:P6375 ?address }

      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "bg,en,ru".
        ?item rdfs:label ?itemLabel .
        ?item skos:altLabel ?itemAltLabel .
      }
    }
    """

    try:
        response = requests.get(
            endpoint,
            params={'query': query, 'format': 'json'},
            headers={'User-Agent': 'GLAM-Extractor/1.0'},
            timeout=60
        )
        response.raise_for_status()
        data = response.json()

        institutions = []
        for result in data['results']['bindings']:
            item_id = result['item']['value'].split('/')[-1]

            coords_str = result.get('coords', {}).get('value', '')
            lat, lon = None, None
            if coords_str and coords_str.startswith('Point('):
                parts = coords_str.replace('Point(', '').replace(')', '').split()
                if len(parts) == 2:
                    lon, lat = float(parts[0]), float(parts[1])

            institutions.append({
                'qid': item_id,
                'label': result.get('itemLabel', {}).get('value'),
                'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [],
                'isil': result.get('isil', {}).get('value'),
                'viaf': result.get('viaf', {}).get('value'),
                'website': result.get('website', {}).get('value'),
                'latitude': lat,
                'longitude': lon,
                'address': result.get('address', {}).get('value')
            })

        print(f"✓ Found {len(institutions)} institutions in Wikidata")
        return institutions

    except Exception as e:
        print(f"✗ Wikidata query failed: {e}")
        return []

def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str]) -> Optional[tuple]:
    """Fuzzy match institution name against candidates."""
    best_match = None
    best_score = 0

    inst_name_clean = inst_name.lower().strip()

    for candidate in candidates:
        # Try all name fields
        names_to_try = []
        for field in name_fields:
            value = candidate.get(field)
            if value:
                if isinstance(value, list):
                    names_to_try.extend(value)
                else:
                    names_to_try.append(value)

        for name in names_to_try:
            if not name:
                continue

            name_clean = name.lower().strip()
            score = fuzz.token_sort_ratio(inst_name_clean, name_clean)

            if score > best_score:
                best_score = score
                best_match = candidate

    if best_score >= 75:
        return (best_match, best_score)
    return None

def enrich_institutions(base_file: str, osm_data: List[Dict], wikidata_data: List[Dict]) -> tuple:
    """Enrich Bulgarian institutions with external data."""
    print(f"\nLoading base institutions from {base_file}...")

    with open(base_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    print(f"✓ Loaded {len(institutions)} institutions")
    print("\nEnriching institutions...")

    enrichments = []
    stats = {
        'total': len(institutions),
        'wikidata_matched': 0,
        'viaf_added': 0,
        'coords_added': 0,
        'website_added': 0,
        'osm_matched': 0,
        'high_confidence': 0,
        'medium_confidence': 0
    }

    for idx, inst in enumerate(institutions, 1):
        inst_name = inst.get('name', '')
        alt_names = inst.get('alternative_names', [])

        enrichment = {
            'id': inst.get('id'),
            'name': inst_name,
            'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None),
            'matches': []
        }

        # Try Wikidata match by ISIL first (exact)
        inst_isil = enrichment['isil']
        wikidata_match = None
        match_score = 0

        if inst_isil:
            for wd in wikidata_data:
                if wd.get('isil') == inst_isil:
                    wikidata_match = wd
                    match_score = 100
                    enrichment['matches'].append({
                        'source': 'wikidata',
                        'match_type': 'isil_exact',
                        'score': 100,
                        'qid': wd['qid']
                    })
                    stats['wikidata_matched'] += 1
                    stats['high_confidence'] += 1
                    break

        # Fuzzy match by name if no ISIL match
        if not wikidata_match:
            result = fuzzy_match_institution(
                inst_name,
                wikidata_data,
                ['label', 'alt_labels']
            )
            if result:
                wikidata_match, match_score = result
                enrichment['matches'].append({
                    'source': 'wikidata',
                    'match_type': 'name_fuzzy',
                    'score': match_score,
                    'qid': wikidata_match['qid']
                })
                stats['wikidata_matched'] += 1
                if match_score >= 85:
                    stats['high_confidence'] += 1
                else:
                    stats['medium_confidence'] += 1

        # Add Wikidata enrichments
        if wikidata_match:
            if wikidata_match.get('viaf'):
                enrichment['viaf'] = wikidata_match['viaf']
                stats['viaf_added'] += 1

            if wikidata_match.get('website') and not inst.get('homepage'):
                enrichment['website'] = wikidata_match['website']
                stats['website_added'] += 1

            if wikidata_match.get('latitude') and wikidata_match.get('longitude'):
                location = inst.get('locations', [{}])[0]
                if not location.get('latitude'):
                    enrichment['latitude'] = wikidata_match['latitude']
                    enrichment['longitude'] = wikidata_match['longitude']
                    stats['coords_added'] += 1

        # Try OSM match
        osm_result = fuzzy_match_institution(
            inst_name,
            osm_data,
            ['name', 'name_bg', 'name_en']
        )

        if osm_result:
            osm_match, osm_score = osm_result
            enrichment['matches'].append({
                'source': 'osm',
                'match_type': 'name_fuzzy',
                'score': osm_score,
                'osm_id': osm_match['osm_id']
            })
            stats['osm_matched'] += 1

            # Add OSM data if missing
            if osm_match.get('latitude') and osm_match.get('longitude'):
                location = inst.get('locations', [{}])[0]
                if not location.get('latitude') and 'latitude' not in enrichment:
                    enrichment['latitude'] = osm_match['latitude']
                    enrichment['longitude'] = osm_match['longitude']
                    stats['coords_added'] += 1

            if osm_match.get('website') and not inst.get('homepage') and 'website' not in enrichment:
                enrichment['website'] = osm_match['website']
                stats['website_added'] += 1

        if enrichment['matches']:
            enrichments.append(enrichment)

        if idx % 10 == 0:
            print(f"  Processed {idx}/{stats['total']} institutions...")

    print(f"\n✓ Enrichment complete")
    print(f"\nStatistics:")
    print(f"  Total institutions: {stats['total']}")
    print(f"  Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
    print(f"  - High confidence (≥85%): {stats['high_confidence']}")
    print(f"  - Medium confidence (75-84%): {stats['medium_confidence']}")
    print(f"  OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)")
    print(f"  VIAF IDs added: {stats['viaf_added']}")
    print(f"  Coordinates added: {stats['coords_added']}")
    print(f"  Websites added: {stats['website_added']}")
    print(f"  Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)")

    return enrichments, stats

def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str):
    """Apply enrichments to base YAML and save."""
    print(f"\nApplying enrichments to {base_file}...")

    with open(base_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    enrichment_map = {e['id']: e for e in enrichments}

    enriched_count = 0
    for inst in institutions:
        inst_id = inst.get('id')
        if inst_id in enrichment_map:
            enrich_data = enrichment_map[inst_id]

            # Add Wikidata ID
            if 'qid' in enrich_data or any(m.get('qid') for m in enrich_data.get('matches', [])):
                qid = enrich_data.get('qid') or next(m['qid'] for m in enrich_data['matches'] if m.get('qid'))
                identifiers = inst.setdefault('identifiers', [])
                if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
                    identifiers.append({
                        'identifier_scheme': 'Wikidata',
                        'identifier_value': qid,
                        'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
                    })

            # Add VIAF ID
            if 'viaf' in enrich_data:
                identifiers = inst.setdefault('identifiers', [])
                if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
                    identifiers.append({
                        'identifier_scheme': 'VIAF',
                        'identifier_value': enrich_data['viaf'],
                        'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}'
                    })

            # Add coordinates
            if 'latitude' in enrich_data and 'longitude' in enrich_data:
                locations = inst.setdefault('locations', [{}])
                if not locations[0].get('latitude'):
                    locations[0]['latitude'] = enrich_data['latitude']
                    locations[0]['longitude'] = enrich_data['longitude']

            # Add website
            if 'website' in enrich_data and not inst.get('homepage'):
                inst['homepage'] = enrich_data['website']
                identifiers = inst.setdefault('identifiers', [])
                if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
                    identifiers.append({
                        'identifier_scheme': 'Website',
                        'identifier_value': enrich_data['website'],
                        'identifier_url': enrich_data['website']
                    })

            enriched_count += 1

    # Add header comment
    output_data = {
        '_comment': f'Bulgarian ISIL Registry - Enriched with Wikidata, VIAF, and OSM data',
        '_generated': datetime.now(timezone.utc).isoformat(),
        '_source_file': base_file,
        '_enriched_count': enriched_count,
        '_total_count': len(institutions)
    }

    # Save enriched YAML
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("---\n")
        f.write(f"# Bulgarian ISIL Registry - Enriched\n")
        f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
        f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n")
        f.write(f"# Source: {base_file}\n\n")
        yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"✓ Saved enriched data to {output_file}")
    print(f"  Enriched {enriched_count}/{len(institutions)} institutions")

def main():
    print("=" * 70)
    print("Bulgarian ISIL Registry Enrichment")
    print("=" * 70)

    base_file = "data/instances/bulgaria_isil_libraries.yaml"
    output_file = "data/instances/bulgaria_complete.yaml"

    # Step 1: Fetch OSM data
    osm_data = fetch_osm_libraries()
    with open('data/isil/bulgaria/bulgaria_osm_libraries.json', 'w', encoding='utf-8') as f:
        json.dump(osm_data, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved OSM data to data/isil/bulgaria/bulgaria_osm_libraries.json")

    # Step 2: Fetch Wikidata
    time.sleep(2)  # Rate limiting
    wikidata_data = fetch_wikidata_institutions()
    with open('data/isil/bulgaria/bulgaria_wikidata_institutions.json', 'w', encoding='utf-8') as f:
        json.dump(wikidata_data, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved Wikidata data to data/isil/bulgaria/bulgaria_wikidata_institutions.json")

    # Step 3: Enrich
    enrichments, stats = enrich_institutions(base_file, osm_data, wikidata_data)
    with open('data/isil/bulgaria/bulgaria_enrichments.json', 'w', encoding='utf-8') as f:
        json.dump({
            'enrichments': enrichments,
            'stats': stats,
            'generated': datetime.now(timezone.utc).isoformat()
        }, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved enrichment data to data/isil/bulgaria/bulgaria_enrichments.json")

    # Step 4: Apply enrichments
    apply_enrichments(base_file, enrichments, output_file)

    print("\n" + "=" * 70)
    print("✓ Bulgarian ISIL enrichment complete!")
    print("=" * 70)

if __name__ == '__main__':
    main()