glam/scripts/scrapers/consolidate_austrian_data.py

#!/usr/bin/env python3
"""
Austrian Heritage Institution Data Consolidator
Merges ISIL registry, Wikidata, and OpenStreetMap data for Austria

This script consolidates multiple Austrian data sources:
1. ISIL page files (194 files, ~1,920 institutions)
2. Wikidata SPARQL results (~4,863 institutions)
3. OpenStreetMap libraries (~748 libraries)

Outputs:
- Consolidated JSON with deduplication
- Statistics report
- Ready for LinkML conversion

Author: OpenCode + MCP Tools
Date: 2025-11-19
"""

import json
import glob
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Set
from collections import defaultdict
from rapidfuzz import fuzz

# Configuration
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
OUTPUT_FILE = DATA_DIR / f"austrian_institutions_consolidated_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
STATS_FILE = DATA_DIR / f"consolidation_stats_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"

# Fuzzy matching threshold for deduplication
FUZZY_THRESHOLD = 85


def parse_isil_pages() -> List[Dict]:
    """Parse all page_XXX_data.json files."""
    institutions = []
    page_files = sorted(glob.glob(str(DATA_DIR / "page_*_data.json")))

    print(f"📄 Parsing {len(page_files)} ISIL page files...")

    for filepath in page_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)

                # Handle two formats:
                # 1. Direct array: [{name, isil_code}, ...]
                # 2. Wrapped object: {institutions: [{name, isil}, ...]}
                if isinstance(data, dict) and 'institutions' in data:
                    items = data['institutions']
                elif isinstance(data, list):
                    items = data
                else:
                    print(f"⚠️  Unknown format in {filepath}")
                    continue

                for inst in items:
                    # Handle both string and dict formats
                    if isinstance(inst, str):
                        continue  # Skip string entries

                    if not isinstance(inst, dict):
                        continue

                    name = inst.get('name')
                    # Check both 'isil_code' and 'isil' fields
                    isil = inst.get('isil_code') or inst.get('isil')

                    # Skip entries with no name
                    if not name:
                        continue

                    institutions.append({
                        'name': name.strip() if name else '',
                        'isil_code': isil.strip() if isil else None,
                        'data_source': 'ISIL_REGISTRY',
                        'source_file': Path(filepath).name
                    })
        except Exception as e:
            print(f"⚠️  Error reading {filepath}: {e}")

    print(f"✅ Parsed {len(institutions)} institutions from ISIL pages")
    return institutions


def parse_wikidata() -> List[Dict]:
    """Parse Wikidata SPARQL results."""
    institutions = []
    wikidata_file = DATA_DIR / "austria_wikidata_institutions.json"

    print(f"📄 Parsing Wikidata SPARQL results...")

    try:
        with open(wikidata_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            bindings = data.get('results', {}).get('bindings', [])

            for binding in bindings:
                item = binding.get('item', {}).get('value', '')
                q_number = item.split('/')[-1] if item else None

                name = binding.get('itemLabel', {}).get('value', '')
                # Skip if name is just Q-number (no proper label)
                if name.startswith('Q') and name[1:].isdigit():
                    continue

                description = binding.get('itemDescription', {}).get('value', '')
                inst_type = binding.get('typeLabel', {}).get('value', '')
                website = binding.get('website', {}).get('value', '')
                viaf = binding.get('viaf', {}).get('value', '')
                isil = binding.get('isil', {}).get('value', '')
                coords = binding.get('coord', {}).get('value', '')
                city = binding.get('cityLabel', {}).get('value', '')

                # Parse coordinates if present
                lat, lon = None, None
                if coords and coords.startswith('Point('):
                    try:
                        coords_clean = coords.replace('Point(', '').replace(')', '')
                        lon, lat = map(float, coords_clean.split())
                    except:
                        pass

                institutions.append({
                    'name': name.strip(),
                    'wikidata_id': q_number,
                    'description': description,
                    'institution_type': inst_type,
                    'website': website,
                    'viaf': viaf,
                    'isil_code': isil,
                    'city': city,
                    'latitude': lat,
                    'longitude': lon,
                    'data_source': 'WIKIDATA',
                    'source_file': 'austria_wikidata_institutions.json'
                })

        print(f"✅ Parsed {len(institutions)} institutions from Wikidata")
    except Exception as e:
        print(f"⚠️  Error reading Wikidata file: {e}")

    return institutions


def parse_osm() -> List[Dict]:
    """Parse OpenStreetMap library data."""
    institutions = []
    osm_file = DATA_DIR / "austria_osm_libraries.json"

    print(f"📄 Parsing OpenStreetMap data...")

    try:
        with open(osm_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            elements = data.get('elements', [])

            for element in elements:
                tags = element.get('tags', {})
                name = tags.get('name', tags.get('operator', ''))

                if not name:
                    continue

                institutions.append({
                    'name': name.strip(),
                    'institution_type': 'library',
                    'latitude': element.get('lat'),
                    'longitude': element.get('lon'),
                    'street': tags.get('addr:street'),
                    'housenumber': tags.get('addr:housenumber'),
                    'postcode': tags.get('addr:postcode'),
                    'city': tags.get('addr:city'),
                    'website': tags.get('website') or tags.get('contact:website'),
                    'phone': tags.get('phone') or tags.get('contact:phone'),
                    'email': tags.get('email') or tags.get('contact:email'),
                    'osm_id': element.get('id'),
                    'osm_type': element.get('type'),
                    'data_source': 'OPENSTREETMAP',
                    'source_file': 'austria_osm_libraries.json'
                })

        print(f"✅ Parsed {len(institutions)} libraries from OSM")
    except Exception as e:
        print(f"⚠️  Error reading OSM file: {e}")

    return institutions


def fuzzy_match_name(name1: str, name2: str) -> int:
    """Calculate fuzzy match score between two institution names."""
    if not name1 or not name2:
        return 0
    return fuzz.ratio(name1.lower(), name2.lower())


def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]:
    """
    Deduplicate institutions using ISIL codes and fuzzy name matching.

    Priority:
    1. ISIL_REGISTRY (most authoritative for ISIL codes)
    2. WIKIDATA (rich metadata)
    3. OPENSTREETMAP (geocoding)
    """
    print(f"\n🔍 Deduplicating {len(institutions)} institutions...")

    # Index by ISIL code
    by_isil: Dict[str, List[Dict]] = defaultdict(list)
    no_isil: List[Dict] = []

    for inst in institutions:
        isil = inst.get('isil_code')
        # Handle None or empty strings
        if isil:
            isil = isil.strip()

        if isil:
            by_isil[isil].append(inst)
        else:
            no_isil.append(inst)

    print(f"  - {len(by_isil)} unique ISIL codes")
    print(f"  - {len(no_isil)} institutions without ISIL")

    # Merge institutions with same ISIL code
    merged = []
    for isil, group in by_isil.items():
        if len(group) == 1:
            merged.append(group[0])
        else:
            # Merge metadata from all sources
            base = {'isil_code': isil, 'data_sources': []}

            for inst in group:
                base['data_sources'].append(inst.get('data_source'))
                # Merge fields (prefer non-empty values)
                for key, value in inst.items():
                    if key == 'data_source':
                        continue
                    if key not in base or not base[key]:
                        base[key] = value

            merged.append(base)

    # Fuzzy match institutions without ISIL
    print(f"  - Fuzzy matching {len(no_isil)} institutions...")
    matched_indices: Set[int] = set()

    for i, inst1 in enumerate(no_isil):
        if i in matched_indices:
            continue

        # Try to match with existing merged institutions
        best_match = None
        best_score = 0

        for j, inst2 in enumerate(merged):
            score = fuzzy_match_name(inst1.get('name', ''), inst2.get('name', ''))
            if score > best_score and score >= FUZZY_THRESHOLD:
                best_score = score
                best_match = inst2

        if best_match:
            # Merge into existing institution
            if 'data_sources' not in best_match:
                best_match['data_sources'] = [best_match.get('data_source')]
            best_match['data_sources'].append(inst1.get('data_source'))

            for key, value in inst1.items():
                if key == 'data_source':
                    continue
                if key not in best_match or not best_match[key]:
                    best_match[key] = value

            matched_indices.add(i)
        else:
            # Try to match with other no_isil institutions
            for j in range(i + 1, len(no_isil)):
                if j in matched_indices:
                    continue

                score = fuzzy_match_name(inst1.get('name', ''), no_isil[j].get('name', ''))
                if score >= FUZZY_THRESHOLD:
                    # Merge inst1 and inst[j]
                    inst1['data_sources'] = [inst1.get('data_source'), no_isil[j].get('data_source')]
                    for key, value in no_isil[j].items():
                        if key == 'data_source':
                            continue
                        if key not in inst1 or not inst1[key]:
                            inst1[key] = value
                    matched_indices.add(j)

    # Add unmatched no_isil institutions
    for i, inst in enumerate(no_isil):
        if i not in matched_indices:
            inst['data_sources'] = [inst.get('data_source')]
            merged.append(inst)

    print(f"✅ Deduplicated to {len(merged)} unique institutions")
    return merged


def generate_statistics(institutions: List[Dict]) -> Dict:
    """Generate consolidation statistics."""
    stats = {
        'total_institutions': len(institutions),
        'by_source': defaultdict(int),
        'by_type': defaultdict(int),
        'with_isil': 0,
        'with_wikidata': 0,
        'with_geocoding': 0,
        'with_website': 0,
        'multi_source': 0,
        'cities': defaultdict(int),
        'generation_date': datetime.now(timezone.utc).isoformat()
    }

    for inst in institutions:
        # Count sources
        sources = inst.get('data_sources', [inst.get('data_source')])
        for source in sources:
            stats['by_source'][source] += 1

        if len(sources) > 1:
            stats['multi_source'] += 1

        # Count features
        if inst.get('isil_code'):
            stats['with_isil'] += 1
        if inst.get('wikidata_id'):
            stats['with_wikidata'] += 1
        if inst.get('latitude') and inst.get('longitude'):
            stats['with_geocoding'] += 1
        if inst.get('website'):
            stats['with_website'] += 1

        # Count by type
        inst_type = inst.get('institution_type', 'unknown')
        stats['by_type'][inst_type] += 1

        # Count by city
        city = inst.get('city', 'unknown')
        if city:
            stats['cities'][city] += 1

    # Convert defaultdicts to regular dicts for JSON serialization
    stats['by_source'] = dict(stats['by_source'])
    stats['by_type'] = dict(stats['by_type'])
    stats['cities'] = dict(sorted(stats['cities'].items(), key=lambda x: x[1], reverse=True)[:20])  # Top 20 cities

    return stats


def main():
    """Main consolidation workflow."""
    print("🇦🇹 Austrian Heritage Institution Data Consolidation")
    print("=" * 60)

    # Parse all data sources
    isil_institutions = parse_isil_pages()
    wikidata_institutions = parse_wikidata()
    osm_institutions = parse_osm()

    # Combine all sources
    all_institutions = isil_institutions + wikidata_institutions + osm_institutions
    print(f"\n📊 Total raw institutions: {len(all_institutions)}")

    # Deduplicate
    consolidated = deduplicate_institutions(all_institutions)

    # Generate statistics
    print(f"\n📈 Generating statistics...")
    stats = generate_statistics(consolidated)

    # Export consolidated data
    print(f"\n💾 Exporting consolidated data...")
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(consolidated, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved to: {OUTPUT_FILE}")
    print(f"   Size: {OUTPUT_FILE.stat().st_size / 1024:.1f} KB")

    # Export statistics
    with open(STATS_FILE, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)
    print(f"✅ Statistics saved to: {STATS_FILE}")

    # Print summary
    print(f"\n" + "=" * 60)
    print(f"📊 CONSOLIDATION SUMMARY")
    print(f"=" * 60)
    print(f"Total unique institutions: {stats['total_institutions']}")
    print(f"")
    print(f"By source:")
    for source, count in stats['by_source'].items():
        print(f"  - {source}: {count}")
    print(f"")
    print(f"Multi-source records: {stats['multi_source']}")
    print(f"")
    print(f"Coverage:")
    print(f"  - With ISIL codes: {stats['with_isil']} ({stats['with_isil']/stats['total_institutions']*100:.1f}%)")
    print(f"  - With Wikidata IDs: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total_institutions']*100:.1f}%)")
    print(f"  - With geocoding: {stats['with_geocoding']} ({stats['with_geocoding']/stats['total_institutions']*100:.1f}%)")
    print(f"  - With websites: {stats['with_website']} ({stats['with_website']/stats['total_institutions']*100:.1f}%)")
    print(f"")
    print(f"Top 5 cities:")
    for i, (city, count) in enumerate(list(stats['cities'].items())[:5], 1):
        print(f"  {i}. {city}: {count}")
    print(f"=" * 60)


if __name__ == "__main__":
    main()