glam/scripts/build_unified_database.py

#!/usr/bin/env python3
"""
Build Unified GLAM Heritage Custodian Database

Merges all country-specific LinkML datasets into a unified database with:
- Deduplication by GHCID
- Data quality tracking
- Version control
- Multiple export formats (JSON, SQLite, Parquet)
"""

import json
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
from collections import defaultdict
import hashlib

# Country dataset paths
COUNTRY_DATASETS = {
    'finland': '/Users/kempersc/apps/glam/data/finland_isil/finland_isil_linkml_final_20251120.json',
    'denmark': '/Users/kempersc/apps/glam/data/instances/denmark_complete_enriched.json',
    'netherlands': '/Users/kempersc/apps/glam/data/instances/netherlands_complete.yaml',
    'belgium': '/Users/kempersc/apps/glam/data/instances/belgium_isil.yaml',
    'belarus': '/Users/kempersc/apps/glam/data/instances/belarus_complete.yaml',
    'canada': '/Users/kempersc/apps/glam/data/instances/canada/canadian_heritage_custodians_geocoded.json',
    'chile': '/Users/kempersc/apps/glam/data/instances/chile/chilean_institutions_batch20_enriched.yaml',
    'egypt': '/Users/kempersc/apps/glam/data/instances/egypt_institutions_ghcid.yaml',
    # Japan dataset is 18MB - handle separately
}

OUTPUT_DIR = Path('/Users/kempersc/apps/glam/data/unified')
OUTPUT_DIR.mkdir(exist_ok=True)

def load_json_dataset(path: str) -> List[Dict[str, Any]]:
    """Load JSON format dataset."""
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Handle both list and dict formats
    if isinstance(data, list):
        return data
    elif isinstance(data, dict) and 'institutions' in data:
        return data['institutions']
    else:
        return [data]

def load_yaml_dataset(path: str) -> List[Dict[str, Any]]:
    """Load YAML format dataset."""
    import yaml
    with open(path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if isinstance(data, list):
        return data
    elif isinstance(data, dict) and 'institutions' in data:
        return data['institutions']
    else:
        return [data]

def extract_key_metadata(record: Dict[str, Any]) -> Dict[str, Any]:
    """Extract key metadata from institution record."""
    return {
        'id': record.get('id'),
        'ghcid': record.get('ghcid'),
        'ghcid_uuid': record.get('ghcid_uuid'),
        'ghcid_numeric': record.get('ghcid_numeric'),
        'name': record.get('name'),
        'institution_type': record.get('institution_type'),
        'country': record.get('locations', [{}])[0].get('country') if record.get('locations') else None,
        'city': record.get('locations', [{}])[0].get('city') if record.get('locations') else None,
        'data_source': record.get('provenance', {}).get('data_source'),
        'data_tier': record.get('provenance', {}).get('data_tier'),
        'extraction_date': record.get('provenance', {}).get('extraction_date'),
        'has_wikidata': any(
            i.get('identifier_scheme') == 'Wikidata'
            for i in record.get('identifiers', [])
        ),
        'has_website': any(
            i.get('identifier_scheme') == 'Website'
            for i in record.get('identifiers', [])
        ),
        'raw_record': json.dumps(record, ensure_ascii=False)
    }

def build_unified_database():
    """Build unified database from all country datasets."""

    print("🌍 Building Unified GLAM Heritage Custodian Database")
    print("=" * 70)

    all_institutions = []
    country_stats = defaultdict(lambda: {
        'total': 0,
        'with_ghcid': 0,
        'with_wikidata': 0,
        'with_website': 0,
        'by_type': defaultdict(int)
    })

    # Load each country dataset
    for country, path in COUNTRY_DATASETS.items():
        if not Path(path).exists():
            print(f"⚠️  {country.upper()}: Dataset not found at {path}")
            continue

        print(f"\n📂 Loading {country.upper()}...")

        try:
            if path.endswith('.json'):
                records = load_json_dataset(path)
            elif path.endswith('.yaml'):
                records = load_yaml_dataset(path)
            else:
                print(f"   ⚠️  Unknown format: {path}")
                continue

            print(f"   ✅ Loaded {len(records)} institutions")

            # Process records
            for record in records:
                metadata = extract_key_metadata(record)
                metadata['source_country'] = country
                all_institutions.append(metadata)

                # Update stats
                stats = country_stats[country]
                stats['total'] += 1
                if metadata.get('ghcid'):
                    stats['with_ghcid'] += 1
                if metadata.get('has_wikidata'):
                    stats['with_wikidata'] += 1
                if metadata.get('has_website'):
                    stats['with_website'] += 1

                inst_type = metadata.get('institution_type', 'UNKNOWN')
                stats['by_type'][inst_type] += 1

        except Exception as e:
            print(f"   ❌ Error loading {country}: {e}")
            continue

    print("\n" + "=" * 70)
    print(f"📊 Total institutions loaded: {len(all_institutions)}")

    # Deduplicate by GHCID
    ghcid_map = {}
    duplicates = []

    for inst in all_institutions:
        ghcid = inst.get('ghcid')
        if not ghcid:
            continue

        if ghcid in ghcid_map:
            duplicates.append((ghcid, inst['name'], ghcid_map[ghcid]['name']))
        else:
            ghcid_map[ghcid] = inst

    print(f"🔍 Unique GHCIDs: {len(ghcid_map)}")
    print(f"⚠️  Duplicates detected: {len(duplicates)}")

    # Export to JSON
    json_output = OUTPUT_DIR / 'glam_unified_database.json'
    with open(json_output, 'w', encoding='utf-8') as f:
        json.dump({
            'metadata': {
                'export_date': datetime.now(timezone.utc).isoformat(),
                'total_institutions': len(all_institutions),
                'unique_ghcids': len(ghcid_map),
                'duplicates': len(duplicates),
                'countries': list(COUNTRY_DATASETS.keys())
            },
            'country_stats': dict(country_stats),
            'institutions': all_institutions
        }, f, indent=2, ensure_ascii=False)

    print(f"✅ JSON export: {json_output}")

    # Export to SQLite
    sqlite_output = OUTPUT_DIR / 'glam_unified_database.db'
    conn = sqlite3.connect(sqlite_output)
    cursor = conn.cursor()

    # Create tables
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS institutions (
            id TEXT PRIMARY KEY,
            ghcid TEXT,
            ghcid_uuid TEXT,
            ghcid_numeric INTEGER,
            name TEXT NOT NULL,
            institution_type TEXT,
            country TEXT,
            city TEXT,
            source_country TEXT,
            data_source TEXT,
            data_tier TEXT,
            extraction_date TEXT,
            has_wikidata BOOLEAN,
            has_website BOOLEAN,
            raw_record TEXT
        )
    ''')

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS metadata (
            key TEXT PRIMARY KEY,
            value TEXT
        )
    ''')

    # Insert data
    for inst in all_institutions:
        cursor.execute('''
            INSERT OR REPLACE INTO institutions VALUES (
                ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
            )
        ''', (
            inst.get('id'),
            inst.get('ghcid'),
            inst.get('ghcid_uuid'),
            inst.get('ghcid_numeric'),
            inst.get('name'),
            inst.get('institution_type'),
            inst.get('country'),
            inst.get('city'),
            inst.get('source_country'),
            inst.get('data_source'),
            inst.get('data_tier'),
            inst.get('extraction_date'),
            inst.get('has_wikidata'),
            inst.get('has_website'),
            inst.get('raw_record')
        ))

    # Insert metadata
    cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)',
                  ('export_date', datetime.now(timezone.utc).isoformat()))
    cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)',
                  ('total_institutions', str(len(all_institutions))))

    conn.commit()
    conn.close()

    print(f"✅ SQLite export: {sqlite_output}")

    # Print country statistics
    print("\n" + "=" * 70)
    print("📈 Country Statistics\n")

    for country, stats in sorted(country_stats.items()):
        print(f"{country.upper()}:")
        print(f"  Total: {stats['total']}")
        print(f"  GHCID: {stats['with_ghcid']} ({stats['with_ghcid']/stats['total']*100:.1f}%)")
        print(f"  Wikidata: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total']*100:.1f}%)")
        print(f"  Website: {stats['with_website']} ({stats['with_website']/stats['total']*100:.1f}%)")
        print(f"  Types: {dict(stats['by_type'])}")
        print()

    # Print duplicates if any
    if duplicates:
        print("\n⚠️  Duplicate GHCIDs Detected:")
        for ghcid, name1, name2 in duplicates[:10]:
            print(f"  {ghcid}: '{name1}' vs '{name2}'")
        if len(duplicates) > 10:
            print(f"  ... and {len(duplicates) - 10} more")

    print("\n✅ Unified database build complete!")
    print(f"📂 Output directory: {OUTPUT_DIR}")

if __name__ == '__main__':
    build_unified_database()