glam/scripts/unify_all_datasets.py

#!/usr/bin/env python3
"""
Unify All GLAM Datasets - Comprehensive Global Integration

This script unifies all heritage institution datasets from individual countries
into a single comprehensive global dataset at data/instances/all/

Features:
- Merges all country-specific YAML files
- Deduplicates by ID and coordinates
- Tracks data provenance by country
- Generates comprehensive statistics
- Identifies records needing enrichment (missing Q-numbers, coordinates, etc.)

Country Sources:
- Brazil: brazilian_institutions_batch6_enriched.yaml (115 institutions)
- Chile: chilean_institutions_batch19_enriched.yaml (90 institutions, 78.9% Wikidata)
- Mexico: mexican_institutions_geocoded.yaml (117 institutions)
- Japan: jp_institutions_resolved.yaml (12,065 institutions)
- Libya: libyan_institutions.yaml (54 institutions)
- Tunisia: tunisian_institutions.yaml (42 institutions)
- Algeria: algerian_institutions.yaml (20 institutions)
- Vietnam: vietnamese_glam_institutions.yaml (21 institutions)
- Georgia: georgia_glam_institutions.yaml (14 institutions)
- Global: global_heritage_institutions_merged.yaml (13,396 institutions)

Output:
- data/instances/all/globalglam-20251111.yaml
- data/instances/all/UNIFICATION_REPORT.md
- data/instances/all/ENRICHMENT_CANDIDATES.yaml (records needing enrichment)
"""

import yaml
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any
from collections import defaultdict


def load_yaml_safe(filepath: Path) -> List[Dict]:
    """Load YAML file safely with error handling."""
    print(f"Loading: {filepath.name}")
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            if not data:
                print(f"  ⚠️  Empty file: {filepath.name}")
                return []
            if isinstance(data, list):
                print(f"  ✅ Loaded {len(data)} institutions")
                return data
            else:
                print(f"  ⚠️  Unexpected format (not a list): {type(data)}")
                return []
    except Exception as e:
        print(f"  ❌ Error loading {filepath.name}: {e}")
        return []


def get_country_code(inst: Dict) -> str:
    """Extract country code from institution."""
    if 'locations' in inst and inst['locations']:
        for loc in inst['locations']:
            if 'country' in loc and loc['country']:
                return loc['country']
    # Try to infer from ID
    if 'id' in inst:
        parts = inst['id'].split('/')
        if len(parts) >= 2:
            country = parts[-2]
            if len(country) == 2:
                return country.upper()
    return 'UNKNOWN'


def has_wikidata(inst: Dict) -> bool:
    """Check if institution has Wikidata identifier."""
    if 'identifiers' not in inst or not inst['identifiers']:
        return False
    return any(
        id.get('identifier_scheme') == 'Wikidata'
        for id in inst['identifiers']
    )


def has_coordinates(inst: Dict) -> bool:
    """Check if institution has geocoded coordinates."""
    if 'locations' not in inst or not inst['locations']:
        return False
    return any(
        loc.get('latitude') is not None and loc.get('longitude') is not None
        for loc in inst['locations']
    )


def needs_enrichment(inst: Dict) -> Dict[str, bool]:
    """Identify what enrichment an institution needs."""
    needs = {
        'wikidata': not has_wikidata(inst),
        'coordinates': not has_coordinates(inst),
        'website': not any(
            id.get('identifier_scheme') == 'Website'
            for id in inst.get('identifiers', [])
        ) if inst.get('identifiers') else True,
        'description': not inst.get('description') or len(inst.get('description', '')) < 50,
    }
    return needs


def main():
    """Main unification workflow."""
    base_dir = Path('/Users/kempersc/apps/glam/data/instances')
    output_dir = base_dir / 'all'
    output_dir.mkdir(exist_ok=True)

    print("\n" + "="*80)
    print("GLAM Dataset Unification - Global Integration")
    print("="*80 + "\n")

    # Define data sources (most recent files for each country)
    sources = {
        'chile': base_dir / 'chile' / 'chilean_institutions_batch19_enriched.yaml',
        'brazil': base_dir / 'brazil' / 'brazilian_institutions_batch6_enriched.yaml',
        'mexico': base_dir / 'mexico' / 'mexican_institutions_geocoded.yaml',
        'japan': base_dir / 'japan' / 'jp_institutions_resolved.yaml',
        'libya': base_dir / 'libya' / 'libyan_institutions.yaml',
        'tunisia': base_dir / 'tunisia' / 'tunisian_institutions.yaml',
        'algeria': base_dir / 'algeria' / 'algerian_institutions.yaml',
        'vietnam': base_dir / 'vietnamese_glam_institutions.yaml',
        'georgia': base_dir / 'georgia_glam_institutions.yaml',
        'historical': base_dir / 'historical_institutions_validation.yaml',
        'global': base_dir / 'global' / 'global_heritage_institutions_merged.yaml',
    }

    # Load all datasets
    all_institutions = []
    source_stats = {}

    for source_name, filepath in sources.items():
        if not filepath.exists():
            print(f"⚠️  Skipping {source_name}: file not found")
            continue

        institutions = load_yaml_safe(filepath)

        # Add source tracking to provenance
        for inst in institutions:
            if 'provenance' not in inst:
                inst['provenance'] = {}
            inst['provenance']['unification_source'] = source_name
            inst['provenance']['unification_date'] = datetime.now(timezone.utc).isoformat()

        all_institutions.extend(institutions)

        # Calculate statistics
        source_stats[source_name] = {
            'total': len(institutions),
            'with_wikidata': sum(1 for i in institutions if has_wikidata(i)),
            'with_coordinates': sum(1 for i in institutions if has_coordinates(i)),
        }

    print(f"\n📊 Total institutions loaded: {len(all_institutions)}")

    # Deduplicate by ID
    print("\n🔍 Deduplicating by ID...")
    seen_ids = {}  # Maps ID -> (institution, source_name)
    duplicates = []
    unique_institutions = []

    for inst in all_institutions:
        inst_id = inst.get('id')
        if not inst_id:
            unique_institutions.append(inst)
            continue

        source = inst['provenance'].get('unification_source', 'unknown')

        if inst_id in seen_ids:
            existing_inst, existing_source = seen_ids[inst_id]
            duplicates.append({
                'id': inst_id,
                'sources': [existing_source, source]
            })
            # Keep the one with more data (prioritize those with Wikidata)
            if has_wikidata(inst) and not has_wikidata(existing_inst):
                # Replace with more enriched version
                unique_institutions = [i for i in unique_institutions if i.get('id') != inst_id]
                unique_institutions.append(inst)
                seen_ids[inst_id] = (inst, source)
        else:
            seen_ids[inst_id] = (inst, source)
            unique_institutions.append(inst)

    print(f"  ✅ Unique institutions: {len(unique_institutions)}")
    print(f"  ⚠️  Duplicates removed: {len(duplicates)}")

    # Calculate enrichment statistics
    print("\n📈 Calculating enrichment statistics...")
    enrichment_stats = {
        'total': len(unique_institutions),
        'with_wikidata': sum(1 for i in unique_institutions if has_wikidata(i)),
        'with_coordinates': sum(1 for i in unique_institutions if has_coordinates(i)),
        'needs_wikidata': sum(1 for i in unique_institutions if needs_enrichment(i)['wikidata']),
        'needs_coordinates': sum(1 for i in unique_institutions if needs_enrichment(i)['coordinates']),
        'needs_website': sum(1 for i in unique_institutions if needs_enrichment(i)['website']),
        'needs_description': sum(1 for i in unique_institutions if needs_enrichment(i)['description']),
    }

    # Group by country
    by_country = defaultdict(list)
    for inst in unique_institutions:
        country = get_country_code(inst)
        by_country[country].append(inst)

    print(f"\n🌍 Countries covered: {len(by_country)}")
    for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True):
        wikidata_count = sum(1 for i in insts if has_wikidata(i))
        wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
        print(f"  {country}: {len(insts)} institutions ({wikidata_count}/{len(insts)} = {wikidata_pct:.1f}% Wikidata)")

    # Identify enrichment candidates
    print("\n🎯 Identifying enrichment candidates...")
    enrichment_candidates = []
    for inst in unique_institutions:
        needs = needs_enrichment(inst)
        if any(needs.values()):
            enrichment_candidates.append({
                'id': inst.get('id'),
                'name': inst.get('name'),
                'country': get_country_code(inst),
                'institution_type': inst.get('institution_type'),
                'needs': needs,
                'priority_score': sum(needs.values())  # Higher = more needs
            })

    # Sort by priority
    enrichment_candidates.sort(key=lambda x: x['priority_score'], reverse=True)

    print(f"  🔍 Found {len(enrichment_candidates)} institutions needing enrichment")
    print(f"     - Need Wikidata: {enrichment_stats['needs_wikidata']}")
    print(f"     - Need coordinates: {enrichment_stats['needs_coordinates']}")
    print(f"     - Need website: {enrichment_stats['needs_website']}")
    print(f"     - Need description: {enrichment_stats['needs_description']}")

    # Save unified dataset
    output_file = output_dir / 'globalglam-20251111.yaml'
    print(f"\n💾 Saving unified dataset to: {output_file.name}")
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(unique_institutions, f, allow_unicode=True, sort_keys=False, width=120)
    print(f"  ✅ Saved {len(unique_institutions)} institutions")

    # Save enrichment candidates
    candidates_file = output_dir / 'ENRICHMENT_CANDIDATES.yaml'
    print(f"\n💾 Saving enrichment candidates to: {candidates_file.name}")
    with open(candidates_file, 'w', encoding='utf-8') as f:
        yaml.dump(enrichment_candidates, f, allow_unicode=True, sort_keys=False)
    print(f"  ✅ Saved {len(enrichment_candidates)} candidates")

    # Generate unification report
    report_file = output_dir / 'UNIFICATION_REPORT.md'
    print(f"\n📄 Generating unification report: {report_file.name}")

    report = f"""# GLAM Dataset Unification Report

**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}

## Executive Summary

- **Total Institutions**: {len(unique_institutions):,}
- **Countries Covered**: {len(by_country)}
- **Wikidata Coverage**: {enrichment_stats['with_wikidata']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_wikidata']/enrichment_stats['total']*100:.1f}%)
- **Geocoding Coverage**: {enrichment_stats['with_coordinates']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_coordinates']/enrichment_stats['total']*100:.1f}%)
- **Duplicates Removed**: {len(duplicates)}

## Data Sources

"""

    for source_name, stats in sorted(source_stats.items()):
        wikidata_pct = (stats['with_wikidata'] / stats['total'] * 100) if stats['total'] > 0 else 0
        geocode_pct = (stats['with_coordinates'] / stats['total'] * 100) if stats['total'] > 0 else 0
        report += f"""### {source_name.title()}
- Total: {stats['total']:,} institutions
- Wikidata: {stats['with_wikidata']:,} ({wikidata_pct:.1f}%)
- Geocoded: {stats['with_coordinates']:,} ({geocode_pct:.1f}%)

"""

    report += f"""## Coverage by Country

| Country | Total | Wikidata | Wikidata % | Geocoded | Geocoded % |
|---------|-------|----------|------------|----------|------------|
"""

    for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True):
        wikidata_count = sum(1 for i in insts if has_wikidata(i))
        wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
        geocode_count = sum(1 for i in insts if has_coordinates(i))
        geocode_pct = (geocode_count / len(insts) * 100) if insts else 0

        report += f"| {country} | {len(insts):,} | {wikidata_count:,} | {wikidata_pct:.1f}% | {geocode_count:,} | {geocode_pct:.1f}% |\n"

    report += f"""
## Enrichment Needs

Total institutions requiring enrichment: **{len(enrichment_candidates):,}** ({len(enrichment_candidates)/len(unique_institutions)*100:.1f}% of dataset)

### By Enrichment Type

- **Need Wikidata**: {enrichment_stats['needs_wikidata']:,} ({enrichment_stats['needs_wikidata']/enrichment_stats['total']*100:.1f}%)
- **Need Coordinates**: {enrichment_stats['needs_coordinates']:,} ({enrichment_stats['needs_coordinates']/enrichment_stats['total']*100:.1f}%)
- **Need Website**: {enrichment_stats['needs_website']:,} ({enrichment_stats['needs_website']/enrichment_stats['total']*100:.1f}%)
- **Need Description**: {enrichment_stats['needs_description']:,} ({enrichment_stats['needs_description']/enrichment_stats['total']*100:.1f}%)

### Priority Distribution (by number of missing fields)

"""

    priority_dist = defaultdict(int)
    for candidate in enrichment_candidates:
        priority_dist[candidate['priority_score']] += 1

    for priority in sorted(priority_dist.keys(), reverse=True):
        count = priority_dist[priority]
        report += f"- **Priority {priority}** ({priority} missing fields): {count:,} institutions\n"

    report += f"""
## Top 50 Enrichment Candidates (Highest Priority)

| Name | Country | Type | Missing Fields |
|------|---------|------|----------------|
"""

    for candidate in enrichment_candidates[:50]:
        missing = ', '.join([k for k, v in candidate['needs'].items() if v])
        name_short = candidate['name'][:60] + '...' if len(candidate['name']) > 60 else candidate['name']
        report += f"| {name_short} | {candidate['country']} | {candidate['institution_type']} | {missing} |\n"

    report += f"""
## Deduplication Details

### Duplicates Found

Total duplicate IDs: {len(duplicates)}

"""

    if duplicates:
        report += "| ID | Sources |\n|----|---------|\n"
        for dup in duplicates[:20]:  # Show first 20
            sources_str = ', '.join(dup['sources'])
            id_short = dup['id'][-50:] if len(dup['id']) > 50 else dup['id']
            report += f"| ...{id_short} | {sources_str} |\n"

        if len(duplicates) > 20:
            report += f"\n*...and {len(duplicates) - 20} more duplicates*\n"

    report += f"""
## Next Steps

### Immediate Actions

1. **Review Enrichment Candidates**: Check `ENRICHMENT_CANDIDATES.yaml` for institutions needing data
2. **Prioritize Countries**: Focus on countries with low Wikidata coverage:
"""

    # Find countries with lowest Wikidata coverage
    country_coverage = []
    for country, insts in by_country.items():
        if country == 'UNKNOWN':
            continue
        wikidata_count = sum(1 for i in insts if has_wikidata(i))
        wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
        country_coverage.append((country, wikidata_pct, len(insts)))

    country_coverage.sort(key=lambda x: x[1])  # Sort by coverage ascending

    for country, pct, count in country_coverage[:10]:
        report += f"   - {country}: {pct:.1f}% coverage ({count} institutions)\n"

    report += f"""
3. **Batch Enrichment Workflow**:
   - Run Wikidata enrichment for high-priority candidates
   - Run geocoding for missing coordinates
   - Crawl institutional websites for missing data

### Tools Available

- **Wikidata Enrichment**: `scripts/enrich_global_batch.py`
- **Geocoding**: `scripts/geocode_institutions.py`
- **Website Crawling**: `scripts/crawl_institution_websites.py` (to be created)

## Files Generated

1. **globalglam-20251111.yaml** - Complete unified dataset ({len(unique_institutions):,} institutions)
2. **ENRICHMENT_CANDIDATES.yaml** - Institutions needing enrichment ({len(enrichment_candidates):,} candidates)
3. **UNIFICATION_REPORT.md** - This report

---

**Generated by**: `scripts/unify_all_datasets.py`
**Dataset Version**: 1.0
**Schema Version**: LinkML v0.2.1
"""

    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(report)

    print(f"  ✅ Report saved")

    # Update DATASET_STATISTICS.yaml
    stats_file = output_dir / 'DATASET_STATISTICS.yaml'
    print(f"\n📊 Updating statistics file: {stats_file.name}")

    stats_data = {
        'generated': datetime.now(timezone.utc).isoformat(),
        'project': 'GLAM Data Extraction',
        'schema_version': 'v0.2.1',
        'unified_dataset': {
            'total_institutions': len(unique_institutions),
            'countries_covered': len(by_country),
            'wikidata_coverage': {
                'count': enrichment_stats['with_wikidata'],
                'percentage': round(enrichment_stats['with_wikidata']/enrichment_stats['total']*100, 2)
            },
            'geocoding_coverage': {
                'count': enrichment_stats['with_coordinates'],
                'percentage': round(enrichment_stats['with_coordinates']/enrichment_stats['total']*100, 2)
            },
            'enrichment_needs': {
                'total_candidates': len(enrichment_candidates),
                'needs_wikidata': enrichment_stats['needs_wikidata'],
                'needs_coordinates': enrichment_stats['needs_coordinates'],
                'needs_website': enrichment_stats['needs_website'],
                'needs_description': enrichment_stats['needs_description'],
            }
        },
        'by_country': {}
    }

    for country, insts in sorted(by_country.items()):
        wikidata_count = sum(1 for i in insts if has_wikidata(i))
        geocode_count = sum(1 for i in insts if has_coordinates(i))
        stats_data['by_country'][country] = {
            'total': len(insts),
            'wikidata_coverage': {
                'count': wikidata_count,
                'percentage': round(wikidata_count/len(insts)*100, 2) if insts else 0
            },
            'geocoding_coverage': {
                'count': geocode_count,
                'percentage': round(geocode_count/len(insts)*100, 2) if insts else 0
            }
        }

    with open(stats_file, 'w', encoding='utf-8') as f:
        yaml.dump(stats_data, f, allow_unicode=True, sort_keys=False)

    print(f"  ✅ Statistics updated")

    print("\n" + "="*80)
    print("✅ UNIFICATION COMPLETE!")
    print("="*80)
    print(f"\n📁 Output files in: {output_dir}/")
    print(f"   - globalglam-20251111.yaml ({len(unique_institutions):,} institutions)")
    print(f"   - ENRICHMENT_CANDIDATES.yaml ({len(enrichment_candidates):,} candidates)")
    print(f"   - UNIFICATION_REPORT.md")
    print(f"   - DATASET_STATISTICS.yaml")
    print(f"\n🎯 Ready for global enrichment workflow!")
    print(f"   Next: Run enrichment on {enrichment_stats['needs_wikidata']:,} institutions without Wikidata")


if __name__ == '__main__':
    main()