glam/scripts/final_enrichment_validation_report.py

#!/usr/bin/env python3
"""
Final validation report for enrichment_history backfill project.

Checks ALL institution files for completeness.
"""

import yaml
from pathlib import Path
from collections import defaultdict

BASE_DIR = Path('/Users/kempersc/apps/glam/data/instances')

# Files to check
FILES_TO_CHECK = [
    ('latin_american_institutions_AUTHORITATIVE.yaml', 'Latin America (AUTHORITATIVE)'),
    ('georgia_glam_institutions_enriched.yaml', 'Georgia'),
    ('tunisia/tunisian_institutions.yaml', 'Tunisia'),
    ('algeria/algerian_institutions.yaml', 'Algeria'),
    ('libya/libyan_institutions.yaml', 'Libya'),
    ('brazil/brazilian_institutions_batch6_enriched.yaml', 'Brazil (Batch 6)'),
    ('belgium/be_institutions_enriched_manual.yaml', 'Belgium'),
    ('great_britain/gb_institutions_enriched_manual.yaml', 'Great Britain'),
    ('united_states/us_institutions_enriched_manual.yaml', 'United States'),
]

def check_file(filepath: Path, label: str):
    """Check enrichment_history completeness."""
    if not filepath.exists():
        return None

    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Handle structure
    institutions = data['institutions'] if 'institutions' in data else data

    total = len(institutions)
    with_wikidata = 0
    with_enrichment = 0
    by_country = defaultdict(lambda: {'total': 0, 'wikidata': 0, 'enrichment': 0})

    for inst in institutions:
        # Get country
        locations = inst.get('locations', [])
        country = locations[0].get('country', 'UNKNOWN') if locations else 'UNKNOWN'

        by_country[country]['total'] += 1

        # Check Wikidata
        identifiers = inst.get('identifiers', [])
        has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)

        if has_wikidata:
            with_wikidata += 1
            by_country[country]['wikidata'] += 1

            # Check enrichment_history
            provenance = inst.get('provenance', {})
            if provenance.get('enrichment_history'):
                with_enrichment += 1
                by_country[country]['enrichment'] += 1

    return {
        'label': label,
        'total': total,
        'with_wikidata': with_wikidata,
        'with_enrichment': with_enrichment,
        'gap': with_wikidata - with_enrichment,
        'by_country': dict(by_country)
    }

def main():
    """Generate final validation report."""
    print("=" * 80)
    print("ENRICHMENT HISTORY BACKFILL PROJECT - FINAL VALIDATION REPORT")
    print("=" * 80)
    print()

    results = []
    total_institutions = 0
    total_wikidata = 0
    total_enrichment = 0

    for filename, label in FILES_TO_CHECK:
        filepath = BASE_DIR / filename
        result = check_file(filepath, label)

        if result is None:
            print(f"⚠️  {label}: File not found")
            print()
            continue

        results.append(result)
        total_institutions += result['total']
        total_wikidata += result['with_wikidata']
        total_enrichment += result['with_enrichment']

        status = "✅" if result['gap'] == 0 else "❌"
        print(f"{status} {result['label']}")
        print(f"   Total institutions: {result['total']}")
        print(f"   With Wikidata IDs: {result['with_wikidata']}")
        print(f"   With enrichment_history: {result['with_enrichment']}")
        print(f"   Gap: {result['gap']}")

        if result['by_country']:
            print("   By country:")
            for country, stats in sorted(result['by_country'].items()):
                if stats['wikidata'] > 0:
                    gap = stats['wikidata'] - stats['enrichment']
                    status_icon = "✅" if gap == 0 else "❌"
                    print(f"     {status_icon} {country}: {stats['wikidata']} Wikidata, "
                          f"{stats['enrichment']} enrichment, gap: {gap}")

        print()

    # Summary
    total_gap = total_wikidata - total_enrichment

    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total institutions processed: {total_institutions}")
    print(f"Total with Wikidata IDs: {total_wikidata}")
    print(f"Total with enrichment_history: {total_enrichment}")
    print(f"Total gap: {total_gap}")
    print()

    if total_gap == 0:
        print("🎉 " + "=" * 76)
        print("   100% COMPLETENESS ACHIEVED - PROJECT SUCCESS!")
        print("   " + "=" * 76)
        print()
        print("   ✅ All institutions with Wikidata IDs have enrichment_history")
        print("   ✅ Provenance tracking complete across all datasets")
        print("   ✅ Data quality metadata documented for {total_wikidata} institutions")
        print()
    else:
        print(f"❌ {total_gap} institutions still missing enrichment_history")
        print()
        for result in results:
            if result['gap'] > 0:
                print(f"   - {result['label']}: {result['gap']} institutions")

    # Country breakdown
    print("=" * 80)
    print("BREAKDOWN BY COUNTRY")
    print("=" * 80)

    country_totals = defaultdict(lambda: {'wikidata': 0, 'enrichment': 0})

    for result in results:
        for country, stats in result['by_country'].items():
            country_totals[country]['wikidata'] += stats['wikidata']
            country_totals[country]['enrichment'] += stats['enrichment']

    for country in sorted(country_totals.keys()):
        stats = country_totals[country]
        gap = stats['wikidata'] - stats['enrichment']
        status = "✅" if gap == 0 else "❌"

        if stats['wikidata'] > 0:
            print(f"{status} {country}: {stats['wikidata']} Wikidata, "
                  f"{stats['enrichment']} enrichment, gap: {gap}")

if __name__ == '__main__':
    main()