glam/scripts/validate_enrichment_history_targeted.py

#!/usr/bin/env python3
"""
Validate enrichment_history completeness for countries with Wikidata enrichments.

Targeted validation to avoid timeout - checks specific countries.
"""

import yaml
from pathlib import Path
from collections import defaultdict

def load_yaml(filepath):
    """Load YAML file safely."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            return data if data else []
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return []

def check_enrichment_history(institutions, country_name):
    """Check enrichment_history completeness for institutions with Wikidata IDs."""
    stats = {
        'total': 0,
        'with_wikidata': 0,
        'with_enrichment_history': 0,
        'missing_enrichment_history': []
    }

    if not institutions:
        return stats

    for inst in institutions:
        stats['total'] += 1

        # Check for Wikidata identifier
        identifiers = inst.get('identifiers', [])
        has_wikidata = any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in identifiers
        )

        if has_wikidata:
            stats['with_wikidata'] += 1

            # Check for enrichment_history
            provenance = inst.get('provenance', {})
            enrichment_history = provenance.get('enrichment_history', [])

            if enrichment_history:
                stats['with_enrichment_history'] += 1
            else:
                stats['missing_enrichment_history'].append({
                    'name': inst.get('name', 'UNKNOWN'),
                    'id': inst.get('id', 'UNKNOWN')
                })

    return stats

def main():
    """Validate targeted countries."""
    base_path = Path('/Users/kempersc/apps/glam/data/instances')

    # Countries to check (known to have Wikidata enrichments)
    target_countries = [
        ('chile', 'chile/chilean_institutions.yaml'),
        ('georgia', 'georgia/georgian_institutions.yaml'),
        ('japan', 'japan/japanese_institutions.yaml'),
        ('mexico', 'mexico/mexican_institutions.yaml'),
        ('norway', 'norway/norwegian_institutions.yaml'),
        ('tunisia', 'tunisia/tunisian_institutions.yaml'),
        ('algeria', 'algeria/algerian_institutions.yaml'),
        ('libya', 'libya/libyan_institutions.yaml'),
        ('brazil', 'brazil/brazilian_institutions_batch6_enriched.yaml'),
        ('belgium', 'belgium/be_institutions_enriched_manual.yaml'),
        ('great_britain', 'great_britain/gb_institutions_enriched_manual.yaml'),
        ('united_states', 'united_states/us_institutions_enriched_manual.yaml'),
    ]

    print("=" * 80)
    print("ENRICHMENT HISTORY VALIDATION - TARGETED COUNTRIES")
    print("=" * 80)
    print()

    total_stats = defaultdict(int)
    all_missing = []

    for country_name, filepath in target_countries:
        full_path = base_path / filepath

        if not full_path.exists():
            print(f"⚠️  {country_name.upper()}: File not found - {filepath}")
            print()
            continue

        institutions = load_yaml(full_path)
        stats = check_enrichment_history(institutions, country_name)

        # Aggregate totals
        total_stats['total'] += stats['total']
        total_stats['with_wikidata'] += stats['with_wikidata']
        total_stats['with_enrichment_history'] += stats['with_enrichment_history']

        # Report
        gap = stats['with_wikidata'] - stats['with_enrichment_history']
        status = "✅" if gap == 0 else "❌"

        print(f"{status} {country_name.upper()}")
        print(f"   Total institutions: {stats['total']}")
        print(f"   With Wikidata: {stats['with_wikidata']}")
        print(f"   With enrichment_history: {stats['with_enrichment_history']}")
        print(f"   Gap: {gap}")

        if stats['missing_enrichment_history']:
            print(f"   Missing enrichment_history:")
            for missing in stats['missing_enrichment_history'][:5]:  # Show first 5
                print(f"     - {missing['name']}")
            if len(stats['missing_enrichment_history']) > 5:
                print(f"     ... and {len(stats['missing_enrichment_history']) - 5} more")
            all_missing.extend(stats['missing_enrichment_history'])

        print()

    # Summary
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    total_gap = total_stats['with_wikidata'] - total_stats['with_enrichment_history']
    print(f"Total institutions: {total_stats['total']}")
    print(f"With Wikidata IDs: {total_stats['with_wikidata']}")
    print(f"With enrichment_history: {total_stats['with_enrichment_history']}")
    print(f"Total gap: {total_gap}")
    print()

    if total_gap == 0:
        print("✅ 100% COMPLETENESS ACHIEVED!")
        print("All institutions with Wikidata IDs have enrichment_history.")
    else:
        print(f"❌ {total_gap} institutions still missing enrichment_history")
        print()
        print("Institutions missing enrichment_history:")
        for missing in all_missing[:20]:  # Show first 20
            print(f"  - {missing['name']} ({missing['id']})")
        if len(all_missing) > 20:
            print(f"  ... and {len(all_missing) - 20} more")

if __name__ == '__main__':
    main()