glam/scripts/verify_phase1_enrichment.py

#!/usr/bin/env python3
"""
Verify Phase 1 Wikidata enrichment coverage.

Phase 1 countries:
- Georgia (GE)
- Great Britain (GB)
- Belgium (BE)
- United States (US)
- Luxembourg (LU)
"""

import yaml
from pathlib import Path
from collections import defaultdict

def verify_phase1_enrichment():
    """Verify Wikidata coverage for Phase 1 countries."""

    # Load unified dataset
    unified_path = Path('data/instances/all/globalglam-20251111.yaml')
    with open(unified_path, 'r', encoding='utf-8') as f:
        unified = yaml.safe_load(f)

    print(f"Loaded {len(unified)} institutions from unified dataset\n")

    # Phase 1 countries
    phase1_countries = ['GE', 'GB', 'BE', 'US', 'LU']

    # Statistics by country
    def make_stats_dict():
        return {'total': 0, 'with_wikidata': 0, 'institutions': []}

    stats = defaultdict(make_stats_dict)

    for inst in unified:
        locations = inst.get('locations', [])
        if not locations:
            continue

        country = locations[0].get('country')
        if country not in phase1_countries:
            continue

        stats[country]['total'] += 1

        # Check for Wikidata identifier
        identifiers = inst.get('identifiers', [])
        has_wikidata = any(
            id.get('identifier_scheme') == 'Wikidata'
            for id in identifiers
        )

        if has_wikidata:
            stats[country]['with_wikidata'] += 1
            wikidata_id = next(
                id.get('identifier_value')
                for id in identifiers
                if id.get('identifier_scheme') == 'Wikidata'
            )
            stats[country]['institutions'].append({
                'name': inst.get('name'),
                'id': inst.get('id'),
                'wikidata': wikidata_id
            })
        else:
            stats[country]['institutions'].append({
                'name': inst.get('name'),
                'id': inst.get('id'),
                'wikidata': None
            })

    # Print detailed results
    print("="*70)
    print("PHASE 1 WIKIDATA ENRICHMENT VERIFICATION")
    print("="*70)
    print()

    total_institutions = 0
    total_with_wikidata = 0

    for country in sorted(phase1_countries):
        country_stats = stats[country]
        total = country_stats['total']
        with_wikidata = country_stats['with_wikidata']
        percentage = (with_wikidata / total * 100) if total > 0 else 0

        total_institutions += total
        total_with_wikidata += with_wikidata

        status = "✅" if percentage == 100 else "❌"

        print(f"{status} {country}: {with_wikidata}/{total} with Wikidata ({percentage:.1f}%)")

        # Show institutions without Wikidata
        missing = [inst for inst in country_stats['institutions'] if inst['wikidata'] is None]
        if missing:
            print(f"   Missing Wikidata:")
            for inst in missing:
                print(f"     - {inst['name']}")
                print(f"       {inst['id']}")
        print()

    # Overall summary
    overall_percentage = (total_with_wikidata / total_institutions * 100) if total_institutions > 0 else 0
    overall_status = "✅" if overall_percentage == 100 else "❌"

    print("="*70)
    print(f"{overall_status} PHASE 1 TOTAL: {total_with_wikidata}/{total_institutions} with Wikidata ({overall_percentage:.1f}%)")
    print("="*70)

    if overall_percentage == 100:
        print("\n🎉 Phase 1 enrichment complete! All institutions have Wikidata identifiers.")
    else:
        print(f"\n⚠ Phase 1 incomplete: {total_institutions - total_with_wikidata} institutions need Wikidata enrichment.")

    return stats

if __name__ == '__main__':
    verify_phase1_enrichment()