#!/usr/bin/env python3 """ Verify Phase 1 Wikidata enrichment coverage. Phase 1 countries: - Georgia (GE) - Great Britain (GB) - Belgium (BE) - United States (US) - Luxembourg (LU) """ import yaml from pathlib import Path from collections import defaultdict def verify_phase1_enrichment(): """Verify Wikidata coverage for Phase 1 countries.""" # Load unified dataset unified_path = Path('data/instances/all/globalglam-20251111.yaml') with open(unified_path, 'r', encoding='utf-8') as f: unified = yaml.safe_load(f) print(f"Loaded {len(unified)} institutions from unified dataset\n") # Phase 1 countries phase1_countries = ['GE', 'GB', 'BE', 'US', 'LU'] # Statistics by country def make_stats_dict(): return {'total': 0, 'with_wikidata': 0, 'institutions': []} stats = defaultdict(make_stats_dict) for inst in unified: locations = inst.get('locations', []) if not locations: continue country = locations[0].get('country') if country not in phase1_countries: continue stats[country]['total'] += 1 # Check for Wikidata identifier identifiers = inst.get('identifiers', []) has_wikidata = any( id.get('identifier_scheme') == 'Wikidata' for id in identifiers ) if has_wikidata: stats[country]['with_wikidata'] += 1 wikidata_id = next( id.get('identifier_value') for id in identifiers if id.get('identifier_scheme') == 'Wikidata' ) stats[country]['institutions'].append({ 'name': inst.get('name'), 'id': inst.get('id'), 'wikidata': wikidata_id }) else: stats[country]['institutions'].append({ 'name': inst.get('name'), 'id': inst.get('id'), 'wikidata': None }) # Print detailed results print("="*70) print("PHASE 1 WIKIDATA ENRICHMENT VERIFICATION") print("="*70) print() total_institutions = 0 total_with_wikidata = 0 for country in sorted(phase1_countries): country_stats = stats[country] total = country_stats['total'] with_wikidata = country_stats['with_wikidata'] percentage = (with_wikidata / total * 100) if total > 0 else 0 total_institutions += total total_with_wikidata += with_wikidata status = "āœ…" if percentage == 100 else "āŒ" print(f"{status} {country}: {with_wikidata}/{total} with Wikidata ({percentage:.1f}%)") # Show institutions without Wikidata missing = [inst for inst in country_stats['institutions'] if inst['wikidata'] is None] if missing: print(f" Missing Wikidata:") for inst in missing: print(f" - {inst['name']}") print(f" {inst['id']}") print() # Overall summary overall_percentage = (total_with_wikidata / total_institutions * 100) if total_institutions > 0 else 0 overall_status = "āœ…" if overall_percentage == 100 else "āŒ" print("="*70) print(f"{overall_status} PHASE 1 TOTAL: {total_with_wikidata}/{total_institutions} with Wikidata ({overall_percentage:.1f}%)") print("="*70) if overall_percentage == 100: print("\nšŸŽ‰ Phase 1 enrichment complete! All institutions have Wikidata identifiers.") else: print(f"\n⚠ Phase 1 incomplete: {total_institutions - total_with_wikidata} institutions need Wikidata enrichment.") return stats if __name__ == '__main__': verify_phase1_enrichment()