#!/usr/bin/env python3 """ Final validation report for enrichment_history backfill project. Checks ALL institution files for completeness. """ import yaml from pathlib import Path from collections import defaultdict BASE_DIR = Path('/Users/kempersc/apps/glam/data/instances') # Files to check FILES_TO_CHECK = [ ('latin_american_institutions_AUTHORITATIVE.yaml', 'Latin America (AUTHORITATIVE)'), ('georgia_glam_institutions_enriched.yaml', 'Georgia'), ('tunisia/tunisian_institutions.yaml', 'Tunisia'), ('algeria/algerian_institutions.yaml', 'Algeria'), ('libya/libyan_institutions.yaml', 'Libya'), ('brazil/brazilian_institutions_batch6_enriched.yaml', 'Brazil (Batch 6)'), ('belgium/be_institutions_enriched_manual.yaml', 'Belgium'), ('great_britain/gb_institutions_enriched_manual.yaml', 'Great Britain'), ('united_states/us_institutions_enriched_manual.yaml', 'United States'), ] def check_file(filepath: Path, label: str): """Check enrichment_history completeness.""" if not filepath.exists(): return None with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Handle structure institutions = data['institutions'] if 'institutions' in data else data total = len(institutions) with_wikidata = 0 with_enrichment = 0 by_country = defaultdict(lambda: {'total': 0, 'wikidata': 0, 'enrichment': 0}) for inst in institutions: # Get country locations = inst.get('locations', []) country = locations[0].get('country', 'UNKNOWN') if locations else 'UNKNOWN' by_country[country]['total'] += 1 # Check Wikidata identifiers = inst.get('identifiers', []) has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers) if has_wikidata: with_wikidata += 1 by_country[country]['wikidata'] += 1 # Check enrichment_history provenance = inst.get('provenance', {}) if provenance.get('enrichment_history'): with_enrichment += 1 by_country[country]['enrichment'] += 1 return { 'label': label, 'total': total, 'with_wikidata': with_wikidata, 'with_enrichment': with_enrichment, 'gap': with_wikidata - with_enrichment, 'by_country': dict(by_country) } def main(): """Generate final validation report.""" print("=" * 80) print("ENRICHMENT HISTORY BACKFILL PROJECT - FINAL VALIDATION REPORT") print("=" * 80) print() results = [] total_institutions = 0 total_wikidata = 0 total_enrichment = 0 for filename, label in FILES_TO_CHECK: filepath = BASE_DIR / filename result = check_file(filepath, label) if result is None: print(f"⚠️ {label}: File not found") print() continue results.append(result) total_institutions += result['total'] total_wikidata += result['with_wikidata'] total_enrichment += result['with_enrichment'] status = "✅" if result['gap'] == 0 else "❌" print(f"{status} {result['label']}") print(f" Total institutions: {result['total']}") print(f" With Wikidata IDs: {result['with_wikidata']}") print(f" With enrichment_history: {result['with_enrichment']}") print(f" Gap: {result['gap']}") if result['by_country']: print(" By country:") for country, stats in sorted(result['by_country'].items()): if stats['wikidata'] > 0: gap = stats['wikidata'] - stats['enrichment'] status_icon = "✅" if gap == 0 else "❌" print(f" {status_icon} {country}: {stats['wikidata']} Wikidata, " f"{stats['enrichment']} enrichment, gap: {gap}") print() # Summary total_gap = total_wikidata - total_enrichment print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total institutions processed: {total_institutions}") print(f"Total with Wikidata IDs: {total_wikidata}") print(f"Total with enrichment_history: {total_enrichment}") print(f"Total gap: {total_gap}") print() if total_gap == 0: print("🎉 " + "=" * 76) print(" 100% COMPLETENESS ACHIEVED - PROJECT SUCCESS!") print(" " + "=" * 76) print() print(" ✅ All institutions with Wikidata IDs have enrichment_history") print(" ✅ Provenance tracking complete across all datasets") print(" ✅ Data quality metadata documented for {total_wikidata} institutions") print() else: print(f"❌ {total_gap} institutions still missing enrichment_history") print() for result in results: if result['gap'] > 0: print(f" - {result['label']}: {result['gap']} institutions") # Country breakdown print("=" * 80) print("BREAKDOWN BY COUNTRY") print("=" * 80) country_totals = defaultdict(lambda: {'wikidata': 0, 'enrichment': 0}) for result in results: for country, stats in result['by_country'].items(): country_totals[country]['wikidata'] += stats['wikidata'] country_totals[country]['enrichment'] += stats['enrichment'] for country in sorted(country_totals.keys()): stats = country_totals[country] gap = stats['wikidata'] - stats['enrichment'] status = "✅" if gap == 0 else "❌" if stats['wikidata'] > 0: print(f"{status} {country}: {stats['wikidata']} Wikidata, " f"{stats['enrichment']} enrichment, gap: {gap}") if __name__ == '__main__': main()