#!/usr/bin/env python3 """ Validate enrichment_history completeness for countries with Wikidata enrichments. Targeted validation to avoid timeout - checks specific countries. """ import yaml from pathlib import Path from collections import defaultdict def load_yaml(filepath): """Load YAML file safely.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) return data if data else [] except Exception as e: print(f"Error loading {filepath}: {e}") return [] def check_enrichment_history(institutions, country_name): """Check enrichment_history completeness for institutions with Wikidata IDs.""" stats = { 'total': 0, 'with_wikidata': 0, 'with_enrichment_history': 0, 'missing_enrichment_history': [] } if not institutions: return stats for inst in institutions: stats['total'] += 1 # Check for Wikidata identifier identifiers = inst.get('identifiers', []) has_wikidata = any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers ) if has_wikidata: stats['with_wikidata'] += 1 # Check for enrichment_history provenance = inst.get('provenance', {}) enrichment_history = provenance.get('enrichment_history', []) if enrichment_history: stats['with_enrichment_history'] += 1 else: stats['missing_enrichment_history'].append({ 'name': inst.get('name', 'UNKNOWN'), 'id': inst.get('id', 'UNKNOWN') }) return stats def main(): """Validate targeted countries.""" base_path = Path('/Users/kempersc/apps/glam/data/instances') # Countries to check (known to have Wikidata enrichments) target_countries = [ ('chile', 'chile/chilean_institutions.yaml'), ('georgia', 'georgia/georgian_institutions.yaml'), ('japan', 'japan/japanese_institutions.yaml'), ('mexico', 'mexico/mexican_institutions.yaml'), ('norway', 'norway/norwegian_institutions.yaml'), ('tunisia', 'tunisia/tunisian_institutions.yaml'), ('algeria', 'algeria/algerian_institutions.yaml'), ('libya', 'libya/libyan_institutions.yaml'), ('brazil', 'brazil/brazilian_institutions_batch6_enriched.yaml'), ('belgium', 'belgium/be_institutions_enriched_manual.yaml'), ('great_britain', 'great_britain/gb_institutions_enriched_manual.yaml'), ('united_states', 'united_states/us_institutions_enriched_manual.yaml'), ] print("=" * 80) print("ENRICHMENT HISTORY VALIDATION - TARGETED COUNTRIES") print("=" * 80) print() total_stats = defaultdict(int) all_missing = [] for country_name, filepath in target_countries: full_path = base_path / filepath if not full_path.exists(): print(f"⚠️ {country_name.upper()}: File not found - {filepath}") print() continue institutions = load_yaml(full_path) stats = check_enrichment_history(institutions, country_name) # Aggregate totals total_stats['total'] += stats['total'] total_stats['with_wikidata'] += stats['with_wikidata'] total_stats['with_enrichment_history'] += stats['with_enrichment_history'] # Report gap = stats['with_wikidata'] - stats['with_enrichment_history'] status = "✅" if gap == 0 else "❌" print(f"{status} {country_name.upper()}") print(f" Total institutions: {stats['total']}") print(f" With Wikidata: {stats['with_wikidata']}") print(f" With enrichment_history: {stats['with_enrichment_history']}") print(f" Gap: {gap}") if stats['missing_enrichment_history']: print(f" Missing enrichment_history:") for missing in stats['missing_enrichment_history'][:5]: # Show first 5 print(f" - {missing['name']}") if len(stats['missing_enrichment_history']) > 5: print(f" ... and {len(stats['missing_enrichment_history']) - 5} more") all_missing.extend(stats['missing_enrichment_history']) print() # Summary print("=" * 80) print("SUMMARY") print("=" * 80) total_gap = total_stats['with_wikidata'] - total_stats['with_enrichment_history'] print(f"Total institutions: {total_stats['total']}") print(f"With Wikidata IDs: {total_stats['with_wikidata']}") print(f"With enrichment_history: {total_stats['with_enrichment_history']}") print(f"Total gap: {total_gap}") print() if total_gap == 0: print("✅ 100% COMPLETENESS ACHIEVED!") print("All institutions with Wikidata IDs have enrichment_history.") else: print(f"❌ {total_gap} institutions still missing enrichment_history") print() print("Institutions missing enrichment_history:") for missing in all_missing[:20]: # Show first 20 print(f" - {missing['name']} ({missing['id']})") if len(all_missing) > 20: print(f" ... and {len(all_missing) - 20} more") if __name__ == '__main__': main()