#!/usr/bin/env python3 """Validate the curated Brazilian institutions YAML file.""" import yaml from collections import Counter # Read the curated file with open('data/instances/brazilian_institutions_curated.yaml', 'r') as f: institutions = yaml.safe_load(f) print(f"✓ Valid YAML with {len(institutions)} institutions\n") # Analyze by type types = Counter(inst['institution_type'] for inst in institutions) print("Institutions by Type:") for inst_type, count in sorted(types.items()): print(f" {inst_type}: {count}") print(f"\nTotal: {sum(types.values())}") # Count features features = { 'alternative_names': 0, 'wikidata_ids': 0, 'digital_platforms': 0, 'collections': 0, 'change_history': 0, } total_platforms = 0 total_collections = 0 total_events = 0 confidence_scores = [] for inst in institutions: if inst.get('alternative_names'): features['alternative_names'] += 1 if inst.get('identifiers'): for id_obj in inst['identifiers']: if id_obj['identifier_scheme'] == 'Wikidata': features['wikidata_ids'] += 1 break if inst.get('digital_platforms'): features['digital_platforms'] += 1 total_platforms += len(inst['digital_platforms']) if inst.get('collections'): features['collections'] += 1 total_collections += len(inst['collections']) if inst.get('change_history'): features['change_history'] += 1 total_events += len(inst['change_history']) if inst.get('provenance', {}).get('confidence_score'): confidence_scores.append(inst['provenance']['confidence_score']) print("\nFeature Coverage:") for feature, count in features.items(): pct = (count / len(institutions)) * 100 print(f" {feature}: {count}/{len(institutions)} ({pct:.0f}%)") print(f"\nAggregate Counts:") print(f" Total digital platforms: {total_platforms}") print(f" Total collections: {total_collections}") print(f" Total change events: {total_events}") if confidence_scores: print(f"\nConfidence Scores:") print(f" Min: {min(confidence_scores):.2f}") print(f" Max: {max(confidence_scores):.2f}") print(f" Average: {sum(confidence_scores)/len(confidence_scores):.2f}") # List all institutions print("\n" + "="*60) print("All Curated Institutions:") print("="*60) for i, inst in enumerate(institutions, 1): print(f"\n{i}. {inst['name']}") print(f" Type: {inst['institution_type']}") if inst.get('identifiers'): wikidata = next((id_obj for id_obj in inst['identifiers'] if id_obj['identifier_scheme'] == 'Wikidata'), None) if wikidata: print(f" Wikidata: {wikidata['identifier_value']}") if inst.get('digital_platforms'): print(f" Platforms: {len(inst['digital_platforms'])}") if inst.get('collections'): print(f" Collections: {len(inst['collections'])}") confidence = inst.get('provenance', {}).get('confidence_score') if confidence: print(f" Confidence: {confidence:.2f}")