glam/validate_curated.py
2025-11-19 23:25:22 +01:00

94 lines
3 KiB
Python

#!/usr/bin/env python3
"""Validate the curated Brazilian institutions YAML file."""
import yaml
from collections import Counter
# Read the curated file
with open('data/instances/brazilian_institutions_curated.yaml', 'r') as f:
institutions = yaml.safe_load(f)
print(f"✓ Valid YAML with {len(institutions)} institutions\n")
# Analyze by type
types = Counter(inst['institution_type'] for inst in institutions)
print("Institutions by Type:")
for inst_type, count in sorted(types.items()):
print(f" {inst_type}: {count}")
print(f"\nTotal: {sum(types.values())}")
# Count features
features = {
'alternative_names': 0,
'wikidata_ids': 0,
'digital_platforms': 0,
'collections': 0,
'change_history': 0,
}
total_platforms = 0
total_collections = 0
total_events = 0
confidence_scores = []
for inst in institutions:
if inst.get('alternative_names'):
features['alternative_names'] += 1
if inst.get('identifiers'):
for id_obj in inst['identifiers']:
if id_obj['identifier_scheme'] == 'Wikidata':
features['wikidata_ids'] += 1
break
if inst.get('digital_platforms'):
features['digital_platforms'] += 1
total_platforms += len(inst['digital_platforms'])
if inst.get('collections'):
features['collections'] += 1
total_collections += len(inst['collections'])
if inst.get('change_history'):
features['change_history'] += 1
total_events += len(inst['change_history'])
if inst.get('provenance', {}).get('confidence_score'):
confidence_scores.append(inst['provenance']['confidence_score'])
print("\nFeature Coverage:")
for feature, count in features.items():
pct = (count / len(institutions)) * 100
print(f" {feature}: {count}/{len(institutions)} ({pct:.0f}%)")
print(f"\nAggregate Counts:")
print(f" Total digital platforms: {total_platforms}")
print(f" Total collections: {total_collections}")
print(f" Total change events: {total_events}")
if confidence_scores:
print(f"\nConfidence Scores:")
print(f" Min: {min(confidence_scores):.2f}")
print(f" Max: {max(confidence_scores):.2f}")
print(f" Average: {sum(confidence_scores)/len(confidence_scores):.2f}")
# List all institutions
print("\n" + "="*60)
print("All Curated Institutions:")
print("="*60)
for i, inst in enumerate(institutions, 1):
print(f"\n{i}. {inst['name']}")
print(f" Type: {inst['institution_type']}")
if inst.get('identifiers'):
wikidata = next((id_obj for id_obj in inst['identifiers']
if id_obj['identifier_scheme'] == 'Wikidata'), None)
if wikidata:
print(f" Wikidata: {wikidata['identifier_value']}")
if inst.get('digital_platforms'):
print(f" Platforms: {len(inst['digital_platforms'])}")
if inst.get('collections'):
print(f" Collections: {len(inst['collections'])}")
confidence = inst.get('provenance', {}).get('confidence_score')
if confidence:
print(f" Confidence: {confidence:.2f}")