94 lines
3 KiB
Python
94 lines
3 KiB
Python
#!/usr/bin/env python3
|
|
"""Validate the curated Brazilian institutions YAML file."""
|
|
|
|
import yaml
|
|
from collections import Counter
|
|
|
|
# Read the curated file
|
|
with open('data/instances/brazilian_institutions_curated.yaml', 'r') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"✓ Valid YAML with {len(institutions)} institutions\n")
|
|
|
|
# Analyze by type
|
|
types = Counter(inst['institution_type'] for inst in institutions)
|
|
print("Institutions by Type:")
|
|
for inst_type, count in sorted(types.items()):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
print(f"\nTotal: {sum(types.values())}")
|
|
|
|
# Count features
|
|
features = {
|
|
'alternative_names': 0,
|
|
'wikidata_ids': 0,
|
|
'digital_platforms': 0,
|
|
'collections': 0,
|
|
'change_history': 0,
|
|
}
|
|
|
|
total_platforms = 0
|
|
total_collections = 0
|
|
total_events = 0
|
|
confidence_scores = []
|
|
|
|
for inst in institutions:
|
|
if inst.get('alternative_names'):
|
|
features['alternative_names'] += 1
|
|
|
|
if inst.get('identifiers'):
|
|
for id_obj in inst['identifiers']:
|
|
if id_obj['identifier_scheme'] == 'Wikidata':
|
|
features['wikidata_ids'] += 1
|
|
break
|
|
|
|
if inst.get('digital_platforms'):
|
|
features['digital_platforms'] += 1
|
|
total_platforms += len(inst['digital_platforms'])
|
|
|
|
if inst.get('collections'):
|
|
features['collections'] += 1
|
|
total_collections += len(inst['collections'])
|
|
|
|
if inst.get('change_history'):
|
|
features['change_history'] += 1
|
|
total_events += len(inst['change_history'])
|
|
|
|
if inst.get('provenance', {}).get('confidence_score'):
|
|
confidence_scores.append(inst['provenance']['confidence_score'])
|
|
|
|
print("\nFeature Coverage:")
|
|
for feature, count in features.items():
|
|
pct = (count / len(institutions)) * 100
|
|
print(f" {feature}: {count}/{len(institutions)} ({pct:.0f}%)")
|
|
|
|
print(f"\nAggregate Counts:")
|
|
print(f" Total digital platforms: {total_platforms}")
|
|
print(f" Total collections: {total_collections}")
|
|
print(f" Total change events: {total_events}")
|
|
|
|
if confidence_scores:
|
|
print(f"\nConfidence Scores:")
|
|
print(f" Min: {min(confidence_scores):.2f}")
|
|
print(f" Max: {max(confidence_scores):.2f}")
|
|
print(f" Average: {sum(confidence_scores)/len(confidence_scores):.2f}")
|
|
|
|
# List all institutions
|
|
print("\n" + "="*60)
|
|
print("All Curated Institutions:")
|
|
print("="*60)
|
|
for i, inst in enumerate(institutions, 1):
|
|
print(f"\n{i}. {inst['name']}")
|
|
print(f" Type: {inst['institution_type']}")
|
|
if inst.get('identifiers'):
|
|
wikidata = next((id_obj for id_obj in inst['identifiers']
|
|
if id_obj['identifier_scheme'] == 'Wikidata'), None)
|
|
if wikidata:
|
|
print(f" Wikidata: {wikidata['identifier_value']}")
|
|
if inst.get('digital_platforms'):
|
|
print(f" Platforms: {len(inst['digital_platforms'])}")
|
|
if inst.get('collections'):
|
|
print(f" Collections: {len(inst['collections'])}")
|
|
confidence = inst.get('provenance', {}).get('confidence_score')
|
|
if confidence:
|
|
print(f" Confidence: {confidence:.2f}")
|