123 lines
4.6 KiB
Python
123 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate comparison report between Brazilian and Mexican GLAM datasets.
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
def count_yaml_institutions(yaml_path: Path) -> int:
|
|
"""Count institutions in YAML file by counting '- id:' lines."""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
return content.count('\n- id:')
|
|
|
|
def analyze_json_stats(json_path: Path) -> dict:
|
|
"""Load and return statistics from JSON file."""
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
return data.get('statistics', {})
|
|
|
|
def main():
|
|
print("="*80)
|
|
print("MEXICAN vs BRAZILIAN GLAM INSTITUTIONS - COMPARISON REPORT")
|
|
print("="*80)
|
|
print()
|
|
|
|
# Load Mexican stats
|
|
mx_json_path = Path('/tmp/mexican_institutions_final.json')
|
|
mx_stats = analyze_json_stats(mx_json_path)
|
|
mx_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml')
|
|
mx_count = count_yaml_institutions(mx_yaml_path)
|
|
|
|
# Load Brazilian stats (from previous session summary)
|
|
br_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_final.yaml')
|
|
br_count = count_yaml_institutions(br_yaml_path)
|
|
|
|
# Brazilian stats from previous session
|
|
br_stats = {
|
|
'total': 115,
|
|
'with_urls': 19,
|
|
'by_type': {
|
|
'MUSEUM': 52,
|
|
'ARCHIVE': 34,
|
|
'LIBRARY': 18,
|
|
'OFFICIAL_INSTITUTION': 7,
|
|
'EDUCATION_PROVIDER': 3,
|
|
'MIXED': 1
|
|
}
|
|
}
|
|
|
|
print(f"{'METRIC':<40} {'BRAZIL':>15} {'MEXICO':>15}")
|
|
print("-"*80)
|
|
|
|
# Overall counts
|
|
print(f"{'Total Institutions':<40} {br_stats['total']:>15} {mx_stats['total']:>15}")
|
|
print(f"{'Institutions with URLs':<40} {br_stats['with_urls']:>15} {mx_stats.get('with_urls', 0):>15}")
|
|
print(f"{'URL Coverage %':<40} {br_stats['with_urls']/br_stats['total']*100:>14.1f}% {mx_stats.get('with_urls', 0)/mx_stats['total']*100:>14.1f}%")
|
|
print(f"{'Institutions with Emails':<40} {'N/A':>15} {mx_stats.get('with_emails', 0):>15}")
|
|
print()
|
|
|
|
# By type
|
|
print("DISTRIBUTION BY INSTITUTION TYPE")
|
|
print("-"*80)
|
|
|
|
all_types = set(br_stats['by_type'].keys()) | set(mx_stats.get('by_type', {}).keys())
|
|
|
|
for itype in sorted(all_types):
|
|
br_count_type = br_stats['by_type'].get(itype, 0)
|
|
mx_count_type = mx_stats.get('by_type', {}).get(itype, 0)
|
|
br_pct = br_count_type / br_stats['total'] * 100 if br_stats['total'] > 0 else 0
|
|
mx_pct = mx_count_type / mx_stats['total'] * 100 if mx_stats['total'] > 0 else 0
|
|
|
|
print(f"{itype:<40} {br_count_type:>10} ({br_pct:>4.1f}%) {mx_count_type:>10} ({mx_pct:>4.1f}%)")
|
|
|
|
print()
|
|
print("TOP MEXICAN STATES BY INSTITUTION COUNT")
|
|
print("-"*80)
|
|
|
|
mx_by_state = mx_stats.get('by_state', {})
|
|
for state, count in sorted(mx_by_state.items(), key=lambda x: -x[1])[:15]:
|
|
print(f" {state:<40} {count:>5}")
|
|
|
|
print()
|
|
print("DATA QUALITY METRICS")
|
|
print("-"*80)
|
|
|
|
# Calculate quality metrics
|
|
br_id_coverage = br_stats['with_urls'] / br_stats['total'] * 100
|
|
mx_id_coverage = mx_stats.get('with_urls', 0) / mx_stats['total'] * 100
|
|
|
|
print(f"{'Identifier Coverage (URLs)':<40} {br_id_coverage:>14.1f}% {mx_id_coverage:>14.1f}%")
|
|
print(f"{'Geographic Coverage (States)':<40} {'27 states':>15} {len(mx_by_state)} states")
|
|
print(f"{'Data Tier':<40} {'TIER_4':>15} {'TIER_4':>15}")
|
|
print(f"{'Extraction Method':<40} {'Agent NER':>15} {'Multi-file':>15}")
|
|
print()
|
|
|
|
print("COMBINED STATISTICS")
|
|
print("-"*80)
|
|
total_combined = br_stats['total'] + mx_stats['total']
|
|
print(f"Total institutions (BR + MX): {total_combined}")
|
|
print(f"Countries covered: 2 (Brazil, Mexico)")
|
|
print(f"Conversations processed: 4 (2 Brazilian + 2 Mexican)")
|
|
print()
|
|
|
|
# Next steps
|
|
print("NEXT STEPS FOR GLOBAL GLAM PROJECT")
|
|
print("-"*80)
|
|
print("1. ✅ COMPLETE: Brazilian institutions (115 records)")
|
|
print(f"2. ✅ COMPLETE: Mexican institutions ({mx_stats['total']} records)")
|
|
print("3. ⏳ NEXT: Canadian GLAM (locate conversation files)")
|
|
print("4. ⏳ Chilean GLAM institutions")
|
|
print("5. ⏳ Vietnamese GLAM institutions")
|
|
print("6. ⏳ Japanese GLAM institutions")
|
|
print("7. ⏳ Schema validation for all records")
|
|
print("8. ⏳ Export to RDF/JSON-LD formats")
|
|
print("9. ⏳ Build visualization dashboard")
|
|
print("10. ⏳ Target: 500+ institutions from 10+ countries")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|