glam/generate_comparison_report.py
2025-11-19 23:25:22 +01:00

123 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Generate comparison report between Brazilian and Mexican GLAM datasets.
"""
import json
import yaml
from pathlib import Path
from collections import defaultdict
def count_yaml_institutions(yaml_path: Path) -> int:
"""Count institutions in YAML file by counting '- id:' lines."""
with open(yaml_path, 'r', encoding='utf-8') as f:
content = f.read()
return content.count('\n- id:')
def analyze_json_stats(json_path: Path) -> dict:
"""Load and return statistics from JSON file."""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data.get('statistics', {})
def main():
print("="*80)
print("MEXICAN vs BRAZILIAN GLAM INSTITUTIONS - COMPARISON REPORT")
print("="*80)
print()
# Load Mexican stats
mx_json_path = Path('/tmp/mexican_institutions_final.json')
mx_stats = analyze_json_stats(mx_json_path)
mx_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml')
mx_count = count_yaml_institutions(mx_yaml_path)
# Load Brazilian stats (from previous session summary)
br_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_final.yaml')
br_count = count_yaml_institutions(br_yaml_path)
# Brazilian stats from previous session
br_stats = {
'total': 115,
'with_urls': 19,
'by_type': {
'MUSEUM': 52,
'ARCHIVE': 34,
'LIBRARY': 18,
'OFFICIAL_INSTITUTION': 7,
'EDUCATION_PROVIDER': 3,
'MIXED': 1
}
}
print(f"{'METRIC':<40} {'BRAZIL':>15} {'MEXICO':>15}")
print("-"*80)
# Overall counts
print(f"{'Total Institutions':<40} {br_stats['total']:>15} {mx_stats['total']:>15}")
print(f"{'Institutions with URLs':<40} {br_stats['with_urls']:>15} {mx_stats.get('with_urls', 0):>15}")
print(f"{'URL Coverage %':<40} {br_stats['with_urls']/br_stats['total']*100:>14.1f}% {mx_stats.get('with_urls', 0)/mx_stats['total']*100:>14.1f}%")
print(f"{'Institutions with Emails':<40} {'N/A':>15} {mx_stats.get('with_emails', 0):>15}")
print()
# By type
print("DISTRIBUTION BY INSTITUTION TYPE")
print("-"*80)
all_types = set(br_stats['by_type'].keys()) | set(mx_stats.get('by_type', {}).keys())
for itype in sorted(all_types):
br_count_type = br_stats['by_type'].get(itype, 0)
mx_count_type = mx_stats.get('by_type', {}).get(itype, 0)
br_pct = br_count_type / br_stats['total'] * 100 if br_stats['total'] > 0 else 0
mx_pct = mx_count_type / mx_stats['total'] * 100 if mx_stats['total'] > 0 else 0
print(f"{itype:<40} {br_count_type:>10} ({br_pct:>4.1f}%) {mx_count_type:>10} ({mx_pct:>4.1f}%)")
print()
print("TOP MEXICAN STATES BY INSTITUTION COUNT")
print("-"*80)
mx_by_state = mx_stats.get('by_state', {})
for state, count in sorted(mx_by_state.items(), key=lambda x: -x[1])[:15]:
print(f" {state:<40} {count:>5}")
print()
print("DATA QUALITY METRICS")
print("-"*80)
# Calculate quality metrics
br_id_coverage = br_stats['with_urls'] / br_stats['total'] * 100
mx_id_coverage = mx_stats.get('with_urls', 0) / mx_stats['total'] * 100
print(f"{'Identifier Coverage (URLs)':<40} {br_id_coverage:>14.1f}% {mx_id_coverage:>14.1f}%")
print(f"{'Geographic Coverage (States)':<40} {'27 states':>15} {len(mx_by_state)} states")
print(f"{'Data Tier':<40} {'TIER_4':>15} {'TIER_4':>15}")
print(f"{'Extraction Method':<40} {'Agent NER':>15} {'Multi-file':>15}")
print()
print("COMBINED STATISTICS")
print("-"*80)
total_combined = br_stats['total'] + mx_stats['total']
print(f"Total institutions (BR + MX): {total_combined}")
print(f"Countries covered: 2 (Brazil, Mexico)")
print(f"Conversations processed: 4 (2 Brazilian + 2 Mexican)")
print()
# Next steps
print("NEXT STEPS FOR GLOBAL GLAM PROJECT")
print("-"*80)
print("1. ✅ COMPLETE: Brazilian institutions (115 records)")
print(f"2. ✅ COMPLETE: Mexican institutions ({mx_stats['total']} records)")
print("3. ⏳ NEXT: Canadian GLAM (locate conversation files)")
print("4. ⏳ Chilean GLAM institutions")
print("5. ⏳ Vietnamese GLAM institutions")
print("6. ⏳ Japanese GLAM institutions")
print("7. ⏳ Schema validation for all records")
print("8. ⏳ Export to RDF/JSON-LD formats")
print("9. ⏳ Build visualization dashboard")
print("10. ⏳ Target: 500+ institutions from 10+ countries")
print()
if __name__ == '__main__':
main()