glam/generate_comparison_report.py

#!/usr/bin/env python3
"""
Generate comparison report between Brazilian and Mexican GLAM datasets.
"""

import json
import yaml
from pathlib import Path
from collections import defaultdict

def count_yaml_institutions(yaml_path: Path) -> int:
    """Count institutions in YAML file by counting '- id:' lines."""
    with open(yaml_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content.count('\n- id:')

def analyze_json_stats(json_path: Path) -> dict:
    """Load and return statistics from JSON file."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data.get('statistics', {})

def main():
    print("="*80)
    print("MEXICAN vs BRAZILIAN GLAM INSTITUTIONS - COMPARISON REPORT")
    print("="*80)
    print()

    # Load Mexican stats
    mx_json_path = Path('/tmp/mexican_institutions_final.json')
    mx_stats = analyze_json_stats(mx_json_path)
    mx_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml')
    mx_count = count_yaml_institutions(mx_yaml_path)

    # Load Brazilian stats (from previous session summary)
    br_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_final.yaml')
    br_count = count_yaml_institutions(br_yaml_path)

    # Brazilian stats from previous session
    br_stats = {
        'total': 115,
        'with_urls': 19,
        'by_type': {
            'MUSEUM': 52,
            'ARCHIVE': 34,
            'LIBRARY': 18,
            'OFFICIAL_INSTITUTION': 7,
            'EDUCATION_PROVIDER': 3,
            'MIXED': 1
        }
    }

    print(f"{'METRIC':<40} {'BRAZIL':>15} {'MEXICO':>15}")
    print("-"*80)

    # Overall counts
    print(f"{'Total Institutions':<40} {br_stats['total']:>15} {mx_stats['total']:>15}")
    print(f"{'Institutions with URLs':<40} {br_stats['with_urls']:>15} {mx_stats.get('with_urls', 0):>15}")
    print(f"{'URL Coverage %':<40} {br_stats['with_urls']/br_stats['total']*100:>14.1f}% {mx_stats.get('with_urls', 0)/mx_stats['total']*100:>14.1f}%")
    print(f"{'Institutions with Emails':<40} {'N/A':>15} {mx_stats.get('with_emails', 0):>15}")
    print()

    # By type
    print("DISTRIBUTION BY INSTITUTION TYPE")
    print("-"*80)

    all_types = set(br_stats['by_type'].keys()) | set(mx_stats.get('by_type', {}).keys())

    for itype in sorted(all_types):
        br_count_type = br_stats['by_type'].get(itype, 0)
        mx_count_type = mx_stats.get('by_type', {}).get(itype, 0)
        br_pct = br_count_type / br_stats['total'] * 100 if br_stats['total'] > 0 else 0
        mx_pct = mx_count_type / mx_stats['total'] * 100 if mx_stats['total'] > 0 else 0

        print(f"{itype:<40} {br_count_type:>10} ({br_pct:>4.1f}%) {mx_count_type:>10} ({mx_pct:>4.1f}%)")

    print()
    print("TOP MEXICAN STATES BY INSTITUTION COUNT")
    print("-"*80)

    mx_by_state = mx_stats.get('by_state', {})
    for state, count in sorted(mx_by_state.items(), key=lambda x: -x[1])[:15]:
        print(f"  {state:<40} {count:>5}")

    print()
    print("DATA QUALITY METRICS")
    print("-"*80)

    # Calculate quality metrics
    br_id_coverage = br_stats['with_urls'] / br_stats['total'] * 100
    mx_id_coverage = mx_stats.get('with_urls', 0) / mx_stats['total'] * 100

    print(f"{'Identifier Coverage (URLs)':<40} {br_id_coverage:>14.1f}% {mx_id_coverage:>14.1f}%")
    print(f"{'Geographic Coverage (States)':<40} {'27 states':>15} {len(mx_by_state)} states")
    print(f"{'Data Tier':<40} {'TIER_4':>15} {'TIER_4':>15}")
    print(f"{'Extraction Method':<40} {'Agent NER':>15} {'Multi-file':>15}")
    print()

    print("COMBINED STATISTICS")
    print("-"*80)
    total_combined = br_stats['total'] + mx_stats['total']
    print(f"Total institutions (BR + MX): {total_combined}")
    print(f"Countries covered: 2 (Brazil, Mexico)")
    print(f"Conversations processed: 4 (2 Brazilian + 2 Mexican)")
    print()

    # Next steps
    print("NEXT STEPS FOR GLOBAL GLAM PROJECT")
    print("-"*80)
    print("1. ✅ COMPLETE: Brazilian institutions (115 records)")
    print(f"2. ✅ COMPLETE: Mexican institutions ({mx_stats['total']} records)")
    print("3. ⏳ NEXT: Canadian GLAM (locate conversation files)")
    print("4. ⏳ Chilean GLAM institutions")
    print("5. ⏳ Vietnamese GLAM institutions")
    print("6. ⏳ Japanese GLAM institutions")
    print("7. ⏳ Schema validation for all records")
    print("8. ⏳ Export to RDF/JSON-LD formats")
    print("9. ⏳ Build visualization dashboard")
    print("10. ⏳ Target: 500+ institutions from 10+ countries")
    print()

if __name__ == '__main__':
    main()