#!/usr/bin/env python3 """ Generate comparison report between Brazilian and Mexican GLAM datasets. """ import json import yaml from pathlib import Path from collections import defaultdict def count_yaml_institutions(yaml_path: Path) -> int: """Count institutions in YAML file by counting '- id:' lines.""" with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() return content.count('\n- id:') def analyze_json_stats(json_path: Path) -> dict: """Load and return statistics from JSON file.""" with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) return data.get('statistics', {}) def main(): print("="*80) print("MEXICAN vs BRAZILIAN GLAM INSTITUTIONS - COMPARISON REPORT") print("="*80) print() # Load Mexican stats mx_json_path = Path('/tmp/mexican_institutions_final.json') mx_stats = analyze_json_stats(mx_json_path) mx_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml') mx_count = count_yaml_institutions(mx_yaml_path) # Load Brazilian stats (from previous session summary) br_yaml_path = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_final.yaml') br_count = count_yaml_institutions(br_yaml_path) # Brazilian stats from previous session br_stats = { 'total': 115, 'with_urls': 19, 'by_type': { 'MUSEUM': 52, 'ARCHIVE': 34, 'LIBRARY': 18, 'OFFICIAL_INSTITUTION': 7, 'EDUCATION_PROVIDER': 3, 'MIXED': 1 } } print(f"{'METRIC':<40} {'BRAZIL':>15} {'MEXICO':>15}") print("-"*80) # Overall counts print(f"{'Total Institutions':<40} {br_stats['total']:>15} {mx_stats['total']:>15}") print(f"{'Institutions with URLs':<40} {br_stats['with_urls']:>15} {mx_stats.get('with_urls', 0):>15}") print(f"{'URL Coverage %':<40} {br_stats['with_urls']/br_stats['total']*100:>14.1f}% {mx_stats.get('with_urls', 0)/mx_stats['total']*100:>14.1f}%") print(f"{'Institutions with Emails':<40} {'N/A':>15} {mx_stats.get('with_emails', 0):>15}") print() # By type print("DISTRIBUTION BY INSTITUTION TYPE") print("-"*80) all_types = set(br_stats['by_type'].keys()) | set(mx_stats.get('by_type', {}).keys()) for itype in sorted(all_types): br_count_type = br_stats['by_type'].get(itype, 0) mx_count_type = mx_stats.get('by_type', {}).get(itype, 0) br_pct = br_count_type / br_stats['total'] * 100 if br_stats['total'] > 0 else 0 mx_pct = mx_count_type / mx_stats['total'] * 100 if mx_stats['total'] > 0 else 0 print(f"{itype:<40} {br_count_type:>10} ({br_pct:>4.1f}%) {mx_count_type:>10} ({mx_pct:>4.1f}%)") print() print("TOP MEXICAN STATES BY INSTITUTION COUNT") print("-"*80) mx_by_state = mx_stats.get('by_state', {}) for state, count in sorted(mx_by_state.items(), key=lambda x: -x[1])[:15]: print(f" {state:<40} {count:>5}") print() print("DATA QUALITY METRICS") print("-"*80) # Calculate quality metrics br_id_coverage = br_stats['with_urls'] / br_stats['total'] * 100 mx_id_coverage = mx_stats.get('with_urls', 0) / mx_stats['total'] * 100 print(f"{'Identifier Coverage (URLs)':<40} {br_id_coverage:>14.1f}% {mx_id_coverage:>14.1f}%") print(f"{'Geographic Coverage (States)':<40} {'27 states':>15} {len(mx_by_state)} states") print(f"{'Data Tier':<40} {'TIER_4':>15} {'TIER_4':>15}") print(f"{'Extraction Method':<40} {'Agent NER':>15} {'Multi-file':>15}") print() print("COMBINED STATISTICS") print("-"*80) total_combined = br_stats['total'] + mx_stats['total'] print(f"Total institutions (BR + MX): {total_combined}") print(f"Countries covered: 2 (Brazil, Mexico)") print(f"Conversations processed: 4 (2 Brazilian + 2 Mexican)") print() # Next steps print("NEXT STEPS FOR GLOBAL GLAM PROJECT") print("-"*80) print("1. ✅ COMPLETE: Brazilian institutions (115 records)") print(f"2. ✅ COMPLETE: Mexican institutions ({mx_stats['total']} records)") print("3. ⏳ NEXT: Canadian GLAM (locate conversation files)") print("4. ⏳ Chilean GLAM institutions") print("5. ⏳ Vietnamese GLAM institutions") print("6. ⏳ Japanese GLAM institutions") print("7. ⏳ Schema validation for all records") print("8. ⏳ Export to RDF/JSON-LD formats") print("9. ⏳ Build visualization dashboard") print("10. ⏳ Target: 500+ institutions from 10+ countries") print() if __name__ == '__main__': main()