glam/scripts/analyze_aron_metadata_sample.py

#!/usr/bin/env python3
"""
Quick analysis: What metadata do ARON institutions actually have?

Tests 20 sample ARON institutions to see field availability before
full enrichment run.
"""

import yaml
import requests
from collections import Counter
from typing import Dict, List, Set

def analyze_aron_sample():
    """Analyze sample ARON institutions to understand metadata availability."""

    # Load unified dataset
    print("Loading czech_unified.yaml...")
    with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Filter ARON-only institutions (not merged with ADR)
    aron_institutions = [
        inst for inst in data
        if 'aron' in inst['provenance']['source_url']
        and 'adr.cz' not in inst['provenance']['source_url']
    ]

    print(f"Found {len(aron_institutions)} ARON-only institutions")
    print(f"Sampling first 20 for metadata analysis...\n")

    # Sample 20 institutions
    sample = aron_institutions[:20]

    # Track field type frequency
    field_type_counter = Counter()
    institutions_with_address = 0
    institutions_with_url = 0
    institutions_with_phone = 0
    institutions_with_email = 0

    results = []

    for idx, inst in enumerate(sample, 1):
        # Get ARON UUID
        aron_uuid = None
        for identifier in inst.get('identifiers', []):
            if identifier.get('identifier_scheme') == 'ARON_UUID':
                aron_uuid = identifier['identifier_value']
                break

        if not aron_uuid:
            print(f"  {idx}. {inst['name'][:40]:40} | No ARON UUID found")
            continue

        # Fetch detail from API
        try:
            url = f"https://portal.nacr.cz/aron/api/aron/apu/{aron_uuid}"
            response = requests.get(url, timeout=5)
            response.raise_for_status()
            detail = response.json()
        except Exception as e:
            print(f"  {idx}. {inst['name'][:40]:40} | API error: {e}")
            continue

        # Extract field types
        field_types = set()
        has_address = False
        has_url = False
        has_phone = False
        has_email = False

        for part in detail.get('parts', []):
            for item in part.get('items', []):
                field_type = item.get('type', '')
                field_types.add(field_type)
                field_type_counter[field_type] += 1

                # Check for contact fields
                if 'ADDRESS' in field_type:
                    has_address = True
                if 'URL' in field_type:
                    has_url = True
                if 'PHONE' in field_type:
                    has_phone = True
                if 'EMAIL' in field_type:
                    has_email = True

        # Update counters
        if has_address:
            institutions_with_address += 1
        if has_url:
            institutions_with_url += 1
        if has_phone:
            institutions_with_phone += 1
        if has_email:
            institutions_with_email += 1

        # Display result
        contact_flags = []
        if has_address:
            contact_flags.append("ADDR")
        if has_url:
            contact_flags.append("URL")
        if has_phone:
            contact_flags.append("PHONE")
        if has_email:
            contact_flags.append("EMAIL")

        contact_str = ", ".join(contact_flags) if contact_flags else "NO CONTACT DATA"

        print(f"  {idx}. {inst['name'][:40]:40} | {contact_str}")

        results.append({
            'name': inst['name'],
            'uuid': aron_uuid,
            'field_types': list(field_types),
            'has_address': has_address,
            'has_url': has_url,
            'has_phone': has_phone,
            'has_email': has_email
        })

    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    print(f"Sample size: {len(results)} institutions")
    print(f"Institutions with addresses: {institutions_with_address} ({institutions_with_address/len(results)*100:.1f}%)")
    print(f"Institutions with URLs: {institutions_with_url} ({institutions_with_url/len(results)*100:.1f}%)")
    print(f"Institutions with phone: {institutions_with_phone} ({institutions_with_phone/len(results)*100:.1f}%)")
    print(f"Institutions with email: {institutions_with_email} ({institutions_with_email/len(results)*100:.1f}%)")

    print(f"\nMost common field types:")
    for field_type, count in field_type_counter.most_common(15):
        print(f"  {field_type:40} | {count:3} occurrences")

    # Recommendation
    print("\n" + "="*80)
    print("RECOMMENDATION")
    print("="*80)

    if institutions_with_address / len(results) > 0.5:
        print("✅ PROCEED with enrichment - Good metadata coverage (>50%)")
        print("   Estimated enrichment time: ~10 minutes (geocoding rate limit)")
    elif institutions_with_address / len(results) > 0.2:
        print("⚠️  PARTIAL enrichment recommended - Some metadata available (20-50%)")
        print("   Consider skipping geocoding for now, just extract contact info")
    else:
        print("❌ SKIP enrichment - Insufficient metadata (<20% coverage)")
        print("   Alternative: Web scraping individual institution pages")
        print("   Or: Contact NK ČR for bulk metadata export")

    return results

if __name__ == '__main__':
    analyze_aron_sample()