#!/usr/bin/env python3 """ Quick analysis: What metadata do ARON institutions actually have? Tests 20 sample ARON institutions to see field availability before full enrichment run. """ import yaml import requests from collections import Counter from typing import Dict, List, Set def analyze_aron_sample(): """Analyze sample ARON institutions to understand metadata availability.""" # Load unified dataset print("Loading czech_unified.yaml...") with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Filter ARON-only institutions (not merged with ADR) aron_institutions = [ inst for inst in data if 'aron' in inst['provenance']['source_url'] and 'adr.cz' not in inst['provenance']['source_url'] ] print(f"Found {len(aron_institutions)} ARON-only institutions") print(f"Sampling first 20 for metadata analysis...\n") # Sample 20 institutions sample = aron_institutions[:20] # Track field type frequency field_type_counter = Counter() institutions_with_address = 0 institutions_with_url = 0 institutions_with_phone = 0 institutions_with_email = 0 results = [] for idx, inst in enumerate(sample, 1): # Get ARON UUID aron_uuid = None for identifier in inst.get('identifiers', []): if identifier.get('identifier_scheme') == 'ARON_UUID': aron_uuid = identifier['identifier_value'] break if not aron_uuid: print(f" {idx}. {inst['name'][:40]:40} | No ARON UUID found") continue # Fetch detail from API try: url = f"https://portal.nacr.cz/aron/api/aron/apu/{aron_uuid}" response = requests.get(url, timeout=5) response.raise_for_status() detail = response.json() except Exception as e: print(f" {idx}. {inst['name'][:40]:40} | API error: {e}") continue # Extract field types field_types = set() has_address = False has_url = False has_phone = False has_email = False for part in detail.get('parts', []): for item in part.get('items', []): field_type = item.get('type', '') field_types.add(field_type) field_type_counter[field_type] += 1 # Check for contact fields if 'ADDRESS' in field_type: has_address = True if 'URL' in field_type: has_url = True if 'PHONE' in field_type: has_phone = True if 'EMAIL' in field_type: has_email = True # Update counters if has_address: institutions_with_address += 1 if has_url: institutions_with_url += 1 if has_phone: institutions_with_phone += 1 if has_email: institutions_with_email += 1 # Display result contact_flags = [] if has_address: contact_flags.append("ADDR") if has_url: contact_flags.append("URL") if has_phone: contact_flags.append("PHONE") if has_email: contact_flags.append("EMAIL") contact_str = ", ".join(contact_flags) if contact_flags else "NO CONTACT DATA" print(f" {idx}. {inst['name'][:40]:40} | {contact_str}") results.append({ 'name': inst['name'], 'uuid': aron_uuid, 'field_types': list(field_types), 'has_address': has_address, 'has_url': has_url, 'has_phone': has_phone, 'has_email': has_email }) # Summary statistics print("\n" + "="*80) print("SUMMARY STATISTICS") print("="*80) print(f"Sample size: {len(results)} institutions") print(f"Institutions with addresses: {institutions_with_address} ({institutions_with_address/len(results)*100:.1f}%)") print(f"Institutions with URLs: {institutions_with_url} ({institutions_with_url/len(results)*100:.1f}%)") print(f"Institutions with phone: {institutions_with_phone} ({institutions_with_phone/len(results)*100:.1f}%)") print(f"Institutions with email: {institutions_with_email} ({institutions_with_email/len(results)*100:.1f}%)") print(f"\nMost common field types:") for field_type, count in field_type_counter.most_common(15): print(f" {field_type:40} | {count:3} occurrences") # Recommendation print("\n" + "="*80) print("RECOMMENDATION") print("="*80) if institutions_with_address / len(results) > 0.5: print("✅ PROCEED with enrichment - Good metadata coverage (>50%)") print(" Estimated enrichment time: ~10 minutes (geocoding rate limit)") elif institutions_with_address / len(results) > 0.2: print("⚠️ PARTIAL enrichment recommended - Some metadata available (20-50%)") print(" Consider skipping geocoding for now, just extract contact info") else: print("❌ SKIP enrichment - Insufficient metadata (<20% coverage)") print(" Alternative: Web scraping individual institution pages") print(" Or: Contact NK ČR for bulk metadata export") return results if __name__ == '__main__': analyze_aron_sample()