glam/scripts/analyze_aron_metadata_sample.py
2025-11-21 22:12:33 +01:00

157 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
Quick analysis: What metadata do ARON institutions actually have?
Tests 20 sample ARON institutions to see field availability before
full enrichment run.
"""
import yaml
import requests
from collections import Counter
from typing import Dict, List, Set
def analyze_aron_sample():
"""Analyze sample ARON institutions to understand metadata availability."""
# Load unified dataset
print("Loading czech_unified.yaml...")
with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Filter ARON-only institutions (not merged with ADR)
aron_institutions = [
inst for inst in data
if 'aron' in inst['provenance']['source_url']
and 'adr.cz' not in inst['provenance']['source_url']
]
print(f"Found {len(aron_institutions)} ARON-only institutions")
print(f"Sampling first 20 for metadata analysis...\n")
# Sample 20 institutions
sample = aron_institutions[:20]
# Track field type frequency
field_type_counter = Counter()
institutions_with_address = 0
institutions_with_url = 0
institutions_with_phone = 0
institutions_with_email = 0
results = []
for idx, inst in enumerate(sample, 1):
# Get ARON UUID
aron_uuid = None
for identifier in inst.get('identifiers', []):
if identifier.get('identifier_scheme') == 'ARON_UUID':
aron_uuid = identifier['identifier_value']
break
if not aron_uuid:
print(f" {idx}. {inst['name'][:40]:40} | No ARON UUID found")
continue
# Fetch detail from API
try:
url = f"https://portal.nacr.cz/aron/api/aron/apu/{aron_uuid}"
response = requests.get(url, timeout=5)
response.raise_for_status()
detail = response.json()
except Exception as e:
print(f" {idx}. {inst['name'][:40]:40} | API error: {e}")
continue
# Extract field types
field_types = set()
has_address = False
has_url = False
has_phone = False
has_email = False
for part in detail.get('parts', []):
for item in part.get('items', []):
field_type = item.get('type', '')
field_types.add(field_type)
field_type_counter[field_type] += 1
# Check for contact fields
if 'ADDRESS' in field_type:
has_address = True
if 'URL' in field_type:
has_url = True
if 'PHONE' in field_type:
has_phone = True
if 'EMAIL' in field_type:
has_email = True
# Update counters
if has_address:
institutions_with_address += 1
if has_url:
institutions_with_url += 1
if has_phone:
institutions_with_phone += 1
if has_email:
institutions_with_email += 1
# Display result
contact_flags = []
if has_address:
contact_flags.append("ADDR")
if has_url:
contact_flags.append("URL")
if has_phone:
contact_flags.append("PHONE")
if has_email:
contact_flags.append("EMAIL")
contact_str = ", ".join(contact_flags) if contact_flags else "NO CONTACT DATA"
print(f" {idx}. {inst['name'][:40]:40} | {contact_str}")
results.append({
'name': inst['name'],
'uuid': aron_uuid,
'field_types': list(field_types),
'has_address': has_address,
'has_url': has_url,
'has_phone': has_phone,
'has_email': has_email
})
# Summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Sample size: {len(results)} institutions")
print(f"Institutions with addresses: {institutions_with_address} ({institutions_with_address/len(results)*100:.1f}%)")
print(f"Institutions with URLs: {institutions_with_url} ({institutions_with_url/len(results)*100:.1f}%)")
print(f"Institutions with phone: {institutions_with_phone} ({institutions_with_phone/len(results)*100:.1f}%)")
print(f"Institutions with email: {institutions_with_email} ({institutions_with_email/len(results)*100:.1f}%)")
print(f"\nMost common field types:")
for field_type, count in field_type_counter.most_common(15):
print(f" {field_type:40} | {count:3} occurrences")
# Recommendation
print("\n" + "="*80)
print("RECOMMENDATION")
print("="*80)
if institutions_with_address / len(results) > 0.5:
print("✅ PROCEED with enrichment - Good metadata coverage (>50%)")
print(" Estimated enrichment time: ~10 minutes (geocoding rate limit)")
elif institutions_with_address / len(results) > 0.2:
print("⚠️ PARTIAL enrichment recommended - Some metadata available (20-50%)")
print(" Consider skipping geocoding for now, just extract contact info")
else:
print("❌ SKIP enrichment - Insufficient metadata (<20% coverage)")
print(" Alternative: Web scraping individual institution pages")
print(" Or: Contact NK ČR for bulk metadata export")
return results
if __name__ == '__main__':
analyze_aron_sample()