157 lines
5.4 KiB
Python
157 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick analysis: What metadata do ARON institutions actually have?
|
|
|
|
Tests 20 sample ARON institutions to see field availability before
|
|
full enrichment run.
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
from collections import Counter
|
|
from typing import Dict, List, Set
|
|
|
|
def analyze_aron_sample():
|
|
"""Analyze sample ARON institutions to understand metadata availability."""
|
|
|
|
# Load unified dataset
|
|
print("Loading czech_unified.yaml...")
|
|
with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Filter ARON-only institutions (not merged with ADR)
|
|
aron_institutions = [
|
|
inst for inst in data
|
|
if 'aron' in inst['provenance']['source_url']
|
|
and 'adr.cz' not in inst['provenance']['source_url']
|
|
]
|
|
|
|
print(f"Found {len(aron_institutions)} ARON-only institutions")
|
|
print(f"Sampling first 20 for metadata analysis...\n")
|
|
|
|
# Sample 20 institutions
|
|
sample = aron_institutions[:20]
|
|
|
|
# Track field type frequency
|
|
field_type_counter = Counter()
|
|
institutions_with_address = 0
|
|
institutions_with_url = 0
|
|
institutions_with_phone = 0
|
|
institutions_with_email = 0
|
|
|
|
results = []
|
|
|
|
for idx, inst in enumerate(sample, 1):
|
|
# Get ARON UUID
|
|
aron_uuid = None
|
|
for identifier in inst.get('identifiers', []):
|
|
if identifier.get('identifier_scheme') == 'ARON_UUID':
|
|
aron_uuid = identifier['identifier_value']
|
|
break
|
|
|
|
if not aron_uuid:
|
|
print(f" {idx}. {inst['name'][:40]:40} | No ARON UUID found")
|
|
continue
|
|
|
|
# Fetch detail from API
|
|
try:
|
|
url = f"https://portal.nacr.cz/aron/api/aron/apu/{aron_uuid}"
|
|
response = requests.get(url, timeout=5)
|
|
response.raise_for_status()
|
|
detail = response.json()
|
|
except Exception as e:
|
|
print(f" {idx}. {inst['name'][:40]:40} | API error: {e}")
|
|
continue
|
|
|
|
# Extract field types
|
|
field_types = set()
|
|
has_address = False
|
|
has_url = False
|
|
has_phone = False
|
|
has_email = False
|
|
|
|
for part in detail.get('parts', []):
|
|
for item in part.get('items', []):
|
|
field_type = item.get('type', '')
|
|
field_types.add(field_type)
|
|
field_type_counter[field_type] += 1
|
|
|
|
# Check for contact fields
|
|
if 'ADDRESS' in field_type:
|
|
has_address = True
|
|
if 'URL' in field_type:
|
|
has_url = True
|
|
if 'PHONE' in field_type:
|
|
has_phone = True
|
|
if 'EMAIL' in field_type:
|
|
has_email = True
|
|
|
|
# Update counters
|
|
if has_address:
|
|
institutions_with_address += 1
|
|
if has_url:
|
|
institutions_with_url += 1
|
|
if has_phone:
|
|
institutions_with_phone += 1
|
|
if has_email:
|
|
institutions_with_email += 1
|
|
|
|
# Display result
|
|
contact_flags = []
|
|
if has_address:
|
|
contact_flags.append("ADDR")
|
|
if has_url:
|
|
contact_flags.append("URL")
|
|
if has_phone:
|
|
contact_flags.append("PHONE")
|
|
if has_email:
|
|
contact_flags.append("EMAIL")
|
|
|
|
contact_str = ", ".join(contact_flags) if contact_flags else "NO CONTACT DATA"
|
|
|
|
print(f" {idx}. {inst['name'][:40]:40} | {contact_str}")
|
|
|
|
results.append({
|
|
'name': inst['name'],
|
|
'uuid': aron_uuid,
|
|
'field_types': list(field_types),
|
|
'has_address': has_address,
|
|
'has_url': has_url,
|
|
'has_phone': has_phone,
|
|
'has_email': has_email
|
|
})
|
|
|
|
# Summary statistics
|
|
print("\n" + "="*80)
|
|
print("SUMMARY STATISTICS")
|
|
print("="*80)
|
|
print(f"Sample size: {len(results)} institutions")
|
|
print(f"Institutions with addresses: {institutions_with_address} ({institutions_with_address/len(results)*100:.1f}%)")
|
|
print(f"Institutions with URLs: {institutions_with_url} ({institutions_with_url/len(results)*100:.1f}%)")
|
|
print(f"Institutions with phone: {institutions_with_phone} ({institutions_with_phone/len(results)*100:.1f}%)")
|
|
print(f"Institutions with email: {institutions_with_email} ({institutions_with_email/len(results)*100:.1f}%)")
|
|
|
|
print(f"\nMost common field types:")
|
|
for field_type, count in field_type_counter.most_common(15):
|
|
print(f" {field_type:40} | {count:3} occurrences")
|
|
|
|
# Recommendation
|
|
print("\n" + "="*80)
|
|
print("RECOMMENDATION")
|
|
print("="*80)
|
|
|
|
if institutions_with_address / len(results) > 0.5:
|
|
print("✅ PROCEED with enrichment - Good metadata coverage (>50%)")
|
|
print(" Estimated enrichment time: ~10 minutes (geocoding rate limit)")
|
|
elif institutions_with_address / len(results) > 0.2:
|
|
print("⚠️ PARTIAL enrichment recommended - Some metadata available (20-50%)")
|
|
print(" Consider skipping geocoding for now, just extract contact info")
|
|
else:
|
|
print("❌ SKIP enrichment - Insufficient metadata (<20% coverage)")
|
|
print(" Alternative: Web scraping individual institution pages")
|
|
print(" Or: Contact NK ČR for bulk metadata export")
|
|
|
|
return results
|
|
|
|
if __name__ == '__main__':
|
|
analyze_aron_sample()
|