#!/usr/bin/env python3 """Find Brazilian institution IDs from main dataset.""" import yaml from pathlib import Path # Load main dataset print("Loading main dataset...") with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f: institutions = list(yaml.safe_load_all(f)) # Flatten if nested if len(institutions) == 1 and isinstance(institutions[0], list): institutions = institutions[0] print(f"Loaded {len(institutions)} institutions") # Target names to search for targets = [ "Universidade Federal de Rondônia", "UNIR", "Secretaria de Estado da Cultura do Tocantins", "Secretaria.*Cultura.*Tocantins", "Museu do Estado de Pernambuco", "Museu Histórico Nacional", "Fundação Cultural Palmares", "Museu Imperial" ] # Search for Brazilian institutions matching targets print("\n" + "="*80) print("SEARCHING FOR TARGET INSTITUTIONS") print("="*80) for inst in institutions: if not isinstance(inst, dict): continue # Check if Brazilian locations = inst.get('locations', []) is_brazil = any(loc.get('country') == 'BR' for loc in locations if isinstance(loc, dict)) if not is_brazil: continue name = inst.get('name', '') description = inst.get('description', '') inst_id = inst.get('id', 'NO_ID') # Check if name matches any target for target in targets: if target.lower() in name.lower() or (description and target.lower() in description.lower()): print(f"\n✓ MATCH: {target}") print(f" ID: {inst_id}") print(f" Name: {name}") if description: desc_short = description[:100] + "..." if len(description) > 100 else description print(f" Description: {desc_short}") break print("\n" + "="*80)