glam/find_brazil_institutions.py

#!/usr/bin/env python3
"""Find Brazilian institution IDs from main dataset."""

import yaml
from pathlib import Path

# Load main dataset
print("Loading main dataset...")
with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f:
    institutions = list(yaml.safe_load_all(f))
    # Flatten if nested
    if len(institutions) == 1 and isinstance(institutions[0], list):
        institutions = institutions[0]

print(f"Loaded {len(institutions)} institutions")

# Target names to search for
targets = [
    "Universidade Federal de Rondônia",
    "UNIR",
    "Secretaria de Estado da Cultura do Tocantins",
    "Secretaria.*Cultura.*Tocantins",
    "Museu do Estado de Pernambuco",
    "Museu Histórico Nacional",
    "Fundação Cultural Palmares",
    "Museu Imperial"
]

# Search for Brazilian institutions matching targets
print("\n" + "="*80)
print("SEARCHING FOR TARGET INSTITUTIONS")
print("="*80)

for inst in institutions:
    if not isinstance(inst, dict):
        continue

    # Check if Brazilian
    locations = inst.get('locations', [])
    is_brazil = any(loc.get('country') == 'BR' for loc in locations if isinstance(loc, dict))

    if not is_brazil:
        continue

    name = inst.get('name', '')
    description = inst.get('description', '')
    inst_id = inst.get('id', 'NO_ID')

    # Check if name matches any target
    for target in targets:
        if target.lower() in name.lower() or (description and target.lower() in description.lower()):
            print(f"\n✓ MATCH: {target}")
            print(f"  ID: {inst_id}")
            print(f"  Name: {name}")
            if description:
                desc_short = description[:100] + "..." if len(description) > 100 else description
                print(f"  Description: {desc_short}")
            break

print("\n" + "="*80)