glam/find_brazil_institutions.py
2025-11-19 23:25:22 +01:00

60 lines
1.8 KiB
Python

#!/usr/bin/env python3
"""Find Brazilian institution IDs from main dataset."""
import yaml
from pathlib import Path
# Load main dataset
print("Loading main dataset...")
with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f:
institutions = list(yaml.safe_load_all(f))
# Flatten if nested
if len(institutions) == 1 and isinstance(institutions[0], list):
institutions = institutions[0]
print(f"Loaded {len(institutions)} institutions")
# Target names to search for
targets = [
"Universidade Federal de Rondônia",
"UNIR",
"Secretaria de Estado da Cultura do Tocantins",
"Secretaria.*Cultura.*Tocantins",
"Museu do Estado de Pernambuco",
"Museu Histórico Nacional",
"Fundação Cultural Palmares",
"Museu Imperial"
]
# Search for Brazilian institutions matching targets
print("\n" + "="*80)
print("SEARCHING FOR TARGET INSTITUTIONS")
print("="*80)
for inst in institutions:
if not isinstance(inst, dict):
continue
# Check if Brazilian
locations = inst.get('locations', [])
is_brazil = any(loc.get('country') == 'BR' for loc in locations if isinstance(loc, dict))
if not is_brazil:
continue
name = inst.get('name', '')
description = inst.get('description', '')
inst_id = inst.get('id', 'NO_ID')
# Check if name matches any target
for target in targets:
if target.lower() in name.lower() or (description and target.lower() in description.lower()):
print(f"\n✓ MATCH: {target}")
print(f" ID: {inst_id}")
print(f" Name: {name}")
if description:
desc_short = description[:100] + "..." if len(description) > 100 else description
print(f" Description: {desc_short}")
break
print("\n" + "="*80)