#!/usr/bin/env python3 """Find Brazilian bonus institutions from main dataset.""" import yaml import re # Load main dataset print("Loading main dataset...") with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f: institutions = list(yaml.safe_load_all(f)) if len(institutions) == 1 and isinstance(institutions[0], list): institutions = institutions[0] print(f"Loaded {len(institutions)} institutions") # Bonus targets bonus_targets = { "Museu do Estado de Pernambuco": ["pernambuco", "estado"], "Museu Histórico Nacional": ["histórico nacional", "historico nacional"], "Fundação Cultural Palmares": ["palmares", "fundação cultural"], "Museu Imperial": ["imperial", "petrópolis"] } print("\n" + "="*80) print("SEARCHING FOR BONUS INSTITUTIONS") print("="*80) for inst in institutions: if not isinstance(inst, dict): continue locations = inst.get('locations', []) is_brazil = any(loc.get('country') == 'BR' for loc in locations if isinstance(loc, dict)) if not is_brazil: continue name = inst.get('name', '').lower() description = str(inst.get('description', '')).lower() inst_id = inst.get('id', 'NO_ID') for target_name, keywords in bonus_targets.items(): # Check if ALL keywords appear in name or description if all(kw.lower() in name or kw.lower() in description for kw in keywords): print(f"\n✓ MATCH: {target_name}") print(f" ID: {inst_id}") print(f" Name: {inst.get('name', 'N/A')}") if description: desc_short = description[:100] + "..." if len(description) > 100 else description print(f" Description: {desc_short}") # Also check if already enriched with these Q-numbers print("\n" + "="*80) print("CHECKING IF Q-NUMBERS ALREADY EXIST") print("="*80) target_qids = ["Q6940628", "Q510993", "Q10286282", "Q1887049"] for inst in institutions: if not isinstance(inst, dict): continue identifiers = inst.get('identifiers', []) for ident in identifiers: if not isinstance(ident, dict): continue if ident.get('identifier_value') in target_qids: print(f"\n✓ Q-NUMBER EXISTS: {ident.get('identifier_value')}") print(f" ID: {inst.get('id', 'NO_ID')}") print(f" Name: {inst.get('name', 'N/A')}") print("\n" + "="*80)