glam/find_brazil_bonus.py
2025-11-19 23:25:22 +01:00

72 lines
2.4 KiB
Python

#!/usr/bin/env python3
"""Find Brazilian bonus institutions from main dataset."""
import yaml
import re
# Load main dataset
print("Loading main dataset...")
with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f:
institutions = list(yaml.safe_load_all(f))
if len(institutions) == 1 and isinstance(institutions[0], list):
institutions = institutions[0]
print(f"Loaded {len(institutions)} institutions")
# Bonus targets
bonus_targets = {
"Museu do Estado de Pernambuco": ["pernambuco", "estado"],
"Museu Histórico Nacional": ["histórico nacional", "historico nacional"],
"Fundação Cultural Palmares": ["palmares", "fundação cultural"],
"Museu Imperial": ["imperial", "petrópolis"]
}
print("\n" + "="*80)
print("SEARCHING FOR BONUS INSTITUTIONS")
print("="*80)
for inst in institutions:
if not isinstance(inst, dict):
continue
locations = inst.get('locations', [])
is_brazil = any(loc.get('country') == 'BR' for loc in locations if isinstance(loc, dict))
if not is_brazil:
continue
name = inst.get('name', '').lower()
description = str(inst.get('description', '')).lower()
inst_id = inst.get('id', 'NO_ID')
for target_name, keywords in bonus_targets.items():
# Check if ALL keywords appear in name or description
if all(kw.lower() in name or kw.lower() in description for kw in keywords):
print(f"\n✓ MATCH: {target_name}")
print(f" ID: {inst_id}")
print(f" Name: {inst.get('name', 'N/A')}")
if description:
desc_short = description[:100] + "..." if len(description) > 100 else description
print(f" Description: {desc_short}")
# Also check if already enriched with these Q-numbers
print("\n" + "="*80)
print("CHECKING IF Q-NUMBERS ALREADY EXIST")
print("="*80)
target_qids = ["Q6940628", "Q510993", "Q10286282", "Q1887049"]
for inst in institutions:
if not isinstance(inst, dict):
continue
identifiers = inst.get('identifiers', [])
for ident in identifiers:
if not isinstance(ident, dict):
continue
if ident.get('identifier_value') in target_qids:
print(f"\n✓ Q-NUMBER EXISTS: {ident.get('identifier_value')}")
print(f" ID: {inst.get('id', 'NO_ID')}")
print(f" Name: {inst.get('name', 'N/A')}")
print("\n" + "="*80)