glam/enrich_brazil_batch13.py
2025-11-19 23:25:22 +01:00

177 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
Brazil Batch 13 Wikidata Enrichment Script
Target: 10-12 institutions to reach ~60-65% coverage (73-75 out of 121)
Focus: National institutions, state museums, major universities
This script uses manual Wikidata searches and documents Q-numbers found.
Run the actual enrichment by calling wikidata-authenticated_search_entity for each institution.
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
print("="*80)
print("BRAZIL BATCH 13 WIKIDATA ENRICHMENT")
print("="*80)
print("Target: 10-12 institutions (aiming for 60-65% coverage)")
print("="*80)
print()
# =============================================================================
# BATCH 13 TARGET INSTITUTIONS
# =============================================================================
BATCH13_TARGETS = [
{
'name': 'Instituto Moreira Salles',
'id': 'https://w3id.org/heritage/custodian/br/instituto-moreira-salles',
'search_query': 'Instituto Moreira Salles Brazil',
'context': 'Major cultural institute with photographic collections',
'expected_qid': 'Q10302915', # Known from research
'priority': 1
},
{
'name': 'Sistema Brasileiro de Museus',
'id': 'https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm',
'search_query': 'Sistema Brasileiro de Museus',
'context': 'National museum system created 2004, coordinated by IBRAM',
'priority': 1
},
{
'name': 'Brasiliana Fotográfica',
'id': 'https://w3id.org/heritage/custodian/br/brasiliana-fotografica',
'search_query': 'Brasiliana Fotográfica',
'context': 'Inter-institutional photography collaboration',
'priority': 1
},
{
'name': 'Universidade Federal de Rondônia',
'id': '3008281717687280329',
'search_query': 'Universidade Federal de Rondônia',
'context': 'Federal university in Rondônia (UNIR)',
'expected_qid': 'Q10365614', # Federal universities follow pattern
'priority': 1
},
{
'name': 'Fundação de Cultura Elias Mansour',
'id': 'https://w3id.org/heritage/custodian/br/ac-funda-o-de-cultura-elias-mansour-fem',
'search_query': 'Fundação de Cultura Elias Mansour Acre',
'context': 'State cultural foundation in Acre',
'priority': 2
},
{
'name': 'Museu dos Povos Acreanos',
'id': 'https://w3id.org/heritage/custodian/br/ac-museu-dos-povos-acreanos',
'search_query': 'Museu dos Povos Acreanos Rio Branco',
'context': 'Museum in Rio Branco (opened 2023)',
'priority': 2
},
{
'name': 'Museu Histórico de Alcântara',
'id': 'https://w3id.org/heritage/custodian/br/mt-museu-hist-rico',
'search_query': 'Museu Histórico de Alcântara Maranhão',
'context': 'Historical museum in Alcântara, Maranhão',
'priority': 2
},
{
'name': 'Secretaria de Estado da Cultura do Amapá',
'id': '1423599463777727402',
'search_query': 'Secretaria Cultura Amapá',
'context': 'State culture secretariat of Amapá',
'priority': 3
},
{
'name': 'Secretaria de Estado da Cultura do Tocantins',
'id': '709508309148680086',
'search_query': 'Secretaria Cultura Tocantins',
'context': 'State culture secretariat of Tocantins',
'priority': 3
},
{
'name': 'Instituto Histórico e Geográfico de Alagoas',
'id': '2519599505258789521',
'search_query': 'Instituto Histórico Geográfico Alagoas',
'context': 'Historical and geographic institute in Alagoas',
'priority': 3
},
{
'name': 'Sistema de Museus de Ouro Preto',
'id': 'https://w3id.org/heritage/custodian/br/mg-ouro-preto-system',
'search_query': 'Sistema Museus Ouro Preto',
'context': "Brazil's first municipal museum network (2006)",
'priority': 3
},
{
'name': 'Museu Histórico de Goiás',
'id': 'https://w3id.org/heritage/custodian/br/go-museu-hist-rico-mham',
'search_query': 'Museu Histórico de Goiás',
'context': 'Historical museum in Goiás state',
'priority': 3
}
]
print("BATCH 13 TARGET INSTITUTIONS")
print("="*80)
print(f"Total targets: {len(BATCH13_TARGETS)}")
print()
for idx, target in enumerate(BATCH13_TARGETS, 1):
print(f"{idx}. {target['name']}")
print(f" Priority: {target['priority']}")
print(f" Search query: {target['search_query']}")
print(f" Context: {target['context']}")
if 'expected_qid' in target:
print(f" Expected Q-ID: {target['expected_qid']}")
print()
print("="*80)
print("INSTRUCTIONS FOR ENRICHMENT:")
print("="*80)
print("""
1. For each institution above, call the wikidata-authenticated_search_entity tool
with the search query provided
2. Verify the Q-number returned matches the institution by checking:
- Label matches the institution name
- Description mentions correct location (Brazil, state)
- Instance of (P31) is correct type (museum, university, etc.)
3. Record results in BATCH13_MATCHES dictionary below
4. For institutions not found via search, use wikidata-authenticated_execute_sparql
with geographic and type filters (P17=Q155 for Brazil, P131 for state)
5. After collecting all Q-numbers, create batch13_enriched.yaml with verified matches
6. Run merge script to integrate into main dataset
""")
print("\nReady to begin searches!")
print("Use the wikidata-authenticated_search_entity tool for each target above.")
print("="*80)
# Template for recording results
BATCH13_MATCHES = {
# Will be filled in as searches are completed
# Example format:
# "Instituto Moreira Salles": {
# "qid": "Q10302915",
# "label": "Instituto Moreira Salles",
# "confidence": 0.95,
# "notes": "Verified via search - cultural institute"
# }
}
# Calculate current status
current_with_wikidata = 67
total_brazilian = 121
current_coverage = (current_with_wikidata / total_brazilian) * 100
print(f"\nCURRENT STATUS:")
print(f" Institutions with Q-numbers: {current_with_wikidata}/121 ({current_coverage:.1f}%)")
print(f" Target after Batch 13: 73-77/121 (60-64%)")
print(f" Need to find: 6-10 Q-numbers")
print()