#!/usr/bin/env python3 """ Enrich Brazilian institutions - Batch 17 (Final push to 70%) Target: 4 high-potential candidates to reach 70% coverage goal Current: 85/126 (67.5%) → Goal: 88/126 (70%) """ import yaml import requests import time from pathlib import Path from datetime import datetime, timezone from typing import Optional def query_wikidata(sparql_query: str) -> dict: """Execute SPARQL query against Wikidata.""" endpoint = "https://query.wikidata.org/sparql" headers = {'User-Agent': 'GLAMExtractor/1.0 (Heritage Institution Research)'} try: response = requests.get( endpoint, params={'query': sparql_query, 'format': 'json'}, headers=headers, timeout=30 ) response.raise_for_status() return response.json() except Exception as e: print(f" ⚠️ Query error: {e}") return {'results': {'bindings': []}} def search_brazilian_museum(name: str, city: str, region: str) -> Optional[dict]: """Search for Brazilian museum in Wikidata.""" # Build search patterns search_patterns = [name] # Alternative patterns for specific cases if "Museu dos Povos Acreanos" in name: search_patterns.extend([ "Museu dos Povos Acreanos", "Memorial dos Autonomistas" ]) elif "Natural History Museum" in name and city == "Campina Grande": search_patterns.extend([ "Museu de História Natural Campina Grande", "Museu de História Natural da Paraíba" ]) elif "Museu Memória" in name: search_patterns.extend([ "Museu da Memória Rondoniense", "Museu Memória Rondoniense" ]) elif "MuseusBr" in name: search_patterns.extend([ "Cadastro Nacional de Museus", "MuseusBR", "Sistema Brasileiro de Museus" # Related platform ]) print(f" 🔍 Searching Wikidata for: {name}") print(f" City: {city}, Region: {region}") for pattern in search_patterns: print(f" Trying pattern: {pattern}") # Query Wikidata query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{ ?item rdfs:label "{pattern}"@pt . ?item wdt:P31/wdt:P279* wd:Q33506 . # instance of museum OPTIONAL {{ ?item wdt:P214 ?viaf }} OPTIONAL {{ ?item wdt:P791 ?isil }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en". }} }} LIMIT 5 """ result = query_wikidata(query) bindings = result.get('results', {}).get('bindings', []) if bindings: item = bindings[0] q_id = item['item']['value'].split('/')[-1] label = item.get('itemLabel', {}).get('value', '') desc = item.get('itemDescription', {}).get('value', '') print(f" ✅ Found: {q_id} - {label}") print(f" Description: {desc}") return { 'q_id': q_id, 'label': label, 'description': desc, 'viaf': item.get('viaf', {}).get('value'), 'isil': item.get('isil', {}).get('value') } time.sleep(1) # Rate limit # Fallback: Text search print(f" 🔎 Trying text search...") query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription WHERE {{ ?item wdt:P31/wdt:P279* wd:Q33506 . ?item wdt:P17 wd:Q155 . ?item rdfs:label ?label . FILTER(CONTAINS(LCASE(?label), "{name.lower().split()[0]}")) SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en". }} }} LIMIT 10 """ result = query_wikidata(query) bindings = result.get('results', {}).get('bindings', []) if bindings: print(f" 📋 Found {len(bindings)} potential matches:") for idx, item in enumerate(bindings[:5], 1): q_id = item['item']['value'].split('/')[-1] label = item.get('itemLabel', {}).get('value', '') desc = item.get('itemDescription', {}).get('value', '') print(f" {idx}. {q_id} - {label}") print(f" {desc}") return None def enrich_institutions(): """Enrich Batch 17 target institutions.""" print("=" * 80) print("BRAZILIAN ENRICHMENT - BATCH 17") print("Goal: Reach 70% coverage (88/126 institutions)") print("=" * 80) print() # Load dataset yaml_path = Path('data/instances/all/globalglam-20251111-batch16-fixed.yaml') print(f"📂 Loading: {yaml_path.name}") with open(yaml_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) # Target institutions targets = { "Museu dos Povos Acreanos": {"city": "Rio Branco", "region": "ACRE"}, "Natural History Museum": {"city": "Campina Grande", "region": "PARAÍBA"}, "Museu Memória": {"city": "Porto Velho", "region": "RONDÔNIA"}, "MuseusBr": {"city": "Brasília", "region": "DISTRITO FEDERAL"} # National platform } enriched_count = 0 for inst in institutions: name = inst.get('name', '') if name in targets: target_info = targets[name] print(f"\n{'=' * 80}") print(f"🎯 Target: {name}") print(f"{'=' * 80}") # Check if already has Wikidata identifiers = inst.get('identifiers', []) has_wikidata = any(id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers) if has_wikidata: print(" ⏭️ Already enriched, skipping") continue # Search Wikidata result = search_brazilian_museum( name, target_info['city'], target_info['region'] ) if result: print(f"\n ✅ MATCH FOUND: {result['q_id']}") enriched_count += 1 else: print(f"\n ❌ NO MATCH: Manual research required") print() print("=" * 80) print("BATCH 17 ENRICHMENT SUMMARY") print("=" * 80) print(f"✅ Institutions enriched: {enriched_count}/4") print(f"❌ Requiring manual research: {4 - enriched_count}/4") print() print("📝 Next steps:") print(" 1. Review match quality for automated matches") print(" 2. Conduct manual Wikidata searches for unmatched institutions") print(" 3. Consider Portuguese Wikipedia as alternative source") print(" 4. Update dataset with verified identifiers") print() if __name__ == '__main__': enrich_institutions()