#!/usr/bin/env python3 """ Interactive script to search Wikidata for Brazilian institutions. Helps build enrichment mappings by querying Wikidata SPARQL endpoint. """ import requests import time from typing import List, Dict, Optional WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" def search_wikidata_sparql(search_terms: str, limit: int = 10) -> List[Dict]: """Search Wikidata using SPARQL for heritage institutions.""" query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?coordinate ?website WHERE {{ # Search in labels and aliases ?item rdfs:label|skos:altLabel ?label . FILTER(CONTAINS(LCASE(?label), LCASE("{search_terms}"))) # Brazilian institutions ?item wdt:P17 wd:Q155 . # country = Brazil # Optional data OPTIONAL {{ ?item wdt:P625 ?coordinate }} OPTIONAL {{ ?item wdt:P856 ?website }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en" }} }} LIMIT {limit} """ headers = { 'User-Agent': 'GLAM-Data-Extractor/0.1 (heritage institutions research)', 'Accept': 'application/sparql-results+json' } try: response = requests.get( WIKIDATA_SPARQL_ENDPOINT, params={'query': query, 'format': 'json'}, headers=headers, timeout=30 ) response.raise_for_status() results = response.json() matches = [] for binding in results.get('results', {}).get('bindings', []): qid = binding['item']['value'].split('/')[-1] label = binding.get('itemLabel', {}).get('value', 'No label') description = binding.get('itemDescription', {}).get('value', '') website = binding.get('website', {}).get('value', '') matches.append({ 'qid': qid, 'label': label, 'description': description, 'website': website }) return matches except Exception as e: print(f"❌ Error querying Wikidata: {e}") return [] def main(): """Search for Brazilian institutions in Wikidata.""" # Priority targets from our dataset targets = [ ("Museu Sacaca", "museum indigenous macapá"), ("Dom Bosco", "museum campo grande"), ("Homem Sergipano", "museum aracaju anthropology"), ("Geopark Araripe", "geopark crato ceará"), ("Goiás", "unesco heritage city"), ("São Luís", "unesco heritage historic center"), ("Arquivo Público", "brasília archive federal district"), ("Memorial Rio Grande", "pelotas memorial museum"), ("Museu Povos Acreanos", "acre rio branco museum"), ("MARCO", "campo grande contemporary art"), ] print("🔍 Searching Wikidata for Brazilian Institutions") print("=" * 80) all_matches = {} for inst_name, search_terms in targets: print(f"\n📍 Searching: {inst_name}") print(f" Terms: {search_terms}") print("-" * 80) matches = search_wikidata_sparql(search_terms) if matches: for i, match in enumerate(matches, 1): print(f" {i}. {match['qid']} - {match['label']}") if match['description']: print(f" {match['description']}") if match['website']: print(f" 🌐 {match['website']}") # Store best match if matches: all_matches[inst_name] = matches[0] else: print(" ❌ No matches found") # Rate limiting time.sleep(2) # Print summary print("\n" + "=" * 80) print("📊 ENRICHMENT MAPPING SUMMARY") print("=" * 80) if all_matches: print("\nFound Wikidata matches:") for inst_name, match in all_matches.items(): print(f" ✅ {inst_name} → {match['qid']} ({match['label']})") else: print("\n❌ No matches found") # Generate enrichment mapping code if all_matches: print("\n" + "=" * 80) print("📝 ENRICHMENT MAPPING CODE (for batch script):") print("=" * 80) print("\nenrichment_mappings = {") for inst_name, match in all_matches.items(): print(f" '{inst_name}': '{match['qid']}', # {match['label']}") print("}") if __name__ == "__main__": main()