#!/usr/bin/env python3 """ Query Wikidata for Mexican heritage institutions using SPARQL. Saves results to JSON for fuzzy matching in Batch 2 enrichment. """ from SPARQLWrapper import SPARQLWrapper, JSON import json from pathlib import Path def query_wikidata_mexican_glam(): """Query Wikidata for Mexican GLAM institutions.""" endpoint = SPARQLWrapper("https://query.wikidata.org/sparql") endpoint.setRequestMethod('POST') # Query for museums, libraries, archives, galleries in Mexico query = """ SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?viaf ?isil ?coords ?typeLabel ?location ?locationLabel WHERE { # Instance of museum, library, archive, or gallery VALUES ?type { wd:Q33506 # museum wd:Q7075 # library wd:Q166118 # archive wd:Q1007870 # art museum wd:Q22698 # park wd:Q207694 # art gallery } ?item wdt:P31/wdt:P279* ?type . # Located in Mexico ?item wdt:P17 wd:Q96 . # Optional: external identifiers OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID OPTIONAL { ?item wdt:P791 ?isil } # ISIL code OPTIONAL { ?item wdt:P625 ?coords } # Coordinates # Optional: location (city/state) OPTIONAL { ?item wdt:P131 ?location . ?location rdfs:label ?locationLabel . FILTER(LANG(?locationLabel) = "es" || LANG(?locationLabel) = "en") } # Get labels in Spanish and English SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . ?item rdfs:label ?itemLabel . ?type rdfs:label ?typeLabel . } # Optional: alternative names OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) = "es" || LANG(?itemAltLabel) = "en") } } LIMIT 500 """ endpoint.setQuery(query) endpoint.setReturnFormat(JSON) print("Querying Wikidata for Mexican heritage institutions...") print("This may take 30-60 seconds...") try: results = endpoint.query().convert() return results['results']['bindings'] except Exception as e: print(f"Error querying Wikidata: {e}") return [] def parse_wikidata_results(results): """Parse SPARQL results into structured format.""" institutions = {} for result in results: # Extract Q-number from URI item_uri = result['item']['value'] q_number = item_uri.split('/')[-1] # Get name name = result.get('itemLabel', {}).get('value', '') # Skip generic labels if name.startswith('Q') or not name: continue # Initialize or update institution record if q_number not in institutions: institutions[q_number] = { 'q_number': q_number, 'name': name, 'alternative_names': [], 'viaf': None, 'isil': None, 'coordinates': None, 'type': result.get('typeLabel', {}).get('value', ''), 'location': result.get('locationLabel', {}).get('value', '') } # Add alternative name if present if 'itemAltLabel' in result: alt_name = result['itemAltLabel']['value'] if alt_name not in institutions[q_number]['alternative_names']: institutions[q_number]['alternative_names'].append(alt_name) # Add identifiers if 'viaf' in result and not institutions[q_number]['viaf']: institutions[q_number]['viaf'] = result['viaf']['value'] if 'isil' in result and not institutions[q_number]['isil']: institutions[q_number]['isil'] = result['isil']['value'] if 'coords' in result and not institutions[q_number]['coordinates']: institutions[q_number]['coordinates'] = result['coords']['value'] return list(institutions.values()) def main(): # Query Wikidata results = query_wikidata_mexican_glam() if not results: print("No results found or query failed.") return print(f"Received {len(results)} results from Wikidata") # Parse results institutions = parse_wikidata_results(results) print(f"Parsed {len(institutions)} unique Mexican institutions") # Save to JSON output_path = Path(__file__).parent.parent / 'data' / 'wikidata' / 'mexican_institutions_wikidata.json' output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(institutions, f, ensure_ascii=False, indent=2) print(f"\nSaved {len(institutions)} institutions to {output_path}") # Print sample results print("\nSample results:") for inst in institutions[:10]: print(f" - {inst['name']} ({inst['q_number']})") if inst['viaf']: print(f" VIAF: {inst['viaf']}") if inst['isil']: print(f" ISIL: {inst['isil']}") if __name__ == '__main__': main()