#!/usr/bin/env python3 """ Extract Argentine Heritage Institutions from Wikidata This script uses the Wikidata SPARQL endpoint to extract Argentine heritage institutions (libraries, archives, museums) that are documented in Wikidata but may not have ISIL codes. Output: data/isil/AR/argentina_wikidata_institutions.json """ import json from datetime import datetime, timezone from typing import List, Dict, Optional # Wikidata SPARQL query results (from manual query - 200 institutions) # This data was extracted using the wikidata-authenticated_execute_sparql MCP tool # Query: Argentine heritage institutions (archives, libraries, museums) # Date: 2025-11-18 WIKIDATA_RESULTS = [ # Results from SPARQL query would be pasted here # For now, we'll structure the extraction from the query results above ] def extract_institution_from_wikidata(result: Dict) -> Dict: """ Convert Wikidata SPARQL result to heritage custodian record. Args: result: SPARQL query result dictionary Returns: Heritage custodian record dict """ # Extract Q-number from Wikidata URI q_number = result['item']['value'].split('/')[-1] # Parse institution name name = result.get('itemLabel', {}).get('value', 'Unknown') lang = result.get('itemLabel', {}).get('xml:lang', 'es') # Parse institution type type_label = result.get('typeLabel', {}).get('value', '').lower() institution_type = map_type_to_taxonomy(type_label) # Parse location city = result.get('cityLabel', {}).get('value') # Parse coordinates coords_wkt = result.get('coords', {}).get('value') lat, lon = parse_coordinates(coords_wkt) if coords_wkt else (None, None) # Parse identifiers identifiers = [] # Wikidata Q-number identifiers.append({ 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' }) # VIAF if 'viaf' in result: identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': result['viaf']['value'], 'identifier_url': f"https://viaf.org/viaf/{result['viaf']['value']}" }) # Website if 'website' in result: identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': result['website']['value'], 'identifier_url': result['website']['value'] }) # Build record record = { 'name': name, 'name_language': lang, 'institution_type': institution_type, 'country': 'AR', 'locations': [], 'identifiers': identifiers, 'provenance': { 'data_source': 'WIKIDATA', 'data_tier': 'TIER_3_CROWD_SOURCED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Wikidata SPARQL query for Argentine heritage institutions', 'confidence_score': 0.85, 'notes': f'Extracted from Wikidata entity {q_number}' } } # Add location if available if city or lat: location = { 'city': city, 'country': 'AR' } if lat and lon: location['latitude'] = lat location['longitude'] = lon record['locations'].append(location) return record def map_type_to_taxonomy(type_label: str) -> str: """Map Wikidata type labels to GLAMORCUBESFIXPHDNT taxonomy.""" type_mapping = { 'archivo': 'ARCHIVE', 'biblioteca': 'LIBRARY', 'museo': 'MUSEUM', 'exposición': 'MUSEUM', # Exhibitions are museum-related 'galería': 'GALLERY' } for key, value in type_mapping.items(): if key in type_label: return value return 'UNKNOWN' def parse_coordinates(wkt: str) -> tuple: """ Parse WKT Point format to lat/lon. Example: "Point(-58.3719 -34.6043)" -> (-34.6043, -58.3719) Returns: (latitude, longitude) tuple """ try: # Remove "Point(" prefix and ")" suffix coords = wkt.replace('Point(', '').replace(')', '') lon, lat = map(float, coords.split()) return (lat, lon) except: return (None, None) def main(): """Main execution.""" print("=" * 80) print("WIKIDATA ARGENTINE INSTITUTIONS EXTRACTION") print("=" * 80) print() print("Status: This script is a template for future Wikidata extraction.") print() print("The Wikidata SPARQL query found ~200 Argentine heritage institutions,") print("but NONE of them have ISIL codes recorded in Wikidata.") print() print("Findings:") print(" - 200+ institutions in Wikidata (archives, libraries, museums)") print(" - Many have websites, VIAF IDs, coordinates") print(" - ISIL codes NOT systematically recorded") print() print("Recommendation: Focus on IRAM email for ISIL registry instead of") print(" extracting from Wikidata.") print() print("However, Wikidata can be used for:") print(" 1. Enriching institutions with Wikidata Q-numbers") print(" 2. Finding coordinates for geocoding") print(" 3. Cross-referencing with CONABIP/AGN data") print() print("=" * 80) if __name__ == "__main__": main()