#!/usr/bin/env python3 """ Enhanced Wikidata MCP Server with Hybrid API Authentication Architecture: - Action API (legacy): Used for search only - 500 requests/hour (no auth support) - Wikibase REST API: Used for data retrieval - 5,000 requests/hour (with OAuth2 token) Rate limits: - Without token: 500 requests/hour (anonymous) on all endpoints - With token: 5,000 requests/hour on REST API, 500 req/hr on search Based on: https://github.com/zzaebok/mcp-wikidata Enhanced with authentication support for GLAM data extraction project """ import httpx import json import os from mcp.server.fastmcp import FastMCP from typing import List, Dict, Optional server = FastMCP("Wikidata MCP Server (Authenticated)") # API Endpoints WIKIDATA_ACTION_API = "https://www.wikidata.org/w/api.php" WIKIDATA_REST_API = "https://www.wikidata.org/w/rest.php/wikibase/v1" SPARQL_URL = "https://query.wikidata.org/sparql" # Load authentication from environment WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN", "") WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "user@example.com") # User-Agent is REQUIRED by Wikimedia policy # Format: AppName/Version (contact@email.com) USER_AGENT = f"GLAMDataExtractor/1.0 ({WIKIMEDIA_CONTACT_EMAIL})" # Headers for Action API (search only - no OAuth2 support) ACTION_API_HEADERS = { "Accept": "application/json", "User-Agent": USER_AGENT, } # Headers for REST API (with OAuth2 token for 5,000 req/hour) REST_API_HEADERS = { "Accept": "application/json", "User-Agent": USER_AGENT, } if WIKIDATA_API_TOKEN: REST_API_HEADERS["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}" print(f"✓ Wikidata MCP Server authenticated with OAuth2 token") print(f" Read operations (REST API): 5,000 requests/hour") print(f" Write operations (Action API): OAuth2 authenticated") print(f" Search (Action API): 500 requests/hour") else: print(f"⚠ Wikidata MCP Server running WITHOUT OAuth2 token") print(f" All endpoints limited to 500 requests/hour") print(f" Write operations DISABLED (requires authentication)") print(f" Set WIKIDATA_API_TOKEN environment variable to increase rate limits") print(f" User-Agent: {USER_AGENT}") print(f" Strategy: Hybrid (Action API for search/writes, REST API for data retrieval)") print(f" Available tools: search, read, create, edit, add_claim") async def search_wikidata(query: str, is_entity: bool = True) -> str: """ Search for a Wikidata item or property ID by its query. Uses Action API (no authentication - 500 req/hour limit). Args: query: Search query string is_entity: True for entities (items), False for properties Returns: Wikidata ID (e.g., Q12345 or P123) """ params = { "action": "query", "list": "search", "srsearch": query, "srnamespace": 0 if is_entity else 120, "srlimit": 1, "srqiprofile": "classic_noboostlinks" if is_entity else "classic", "srwhat": "text", "format": "json", } async with httpx.AsyncClient() as client: response = await client.get(WIKIDATA_ACTION_API, headers=ACTION_API_HEADERS, params=params) response.raise_for_status() try: title = response.json()["query"]["search"][0]["title"] # For properties, title format is "Property:P123", for entities it's just "Q123" if ":" in title: title = title.split(":")[-1] return title except (KeyError, IndexError): return "No results found. Consider changing the search term." @server.tool() async def search_entity(query: str) -> str: """ Search for a Wikidata entity ID by its query. Args: query (str): The query to search for. The query should be unambiguous enough to uniquely identify the entity. Returns: str: The Wikidata entity ID corresponding to the given query (e.g., Q12345). Example: search_entity("Rijksmuseum") -> "Q190804" """ return await search_wikidata(query, is_entity=True) @server.tool() async def search_property(query: str) -> str: """ Search for a Wikidata property ID by its query. Args: query (str): The query to search for. The query should be unambiguous enough to uniquely identify the property. Returns: str: The Wikidata property ID corresponding to the given query (e.g., P123). Example: search_property("ISIL code") -> "P791" """ return await search_wikidata(query, is_entity=False) @server.tool() async def get_properties(entity_id: str) -> List[str]: """ Get the properties associated with a given Wikidata entity ID. Uses Wikibase REST API (works with or without authentication). Args: entity_id (str): The entity ID to retrieve properties for. This should be a valid Wikidata entity ID. Returns: list: A list of property IDs associated with the given entity ID. If no properties are found, an empty list is returned. Example: get_properties("Q190804") -> ["P31", "P17", "P131", ...] """ async with httpx.AsyncClient() as client: response = await client.get( f"{WIKIDATA_REST_API}/entities/items/{entity_id}", headers=REST_API_HEADERS ) # Check for OAuth errors and retry without authentication if response.status_code == 403: response_data = response.json() if response_data.get("errorKey") == "mwoauth-invalid-authorization-invalid-user": headers_no_auth = {k: v for k, v in REST_API_HEADERS.items() if k != "Authorization"} response = await client.get( f"{WIKIDATA_REST_API}/entities/items/{entity_id}", headers=headers_no_auth ) response.raise_for_status() data = response.json() # Extract property IDs from statements statements = data.get("statements", {}) return list(statements.keys()) @server.tool() async def execute_sparql(sparql_query: str) -> str: """ Execute a SPARQL query on Wikidata. You may assume the following prefixes: PREFIX wd: PREFIX wdt: PREFIX p: PREFIX ps: Args: sparql_query (str): The SPARQL query to execute. Returns: str: The JSON-formatted result of the SPARQL query execution. If there are no results, an empty JSON object will be returned. Example: execute_sparql('SELECT ?item WHERE { ?item wdt:P791 "NL-AmRMA" }') -> JSON results Note: SPARQL endpoint does NOT use API token authentication. Rate limits: https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/User_Manual#Query_limits """ # Note: SPARQL endpoint uses custom User-Agent for rate limiting sparql_headers = {"User-Agent": USER_AGENT} async with httpx.AsyncClient(timeout=30.0) as client: response = await client.get( SPARQL_URL, headers=sparql_headers, params={"query": sparql_query, "format": "json"} ) response.raise_for_status() result = response.json()["results"]["bindings"] return json.dumps(result, indent=2) @server.tool() async def get_metadata(entity_id: str, language: str = "en") -> Dict[str, str]: """ Retrieve the label and description for a given Wikidata entity ID. Uses Wikibase REST API (works with or without authentication). Args: entity_id (str): The entity ID to retrieve metadata for. language (str): The language code for the label and description (default is "en"). Use ISO 639-1 codes. Returns: dict: A dictionary containing the label and description of the entity, if available. Example: get_metadata("Q190804", "en") -> {"Label": "Rijksmuseum", "Description": "museum in Amsterdam, Netherlands"} get_metadata("Q190804", "nl") -> {"Label": "Rijksmuseum", "Description": "Nederlands nationaal museum in Amsterdam"} """ async with httpx.AsyncClient() as client: response = await client.get( f"{WIKIDATA_REST_API}/entities/items/{entity_id}", headers=REST_API_HEADERS ) # Check for OAuth errors and retry without authentication if response.status_code == 403: response_data = response.json() if response_data.get("errorKey") == "mwoauth-invalid-authorization-invalid-user": # Retry without OAuth token (unified login not activated) headers_no_auth = {k: v for k, v in REST_API_HEADERS.items() if k != "Authorization"} response = await client.get( f"{WIKIDATA_REST_API}/entities/items/{entity_id}", headers=headers_no_auth ) response.raise_for_status() data = response.json() label = data.get("labels", {}).get(language, "No label found") description = data.get("descriptions", {}).get(language, "No description found") return {"Label": label, "Description": description} @server.tool() async def get_identifiers(entity_id: str) -> Dict[str, Optional[str]]: """ Get external identifiers for a Wikidata entity (ISIL, VIAF, etc.). Uses Wikibase REST API (works with or without authentication). Args: entity_id (str): The Wikidata entity ID (e.g., Q190804) Returns: dict: Dictionary of identifier types and their values Example: get_identifiers("Q190804") -> { "ISIL": "NL-AmRMA", "VIAF": "148691498", "official_website": "https://www.rijksmuseum.nl" } """ # Key heritage-related property IDs identifier_props = { "P791": "ISIL", # ISIL code "P214": "VIAF", # VIAF ID "P227": "GND", # GND ID "P244": "LCNAF", # Library of Congress "P1566": "GeoNames", # GeoNames ID "P856": "official_website", # Official website "P2581": "BabelNet", # BabelNet ID } async with httpx.AsyncClient() as client: response = await client.get( f"{WIKIDATA_REST_API}/entities/items/{entity_id}", headers=REST_API_HEADERS ) # Check for OAuth errors and retry without authentication if response.status_code == 403: response_data = response.json() if response_data.get("errorKey") == "mwoauth-invalid-authorization-invalid-user": headers_no_auth = {k: v for k, v in REST_API_HEADERS.items() if k != "Authorization"} response = await client.get( f"{WIKIDATA_REST_API}/entities/items/{entity_id}", headers=headers_no_auth ) response.raise_for_status() data = response.json() statements = data.get("statements", {}) identifiers = {} for prop_id, name in identifier_props.items(): if prop_id in statements and len(statements[prop_id]) > 0: # Get the first value from the statement try: statement = statements[prop_id][0] value_data = statement.get("value", {}) if prop_id == "P856": # URL value = value_data.get("content") else: value = value_data.get("content") identifiers[name] = value except (KeyError, IndexError): identifiers[name] = None else: identifiers[name] = None return identifiers async def _get_csrf_token() -> str: """ Get a CSRF token for authenticated write operations. Uses Action API with OAuth2 Bearer token. Returns: str: CSRF token for use in wbeditentity calls Raises: ValueError: If no authentication token is available httpx.HTTPError: If token retrieval fails """ if not WIKIDATA_API_TOKEN: raise ValueError("CSRF token requires authentication. Set WIKIDATA_API_TOKEN environment variable.") # Action API requires OAuth token in Authorization header auth_headers = { "Authorization": f"Bearer {WIKIDATA_API_TOKEN}", "User-Agent": USER_AGENT, } params = { "action": "query", "meta": "tokens", "type": "csrf", "format": "json", } async with httpx.AsyncClient() as client: response = await client.get( WIKIDATA_ACTION_API, headers=auth_headers, params=params ) response.raise_for_status() data = response.json() # Check for error responses if "error" in data: error_info = data["error"] raise ValueError(f"CSRF token error: {error_info.get('code')} - {error_info.get('info')}") token = data["query"]["tokens"]["csrftoken"] if token == "+\\": raise ValueError("Received anonymous CSRF token. Check OAuth2 authentication.") return token @server.tool() async def create_entity( labels: Dict[str, str], descriptions: Optional[Dict[str, str]] = None, aliases: Optional[Dict[str, List[str]]] = None, ) -> str: """ Create a new Wikidata item with labels, descriptions, and aliases. Requires OAuth2 authentication. Args: labels (dict): Labels in different languages, e.g., {"en": "Example Item", "nl": "Voorbeeld item"} descriptions (dict): Optional descriptions, e.g., {"en": "An example item", "nl": "Een voorbeeld item"} aliases (dict): Optional aliases, e.g., {"en": ["Alias 1", "Alias 2"]} Returns: str: The entity ID of the newly created item (e.g., "Q12345") Example: create_entity( labels={"en": "Amsterdam Museum", "nl": "Amsterdam Museum"}, descriptions={"en": "municipal museum in Amsterdam", "nl": "gemeentelijk museum in Amsterdam"} ) -> "Q98765432" Note: - Requires WIKIDATA_API_TOKEN environment variable - Uses Action API wbeditentity endpoint - Rate limited by OAuth2 quota (5,000 req/hr for authenticated users) """ # Get CSRF token csrf_token = await _get_csrf_token() # Build entity data structure entity_data = {"labels": {}} for lang, label in labels.items(): entity_data["labels"][lang] = {"language": lang, "value": label} if descriptions: entity_data["descriptions"] = {} for lang, desc in descriptions.items(): entity_data["descriptions"][lang] = {"language": lang, "value": desc} if aliases: entity_data["aliases"] = {} for lang, alias_list in aliases.items(): entity_data["aliases"][lang] = [ {"language": lang, "value": alias} for alias in alias_list ] # Prepare API request auth_headers = { "Authorization": f"Bearer {WIKIDATA_API_TOKEN}", "User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded", } payload = { "action": "wbeditentity", "new": "item", "data": json.dumps(entity_data), "token": csrf_token, "format": "json", } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( WIKIDATA_ACTION_API, headers=auth_headers, data=payload ) response.raise_for_status() result = response.json() # Check for API errors if "error" in result: error_info = result["error"] raise ValueError(f"Create entity error: {error_info.get('code')} - {error_info.get('info')}") entity_id = result["entity"]["id"] return f"Successfully created entity {entity_id}" @server.tool() async def edit_entity( entity_id: str, labels: Optional[Dict[str, str]] = None, descriptions: Optional[Dict[str, str]] = None, aliases: Optional[Dict[str, List[str]]] = None, ) -> str: """ Edit an existing Wikidata entity's labels, descriptions, or aliases. Requires OAuth2 authentication. Args: entity_id (str): The entity ID to edit (e.g., "Q12345") labels (dict): Optional labels to add/update, e.g., {"en": "New Label"} descriptions (dict): Optional descriptions to add/update, e.g., {"en": "New description"} aliases (dict): Optional aliases to add, e.g., {"en": ["Alias 1", "Alias 2"]} Returns: str: Success message with entity ID Example: edit_entity( entity_id="Q98765", labels={"en": "Updated Label"}, descriptions={"en": "Updated description"} ) -> "Successfully edited entity Q98765" Note: - Requires WIKIDATA_API_TOKEN environment variable - Uses Action API wbeditentity endpoint - Existing values are updated; new languages are added """ # Get CSRF token csrf_token = await _get_csrf_token() # Build entity data structure (only include provided fields) entity_data = {} if labels: entity_data["labels"] = {} for lang, label in labels.items(): entity_data["labels"][lang] = {"language": lang, "value": label} if descriptions: entity_data["descriptions"] = {} for lang, desc in descriptions.items(): entity_data["descriptions"][lang] = {"language": lang, "value": desc} if aliases: entity_data["aliases"] = {} for lang, alias_list in aliases.items(): entity_data["aliases"][lang] = [ {"language": lang, "value": alias} for alias in alias_list ] if not entity_data: return "No changes requested. Provide at least one of: labels, descriptions, or aliases." # Prepare API request auth_headers = { "Authorization": f"Bearer {WIKIDATA_API_TOKEN}", "User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded", } payload = { "action": "wbeditentity", "id": entity_id, "data": json.dumps(entity_data), "token": csrf_token, "format": "json", } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( WIKIDATA_ACTION_API, headers=auth_headers, data=payload ) response.raise_for_status() result = response.json() # Check for API errors if "error" in result: error_info = result["error"] raise ValueError(f"Edit entity error: {error_info.get('code')} - {error_info.get('info')}") return f"Successfully edited entity {entity_id}" @server.tool() async def add_claim( entity_id: str, property_id: str, value: str, value_type: str = "string" ) -> str: """ Add a claim (statement) to a Wikidata entity. Requires OAuth2 authentication. Args: entity_id (str): The entity ID to add the claim to (e.g., "Q12345") property_id (str): The property ID for the claim (e.g., "P31" for "instance of") value (str): The value for the claim: - For "item": another entity ID (e.g., "Q33506") - For "string": a text string - For "url": a URL string - For "time": ISO 8601 format (e.g., "+2023-01-15T00:00:00Z") value_type (str): Type of value - "item", "string", "url", or "time" (default: "string") Returns: str: Success message with claim ID Example: # Add "instance of: museum" (P31: Q33506) add_claim(entity_id="Q98765", property_id="P31", value="Q33506", value_type="item") # Add ISIL code add_claim(entity_id="Q98765", property_id="P791", value="NL-ABC", value_type="string") # Add official website add_claim(entity_id="Q98765", property_id="P856", value="https://example.org", value_type="url") Note: - Requires WIKIDATA_API_TOKEN environment variable - Uses Action API wbcreateclaim endpoint - Does not check for duplicate claims """ # Get CSRF token csrf_token = await _get_csrf_token() # Build claim value based on type if value_type == "item": claim_value = json.dumps({ "entity-type": "item", "numeric-id": int(value.replace("Q", "")) }) elif value_type == "string": claim_value = json.dumps(value) elif value_type == "url": claim_value = json.dumps(value) elif value_type == "time": # Time values require special structure claim_value = json.dumps({ "time": value, "timezone": 0, "before": 0, "after": 0, "precision": 11, # Day precision "calendarmodel": "http://www.wikidata.org/entity/Q1985727" }) else: raise ValueError(f"Unsupported value_type: {value_type}. Use 'item', 'string', 'url', or 'time'.") # Prepare API request auth_headers = { "Authorization": f"Bearer {WIKIDATA_API_TOKEN}", "User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded", } payload = { "action": "wbcreateclaim", "entity": entity_id, "property": property_id, "snaktype": "value", "value": claim_value, "token": csrf_token, "format": "json", } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( WIKIDATA_ACTION_API, headers=auth_headers, data=payload ) response.raise_for_status() result = response.json() # Check for API errors if "error" in result: error_info = result["error"] raise ValueError(f"Add claim error: {error_info.get('code')} - {error_info.get('info')}") claim_id = result["claim"]["id"] return f"Successfully added claim {claim_id} to entity {entity_id}" if __name__ == "__main__": server.run()