From 798e8763d0674d978e4eec41771e2bbf0da204d8 Mon Sep 17 00:00:00 2001
From: Danny Avila <messagedaniel@protonmail.com>
Date: Sun, 24 Mar 2024 23:43:00 -0400
Subject: [PATCH] =?UTF-8?q?=F0=9F=91=93=20feat:=20Vision=20Support=20for?=
 =?UTF-8?q?=20Assistants=20(#2195)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor(assistants/chat): use promises to speed up initialization, initialize shared variables, include `attachedFileIds` to streamRunManager

* chore: additional typedefs

* fix(OpenAIClient): handle edge case where attachments promise is resolved

* feat: createVisionPrompt

* feat: Vision Support for Assistants
---
 api/app/clients/OpenAIClient.js               |   6 +-
 api/app/clients/prompts/createVisionPrompt.js |  34 +++
 api/app/clients/prompts/index.js              |   2 +
 api/server/routes/assistants/chat.js          | 243 ++++++++++++------
 api/server/services/Runs/StreamRunManager.js  |   4 +
 api/server/services/Threads/manage.js         |  13 +-
 api/server/services/ToolService.js            |  48 +++-
 api/server/utils/handleText.js                |   1 +
 api/typedefs.js                               |  12 +
 client/src/common/assistants-types.ts         |   6 +-
 .../components/Chat/Messages/Content/Part.tsx |  26 +-
 .../SidePanel/Builder/AssistantPanel.tsx      |  43 +++-
 .../SidePanel/Builder/AssistantSelect.tsx     |  15 +-
 client/src/localization/languages/Eng.tsx     |   1 +
 packages/data-provider/src/config.ts          |   2 +
 packages/data-provider/src/schemas.ts         |  20 +-
 16 files changed, 376 insertions(+), 100 deletions(-)
 create mode 100644 api/app/clients/prompts/createVisionPrompt.js

diff --git a/api/app/clients/OpenAIClient.js b/api/app/clients/OpenAIClient.js
index 183f7999c..ef6868254 100644
--- a/api/app/clients/OpenAIClient.js
+++ b/api/app/clients/OpenAIClient.js
@@ -92,7 +92,11 @@ class OpenAIClient extends BaseClient {
     }
 
     this.defaultVisionModel = this.options.visionModel ?? 'gpt-4-vision-preview';
-    this.options.attachments?.then((attachments) => this.checkVisionRequest(attachments));
+    if (typeof this.options.attachments?.then === 'function') {
+      this.options.attachments.then((attachments) => this.checkVisionRequest(attachments));
+    } else {
+      this.checkVisionRequest(this.options.attachments);
+    }
 
     const { OPENROUTER_API_KEY, OPENAI_FORCE_PROMPT } = process.env ?? {};
     if (OPENROUTER_API_KEY && !this.azure) {
diff --git a/api/app/clients/prompts/createVisionPrompt.js b/api/app/clients/prompts/createVisionPrompt.js
new file mode 100644
index 000000000..5d8a7bbf5
--- /dev/null
+++ b/api/app/clients/prompts/createVisionPrompt.js
@@ -0,0 +1,34 @@
+/**
+ * Generates a prompt instructing the user to describe an image in detail, tailored to different types of visual content.
+ * @param {boolean} pluralized - Whether to pluralize the prompt for multiple images.
+ * @returns {string} - The generated vision prompt.
+ */
+const createVisionPrompt = (pluralized = false) => {
+  return `Please describe the image${
+    pluralized ? 's' : ''
+  } in detail, covering relevant aspects such as:
+
+  For photographs, illustrations, or artwork:
+  - The main subject(s) and their appearance, positioning, and actions
+  - The setting, background, and any notable objects or elements
+  - Colors, lighting, and overall mood or atmosphere
+  - Any interesting details, textures, or patterns
+  - The style, technique, or medium used (if discernible)
+  
+  For screenshots or images containing text:
+  - The content and purpose of the text
+  - The layout, formatting, and organization of the information
+  - Any notable visual elements, such as logos, icons, or graphics
+  - The overall context or message conveyed by the screenshot
+  
+  For graphs, charts, or data visualizations:
+  - The type of graph or chart (e.g., bar graph, line chart, pie chart)
+  - The variables being compared or analyzed
+  - Any trends, patterns, or outliers in the data
+  - The axis labels, scales, and units of measurement
+  - The title, legend, and any additional context provided
+  
+  Be as specific and descriptive as possible while maintaining clarity and concision.`;
+};
+
+module.exports = createVisionPrompt;
diff --git a/api/app/clients/prompts/index.js b/api/app/clients/prompts/index.js
index 9edb9954f..36bb6f7e2 100644
--- a/api/app/clients/prompts/index.js
+++ b/api/app/clients/prompts/index.js
@@ -4,6 +4,7 @@ const handleInputs = require('./handleInputs');
 const instructions = require('./instructions');
 const titlePrompts = require('./titlePrompts');
 const truncateText = require('./truncateText');
+const createVisionPrompt = require('./createVisionPrompt');
 const createContextHandlers = require('./createContextHandlers');
 
 module.exports = {
@@ -13,5 +14,6 @@ module.exports = {
   ...instructions,
   ...titlePrompts,
   truncateText,
+  createVisionPrompt,
   createContextHandlers,
 };
diff --git a/api/server/routes/assistants/chat.js b/api/server/routes/assistants/chat.js
index a0cf57000..9602ffdfa 100644
--- a/api/server/routes/assistants/chat.js
+++ b/api/server/routes/assistants/chat.js
@@ -4,9 +4,11 @@ const {
   Constants,
   RunStatus,
   CacheKeys,
+  FileSources,
   ContentTypes,
   EModelEndpoint,
   ViolationTypes,
+  ImageVisionTool,
   AssistantStreamEvents,
 } = require('librechat-data-provider');
 const {
@@ -17,9 +19,10 @@ const {
   addThreadMetadata,
   saveAssistantMessage,
 } = require('~/server/services/Threads');
+const { sendResponse, sendMessage, sleep, isEnabled, countTokens } = require('~/server/utils');
 const { runAssistant, createOnTextProgress } = require('~/server/services/AssistantService');
 const { addTitle, initializeClient } = require('~/server/services/Endpoints/assistants');
-const { sendResponse, sendMessage, sleep, isEnabled, countTokens } = require('~/server/utils');
+const { formatMessage, createVisionPrompt } = require('~/app/clients/prompts');
 const { createRun, StreamRunManager } = require('~/server/services/Runs');
 const { getTransactions } = require('~/models/Transaction');
 const checkBalance = require('~/models/checkBalance');
@@ -100,6 +103,16 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
   let parentMessageId = _parentId;
   /** @type {TMessage[]} */
   let previousMessages = [];
+  /** @type {import('librechat-data-provider').TConversation | null} */
+  let conversation = null;
+  /** @type {string[]} */
+  let file_ids = [];
+  /** @type {Set<string>} */
+  let attachedFileIds = new Set();
+  /** @type {TMessage | null} */
+  let requestMessage = null;
+  /** @type {undefined | Promise<ChatCompletion>} */
+  let visionPromise;
 
   const userMessageId = v4();
   const responseMessageId = v4();
@@ -258,7 +271,10 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
       throw new Error('Missing assistant_id');
     }
 
-    if (isEnabled(process.env.CHECK_BALANCE)) {
+    const checkBalanceBeforeRun = async () => {
+      if (!isEnabled(process.env.CHECK_BALANCE)) {
+        return;
+      }
       const transactions =
         (await getTransactions({
           user: req.user.id,
@@ -288,7 +304,7 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
           amount: promptTokens,
         },
       });
-    }
+    };
 
     /** @type {{ openai: OpenAIClient }} */
     const { openai: _openai, client } = await initializeClient({
@@ -300,15 +316,11 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
 
     openai = _openai;
 
-    // if (thread_id) {
-    //   previousMessages = await checkMessageGaps({ openai, thread_id, conversationId });
-    // }
-
     if (previousMessages.length) {
       parentMessageId = previousMessages[previousMessages.length - 1].messageId;
     }
 
-    const userMessage = {
+    let userMessage = {
       role: 'user',
       content: text,
       metadata: {
@@ -316,75 +328,7 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
       },
     };
 
-    let thread_file_ids = [];
-    if (convoId) {
-      const convo = await getConvo(req.user.id, convoId);
-      if (convo && convo.file_ids) {
-        thread_file_ids = convo.file_ids;
-      }
-    }
-
-    const file_ids = files.map(({ file_id }) => file_id);
-    if (file_ids.length || thread_file_ids.length) {
-      userMessage.file_ids = file_ids;
-      openai.attachedFileIds = new Set([...file_ids, ...thread_file_ids]);
-    }
-
-    // TODO: may allow multiple messages to be created beforehand in a future update
-    const initThreadBody = {
-      messages: [userMessage],
-      metadata: {
-        user: req.user.id,
-        conversationId,
-      },
-    };
-
-    const result = await initThread({ openai, body: initThreadBody, thread_id });
-    thread_id = result.thread_id;
-
-    createOnTextProgress({
-      openai,
-      conversationId,
-      userMessageId,
-      messageId: responseMessageId,
-      thread_id,
-    });
-
-    const requestMessage = {
-      user: req.user.id,
-      text,
-      messageId: userMessageId,
-      parentMessageId,
-      // TODO: make sure client sends correct format for `files`, use zod
-      files,
-      file_ids,
-      conversationId,
-      isCreatedByUser: true,
-      assistant_id,
-      thread_id,
-      model: assistant_id,
-    };
-
-    previousMessages.push(requestMessage);
-
-    await saveUserMessage({ ...requestMessage, model });
-
-    const conversation = {
-      conversationId,
-      // TODO: title feature
-      title: 'New Chat',
-      endpoint: EModelEndpoint.assistants,
-      promptPrefix: promptPrefix,
-      instructions: instructions,
-      assistant_id,
-      // model,
-    };
-
-    if (file_ids.length) {
-      conversation.file_ids = file_ids;
-    }
-
-    /** @type {CreateRunBody} */
+    /** @type {CreateRunBody | undefined} */
     const body = {
       assistant_id,
       model,
@@ -398,6 +342,143 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
       body.instructions = instructions;
     }
 
+    const getRequestFileIds = async () => {
+      let thread_file_ids = [];
+      if (convoId) {
+        const convo = await getConvo(req.user.id, convoId);
+        if (convo && convo.file_ids) {
+          thread_file_ids = convo.file_ids;
+        }
+      }
+
+      file_ids = files.map(({ file_id }) => file_id);
+      if (file_ids.length || thread_file_ids.length) {
+        userMessage.file_ids = file_ids;
+        attachedFileIds = new Set([...file_ids, ...thread_file_ids]);
+      }
+    };
+
+    const addVisionPrompt = async () => {
+      if (!req.body.endpointOption.attachments) {
+        return;
+      }
+
+      const assistant = await openai.beta.assistants.retrieve(assistant_id);
+      const visionToolIndex = assistant.tools.findIndex(
+        (tool) => tool.function.name === ImageVisionTool.function.name,
+      );
+
+      if (visionToolIndex === -1) {
+        return;
+      }
+
+      const attachments = await req.body.endpointOption.attachments;
+      let visionMessage = {
+        role: 'user',
+        content: '',
+      };
+      const files = await client.addImageURLs(visionMessage, attachments);
+      if (!visionMessage.image_urls?.length) {
+        return;
+      }
+
+      const imageCount = visionMessage.image_urls.length;
+      const plural = imageCount > 1;
+      visionMessage.content = createVisionPrompt(plural);
+      visionMessage = formatMessage({ message: visionMessage, endpoint: EModelEndpoint.openAI });
+
+      visionPromise = openai.chat.completions.create({
+        model: 'gpt-4-vision-preview',
+        messages: [visionMessage],
+        max_tokens: 4000,
+      });
+
+      const pluralized = plural ? 's' : '';
+      body.additional_instructions = `${
+        body.additional_instructions ? `${body.additional_instructions}\n` : ''
+      }The user has uploaded ${imageCount} image${pluralized}.
+      Use the \`${ImageVisionTool.function.name}\` tool to retrieve ${
+  plural ? '' : 'a '
+}detailed text description${pluralized} for ${plural ? 'each' : 'the'} image${pluralized}.`;
+
+      return files;
+    };
+
+    const initializeThread = async () => {
+      /** @type {[ undefined | MongoFile[]]}*/
+      const [processedFiles] = await Promise.all([addVisionPrompt(), getRequestFileIds()]);
+      // TODO: may allow multiple messages to be created beforehand in a future update
+      const initThreadBody = {
+        messages: [userMessage],
+        metadata: {
+          user: req.user.id,
+          conversationId,
+        },
+      };
+
+      if (processedFiles) {
+        for (const file of processedFiles) {
+          if (file.source !== FileSources.openai) {
+            attachedFileIds.delete(file.file_id);
+            const index = file_ids.indexOf(file.file_id);
+            if (index > -1) {
+              file_ids.splice(index, 1);
+            }
+          }
+        }
+
+        userMessage.file_ids = file_ids;
+      }
+
+      const result = await initThread({ openai, body: initThreadBody, thread_id });
+      thread_id = result.thread_id;
+
+      createOnTextProgress({
+        openai,
+        conversationId,
+        userMessageId,
+        messageId: responseMessageId,
+        thread_id,
+      });
+
+      requestMessage = {
+        user: req.user.id,
+        text,
+        messageId: userMessageId,
+        parentMessageId,
+        // TODO: make sure client sends correct format for `files`, use zod
+        files,
+        file_ids,
+        conversationId,
+        isCreatedByUser: true,
+        assistant_id,
+        thread_id,
+        model: assistant_id,
+      };
+
+      previousMessages.push(requestMessage);
+
+      /* asynchronous */
+      saveUserMessage({ ...requestMessage, model });
+
+      conversation = {
+        conversationId,
+        title: 'New Chat',
+        endpoint: EModelEndpoint.assistants,
+        promptPrefix: promptPrefix,
+        instructions: instructions,
+        assistant_id,
+        // model,
+      };
+
+      if (file_ids.length) {
+        conversation.file_ids = file_ids;
+      }
+    };
+
+    const promises = [initializeThread(), checkBalanceBeforeRun()];
+    await Promise.all(promises);
+
     const sendInitialResponse = () => {
       sendMessage(res, {
         sync: true,
@@ -421,6 +502,8 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
 
     const processRun = async (retry = false) => {
       if (req.app.locals[EModelEndpoint.azureOpenAI]?.assistants) {
+        openai.attachedFileIds = attachedFileIds;
+        openai.visionPromise = visionPromise;
         if (retry) {
           response = await runAssistant({
             openai,
@@ -463,9 +546,11 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
         req,
         res,
         openai,
-        thread_id,
-        responseMessage: openai.responseMessage,
         handlers,
+        thread_id,
+        visionPromise,
+        attachedFileIds,
+        responseMessage: openai.responseMessage,
         // streamOptions: {
 
         // },
diff --git a/api/server/services/Runs/StreamRunManager.js b/api/server/services/Runs/StreamRunManager.js
index fe9e8da73..2059de6a5 100644
--- a/api/server/services/Runs/StreamRunManager.js
+++ b/api/server/services/Runs/StreamRunManager.js
@@ -59,6 +59,10 @@ class StreamRunManager {
     this.messages = [];
     /** @type {string} */
     this.text = '';
+    /** @type {Set<string>} */
+    this.attachedFileIds = fields.attachedFileIds;
+    /** @type {undefined | Promise<ChatCompletion>} */
+    this.visionPromise = fields.visionPromise;
 
     /**
      * @type {Object.<AssistantStreamEvents, (event: AssistantStreamEvent) => Promise<void>>}
diff --git a/api/server/services/Threads/manage.js b/api/server/services/Threads/manage.js
index 12386b60a..18dbea7fe 100644
--- a/api/server/services/Threads/manage.js
+++ b/api/server/services/Threads/manage.js
@@ -468,21 +468,28 @@ async function checkMessageGaps({ openai, latestMessageId, thread_id, run_id, co
 
 /**
  * Records token usage for a given completion request.
- *
  * @param {Object} params - The parameters for initializing a thread.
  * @param {number} params.prompt_tokens - The number of prompt tokens used.
  * @param {number} params.completion_tokens - The number of completion tokens used.
  * @param {string} params.model - The model used by the assistant run.
  * @param {string} params.user - The user's ID.
  * @param {string} params.conversationId - LibreChat conversation ID.
+ * @param {string} [params.context='message'] - The context of the usage. Defaults to 'message'.
  * @return {Promise<TMessage[]>} A promise that resolves to the updated messages
  */
-const recordUsage = async ({ prompt_tokens, completion_tokens, model, user, conversationId }) => {
+const recordUsage = async ({
+  prompt_tokens,
+  completion_tokens,
+  model,
+  user,
+  conversationId,
+  context = 'message',
+}) => {
   await spendTokens(
     {
       user,
       model,
-      context: 'message',
+      context,
       conversationId,
     },
     { promptTokens: prompt_tokens, completionTokens: completion_tokens },
diff --git a/api/server/services/ToolService.js b/api/server/services/ToolService.js
index 369ae37f7..923e9a7ab 100644
--- a/api/server/services/ToolService.js
+++ b/api/server/services/ToolService.js
@@ -4,14 +4,17 @@ const { StructuredTool } = require('langchain/tools');
 const { zodToJsonSchema } = require('zod-to-json-schema');
 const { Calculator } = require('langchain/tools/calculator');
 const {
+  Tools,
   ContentTypes,
   imageGenTools,
+  actionDelimiter,
+  ImageVisionTool,
   openapiToFunction,
   validateAndParseOpenAPISpec,
-  actionDelimiter,
 } = require('librechat-data-provider');
 const { loadActionSets, createActionTool, domainParser } = require('./ActionService');
 const { processFileURL } = require('~/server/services/Files/process');
+const { recordUsage } = require('~/server/services/Threads');
 const { loadTools } = require('~/app/clients/tools/util');
 const { redactMessage } = require('~/config/parsers');
 const { sleep } = require('~/server/utils');
@@ -83,6 +86,8 @@ function loadAndFormatTools({ directory, filter = new Set() }) {
     tools.push(formattedTool);
   }
 
+  tools.push(ImageVisionTool);
+
   return tools.reduce((map, tool) => {
     map[tool.function.name] = tool;
     return map;
@@ -100,8 +105,8 @@ function loadAndFormatTools({ directory, filter = new Set() }) {
  */
 function formatToOpenAIAssistantTool(tool) {
   return {
-    type: 'function',
-    function: {
+    type: Tools.function,
+    [Tools.function]: {
       name: tool.name,
       description: tool.description,
       parameters: zodToJsonSchema(tool.schema),
@@ -109,13 +114,42 @@ function formatToOpenAIAssistantTool(tool) {
   };
 }
 
+/**
+ * Processes the required actions by calling the appropriate tools and returning the outputs.
+ * @param {OpenAIClient} client - OpenAI or StreamRunManager Client.
+ * @param {RequiredAction} requiredActions - The current required action.
+ * @returns {Promise<ToolOutput>} The outputs of the tools.
+ */
+const processVisionRequest = async (client, currentAction) => {
+  if (!client.visionPromise) {
+    return {
+      tool_call_id: currentAction.toolCallId,
+      output: 'No image details found.',
+    };
+  }
+
+  /** @type {ChatCompletion | undefined} */
+  const completion = await client.visionPromise;
+  if (completion.usage) {
+    recordUsage({
+      user: client.req.user.id,
+      model: client.req.body.model,
+      conversationId: (client.responseMessage ?? client.finalMessage).conversationId,
+      ...completion.usage,
+    });
+  }
+  const output = completion?.choices?.[0]?.message?.content ?? 'No image details found.';
+  return {
+    tool_call_id: currentAction.toolCallId,
+    output,
+  };
+};
+
 /**
  * Processes return required actions from run.
- *
  * @param {OpenAIClient} client - OpenAI or StreamRunManager Client.
  * @param {RequiredAction[]} requiredActions - The required actions to submit outputs for.
  * @returns {Promise<ToolOutputs>} The outputs of the tools.
- *
  */
 async function processRequiredActions(client, requiredActions) {
   logger.debug(
@@ -152,6 +186,10 @@ async function processRequiredActions(client, requiredActions) {
 
   for (let i = 0; i < requiredActions.length; i++) {
     const currentAction = requiredActions[i];
+    if (currentAction.tool === ImageVisionTool.function.name) {
+      promises.push(processVisionRequest(client, currentAction));
+      continue;
+    }
     let tool = ToolMap[currentAction.tool] ?? ActionToolMap[currentAction.tool];
 
     const handleToolOutput = async (output) => {
diff --git a/api/server/utils/handleText.js b/api/server/utils/handleText.js
index 973218e4f..bfa37e279 100644
--- a/api/server/utils/handleText.js
+++ b/api/server/utils/handleText.js
@@ -172,6 +172,7 @@ function generateConfig(key, baseURL, assistants = false) {
     config.retrievalModels = defaultRetrievalModels;
     config.capabilities = [
       Capabilities.code_interpreter,
+      Capabilities.image_vision,
       Capabilities.retrieval,
       Capabilities.actions,
       Capabilities.tools,
diff --git a/api/typedefs.js b/api/typedefs.js
index 36b341d92..5b358c787 100644
--- a/api/typedefs.js
+++ b/api/typedefs.js
@@ -32,6 +32,18 @@
  * @memberof typedefs
  */
 
+/**
+ * @exports ChatCompletionContentPartImage
+ * @typedef {import('openai').OpenAI.ChatCompletionContentPartImage} ChatCompletionContentPartImage
+ * @memberof typedefs
+ */
+
+/**
+ * @exports ChatCompletion
+ * @typedef {import('openai').OpenAI.ChatCompletion} ChatCompletion
+ * @memberof typedefs
+ */
+
 /**
  * @exports OpenAIRequestOptions
  * @typedef {import('openai').OpenAI.RequestOptions} OpenAIRequestOptions
diff --git a/client/src/common/assistants-types.ts b/client/src/common/assistants-types.ts
index c748de0de..3b9ad27da 100644
--- a/client/src/common/assistants-types.ts
+++ b/client/src/common/assistants-types.ts
@@ -1,3 +1,4 @@
+import { Capabilities } from 'librechat-data-provider';
 import type { Assistant } from 'librechat-data-provider';
 import type { Option, ExtendedFile } from './types';
 
@@ -6,8 +7,9 @@ export type TAssistantOption =
   | (Option & Assistant & { files?: Array<[string, ExtendedFile]> });
 
 export type Actions = {
-  code_interpreter: boolean;
-  retrieval: boolean;
+  [Capabilities.code_interpreter]: boolean;
+  [Capabilities.image_vision]: boolean;
+  [Capabilities.retrieval]: boolean;
 };
 
 export type AssistantForm = {
diff --git a/client/src/components/Chat/Messages/Content/Part.tsx b/client/src/components/Chat/Messages/Content/Part.tsx
index b96d0bcf8..e52ccfd08 100644
--- a/client/src/components/Chat/Messages/Content/Part.tsx
+++ b/client/src/components/Chat/Messages/Content/Part.tsx
@@ -1,4 +1,9 @@
-import { ToolCallTypes, ContentTypes, imageGenTools } from 'librechat-data-provider';
+import {
+  ToolCallTypes,
+  ContentTypes,
+  imageGenTools,
+  isImageVisionTool,
+} from 'librechat-data-provider';
 import type { TMessageContentParts, TMessage } from 'librechat-data-provider';
 import type { TDisplayProps } from '~/common';
 import { ErrorMessage } from './MessageContent';
@@ -96,6 +101,25 @@ export default function Part({
     part[ContentTypes.TOOL_CALL].type === ToolCallTypes.FUNCTION
   ) {
     const toolCall = part[ContentTypes.TOOL_CALL];
+    if (isImageVisionTool(toolCall)) {
+      if (isSubmitting && showCursor) {
+        return (
+          <Container>
+            <div className="markdown prose dark:prose-invert light dark:text-gray-70 my-1 w-full break-words">
+              <DisplayMessage
+                text={''}
+                isCreatedByUser={message.isCreatedByUser}
+                message={message}
+                showCursor={showCursor}
+              />
+            </div>
+          </Container>
+        );
+      }
+
+      return null;
+    }
+
     return (
       <ToolCall
         initialProgress={toolCall.progress ?? 0.1}
diff --git a/client/src/components/SidePanel/Builder/AssistantPanel.tsx b/client/src/components/SidePanel/Builder/AssistantPanel.tsx
index 3136569c4..f84f62928 100644
--- a/client/src/components/SidePanel/Builder/AssistantPanel.tsx
+++ b/client/src/components/SidePanel/Builder/AssistantPanel.tsx
@@ -8,6 +8,7 @@ import {
   Capabilities,
   EModelEndpoint,
   actionDelimiter,
+  ImageVisionTool,
   defaultAssistantFormValues,
 } from 'librechat-data-provider';
 import type { AssistantForm, AssistantPanelProps } from '~/common';
@@ -82,6 +83,10 @@ export default function AssistantPanel({
     () => assistants?.capabilities?.includes(Capabilities.code_interpreter),
     [assistants],
   );
+  const imageVisionEnabled = useMemo(
+    () => assistants?.capabilities?.includes(Capabilities.image_vision),
+    [assistants],
+  );
 
   useEffect(() => {
     if (model && !retrievalModels.has(model)) {
@@ -157,6 +162,9 @@ export default function AssistantPanel({
     if (data.retrieval) {
       tools.push({ type: Tools.retrieval });
     }
+    if (data.image_vision) {
+      tools.push(ImageVisionTool);
+    }
 
     const {
       name,
@@ -374,6 +382,37 @@ export default function AssistantPanel({
                   </label>
                 </div>
               )}
+              {imageVisionEnabled && (
+                <div className="flex items-center">
+                  <Controller
+                    name={Capabilities.image_vision}
+                    control={control}
+                    render={({ field }) => (
+                      <Checkbox
+                        {...field}
+                        checked={field.value}
+                        onCheckedChange={field.onChange}
+                        className="relative float-left  mr-2 inline-flex h-4 w-4 cursor-pointer"
+                        value={field?.value?.toString()}
+                      />
+                    )}
+                  />
+                  <label
+                    className="form-check-label text-token-text-primary w-full cursor-pointer"
+                    htmlFor={Capabilities.image_vision}
+                    onClick={() =>
+                      setValue(Capabilities.image_vision, !getValues(Capabilities.image_vision), {
+                        shouldDirty: true,
+                      })
+                    }
+                  >
+                    <div className="flex items-center">
+                      {localize('com_assistants_image_vision')}
+                      <QuestionMark />
+                    </div>
+                  </label>
+                </div>
+              )}
               {retrievalEnabled && (
                 <div className="flex items-center">
                   <Controller
@@ -417,9 +456,9 @@ export default function AssistantPanel({
               ${actionsEnabled ? localize('com_assistants_actions') : ''}`}
             </label>
             <div className="space-y-1">
-              {functions.map((func) => (
+              {functions.map((func, i) => (
                 <AssistantTool
-                  key={func}
+                  key={`${func}-${i}-${assistant_id}`}
                   tool={func}
                   allTools={allTools}
                   assistant_id={assistant_id}
diff --git a/client/src/components/SidePanel/Builder/AssistantSelect.tsx b/client/src/components/SidePanel/Builder/AssistantSelect.tsx
index 422b781cd..6ae3a9aa0 100644
--- a/client/src/components/SidePanel/Builder/AssistantSelect.tsx
+++ b/client/src/components/SidePanel/Builder/AssistantSelect.tsx
@@ -3,6 +3,8 @@ import { useCallback, useEffect, useRef } from 'react';
 import {
   defaultAssistantFormValues,
   defaultOrderQuery,
+  isImageVisionTool,
+  Capabilities,
   FileSources,
 } from 'librechat-data-provider';
 import type { UseFormReset } from 'react-hook-form';
@@ -13,7 +15,7 @@ import SelectDropDown from '~/components/ui/SelectDropDown';
 import { useListAssistantsQuery } from '~/data-provider';
 import { useFileMapContext } from '~/Providers';
 import { useLocalize } from '~/hooks';
-import { cn } from '~/utils/';
+import { cn } from '~/utils';
 
 const keys = new Set(['name', 'id', 'description', 'instructions', 'model']);
 
@@ -87,20 +89,21 @@ export default function AssistantSelect({
       };
 
       const actions: Actions = {
-        code_interpreter: false,
-        retrieval: false,
+        [Capabilities.code_interpreter]: false,
+        [Capabilities.image_vision]: false,
+        [Capabilities.retrieval]: false,
       };
 
       assistant?.tools
-        ?.filter((tool) => tool.type !== 'function')
-        ?.map((tool) => tool.type)
+        ?.filter((tool) => tool.type !== 'function' || isImageVisionTool(tool))
+        ?.map((tool) => tool?.function?.name || tool.type)
         .forEach((tool) => {
           actions[tool] = true;
         });
 
       const functions =
         assistant?.tools
-          ?.filter((tool) => tool.type === 'function')
+          ?.filter((tool) => tool.type === 'function' && !isImageVisionTool(tool))
           ?.map((tool) => tool.function?.name ?? '') ?? [];
 
       const formValues: Partial<AssistantForm & Actions> = {
diff --git a/client/src/localization/languages/Eng.tsx b/client/src/localization/languages/Eng.tsx
index 07b0e92fa..743d115ad 100644
--- a/client/src/localization/languages/Eng.tsx
+++ b/client/src/localization/languages/Eng.tsx
@@ -16,6 +16,7 @@ export default {
     'If you upload files under Knowledge, conversations with your Assistant may include file contents.',
   com_assistants_knowledge_disabled:
     'Assistant must be created, and Code Interpreter or Retrieval must be enabled and saved before uploading files as Knowledge.',
+  com_assistants_image_vision: 'Image Vision',
   com_assistants_code_interpreter: 'Code Interpreter',
   com_assistants_code_interpreter_files:
     'The following files are only available for Code Interpreter:',
diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts
index f6ce32f1f..393f2bb0e 100644
--- a/packages/data-provider/src/config.ts
+++ b/packages/data-provider/src/config.ts
@@ -82,6 +82,7 @@ export type TValidatedAzureConfig = {
 
 export enum Capabilities {
   code_interpreter = 'code_interpreter',
+  image_vision = 'image_vision',
   retrieval = 'retrieval',
   actions = 'actions',
   tools = 'tools',
@@ -100,6 +101,7 @@ export const assistantEndpointSchema = z.object({
     .optional()
     .default([
       Capabilities.code_interpreter,
+      Capabilities.image_vision,
       Capabilities.retrieval,
       Capabilities.actions,
       Capabilities.tools,
diff --git a/packages/data-provider/src/schemas.ts b/packages/data-provider/src/schemas.ts
index 266aa9e98..f5f0c1853 100644
--- a/packages/data-provider/src/schemas.ts
+++ b/packages/data-provider/src/schemas.ts
@@ -1,5 +1,6 @@
 import { z } from 'zod';
-import type { TMessageContentParts } from './types/assistants';
+import { Tools } from './types/assistants';
+import type { TMessageContentParts, FunctionTool, FunctionToolCall } from './types/assistants';
 import type { TFile } from './types/files';
 
 export const isUUID = z.string().uuid();
@@ -25,9 +26,26 @@ export const defaultAssistantFormValues = {
   model: '',
   functions: [],
   code_interpreter: false,
+  image_vision: false,
   retrieval: false,
 };
 
+export const ImageVisionTool: FunctionTool = {
+  type: Tools.function,
+  [Tools.function]: {
+    name: 'image_vision',
+    description: 'Get detailed text descriptions for all current image attachments.',
+    parameters: {
+      type: 'object',
+      properties: {},
+      required: [],
+    },
+  },
+};
+
+export const isImageVisionTool = (tool: FunctionTool | FunctionToolCall) =>
+  tool.type === 'function' && tool.function?.name === ImageVisionTool?.function?.name;
+
 export const endpointSettings = {
   [EModelEndpoint.google]: {
     model: {