feat: add STT support for Upload as Text

2025-08-04 22:34:14 -07:00
parent 23945b3434
commit da611a634a
3 changed files with 105 additions and 3 deletions
--- a/api/server/services/Files/Audio/STTService.js
+++ b/api/server/services/Files/Audio/STTService.js
@@ -325,4 +325,4 @@ async function speechToText(req, res) {
  await sttService.processTextToSpeech(req, res);
 }

-module.exports = { speechToText };
+module.exports = { speechToText, STTService };
--- a/api/server/services/Files/process.js
+++ b/api/server/services/Files/process.js
@@ -34,6 +34,7 @@ const { checkCapability } = require('~/server/services/Config');
 const { LB_QueueAsyncCall } = require('~/server/utils/queue');
 const { getStrategyFunctions } = require('./strategies');
 const { determineFileType } = require('~/server/utils');
+const { STTService } = require('./Audio/STTService');
 const { logger } = require('~/config');
 const FormData = require('form-data');
 const axios = require('axios');
@@ -501,6 +502,35 @@ const processFileUpload = async ({ req, res, metadata }) => {
  }

  const { file } = req;
+
+  const fileConfig = mergeFileConfig(req.app.locals.fileConfig);
+  const shouldUseSTT = fileConfig.checkType(
+    file.mimetype,
+    fileConfig.stt?.supportedMimeTypes || [],
+  );
+
+  if (shouldUseSTT) {
+    const { text, bytes } = await processAudioFile({ file });
+
+    const result = await createFile(
+      {
+        user: req.user.id,
+        file_id,
+        temp_file_id,
+        bytes,
+        filepath: file.path,
+        filename: file.originalname,
+        context: FileContext.message_attachment,
+        type: 'text/plain',
+        source: FileSources.text,
+        text,
+      },
+      true,
+    );
+    return res
+      .status(200)
+      .json({ message: 'Audio file processed and converted to text successfully', ...result });
+  }
  const {
    id,
    bytes,
@@ -605,6 +635,10 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
    file.mimetype,
    fileConfig.ocr?.supportedMimeTypes || [],
  );
+  const shouldUseSTT = fileConfig.checkType(
+    file.mimetype,
+    fileConfig.stt?.supportedMimeTypes || [],
+  );

  let fileInfoMetadata;
  const entity_id = messageAttachment === true ? undefined : agent_id;
@@ -664,6 +698,35 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
        context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
      });

+      if (!messageAttachment && tool_resource) {
+        await addAgentResourceFile({
+          req,
+          file_id,
+          agent_id,
+          tool_resource,
+        });
+      }
+      const result = await createFile(fileInfo, true);
+      return res
+        .status(200)
+        .json({ message: 'Agent file uploaded and processed successfully', ...result });
+    } else if (shouldUseSTT) {
+      const { text, bytes } = await processAudioFile({ file });
+
+      const fileInfo = removeNullishValues({
+        text,
+        bytes,
+        file_id,
+        temp_file_id,
+        user: req.user.id,
+        type: 'text/plain',
+        filepath: file.path,
+        source: FileSources.text,
+        filename: file.originalname,
+        model: messageAttachment ? undefined : req.body.model,
+        context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
+      });
+
      if (!messageAttachment && tool_resource) {
        await addAgentResourceFile({
          req,
@@ -685,7 +748,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
        file_id,
        temp_file_id,
        user: req.user.id,
-        type: file.mimetype,
+        type: file.mimetype.startsWith('audio/') ? 'text/plain' : file.mimetype,
        filepath: file.path,
        source: FileSources.text,
        filename: file.originalname,
@@ -706,7 +769,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
        .status(200)
        .json({ message: 'Agent file uploaded and processed successfully', ...result });
    } else {
-      throw new Error(`File type ${file.mimetype} is not supported for OCR or text parsing`);
+      throw new Error(`File type ${file.mimetype} is not supported for OCR, STT, or text parsing`);
    }
  }

@@ -1096,6 +1159,35 @@ function filterFile({ req, image, isAvatar }) {
  }
 }

+/**
+ * Processes audio files using Speech-to-Text (STT) service.
+ * @param {Object} params - The parameters object.
+ * @param {Object} params.file - The audio file object.
+ * @returns {Promise<Object>} A promise that resolves to an object containing text and bytes.
+ */
+async function processAudioFile({ file }) {
+  try {
+    const sttService = await STTService.getInstance();
+    const audioBuffer = await fs.promises.readFile(file.path);
+    const audioFile = {
+      originalname: file.originalname,
+      mimetype: file.mimetype,
+      size: file.size,
+    };
+
+    const [provider, sttSchema] = await sttService.getProviderSchema();
+    const text = await sttService.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
+
+    return {
+      text,
+      bytes: Buffer.byteLength(text, 'utf8'),
+    };
+  } catch (error) {
+    logger.error('Error processing audio file with STT:', error);
+    throw new Error(`Failed to process audio file: ${error.message}`);
+  }
+}
+
 module.exports = {
  filterFile,
  processFiles,
@@ -1107,4 +1199,5 @@ module.exports = {
  processDeleteRequest,
  processAgentFileUpload,
  retrieveAndProcessFile,
+  processAudioFile,
 };
--- a/packages/data-provider/src/file-config.ts
+++ b/packages/data-provider/src/file-config.ts
@@ -122,6 +122,9 @@ export const applicationMimeTypes =

 export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/;

+export const audioMimeTypes =
+  /^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm)$/;
+
 export const defaultOCRMimeTypes = [
  imageMimeTypes,
  /^application\/pdf$/,
@@ -132,11 +135,14 @@ export const defaultOCRMimeTypes = [

 export const defaultTextParsingMimeTypes = [textMimeTypes];

+export const defaultSTTMimeTypes = [audioMimeTypes];
+
 export const supportedMimeTypes = [
  textMimeTypes,
  excelMimeTypes,
  applicationMimeTypes,
  imageMimeTypes,
+  audioMimeTypes,
  /** Supported by LC Code Interpreter PAI */
  /^image\/(svg|svg\+xml)$/,
 ];
@@ -214,6 +220,9 @@ export const fileConfig = {
  textParsing: {
    supportedMimeTypes: defaultTextParsingMimeTypes,
  },
+  stt: {
+    supportedMimeTypes: defaultSTTMimeTypes,
+  },
  checkType: function (fileType: string, supportedTypes: RegExp[] = supportedMimeTypes) {
    return supportedTypes.some((regex) => regex.test(fileType));
  },