feat: add STT support for Upload as Text

This commit is contained in:
Dustin Healy
2025-08-04 22:34:14 -07:00
parent 23945b3434
commit da611a634a
3 changed files with 105 additions and 3 deletions

View File

@@ -325,4 +325,4 @@ async function speechToText(req, res) {
await sttService.processTextToSpeech(req, res);
}
module.exports = { speechToText };
module.exports = { speechToText, STTService };

View File

@@ -34,6 +34,7 @@ const { checkCapability } = require('~/server/services/Config');
const { LB_QueueAsyncCall } = require('~/server/utils/queue');
const { getStrategyFunctions } = require('./strategies');
const { determineFileType } = require('~/server/utils');
const { STTService } = require('./Audio/STTService');
const { logger } = require('~/config');
const FormData = require('form-data');
const axios = require('axios');
@@ -501,6 +502,35 @@ const processFileUpload = async ({ req, res, metadata }) => {
}
const { file } = req;
const fileConfig = mergeFileConfig(req.app.locals.fileConfig);
const shouldUseSTT = fileConfig.checkType(
file.mimetype,
fileConfig.stt?.supportedMimeTypes || [],
);
if (shouldUseSTT) {
const { text, bytes } = await processAudioFile({ file });
const result = await createFile(
{
user: req.user.id,
file_id,
temp_file_id,
bytes,
filepath: file.path,
filename: file.originalname,
context: FileContext.message_attachment,
type: 'text/plain',
source: FileSources.text,
text,
},
true,
);
return res
.status(200)
.json({ message: 'Audio file processed and converted to text successfully', ...result });
}
const {
id,
bytes,
@@ -605,6 +635,10 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
file.mimetype,
fileConfig.ocr?.supportedMimeTypes || [],
);
const shouldUseSTT = fileConfig.checkType(
file.mimetype,
fileConfig.stt?.supportedMimeTypes || [],
);
let fileInfoMetadata;
const entity_id = messageAttachment === true ? undefined : agent_id;
@@ -664,6 +698,35 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
file_id,
agent_id,
tool_resource,
});
}
const result = await createFile(fileInfo, true);
return res
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
} else if (shouldUseSTT) {
const { text, bytes } = await processAudioFile({ file });
const fileInfo = removeNullishValues({
text,
bytes,
file_id,
temp_file_id,
user: req.user.id,
type: 'text/plain',
filepath: file.path,
source: FileSources.text,
filename: file.originalname,
model: messageAttachment ? undefined : req.body.model,
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
@@ -685,7 +748,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
file_id,
temp_file_id,
user: req.user.id,
type: file.mimetype,
type: file.mimetype.startsWith('audio/') ? 'text/plain' : file.mimetype,
filepath: file.path,
source: FileSources.text,
filename: file.originalname,
@@ -706,7 +769,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
} else {
throw new Error(`File type ${file.mimetype} is not supported for OCR or text parsing`);
throw new Error(`File type ${file.mimetype} is not supported for OCR, STT, or text parsing`);
}
}
@@ -1096,6 +1159,35 @@ function filterFile({ req, image, isAvatar }) {
}
}
/**
* Processes audio files using Speech-to-Text (STT) service.
* @param {Object} params - The parameters object.
* @param {Object} params.file - The audio file object.
* @returns {Promise<Object>} A promise that resolves to an object containing text and bytes.
*/
async function processAudioFile({ file }) {
try {
const sttService = await STTService.getInstance();
const audioBuffer = await fs.promises.readFile(file.path);
const audioFile = {
originalname: file.originalname,
mimetype: file.mimetype,
size: file.size,
};
const [provider, sttSchema] = await sttService.getProviderSchema();
const text = await sttService.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
return {
text,
bytes: Buffer.byteLength(text, 'utf8'),
};
} catch (error) {
logger.error('Error processing audio file with STT:', error);
throw new Error(`Failed to process audio file: ${error.message}`);
}
}
module.exports = {
filterFile,
processFiles,
@@ -1107,4 +1199,5 @@ module.exports = {
processDeleteRequest,
processAgentFileUpload,
retrieveAndProcessFile,
processAudioFile,
};

View File

@@ -122,6 +122,9 @@ export const applicationMimeTypes =
export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/;
export const audioMimeTypes =
/^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm)$/;
export const defaultOCRMimeTypes = [
imageMimeTypes,
/^application\/pdf$/,
@@ -132,11 +135,14 @@ export const defaultOCRMimeTypes = [
export const defaultTextParsingMimeTypes = [textMimeTypes];
export const defaultSTTMimeTypes = [audioMimeTypes];
export const supportedMimeTypes = [
textMimeTypes,
excelMimeTypes,
applicationMimeTypes,
imageMimeTypes,
audioMimeTypes,
/** Supported by LC Code Interpreter PAI */
/^image\/(svg|svg\+xml)$/,
];
@@ -214,6 +220,9 @@ export const fileConfig = {
textParsing: {
supportedMimeTypes: defaultTextParsingMimeTypes,
},
stt: {
supportedMimeTypes: defaultSTTMimeTypes,
},
checkType: function (fileType: string, supportedTypes: RegExp[] = supportedMimeTypes) {
return supportedTypes.some((regex) => regex.test(fileType));
},