feat: add STT support for Upload as Text
This commit is contained in:
@@ -325,4 +325,4 @@ async function speechToText(req, res) {
|
||||
await sttService.processTextToSpeech(req, res);
|
||||
}
|
||||
|
||||
module.exports = { speechToText };
|
||||
module.exports = { speechToText, STTService };
|
||||
|
||||
@@ -34,6 +34,7 @@ const { checkCapability } = require('~/server/services/Config');
|
||||
const { LB_QueueAsyncCall } = require('~/server/utils/queue');
|
||||
const { getStrategyFunctions } = require('./strategies');
|
||||
const { determineFileType } = require('~/server/utils');
|
||||
const { STTService } = require('./Audio/STTService');
|
||||
const { logger } = require('~/config');
|
||||
const FormData = require('form-data');
|
||||
const axios = require('axios');
|
||||
@@ -501,6 +502,35 @@ const processFileUpload = async ({ req, res, metadata }) => {
|
||||
}
|
||||
|
||||
const { file } = req;
|
||||
|
||||
const fileConfig = mergeFileConfig(req.app.locals.fileConfig);
|
||||
const shouldUseSTT = fileConfig.checkType(
|
||||
file.mimetype,
|
||||
fileConfig.stt?.supportedMimeTypes || [],
|
||||
);
|
||||
|
||||
if (shouldUseSTT) {
|
||||
const { text, bytes } = await processAudioFile({ file });
|
||||
|
||||
const result = await createFile(
|
||||
{
|
||||
user: req.user.id,
|
||||
file_id,
|
||||
temp_file_id,
|
||||
bytes,
|
||||
filepath: file.path,
|
||||
filename: file.originalname,
|
||||
context: FileContext.message_attachment,
|
||||
type: 'text/plain',
|
||||
source: FileSources.text,
|
||||
text,
|
||||
},
|
||||
true,
|
||||
);
|
||||
return res
|
||||
.status(200)
|
||||
.json({ message: 'Audio file processed and converted to text successfully', ...result });
|
||||
}
|
||||
const {
|
||||
id,
|
||||
bytes,
|
||||
@@ -605,6 +635,10 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
|
||||
file.mimetype,
|
||||
fileConfig.ocr?.supportedMimeTypes || [],
|
||||
);
|
||||
const shouldUseSTT = fileConfig.checkType(
|
||||
file.mimetype,
|
||||
fileConfig.stt?.supportedMimeTypes || [],
|
||||
);
|
||||
|
||||
let fileInfoMetadata;
|
||||
const entity_id = messageAttachment === true ? undefined : agent_id;
|
||||
@@ -664,6 +698,35 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
|
||||
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
|
||||
});
|
||||
|
||||
if (!messageAttachment && tool_resource) {
|
||||
await addAgentResourceFile({
|
||||
req,
|
||||
file_id,
|
||||
agent_id,
|
||||
tool_resource,
|
||||
});
|
||||
}
|
||||
const result = await createFile(fileInfo, true);
|
||||
return res
|
||||
.status(200)
|
||||
.json({ message: 'Agent file uploaded and processed successfully', ...result });
|
||||
} else if (shouldUseSTT) {
|
||||
const { text, bytes } = await processAudioFile({ file });
|
||||
|
||||
const fileInfo = removeNullishValues({
|
||||
text,
|
||||
bytes,
|
||||
file_id,
|
||||
temp_file_id,
|
||||
user: req.user.id,
|
||||
type: 'text/plain',
|
||||
filepath: file.path,
|
||||
source: FileSources.text,
|
||||
filename: file.originalname,
|
||||
model: messageAttachment ? undefined : req.body.model,
|
||||
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
|
||||
});
|
||||
|
||||
if (!messageAttachment && tool_resource) {
|
||||
await addAgentResourceFile({
|
||||
req,
|
||||
@@ -685,7 +748,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
|
||||
file_id,
|
||||
temp_file_id,
|
||||
user: req.user.id,
|
||||
type: file.mimetype,
|
||||
type: file.mimetype.startsWith('audio/') ? 'text/plain' : file.mimetype,
|
||||
filepath: file.path,
|
||||
source: FileSources.text,
|
||||
filename: file.originalname,
|
||||
@@ -706,7 +769,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
|
||||
.status(200)
|
||||
.json({ message: 'Agent file uploaded and processed successfully', ...result });
|
||||
} else {
|
||||
throw new Error(`File type ${file.mimetype} is not supported for OCR or text parsing`);
|
||||
throw new Error(`File type ${file.mimetype} is not supported for OCR, STT, or text parsing`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1096,6 +1159,35 @@ function filterFile({ req, image, isAvatar }) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes audio files using Speech-to-Text (STT) service.
|
||||
* @param {Object} params - The parameters object.
|
||||
* @param {Object} params.file - The audio file object.
|
||||
* @returns {Promise<Object>} A promise that resolves to an object containing text and bytes.
|
||||
*/
|
||||
async function processAudioFile({ file }) {
|
||||
try {
|
||||
const sttService = await STTService.getInstance();
|
||||
const audioBuffer = await fs.promises.readFile(file.path);
|
||||
const audioFile = {
|
||||
originalname: file.originalname,
|
||||
mimetype: file.mimetype,
|
||||
size: file.size,
|
||||
};
|
||||
|
||||
const [provider, sttSchema] = await sttService.getProviderSchema();
|
||||
const text = await sttService.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
|
||||
|
||||
return {
|
||||
text,
|
||||
bytes: Buffer.byteLength(text, 'utf8'),
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error('Error processing audio file with STT:', error);
|
||||
throw new Error(`Failed to process audio file: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
filterFile,
|
||||
processFiles,
|
||||
@@ -1107,4 +1199,5 @@ module.exports = {
|
||||
processDeleteRequest,
|
||||
processAgentFileUpload,
|
||||
retrieveAndProcessFile,
|
||||
processAudioFile,
|
||||
};
|
||||
|
||||
@@ -122,6 +122,9 @@ export const applicationMimeTypes =
|
||||
|
||||
export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/;
|
||||
|
||||
export const audioMimeTypes =
|
||||
/^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm)$/;
|
||||
|
||||
export const defaultOCRMimeTypes = [
|
||||
imageMimeTypes,
|
||||
/^application\/pdf$/,
|
||||
@@ -132,11 +135,14 @@ export const defaultOCRMimeTypes = [
|
||||
|
||||
export const defaultTextParsingMimeTypes = [textMimeTypes];
|
||||
|
||||
export const defaultSTTMimeTypes = [audioMimeTypes];
|
||||
|
||||
export const supportedMimeTypes = [
|
||||
textMimeTypes,
|
||||
excelMimeTypes,
|
||||
applicationMimeTypes,
|
||||
imageMimeTypes,
|
||||
audioMimeTypes,
|
||||
/** Supported by LC Code Interpreter PAI */
|
||||
/^image\/(svg|svg\+xml)$/,
|
||||
];
|
||||
@@ -214,6 +220,9 @@ export const fileConfig = {
|
||||
textParsing: {
|
||||
supportedMimeTypes: defaultTextParsingMimeTypes,
|
||||
},
|
||||
stt: {
|
||||
supportedMimeTypes: defaultSTTMimeTypes,
|
||||
},
|
||||
checkType: function (fileType: string, supportedTypes: RegExp[] = supportedMimeTypes) {
|
||||
return supportedTypes.some((regex) => regex.test(fileType));
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user