Compare commits

...

3 Commits

Author SHA1 Message Date
Dustin Healy
da611a634a feat: add STT support for Upload as Text 2025-08-04 22:34:14 -07:00
Dustin Healy
23945b3434 📤 feat: Add RAG API Endpoint Support for Text Parsing (#8849)
* feat: implement RAG API integration for text parsing with fallback to native parsing

* chore: remove TODO now that placeholder and fllback are implemented
2025-08-04 18:44:02 -07:00
Dustin Healy
cde9f058af 🪶 feat: Add Support for Uploading Plaintext Files
feat: delineate between OCR and text handling in fileConfig field of config file

- also adds support for passing in mimetypes as just plain file extensions

feat: add showLabel bool to support future synthetic component DynamicDropdownInput

feat: add new combination dropdown-input component in params panel to support file type token limits

refactor: move hovercard to side to align with other hovercards

chore: clean up autogenerated comments

feat: add delineation to file upload path between text and ocr configured filetypes

feat: add token limit checks during file upload

refactor: move textParsing out of ocrEnabled logic

refactor: clean up types for filetype config

refactor: finish decoupling DynamicDropdownInput from fileTokenLimits

fix: move image token cost function into file to fix circular dependency causing unittest to fail and remove unused var for linter

chore: remove out of scope code following review

refactor: make fileTokenLimit conform to existing styles

chore: remove unused localization string

chore: undo changes to DynamicInput and other strays

feat: add fileTokenLimit to all provider config panels

fix: move textParsing back into ocr tool_resource block for now so that it doesn't interfere with other upload types
2025-08-04 16:39:03 -07:00
13 changed files with 428 additions and 41 deletions

View File

@@ -400,7 +400,8 @@ router.post('/', async (req, res) => {
if (
error.message?.includes('Invalid file format') ||
error.message?.includes('No OCR result')
error.message?.includes('No OCR result') ||
error.message?.includes('exceeds token limit')
) {
message = error.message;
}

View File

@@ -28,6 +28,18 @@ router.post('/', async (req, res) => {
} catch (error) {
// TODO: delete remote file if it exists
logger.error('[/files/images] Error processing file:', error);
let message = 'Error processing file';
// Handle specific error types
if (
error.message?.includes('Invalid file format') ||
error.message?.includes('No OCR result') ||
error.message?.includes('exceeds token limit')
) {
message = error.message;
}
try {
const filepath = path.join(
req.app.locals.paths.imageOutput,
@@ -38,7 +50,7 @@ router.post('/', async (req, res) => {
} catch (error) {
logger.error('[/files/images] Error deleting file:', error);
}
res.status(500).json({ message: 'Error processing file' });
res.status(500).json({ message });
} finally {
try {
await fs.unlink(req.file.path);

View File

@@ -325,4 +325,4 @@ async function speechToText(req, res) {
await sttService.processTextToSpeech(req, res);
}
module.exports = { speechToText };
module.exports = { speechToText, STTService };

View File

@@ -28,12 +28,112 @@ const { addResourceFileId, deleteResourceFileId } = require('~/server/controller
const { addAgentResourceFile, removeAgentResourceFiles } = require('~/models/Agent');
const { getOpenAIClient } = require('~/server/controllers/assistants/helpers');
const { createFile, updateFileUsage, deleteFiles } = require('~/models/File');
const { generateShortLivedToken } = require('~/server/services/AuthService');
const { loadAuthValues } = require('~/server/services/Tools/credentials');
const { checkCapability } = require('~/server/services/Config');
const { LB_QueueAsyncCall } = require('~/server/utils/queue');
const { getStrategyFunctions } = require('./strategies');
const { determineFileType } = require('~/server/utils');
const { STTService } = require('./Audio/STTService');
const { logger } = require('~/config');
const FormData = require('form-data');
const axios = require('axios');
/**
* Attempts to parse text using RAG API, falls back to native text parsing
* @param {Object} params - The parameters object
* @param {Express.Request} params.req - The Express request object
* @param {Express.Multer.File} params.file - The uploaded file
* @param {string} params.file_id - The file ID
* @returns {Promise<{text: string, bytes: number, source: string}>}
*/
async function parseText({ req, file, file_id }) {
if (!process.env.RAG_API_URL) {
logger.debug('[parseText] RAG_API_URL not defined, falling back to native text parsing');
return parseTextNative(file);
}
try {
const healthResponse = await axios.get(`${process.env.RAG_API_URL}/health`, {
timeout: 5000,
});
if (healthResponse?.statusText !== 'OK' && healthResponse?.status !== 200) {
logger.debug('[parseText] RAG API health check failed, falling back to native parsing');
return parseTextNative(file);
}
} catch (healthError) {
logger.debug(
`[parseText] RAG API health check failed: ${healthError.message}, falling back to native parsing`,
);
return parseTextNative(file);
}
try {
const jwtToken = generateShortLivedToken(req.user.id);
const formData = new FormData();
formData.append('file_id', file_id);
formData.append('file', fs.createReadStream(file.path));
const formHeaders = formData.getHeaders();
// TODO: Actually implement referenced RAG API endpoint /parse-text
const response = await axios.post(`${process.env.RAG_API_URL}/parse-text`, formData, {
headers: {
Authorization: `Bearer ${jwtToken}`,
accept: 'application/json',
...formHeaders,
},
timeout: 30000,
});
const responseData = response.data;
logger.debug('[parseText] Response from RAG API', responseData);
if (!responseData.text) {
throw new Error('RAG API did not return parsed text');
}
return {
text: responseData.text,
bytes: Buffer.byteLength(responseData.text, 'utf8'),
source: 'rag_api',
};
} catch (error) {
logger.warn(
`[parseText] RAG API text parsing failed: ${error.message}, falling back to native parsing`,
);
return parseTextNative(file);
}
}
/**
* Native JavaScript text parsing fallback
* Simple text file reading - complex formats handled by RAG API
* @param {Express.Multer.File} file - The uploaded file
* @returns {{text: string, bytes: number, source: string}}
*/
function parseTextNative(file) {
try {
let text = '';
try {
text = fs.readFileSync(file.path, 'utf8');
} catch (readError) {
throw new Error(`Cannot read file as text: ${readError.message}`);
}
const bytes = Buffer.byteLength(text, 'utf8');
return {
text,
bytes,
source: 'native_js',
};
} catch (error) {
logger.error(`[parseTextNative] Error parsing file: ${error.message}`);
throw new Error(`Failed to parse file: ${error.message}`);
}
}
/**
*
@@ -402,6 +502,35 @@ const processFileUpload = async ({ req, res, metadata }) => {
}
const { file } = req;
const fileConfig = mergeFileConfig(req.app.locals.fileConfig);
const shouldUseSTT = fileConfig.checkType(
file.mimetype,
fileConfig.stt?.supportedMimeTypes || [],
);
if (shouldUseSTT) {
const { text, bytes } = await processAudioFile({ file });
const result = await createFile(
{
user: req.user.id,
file_id,
temp_file_id,
bytes,
filepath: file.path,
filename: file.originalname,
context: FileContext.message_attachment,
type: 'text/plain',
source: FileSources.text,
text,
},
true,
);
return res
.status(200)
.json({ message: 'Audio file processed and converted to text successfully', ...result });
}
const {
id,
bytes,
@@ -496,6 +625,21 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
throw new Error('No tool resource provided for non-image agent file upload');
}
const fileConfig = mergeFileConfig(req.app.locals.fileConfig);
const shouldUseTextParsing = fileConfig.checkType(
file.mimetype,
fileConfig.textParsing?.supportedMimeTypes || [],
);
const shouldUseOCR = fileConfig.checkType(
file.mimetype,
fileConfig.ocr?.supportedMimeTypes || [],
);
const shouldUseSTT = fileConfig.checkType(
file.mimetype,
fileConfig.stt?.supportedMimeTypes || [],
);
let fileInfoMetadata;
const entity_id = messageAttachment === true ? undefined : agent_id;
const basePath = mime.getType(file.originalname)?.startsWith('image') ? 'images' : 'uploads';
@@ -526,46 +670,107 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
throw new Error('OCR capability is not enabled for Agents');
}
const { handleFileUpload: uploadOCR } = getStrategyFunctions(
req.app.locals?.ocr?.strategy ?? FileSources.mistral_ocr,
);
const { file_id, temp_file_id } = metadata;
const {
text,
bytes,
// TODO: OCR images support?
images,
filename,
filepath: ocrFileURL,
} = await uploadOCR({ req, file, loadAuthValues });
if (shouldUseOCR) {
const { handleFileUpload: uploadOCR } = getStrategyFunctions(
req.app.locals?.ocr?.strategy ?? FileSources.mistral_ocr,
);
const fileInfo = removeNullishValues({
text,
bytes,
file_id,
temp_file_id,
user: req.user.id,
type: 'text/plain',
filepath: ocrFileURL,
source: FileSources.text,
filename: filename ?? file.originalname,
model: messageAttachment ? undefined : req.body.model,
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
const {
text,
bytes,
filename,
filepath: ocrFileURL,
} = await uploadOCR({ req, file, loadAuthValues });
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
const fileInfo = removeNullishValues({
text,
bytes,
file_id,
agent_id,
tool_resource,
temp_file_id,
user: req.user.id,
type: 'text/plain',
filepath: ocrFileURL,
source: FileSources.text,
filename: filename ?? file.originalname,
model: messageAttachment ? undefined : req.body.model,
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
file_id,
agent_id,
tool_resource,
});
}
const result = await createFile(fileInfo, true);
return res
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
} else if (shouldUseSTT) {
const { text, bytes } = await processAudioFile({ file });
const fileInfo = removeNullishValues({
text,
bytes,
file_id,
temp_file_id,
user: req.user.id,
type: 'text/plain',
filepath: file.path,
source: FileSources.text,
filename: file.originalname,
model: messageAttachment ? undefined : req.body.model,
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
file_id,
agent_id,
tool_resource,
});
}
const result = await createFile(fileInfo, true);
return res
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
} else if (shouldUseTextParsing) {
const { text, bytes } = await parseText({ req, file, file_id });
const fileInfo = removeNullishValues({
text,
bytes,
file_id,
temp_file_id,
user: req.user.id,
type: file.mimetype.startsWith('audio/') ? 'text/plain' : file.mimetype,
filepath: file.path,
source: FileSources.text,
filename: file.originalname,
model: messageAttachment ? undefined : req.body.model,
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
file_id,
agent_id,
tool_resource,
});
}
const result = await createFile(fileInfo, true);
return res
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
} else {
throw new Error(`File type ${file.mimetype} is not supported for OCR, STT, or text parsing`);
}
const result = await createFile(fileInfo, true);
return res
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
}
const source =
@@ -954,6 +1159,35 @@ function filterFile({ req, image, isAvatar }) {
}
}
/**
* Processes audio files using Speech-to-Text (STT) service.
* @param {Object} params - The parameters object.
* @param {Object} params.file - The audio file object.
* @returns {Promise<Object>} A promise that resolves to an object containing text and bytes.
*/
async function processAudioFile({ file }) {
try {
const sttService = await STTService.getInstance();
const audioBuffer = await fs.promises.readFile(file.path);
const audioFile = {
originalname: file.originalname,
mimetype: file.mimetype,
size: file.size,
};
const [provider, sttSchema] = await sttService.getProviderSchema();
const text = await sttService.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
return {
text,
bytes: Buffer.byteLength(text, 'utf8'),
};
} catch (error) {
logger.error('Error processing audio file with STT:', error);
throw new Error(`Failed to process audio file: ${error.message}`);
}
}
module.exports = {
filterFile,
processFiles,
@@ -965,4 +1199,5 @@ module.exports = {
processDeleteRequest,
processAgentFileUpload,
retrieveAndProcessFile,
processAudioFile,
};

View File

@@ -135,10 +135,14 @@ const useFileHandling = (params?: UseFileHandling) => {
const file_id = body.get('file_id');
clearUploadTimer(file_id as string);
deleteFileById(file_id as string);
const errorMessage =
error?.code === 'ERR_CANCELED'
? 'com_error_files_upload_canceled'
: (error?.response?.data?.message ?? 'com_error_files_upload');
let errorMessage = 'com_error_files_upload';
if (error?.code === 'ERR_CANCELED') {
errorMessage = 'com_error_files_upload_canceled';
} else if (error?.response?.data?.message) {
errorMessage = error.response.data.message;
}
setError(errorMessage);
},
},

View File

@@ -774,6 +774,9 @@
"com_ui_field_required": "This field is required",
"com_ui_file_size": "File Size",
"com_ui_files": "Files",
"com_ui_file_token_limit": "File Token Limit",
"com_ui_file_token_limit_desc": "Set maximum token limit for file processing to control costs and resource usage",
"com_ui_filter_prompts": "Filter Prompts",
"com_ui_filter_prompts_name": "Filter prompts by name",
"com_ui_final_touch": "Final touch",

View File

@@ -122,11 +122,27 @@ export const applicationMimeTypes =
export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/;
export const audioMimeTypes =
/^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm)$/;
export const defaultOCRMimeTypes = [
imageMimeTypes,
/^application\/pdf$/,
/^application\/vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)$/,
/^application\/vnd\.ms-(word|powerpoint|excel)$/,
/^application\/epub\+zip$/,
];
export const defaultTextParsingMimeTypes = [textMimeTypes];
export const defaultSTTMimeTypes = [audioMimeTypes];
export const supportedMimeTypes = [
textMimeTypes,
excelMimeTypes,
applicationMimeTypes,
imageMimeTypes,
audioMimeTypes,
/** Supported by LC Code Interpreter PAI */
/^image\/(svg|svg\+xml)$/,
];
@@ -198,6 +214,15 @@ export const fileConfig = {
maxHeight: 1900,
quality: 0.92,
},
ocr: {
supportedMimeTypes: defaultOCRMimeTypes,
},
textParsing: {
supportedMimeTypes: defaultTextParsingMimeTypes,
},
stt: {
supportedMimeTypes: defaultSTTMimeTypes,
},
checkType: function (fileType: string, supportedTypes: RegExp[] = supportedMimeTypes) {
return supportedTypes.some((regex) => regex.test(fileType));
},
@@ -246,6 +271,16 @@ export const fileConfigSchema = z.object({
quality: z.number().min(0).max(1).optional(),
})
.optional(),
ocr: z
.object({
supportedMimeTypes: supportedMimeTypesSchema.optional(),
})
.optional(),
textParsing: z
.object({
supportedMimeTypes: supportedMimeTypesSchema.optional(),
})
.optional(),
});
/** Helper function to safely convert string patterns to RegExp objects */
@@ -261,7 +296,17 @@ export const convertStringsToRegex = (patterns: string[]): RegExp[] =>
}, []);
export function mergeFileConfig(dynamic: z.infer<typeof fileConfigSchema> | undefined): FileConfig {
const mergedConfig = fileConfig as FileConfig;
const mergedConfig: FileConfig = {
...fileConfig,
ocr: {
...fileConfig.ocr,
supportedMimeTypes: fileConfig.ocr?.supportedMimeTypes || [],
},
textParsing: {
...fileConfig.textParsing,
supportedMimeTypes: fileConfig.textParsing?.supportedMimeTypes || [],
},
};
if (!dynamic) {
return mergedConfig;
}
@@ -282,6 +327,28 @@ export function mergeFileConfig(dynamic: z.infer<typeof fileConfigSchema> | unde
};
}
if (dynamic.ocr !== undefined) {
mergedConfig.ocr = {
...mergedConfig.ocr,
...dynamic.ocr,
};
if (dynamic.ocr.supportedMimeTypes) {
mergedConfig.ocr.supportedMimeTypes = convertStringsToRegex(dynamic.ocr.supportedMimeTypes);
}
}
if (dynamic.textParsing !== undefined) {
mergedConfig.textParsing = {
...mergedConfig.textParsing,
...dynamic.textParsing,
};
if (dynamic.textParsing.supportedMimeTypes) {
mergedConfig.textParsing.supportedMimeTypes = convertStringsToRegex(
dynamic.textParsing.supportedMimeTypes,
);
}
}
if (!dynamic.endpoints) {
return mergedConfig;
}

View File

@@ -138,6 +138,18 @@ export const librechat = {
placeholderCode: true,
optionType: 'model',
} as const,
fileTokenLimit: {
key: 'fileTokenLimit',
label: 'com_ui_file_token_limit',
labelCode: true,
description: 'com_ui_file_token_limit_desc',
descriptionCode: true,
placeholder: 'com_nav_theme_system',
placeholderCode: true,
type: 'number',
component: 'input',
columnSpan: 2,
} as const,
};
const openAIParams: Record<string, SettingDefinition> = {
@@ -603,6 +615,7 @@ const googleConfig: SettingsConfiguration = [
google.thinking,
google.thinkingBudget,
google.web_search,
librechat.fileTokenLimit,
];
const googleCol1: SettingsConfiguration = [
@@ -621,6 +634,7 @@ const googleCol2: SettingsConfiguration = [
google.thinking,
google.thinkingBudget,
google.web_search,
librechat.fileTokenLimit,
];
const openAI: SettingsConfiguration = [
@@ -640,6 +654,7 @@ const openAI: SettingsConfiguration = [
openAIParams.useResponsesApi,
openAIParams.reasoning_summary,
openAIParams.disableStreaming,
librechat.fileTokenLimit,
];
const openAICol1: SettingsConfiguration = [
@@ -663,6 +678,7 @@ const openAICol2: SettingsConfiguration = [
openAIParams.useResponsesApi,
openAIParams.web_search,
openAIParams.disableStreaming,
librechat.fileTokenLimit,
];
const anthropicConfig: SettingsConfiguration = [
@@ -678,6 +694,7 @@ const anthropicConfig: SettingsConfiguration = [
anthropic.thinking,
anthropic.thinkingBudget,
anthropic.web_search,
librechat.fileTokenLimit,
];
const anthropicCol1: SettingsConfiguration = [
@@ -697,6 +714,7 @@ const anthropicCol2: SettingsConfiguration = [
anthropic.thinking,
anthropic.thinkingBudget,
anthropic.web_search,
librechat.fileTokenLimit,
];
const bedrockAnthropic: SettingsConfiguration = [
@@ -712,6 +730,7 @@ const bedrockAnthropic: SettingsConfiguration = [
bedrock.region,
anthropic.thinking,
anthropic.thinkingBudget,
librechat.fileTokenLimit,
];
const bedrockMistral: SettingsConfiguration = [
@@ -723,6 +742,7 @@ const bedrockMistral: SettingsConfiguration = [
mistral.topP,
librechat.resendFiles,
bedrock.region,
librechat.fileTokenLimit,
];
const bedrockCohere: SettingsConfiguration = [
@@ -734,6 +754,7 @@ const bedrockCohere: SettingsConfiguration = [
cohere.topP,
librechat.resendFiles,
bedrock.region,
librechat.fileTokenLimit,
];
const bedrockGeneral: SettingsConfiguration = [
@@ -744,6 +765,7 @@ const bedrockGeneral: SettingsConfiguration = [
meta.topP,
librechat.resendFiles,
bedrock.region,
librechat.fileTokenLimit,
];
const bedrockAnthropicCol1: SettingsConfiguration = [
@@ -763,6 +785,7 @@ const bedrockAnthropicCol2: SettingsConfiguration = [
bedrock.region,
anthropic.thinking,
anthropic.thinkingBudget,
librechat.fileTokenLimit,
];
const bedrockMistralCol1: SettingsConfiguration = [
@@ -778,6 +801,7 @@ const bedrockMistralCol2: SettingsConfiguration = [
mistral.topP,
librechat.resendFiles,
bedrock.region,
librechat.fileTokenLimit,
];
const bedrockCohereCol1: SettingsConfiguration = [
@@ -793,6 +817,7 @@ const bedrockCohereCol2: SettingsConfiguration = [
cohere.topP,
librechat.resendFiles,
bedrock.region,
librechat.fileTokenLimit,
];
const bedrockGeneralCol1: SettingsConfiguration = [
@@ -807,6 +832,7 @@ const bedrockGeneralCol2: SettingsConfiguration = [
meta.topP,
librechat.resendFiles,
bedrock.region,
librechat.fileTokenLimit,
];
export const paramSettings: Record<string, SettingsConfiguration | undefined> = {

View File

@@ -662,6 +662,8 @@ export const tConversationSchema = z.object({
iconURL: z.string().nullable().optional(),
/* temporary chat */
expiredAt: z.string().nullable().optional(),
/* file token limits */
fileTokenLimit: coerceNumber.optional(),
/** @deprecated */
resendImages: z.boolean().optional(),
/** @deprecated */
@@ -774,6 +776,8 @@ export const tQueryParamsSchema = tConversationSchema
* https://platform.openai.com/docs/api-reference/runs/createRun#runs-createrun-instructions
* */
instructions: true,
/** @endpoints openAI, google, anthropic */
fileTokenLimit: true,
})
.merge(
z.object({
@@ -830,6 +834,7 @@ export const googleBaseSchema = tConversationSchema.pick({
thinking: true,
thinkingBudget: true,
web_search: true,
fileTokenLimit: true,
iconURL: true,
greeting: true,
spec: true,
@@ -1080,6 +1085,7 @@ export const openAIBaseSchema = tConversationSchema.pick({
useResponsesApi: true,
web_search: true,
disableStreaming: true,
fileTokenLimit: true,
});
export const openAISchema = openAIBaseSchema
@@ -1124,6 +1130,7 @@ export const anthropicBaseSchema = tConversationSchema.pick({
spec: true,
maxContextTokens: true,
web_search: true,
fileTokenLimit: true,
});
export const anthropicSchema = anthropicBaseSchema

View File

@@ -55,6 +55,33 @@ export type FileConfig = {
maxHeight?: number;
quality?: number;
};
ocr?: {
supportedMimeTypes?: RegExp[];
};
textParsing?: {
supportedMimeTypes?: RegExp[];
};
checkType?: (fileType: string, supportedTypes: RegExp[]) => boolean;
};
export type FileConfigInput = {
endpoints?: {
[key: string]: EndpointFileConfig;
};
serverFileSizeLimit?: number;
avatarSizeLimit?: number;
clientImageResize?: {
enabled?: boolean;
maxWidth?: number;
maxHeight?: number;
quality?: number;
};
ocr?: {
supportedMimeTypes?: string[];
};
textParsing?: {
supportedMimeTypes?: string[];
};
checkType?: (fileType: string, supportedTypes: RegExp[]) => boolean;
};

View File

@@ -141,6 +141,9 @@ export const conversationPreset = {
disableStreaming: {
type: Boolean,
},
fileTokenLimit: {
type: Number,
},
/** Reasoning models only */
reasoning_effort: {
type: String,

View File

@@ -50,6 +50,7 @@ export interface IPreset extends Document {
useResponsesApi?: boolean;
web_search?: boolean;
disableStreaming?: boolean;
fileTokenLimit?: number;
// end of additional fields
agentOptions?: unknown;
}

View File

@@ -49,6 +49,7 @@ export interface IConversation extends Document {
useResponsesApi?: boolean;
web_search?: boolean;
disableStreaming?: boolean;
fileTokenLimit?: number;
// Additional fields
files?: string[];
expiredAt?: Date;