Remove local file ingestion tools
This commit is contained in:
@@ -5,7 +5,7 @@ An MCP server exposing the LightRAG Server API as tools, resources, and prompts
|
|||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Retrieval tools: `query_data`, `query`, `query_stream`, `query_stream_chunks`
|
- Retrieval tools: `query_data`, `query`, `query_stream`, `query_stream_chunks`
|
||||||
- Ingestion tools: `ingest_text`, `ingest_texts`, `ingest_file`, `ingest_files`, `upload_document`
|
- Ingestion tools: `ingest_text`, `ingest_texts`, `upload_document`
|
||||||
- Freshness tools: `scan_documents`, `scan_and_wait`, `pipeline_status`, `wait_for_idle`, `track_status`
|
- Freshness tools: `scan_documents`, `scan_and_wait`, `pipeline_status`, `wait_for_idle`, `track_status`
|
||||||
- Memory tool: `ingest_memory` for lessons, preferences, decisions, structures, functions, relationships
|
- Memory tool: `ingest_memory` for lessons, preferences, decisions, structures, functions, relationships
|
||||||
- Graph tools: entity/relation CRUD, entity existence check, label search, graph export
|
- Graph tools: entity/relation CRUD, entity existence check, label search, graph export
|
||||||
@@ -52,4 +52,3 @@ lightrag-mcp-smoke --query "What is this project?" --format pretty
|
|||||||
- `query_stream` collects the streaming response and returns it as a single string.
|
- `query_stream` collects the streaming response and returns it as a single string.
|
||||||
- `query_stream_chunks` returns chunked output and reports progress to clients that support progress events.
|
- `query_stream_chunks` returns chunked output and reports progress to clients that support progress events.
|
||||||
- `refresh_and_query` is a convenience macro for evidence-first workflows.
|
- `refresh_and_query` is a convenience macro for evidence-first workflows.
|
||||||
- `ingest_file(s)` chunk local files and store them with `file_source` references.
|
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Iterable
|
from typing import Any, Iterable
|
||||||
|
|
||||||
from mcp.server.fastmcp import Context, FastMCP
|
from mcp.server.fastmcp import Context, FastMCP
|
||||||
@@ -89,32 +88,6 @@ async def _wait_for_idle(timeout_s: float, interval_s: float) -> dict[str, Any]:
|
|||||||
await asyncio.sleep(interval_s)
|
await asyncio.sleep(interval_s)
|
||||||
|
|
||||||
|
|
||||||
def _chunk_text(text: str, max_chars: int, overlap: int) -> list[str]:
|
|
||||||
if max_chars <= 0:
|
|
||||||
raise ValueError("max_chars must be > 0")
|
|
||||||
if overlap < 0 or overlap >= max_chars:
|
|
||||||
raise ValueError("overlap must be >= 0 and < max_chars")
|
|
||||||
|
|
||||||
chunks: list[str] = []
|
|
||||||
start = 0
|
|
||||||
length = len(text)
|
|
||||||
while start < length:
|
|
||||||
end = min(start + max_chars, length)
|
|
||||||
cut = text.rfind("\n", start, end)
|
|
||||||
if cut == -1 or cut <= start:
|
|
||||||
cut = end
|
|
||||||
chunk = text[start:cut].strip()
|
|
||||||
if chunk:
|
|
||||||
chunks.append(chunk)
|
|
||||||
if cut >= length:
|
|
||||||
break
|
|
||||||
next_start = max(cut - overlap, 0)
|
|
||||||
if next_start <= start:
|
|
||||||
next_start = cut
|
|
||||||
start = next_start
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
|
|
||||||
def _format_list(items: Iterable[str]) -> str:
|
def _format_list(items: Iterable[str]) -> str:
|
||||||
return ", ".join(item.strip() for item in items if item.strip())
|
return ", ".join(item.strip() for item in items if item.strip())
|
||||||
|
|
||||||
@@ -359,45 +332,6 @@ async def ingest_texts(
|
|||||||
return await client.request_json("POST", "/documents/texts", json=payload)
|
return await client.request_json("POST", "/documents/texts", json=payload)
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
async def ingest_file(
|
|
||||||
path: str,
|
|
||||||
max_chars: int = 4000,
|
|
||||||
overlap: int = 200,
|
|
||||||
encoding: str = "utf-8",
|
|
||||||
) -> dict[str, Any]:
|
|
||||||
"""Read a local file, chunk it, and ingest as texts with file_sources set per chunk."""
|
|
||||||
file_path = Path(path)
|
|
||||||
if not file_path.exists():
|
|
||||||
raise FileNotFoundError(f"File not found: {file_path}")
|
|
||||||
text = file_path.read_text(encoding=encoding, errors="replace")
|
|
||||||
chunks = _chunk_text(text, max_chars=max_chars, overlap=overlap)
|
|
||||||
if not chunks:
|
|
||||||
raise ValueError(f"No content to ingest from {file_path}")
|
|
||||||
file_sources = [f"{file_path}#chunk:{idx + 1}/{len(chunks)}" for idx in range(len(chunks))]
|
|
||||||
payload: dict[str, Any] = {"texts": chunks, "file_sources": file_sources}
|
|
||||||
return await client.request_json("POST", "/documents/texts", json=payload)
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
async def ingest_files(
|
|
||||||
paths: list[str],
|
|
||||||
max_chars: int = 4000,
|
|
||||||
overlap: int = 200,
|
|
||||||
encoding: str = "utf-8",
|
|
||||||
) -> dict[str, Any]:
|
|
||||||
"""Ingest multiple local files by chunking each file into texts."""
|
|
||||||
results: dict[str, Any] = {}
|
|
||||||
for path in paths:
|
|
||||||
results[path] = await ingest_file(
|
|
||||||
path=path,
|
|
||||||
max_chars=max_chars,
|
|
||||||
overlap=overlap,
|
|
||||||
encoding=encoding,
|
|
||||||
)
|
|
||||||
return {"results": results}
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
async def upload_document(path: str) -> dict[str, Any]:
|
async def upload_document(path: str) -> dict[str, Any]:
|
||||||
"""Upload a local file to the LightRAG input directory."""
|
"""Upload a local file to the LightRAG input directory."""
|
||||||
|
|||||||
Reference in New Issue
Block a user