feat: better icon

This commit is contained in:
Thomas Marchand
2025-12-17 14:44:08 +00:00
parent 932f463f0d
commit 7ce644b77a
9 changed files with 1428 additions and 4 deletions

View File

@@ -37,7 +37,7 @@ src/
├── memory/ # Supabase + pgvector persistence
├── mcp/ # MCP server registry + config
├── llm/ # OpenRouter client
├── tools/ # File ops, terminal, git, web, search
├── tools/ # File ops, terminal, git, web, search, desktop
├── task/ # Task types + verification
└── api/ # HTTP routes (axum)
```
@@ -81,6 +81,23 @@ Analyzes failure signals to decide action:
| External error | API/network issues | **Retry** same config |
| Infeasible | consistent failures | **Stop** |
## Desktop Automation
When `DESKTOP_ENABLED=true`, the agent has access to desktop automation tools:
| Tool | Purpose |
|------|---------|
| `desktop_start_session` | Start Xvfb + i3 virtual desktop |
| `desktop_stop_session` | Clean up desktop session |
| `desktop_screenshot` | Capture screen (returns PNG path) |
| `desktop_type` | Type text or send key combos |
| `desktop_click` | Mouse click at coordinates |
| `desktop_get_text` | Extract text via AT-SPI or OCR |
| `desktop_mouse_move` | Move mouse without clicking |
| `desktop_scroll` | Scroll wheel at position |
Setup: See `docs/DESKTOP_SETUP.md` and run `scripts/install_desktop.sh` on server.
## After Significant Changes
When you make architectural changes to this codebase, **update the Cursor rules**:

View File

@@ -69,6 +69,14 @@ Template file for local credentials. Copy to `secrets.json` and fill in values.
| `PORT` | `3000` | Server port |
| `MAX_ITERATIONS` | `50` | Max agent loop iterations |
### Desktop Automation (Optional)
| Variable | Default | Description |
|----------|---------|-------------|
| `DESKTOP_ENABLED` | `false` | Enable desktop_* tools (Xvfb, xdotool, etc.) |
| `DESKTOP_RESOLUTION` | `1920x1080` | Virtual display resolution |
| `DESKTOP_DISPLAY_START` | `99` | Starting X display number |
### Dashboard
| Variable | Description |

View File

@@ -41,6 +41,9 @@ regex = "1"
# For memory/storage
chrono = { version = "0.4", features = ["serde"] }
# For desktop tools (process management on Unix)
libc = "0.2"
# Auth (JWT)
jsonwebtoken = "9"

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

255
docs/DESKTOP_SETUP.md Normal file
View File

@@ -0,0 +1,255 @@
# Desktop Environment Setup
This guide covers setting up a headless desktop environment for the Open Agent to control browsers and graphical applications.
## Overview
The desktop automation stack consists of:
- **Xvfb**: Virtual framebuffer for headless X11
- **i3**: Minimal, deterministic window manager
- **xdotool**: Keyboard and mouse automation
- **scrot**: Screenshot capture
- **Chromium**: Web browser
- **AT-SPI2**: Accessibility tree extraction
- **Tesseract**: OCR fallback for text extraction
## Installation (Ubuntu/Debian)
```bash
# Update package list
apt update
# Install core X11 and window manager
apt install -y xvfb i3 x11-utils
# Install automation tools
apt install -y xdotool scrot imagemagick
# Install Chromium browser
apt install -y chromium chromium-sandbox
# Install accessibility tools (AT-SPI2)
apt install -y at-spi2-core libatspi2.0-0 python3-gi python3-gi-cairo gir1.2-atspi-2.0
# Install OCR
apt install -y tesseract-ocr
# Install fonts for proper rendering
apt install -y fonts-liberation fonts-dejavu-core
```
## i3 Configuration
Create a minimal, deterministic i3 config at `/root/.config/i3/config`:
```bash
mkdir -p /root/.config/i3
cat > /root/.config/i3/config << 'EOF'
# Open Agent i3 Config - Minimal and Deterministic
# No decorations, no animations, simple layout
# Use Super (Mod4) as modifier
set $mod Mod4
# Font for window titles (not shown due to no decorations)
font pango:DejaVu Sans Mono 10
# Remove window decorations
default_border none
default_floating_border none
# No gaps
gaps inner 0
gaps outer 0
# Focus follows mouse (predictable behavior)
focus_follows_mouse no
# Disable window titlebars completely
for_window [class=".*"] border pixel 0
# Make all windows float by default for easier positioning
# (comment out if you prefer tiling)
# for_window [class=".*"] floating enable
# Chromium-specific: maximize and remove sandbox issues
for_window [class="Chromium"] border pixel 0
for_window [class="chromium"] border pixel 0
# Keybindings (minimal set)
bindsym $mod+Return exec chromium --no-sandbox --disable-gpu
bindsym $mod+Shift+q kill
bindsym $mod+d exec dmenu_run
# Focus movement
bindsym $mod+h focus left
bindsym $mod+j focus down
bindsym $mod+k focus up
bindsym $mod+l focus right
# Exit i3
bindsym $mod+Shift+e exit
# Reload config
bindsym $mod+Shift+r reload
# Workspace setup (just workspace 1)
workspace 1 output primary
EOF
```
## Environment Variables
Add these to `/etc/open_agent/open_agent.env`:
```bash
# Enable desktop automation tools
DESKTOP_ENABLED=true
# Xvfb resolution (width x height)
DESKTOP_RESOLUTION=1920x1080
# Starting display number (will increment for concurrent sessions)
DESKTOP_DISPLAY_START=99
```
## Manual Testing
Test the setup manually before enabling for the agent:
```bash
# Start Xvfb on display :99
Xvfb :99 -screen 0 1920x1080x24 &
export DISPLAY=:99
# Start i3 window manager
i3 &
# Launch Chromium
chromium --no-sandbox --disable-gpu &
# Take a screenshot
sleep 2
scrot /tmp/test_screenshot.png
# Verify screenshot exists
ls -la /tmp/test_screenshot.png
# Test xdotool
xdotool getactivewindow
# Clean up
pkill -f "Xvfb :99"
```
## AT-SPI Accessibility Tree
Test accessibility tree extraction:
```bash
export DISPLAY=:99
export DBUS_SESSION_BUS_ADDRESS=unix:path=/tmp/dbus-session-$$
# Start dbus session (required for AT-SPI)
dbus-daemon --session --fork --address=$DBUS_SESSION_BUS_ADDRESS
# Python script to dump accessibility tree
python3 << 'EOF'
import gi
gi.require_version('Atspi', '2.0')
from gi.repository import Atspi
def print_tree(obj, indent=0):
try:
name = obj.get_name() or ""
role = obj.get_role_name()
if name or role != "unknown":
print(" " * indent + f"[{role}] {name}")
for i in range(obj.get_child_count()):
child = obj.get_child_at_index(i)
if child:
print_tree(child, indent + 1)
except Exception as e:
pass
desktop = Atspi.get_desktop(0)
for i in range(desktop.get_child_count()):
app = desktop.get_child_at_index(i)
if app:
print_tree(app)
EOF
```
## OCR with Tesseract
Test OCR on a screenshot:
```bash
# Take screenshot and run OCR
DISPLAY=:99 scrot /tmp/screen.png
tesseract /tmp/screen.png stdout
# With language hint
tesseract /tmp/screen.png stdout -l eng
```
## Troubleshooting
### Xvfb won't start
```bash
# Check if display is already in use
ls -la /tmp/.X*-lock
# Remove stale lock files
rm -f /tmp/.X99-lock /tmp/.X11-unix/X99
```
### Chromium sandbox issues
Always use `--no-sandbox` flag when running as root:
```bash
chromium --no-sandbox --disable-gpu
```
### xdotool can't find windows
```bash
# List all windows
xdotool search --name ""
# Ensure DISPLAY is set
echo $DISPLAY
```
### AT-SPI not working
```bash
# Ensure dbus is running
export $(dbus-launch)
# Enable AT-SPI for Chromium
chromium --force-renderer-accessibility --no-sandbox
```
### No fonts rendering
```bash
# Install additional fonts
apt install -y fonts-noto fonts-freefont-ttf
# Rebuild font cache
fc-cache -fv
```
## Security Considerations
- The agent runs with full system access
- Xvfb sessions are isolated per-task
- Sessions are cleaned up when tasks complete
- Chromium runs with `--no-sandbox` (required for root, but limits isolation)
- Consider running in a container for additional isolation
## Session Lifecycle
1. **Task starts**: Agent calls `desktop_start_session`
2. **Xvfb starts**: Virtual display created at `:99` (or next available)
3. **i3 starts**: Window manager provides predictable layout
4. **Browser launches**: Chromium opens (if requested)
5. **Agent works**: Screenshots, clicks, typing via desktop_* tools
6. **Task ends**: `desktop_stop_session` kills Xvfb and children
7. **Cleanup**: Any orphaned sessions killed on task failure

138
scripts/install_desktop.sh Executable file
View File

@@ -0,0 +1,138 @@
#!/bin/bash
# Install desktop automation dependencies for Open Agent
# Run this on the production server: bash scripts/install_desktop.sh
set -e
echo "=== Installing desktop automation packages ==="
# Update package list
apt update
# Install core X11 and window manager
echo "Installing Xvfb and i3..."
apt install -y xvfb i3 x11-utils
# Install automation tools
echo "Installing xdotool and screenshot tools..."
apt install -y xdotool scrot imagemagick
# Install Chromium browser
echo "Installing Chromium..."
apt install -y chromium chromium-sandbox || apt install -y chromium-browser
# Install accessibility tools (AT-SPI2)
echo "Installing AT-SPI2 for accessibility tree..."
apt install -y at-spi2-core libatspi2.0-0 python3-gi python3-gi-cairo gir1.2-atspi-2.0
# Install OCR
echo "Installing Tesseract OCR..."
apt install -y tesseract-ocr
# Install fonts for proper rendering
echo "Installing fonts..."
apt install -y fonts-liberation fonts-dejavu-core fonts-noto
# Create i3 config directory
echo "Creating i3 configuration..."
mkdir -p /root/.config/i3
# Write i3 config
cat > /root/.config/i3/config << 'EOF'
# Open Agent i3 Config - Minimal and Deterministic
# No decorations, no animations, simple layout
# Use Super (Mod4) as modifier
set $mod Mod4
# Font for window titles (not shown due to no decorations)
font pango:DejaVu Sans Mono 10
# Remove window decorations
default_border none
default_floating_border none
# No gaps
gaps inner 0
gaps outer 0
# Focus follows mouse (predictable behavior)
focus_follows_mouse no
# Disable window titlebars completely
for_window [class=".*"] border pixel 0
# Chromium-specific: maximize and remove sandbox issues
for_window [class="Chromium"] border pixel 0
for_window [class="chromium"] border pixel 0
# Keybindings (minimal set)
bindsym $mod+Return exec chromium --no-sandbox --disable-gpu
bindsym $mod+Shift+q kill
bindsym $mod+d exec dmenu_run
# Focus movement
bindsym $mod+h focus left
bindsym $mod+j focus down
bindsym $mod+k focus up
bindsym $mod+l focus right
# Exit i3
bindsym $mod+Shift+e exit
# Reload config
bindsym $mod+Shift+r reload
# Workspace setup (just workspace 1)
workspace 1 output primary
EOF
echo "i3 configuration written to /root/.config/i3/config"
# Add DESKTOP_ENABLED to environment file
echo "Enabling desktop in environment..."
if ! grep -q "DESKTOP_ENABLED" /etc/open_agent/open_agent.env 2>/dev/null; then
echo "" >> /etc/open_agent/open_agent.env
echo "# Desktop automation" >> /etc/open_agent/open_agent.env
echo "DESKTOP_ENABLED=true" >> /etc/open_agent/open_agent.env
echo "DESKTOP_RESOLUTION=1920x1080" >> /etc/open_agent/open_agent.env
fi
# Create work and screenshots directories
echo "Creating working directories..."
mkdir -p /root/work/screenshots
mkdir -p /root/tools
# Test installation
echo ""
echo "=== Testing installation ==="
echo -n "Xvfb: "
which Xvfb && echo "OK" || echo "MISSING"
echo -n "i3: "
which i3 && echo "OK" || echo "MISSING"
echo -n "xdotool: "
which xdotool && echo "OK" || echo "MISSING"
echo -n "scrot: "
which scrot && echo "OK" || echo "MISSING"
echo -n "chromium: "
(which chromium || which chromium-browser) && echo "OK" || echo "MISSING"
echo -n "tesseract: "
which tesseract && echo "OK" || echo "MISSING"
echo -n "python3 with gi: "
python3 -c "import gi; print('OK')" 2>/dev/null || echo "MISSING"
echo ""
echo "=== Installation complete ==="
echo "Run: systemctl restart open_agent"
echo "To test manually:"
echo " Xvfb :99 -screen 0 1920x1080x24 &"
echo " DISPLAY=:99 i3 &"
echo " DISPLAY=:99 chromium --no-sandbox &"
echo " DISPLAY=:99 scrot /tmp/test.png"

965
src/tools/desktop.rs Normal file
View File

@@ -0,0 +1,965 @@
//! Desktop automation tools for controlling graphical applications.
//!
//! This module provides tools for:
//! - Managing Xvfb virtual display sessions
//! - Taking screenshots
//! - Keyboard input (typing)
//! - Mouse operations (clicking)
//! - Extracting visible text (AT-SPI + OCR)
//!
//! Requires: Xvfb, i3, xdotool, scrot, tesseract, AT-SPI2
//! Only available when DESKTOP_ENABLED=true
use std::path::Path;
use std::process::Stdio;
use std::sync::atomic::{AtomicU32, Ordering};
use async_trait::async_trait;
use serde_json::{json, Value};
use tokio::process::Command;
use super::Tool;
/// Global counter for display numbers to avoid conflicts
static DISPLAY_COUNTER: AtomicU32 = AtomicU32::new(99);
/// Check if desktop tools are enabled
fn desktop_enabled() -> bool {
std::env::var("DESKTOP_ENABLED")
.map(|v| v.to_lowercase() == "true" || v == "1")
.unwrap_or(false)
}
/// Get the configured resolution
fn get_resolution() -> String {
std::env::var("DESKTOP_RESOLUTION").unwrap_or_else(|_| "1920x1080".to_string())
}
/// Run a command with DISPLAY environment variable set
async fn run_with_display(
display: &str,
program: &str,
args: &[&str],
timeout_secs: u64,
) -> anyhow::Result<(String, String, i32)> {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
Command::new(program)
.args(args)
.env("DISPLAY", display)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
{
Ok(Ok(output)) => output,
Ok(Err(e)) => return Err(anyhow::anyhow!("Failed to execute {}: {}", program, e)),
Err(_) => return Err(anyhow::anyhow!("Command {} timed out", program)),
};
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
let exit_code = output.status.code().unwrap_or(-1);
Ok((stdout, stderr, exit_code))
}
/// Start a new desktop session with Xvfb and i3.
///
/// Creates a virtual X11 display and starts the i3 window manager.
/// Returns the display identifier (e.g., ":99") for use with other desktop tools.
pub struct StartSession;
#[async_trait]
impl Tool for StartSession {
fn name(&self) -> &str {
"desktop_start_session"
}
fn description(&self) -> &str {
"Start a virtual desktop session (Xvfb + i3 window manager). Returns the DISPLAY identifier (e.g., ':99') needed for other desktop_* tools. Call this before using any other desktop tools. Optionally launches Chromium browser."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"launch_browser": {
"type": "boolean",
"description": "If true, automatically launch Chromium browser after starting the session (default: false)"
},
"url": {
"type": "string",
"description": "Optional URL to open in Chromium (only used if launch_browser is true)"
}
},
"required": []
})
}
async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
if !desktop_enabled() {
return Err(anyhow::anyhow!(
"Desktop tools are disabled. Set DESKTOP_ENABLED=true to enable."
));
}
// Get next display number
let display_num = DISPLAY_COUNTER.fetch_add(1, Ordering::SeqCst);
let display_id = format!(":{}", display_num);
let resolution = get_resolution();
tracing::info!(display = %display_id, resolution = %resolution, "Starting desktop session");
// Clean up any stale lock files
let lock_file = format!("/tmp/.X{}-lock", display_num);
let socket_file = format!("/tmp/.X11-unix/X{}", display_num);
let _ = std::fs::remove_file(&lock_file);
let _ = std::fs::remove_file(&socket_file);
// Start Xvfb
let xvfb_args = format!("{} -screen 0 {}x24", display_id, resolution);
let mut xvfb = Command::new("Xvfb")
.args(xvfb_args.split_whitespace())
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.map_err(|e| anyhow::anyhow!("Failed to start Xvfb: {}. Is Xvfb installed?", e))?;
let xvfb_pid = xvfb.id().unwrap_or(0);
// Wait for Xvfb to be ready
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
// Verify Xvfb is running
if let Ok(Some(status)) = xvfb.try_wait() {
return Err(anyhow::anyhow!(
"Xvfb exited immediately with status: {:?}",
status
));
}
// Start i3 window manager
let i3 = Command::new("i3")
.env("DISPLAY", &display_id)
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.map_err(|e| anyhow::anyhow!("Failed to start i3: {}. Is i3 installed?", e))?;
let i3_pid = i3.id().unwrap_or(0);
// Wait for i3 to initialize
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
// Create screenshots directory in working dir
let screenshots_dir = working_dir.join("screenshots");
std::fs::create_dir_all(&screenshots_dir)?;
// Save session info to a file for cleanup
let session_file = working_dir.join(format!(".desktop_session_{}", display_num));
let session_info = json!({
"display": display_id,
"display_num": display_num,
"xvfb_pid": xvfb_pid,
"i3_pid": i3_pid,
"resolution": resolution,
"screenshots_dir": screenshots_dir.to_string_lossy()
});
std::fs::write(&session_file, serde_json::to_string_pretty(&session_info)?)?;
// Optionally launch browser
let launch_browser = args["launch_browser"].as_bool().unwrap_or(false);
let browser_info = if launch_browser {
let url = args["url"].as_str().unwrap_or("about:blank");
let chromium = Command::new("chromium")
.args([
"--no-sandbox",
"--disable-gpu",
"--disable-software-rasterizer",
"--disable-dev-shm-usage",
"--force-renderer-accessibility",
url,
])
.env("DISPLAY", &display_id)
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.map_err(|e| anyhow::anyhow!("Failed to start Chromium: {}", e))?;
let chromium_pid = chromium.id().unwrap_or(0);
// Wait for browser to load
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
format!(", \"browser\": \"chromium\", \"browser_pid\": {}, \"url\": \"{}\"", chromium_pid, url)
} else {
String::new()
};
Ok(format!(
"{{\"success\": true, \"display\": \"{}\", \"resolution\": \"{}\", \"xvfb_pid\": {}, \"i3_pid\": {}, \"screenshots_dir\": \"{}\"{}}}",
display_id,
resolution,
xvfb_pid,
i3_pid,
screenshots_dir.display(),
browser_info
))
}
}
/// Stop a desktop session and clean up resources.
pub struct StopSession;
#[async_trait]
impl Tool for StopSession {
fn name(&self) -> &str {
"desktop_stop_session"
}
fn description(&self) -> &str {
"Stop a virtual desktop session. Kills Xvfb and all associated processes. Call this when done with desktop automation."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99') returned by desktop_start_session"
}
},
"required": ["display"]
})
}
async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
// Extract display number
let display_num: u32 = display_id
.trim_start_matches(':')
.parse()
.map_err(|_| anyhow::anyhow!("Invalid display format: {}", display_id))?;
tracing::info!(display = %display_id, "Stopping desktop session");
// Read session file if it exists
let session_file = working_dir.join(format!(".desktop_session_{}", display_num));
let mut killed_pids = Vec::new();
if session_file.exists() {
if let Ok(content) = std::fs::read_to_string(&session_file) {
if let Ok(session_info) = serde_json::from_str::<Value>(&content) {
// Kill processes by PID
for pid_key in ["xvfb_pid", "i3_pid", "browser_pid"] {
if let Some(pid) = session_info[pid_key].as_u64() {
let pid = pid as i32;
unsafe {
libc::kill(pid, libc::SIGTERM);
}
killed_pids.push(pid);
}
}
}
}
let _ = std::fs::remove_file(&session_file);
}
// Also kill by display pattern (fallback)
let _ = Command::new("pkill")
.args(["-f", &format!("Xvfb {}", display_id)])
.output()
.await;
// Clean up lock files
let lock_file = format!("/tmp/.X{}-lock", display_num);
let socket_file = format!("/tmp/.X11-unix/X{}", display_num);
let _ = std::fs::remove_file(&lock_file);
let _ = std::fs::remove_file(&socket_file);
Ok(format!(
"{{\"success\": true, \"display\": \"{}\", \"killed_pids\": {:?}}}",
display_id, killed_pids
))
}
}
/// Take a screenshot of the desktop.
pub struct Screenshot;
#[async_trait]
impl Tool for Screenshot {
fn name(&self) -> &str {
"desktop_screenshot"
}
fn description(&self) -> &str {
"Take a screenshot of the virtual desktop. Returns the file path to the saved PNG image. You can then use read_file to view the image (supports vision)."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99') from desktop_start_session"
},
"filename": {
"type": "string",
"description": "Optional filename for the screenshot (default: auto-generated with timestamp)"
},
"region": {
"type": "object",
"description": "Optional region to capture (x, y, width, height)",
"properties": {
"x": { "type": "integer" },
"y": { "type": "integer" },
"width": { "type": "integer" },
"height": { "type": "integer" }
}
}
},
"required": ["display"]
})
}
async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
// Generate filename
let filename = args["filename"].as_str().map(|s| s.to_string()).unwrap_or_else(|| {
let timestamp = chrono::Utc::now().format("%Y%m%d_%H%M%S");
format!("screenshot_{}.png", timestamp)
});
// Ensure screenshots directory exists
let screenshots_dir = working_dir.join("screenshots");
std::fs::create_dir_all(&screenshots_dir)?;
let filepath = screenshots_dir.join(&filename);
tracing::info!(display = %display_id, path = %filepath.display(), "Taking screenshot");
// Build scrot command
let mut scrot_args = vec!["-o".to_string(), filepath.to_string_lossy().to_string()];
// Add region if specified
if let Some(region) = args.get("region") {
if region.is_object() {
let x = region["x"].as_i64().unwrap_or(0);
let y = region["y"].as_i64().unwrap_or(0);
let w = region["width"].as_i64().unwrap_or(100);
let h = region["height"].as_i64().unwrap_or(100);
scrot_args.push("-a".to_string());
scrot_args.push(format!("{},{},{},{}", x, y, w, h));
}
}
let (_stdout, stderr, exit_code) = run_with_display(
display_id,
"scrot",
&scrot_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
30,
)
.await?;
if exit_code != 0 {
// Try import as fallback
let import_result = run_with_display(
display_id,
"import",
&["-window", "root", filepath.to_string_lossy().as_ref()],
30,
)
.await;
if let Err(e) = import_result {
return Err(anyhow::anyhow!(
"Screenshot failed. scrot error: {}. import error: {}",
stderr,
e
));
}
}
// Verify file exists
if !filepath.exists() {
return Err(anyhow::anyhow!("Screenshot file was not created"));
}
let metadata = std::fs::metadata(&filepath)?;
Ok(format!(
"{{\"success\": true, \"path\": \"{}\", \"size_bytes\": {}}}",
filepath.display(),
metadata.len()
))
}
}
/// Send keyboard input to the desktop.
pub struct TypeText;
#[async_trait]
impl Tool for TypeText {
fn name(&self) -> &str {
"desktop_type"
}
fn description(&self) -> &str {
"Send keyboard input to the virtual desktop. Can type text or send special keys (Return, Tab, Escape, ctrl+a, alt+F4, etc.). Text is typed into the currently focused window."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99')"
},
"text": {
"type": "string",
"description": "Text to type. For special keys, use key names: 'Return', 'Tab', 'Escape', 'BackSpace', 'Delete', 'Up', 'Down', 'Left', 'Right', 'Home', 'End', 'Page_Up', 'Page_Down', 'F1'-'F12'"
},
"key": {
"type": "string",
"description": "Send a key combination instead of typing text. Examples: 'Return', 'ctrl+a', 'alt+F4', 'ctrl+shift+t', 'super+Return'"
},
"delay_ms": {
"type": "integer",
"description": "Delay between keystrokes in milliseconds (default: 12, increase for slow applications)"
}
},
"required": ["display"],
"oneOf": [
{ "required": ["text"] },
{ "required": ["key"] }
]
})
}
async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
let delay_ms = args["delay_ms"].as_u64().unwrap_or(12);
let (command, input) = if let Some(text) = args["text"].as_str() {
// Type text character by character
("type", text.to_string())
} else if let Some(key) = args["key"].as_str() {
// Send key combination
("key", key.to_string())
} else {
return Err(anyhow::anyhow!("Either 'text' or 'key' must be provided"));
};
tracing::info!(display = %display_id, command = %command, "Sending keyboard input");
let (_stdout, stderr, exit_code) = run_with_display(
display_id,
"xdotool",
&[command, "--delay", &delay_ms.to_string(), &input],
30,
)
.await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("xdotool failed: {}", stderr));
}
Ok(format!(
"{{\"success\": true, \"command\": \"{}\", \"input\": \"{}\"}}",
command,
input.replace('\"', "\\\"").replace('\n', "\\n")
))
}
}
/// Click at a position on the desktop.
pub struct Click;
#[async_trait]
impl Tool for Click {
fn name(&self) -> &str {
"desktop_click"
}
fn description(&self) -> &str {
"Click at a specific position on the virtual desktop. Supports left, middle, right click and double-click. Coordinates are in pixels from top-left (0,0)."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99')"
},
"x": {
"type": "integer",
"description": "X coordinate in pixels from left edge"
},
"y": {
"type": "integer",
"description": "Y coordinate in pixels from top edge"
},
"button": {
"type": "string",
"enum": ["left", "middle", "right"],
"description": "Mouse button to click (default: 'left')"
},
"double": {
"type": "boolean",
"description": "If true, perform a double-click (default: false)"
},
"hold_ms": {
"type": "integer",
"description": "Hold the click for this many milliseconds (for drag operations, use with move)"
}
},
"required": ["display", "x", "y"]
})
}
async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
let x = args["x"]
.as_i64()
.ok_or_else(|| anyhow::anyhow!("Missing 'x' argument"))?;
let y = args["y"]
.as_i64()
.ok_or_else(|| anyhow::anyhow!("Missing 'y' argument"))?;
let button = match args["button"].as_str().unwrap_or("left") {
"left" => "1",
"middle" => "2",
"right" => "3",
other => return Err(anyhow::anyhow!("Invalid button: {}", other)),
};
let double = args["double"].as_bool().unwrap_or(false);
let repeat = if double { "2" } else { "1" };
tracing::info!(display = %display_id, x = x, y = y, button = button, "Clicking");
// Move to position first
let (_, stderr, exit_code) = run_with_display(
display_id,
"xdotool",
&["mousemove", &x.to_string(), &y.to_string()],
10,
)
.await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr));
}
// Small delay to ensure move completes
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
// Click
let (_, stderr, exit_code) = run_with_display(
display_id,
"xdotool",
&["click", "--repeat", repeat, button],
10,
)
.await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("xdotool click failed: {}", stderr));
}
Ok(format!(
"{{\"success\": true, \"x\": {}, \"y\": {}, \"button\": \"{}\", \"double\": {}}}",
x,
y,
args["button"].as_str().unwrap_or("left"),
double
))
}
}
/// Extract visible text from the desktop using AT-SPI or OCR.
pub struct GetText;
#[async_trait]
impl Tool for GetText {
fn name(&self) -> &str {
"desktop_get_text"
}
fn description(&self) -> &str {
"Extract visible text from the virtual desktop. Uses the accessibility tree (AT-SPI) for structured output with element types, or falls back to OCR (Tesseract) for raw text. The accessibility tree provides better structure for web pages and applications."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99')"
},
"method": {
"type": "string",
"enum": ["accessibility", "ocr", "both"],
"description": "Method to extract text. 'accessibility' uses AT-SPI (best for browsers/apps), 'ocr' uses Tesseract (works on any content), 'both' tries accessibility first then OCR (default: 'accessibility')"
},
"max_depth": {
"type": "integer",
"description": "Maximum depth to traverse in accessibility tree (default: 10)"
}
},
"required": ["display"]
})
}
async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
let method = args["method"].as_str().unwrap_or("accessibility");
let max_depth = args["max_depth"].as_u64().unwrap_or(10);
tracing::info!(display = %display_id, method = %method, "Extracting text");
let mut results = Vec::new();
// Try accessibility tree
if method == "accessibility" || method == "both" {
match get_accessibility_text(display_id, max_depth).await {
Ok(text) if !text.trim().is_empty() => {
results.push(("accessibility", text));
}
Ok(_) => {
tracing::debug!("Accessibility tree returned empty");
}
Err(e) => {
tracing::warn!("Accessibility tree extraction failed: {}", e);
if method == "accessibility" {
// Only fail if accessibility was the only method
results.push(("accessibility_error", e.to_string()));
}
}
}
}
// Try OCR
if method == "ocr" || (method == "both" && results.is_empty()) {
match get_ocr_text(display_id, working_dir).await {
Ok(text) => {
results.push(("ocr", text));
}
Err(e) => {
tracing::warn!("OCR extraction failed: {}", e);
results.push(("ocr_error", e.to_string()));
}
}
}
// Format output
if results.is_empty() {
return Err(anyhow::anyhow!("No text extraction method succeeded"));
}
let mut output = String::new();
for (method_name, content) in results {
if method_name.ends_with("_error") {
output.push_str(&format!("--- {} ---\n{}\n\n", method_name, content));
} else {
output.push_str(&format!("--- {} ---\n{}\n\n", method_name, content));
}
}
Ok(output.trim().to_string())
}
}
/// Extract text using AT-SPI accessibility tree
async fn get_accessibility_text(display: &str, max_depth: u64) -> anyhow::Result<String> {
// Python script to extract accessibility tree
let python_script = format!(
r#"
import gi
import sys
gi.require_version('Atspi', '2.0')
from gi.repository import Atspi
def get_text(obj, depth=0, max_depth={}):
if depth > max_depth:
return ""
result = []
try:
name = obj.get_name() or ""
role = obj.get_role_name()
# Get text content if available
text = ""
try:
text_iface = obj.get_text()
if text_iface:
text = text_iface.get_text(0, text_iface.get_character_count())
except:
pass
# Include meaningful content
if name or text:
indent = " " * depth
content = text or name
if content.strip():
result.append(f"{{indent}}[{{role}}] {{content[:500]}}")
# Recurse into children
for i in range(obj.get_child_count()):
child = obj.get_child_at_index(i)
if child:
child_text = get_text(child, depth + 1, max_depth)
if child_text:
result.append(child_text)
except Exception as e:
pass
return "\n".join(result)
try:
desktop = Atspi.get_desktop(0)
output = []
for i in range(desktop.get_child_count()):
app = desktop.get_child_at_index(i)
if app:
app_text = get_text(app, 0, {})
if app_text.strip():
output.append(app_text)
print("\n".join(output))
except Exception as e:
print(f"Error: {{e}}", file=sys.stderr)
sys.exit(1)
"#,
max_depth, max_depth
);
let (stdout, stderr, exit_code) = run_with_display(display, "python3", &["-c", &python_script], 30).await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("AT-SPI extraction failed: {}", stderr));
}
Ok(stdout)
}
/// Extract text using OCR (Tesseract)
async fn get_ocr_text(display: &str, working_dir: &Path) -> anyhow::Result<String> {
// Take a screenshot first
let screenshots_dir = working_dir.join("screenshots");
std::fs::create_dir_all(&screenshots_dir)?;
let screenshot_path = screenshots_dir.join("_ocr_temp.png");
// Take screenshot
let (_, stderr, exit_code) = run_with_display(
display,
"scrot",
&["-o", screenshot_path.to_string_lossy().as_ref()],
30,
)
.await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("Failed to take screenshot for OCR: {}", stderr));
}
// Run tesseract
let output = Command::new("tesseract")
.args([screenshot_path.to_string_lossy().as_ref(), "stdout", "-l", "eng"])
.output()
.await
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
// Clean up temp screenshot
let _ = std::fs::remove_file(&screenshot_path);
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(anyhow::anyhow!("Tesseract failed: {}", stderr));
}
let text = String::from_utf8_lossy(&output.stdout).to_string();
Ok(text)
}
/// Move the mouse to a position (without clicking).
pub struct MouseMove;
#[async_trait]
impl Tool for MouseMove {
fn name(&self) -> &str {
"desktop_mouse_move"
}
fn description(&self) -> &str {
"Move the mouse cursor to a specific position without clicking. Useful for hover effects or preparing for drag operations."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99')"
},
"x": {
"type": "integer",
"description": "X coordinate in pixels from left edge"
},
"y": {
"type": "integer",
"description": "Y coordinate in pixels from top edge"
}
},
"required": ["display", "x", "y"]
})
}
async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
let x = args["x"]
.as_i64()
.ok_or_else(|| anyhow::anyhow!("Missing 'x' argument"))?;
let y = args["y"]
.as_i64()
.ok_or_else(|| anyhow::anyhow!("Missing 'y' argument"))?;
tracing::info!(display = %display_id, x = x, y = y, "Moving mouse");
let (_, stderr, exit_code) = run_with_display(
display_id,
"xdotool",
&["mousemove", &x.to_string(), &y.to_string()],
10,
)
.await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr));
}
Ok(format!("{{\"success\": true, \"x\": {}, \"y\": {}}}", x, y))
}
}
/// Scroll the mouse wheel.
pub struct Scroll;
#[async_trait]
impl Tool for Scroll {
fn name(&self) -> &str {
"desktop_scroll"
}
fn description(&self) -> &str {
"Scroll the mouse wheel at the current position or at specified coordinates. Positive amount scrolls down, negative scrolls up."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"display": {
"type": "string",
"description": "The display identifier (e.g., ':99')"
},
"amount": {
"type": "integer",
"description": "Scroll amount. Positive = down, negative = up. Each unit is typically one 'click' of the scroll wheel."
},
"x": {
"type": "integer",
"description": "Optional: X coordinate to scroll at (moves mouse first)"
},
"y": {
"type": "integer",
"description": "Optional: Y coordinate to scroll at (moves mouse first)"
}
},
"required": ["display", "amount"]
})
}
async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
let display_id = args["display"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
let amount = args["amount"]
.as_i64()
.ok_or_else(|| anyhow::anyhow!("Missing 'amount' argument"))?;
// Move to position if specified
if let (Some(x), Some(y)) = (args["x"].as_i64(), args["y"].as_i64()) {
let (_, stderr, exit_code) = run_with_display(
display_id,
"xdotool",
&["mousemove", &x.to_string(), &y.to_string()],
10,
)
.await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr));
}
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
}
tracing::info!(display = %display_id, amount = amount, "Scrolling");
// xdotool uses button 4 for scroll up, button 5 for scroll down
let (button, clicks) = if amount >= 0 {
("5", amount.unsigned_abs() as usize)
} else {
("4", amount.unsigned_abs() as usize)
};
for _ in 0..clicks {
let (_, stderr, exit_code) =
run_with_display(display_id, "xdotool", &["click", button], 10).await?;
if exit_code != 0 {
return Err(anyhow::anyhow!("xdotool scroll failed: {}", stderr));
}
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
}
Ok(format!(
"{{\"success\": true, \"amount\": {}, \"direction\": \"{}\"}}",
amount,
if amount >= 0 { "down" } else { "up" }
))
}
}

View File

@@ -7,6 +7,7 @@
//! and search anywhere on the machine. The `working_dir` parameter is the default directory
//! for relative paths (typically `/root` in production).
mod desktop;
mod directory;
mod file_ops;
mod git;
@@ -97,6 +98,33 @@ impl ToolRegistry {
tools.insert("ui_optionList".to_string(), Arc::new(ui::UiOptionList));
tools.insert("ui_dataTable".to_string(), Arc::new(ui::UiDataTable));
// Desktop automation (conditional on DESKTOP_ENABLED)
if std::env::var("DESKTOP_ENABLED")
.map(|v| v.to_lowercase() == "true" || v == "1")
.unwrap_or(false)
{
tools.insert(
"desktop_start_session".to_string(),
Arc::new(desktop::StartSession),
);
tools.insert(
"desktop_stop_session".to_string(),
Arc::new(desktop::StopSession),
);
tools.insert(
"desktop_screenshot".to_string(),
Arc::new(desktop::Screenshot),
);
tools.insert("desktop_type".to_string(), Arc::new(desktop::TypeText));
tools.insert("desktop_click".to_string(), Arc::new(desktop::Click));
tools.insert("desktop_get_text".to_string(), Arc::new(desktop::GetText));
tools.insert(
"desktop_mouse_move".to_string(),
Arc::new(desktop::MouseMove),
);
tools.insert("desktop_scroll".to_string(), Arc::new(desktop::Scroll));
}
Self { tools }
}