diff --git a/.cursor/rules/project.mdc b/.cursor/rules/project.mdc index 354118c..7fd8dda 100644 --- a/.cursor/rules/project.mdc +++ b/.cursor/rules/project.mdc @@ -37,7 +37,7 @@ src/ ├── memory/ # Supabase + pgvector persistence ├── mcp/ # MCP server registry + config ├── llm/ # OpenRouter client -├── tools/ # File ops, terminal, git, web, search +├── tools/ # File ops, terminal, git, web, search, desktop ├── task/ # Task types + verification └── api/ # HTTP routes (axum) ``` @@ -81,6 +81,23 @@ Analyzes failure signals to decide action: | External error | API/network issues | **Retry** same config | | Infeasible | consistent failures | **Stop** | +## Desktop Automation + +When `DESKTOP_ENABLED=true`, the agent has access to desktop automation tools: + +| Tool | Purpose | +|------|---------| +| `desktop_start_session` | Start Xvfb + i3 virtual desktop | +| `desktop_stop_session` | Clean up desktop session | +| `desktop_screenshot` | Capture screen (returns PNG path) | +| `desktop_type` | Type text or send key combos | +| `desktop_click` | Mouse click at coordinates | +| `desktop_get_text` | Extract text via AT-SPI or OCR | +| `desktop_mouse_move` | Move mouse without clicking | +| `desktop_scroll` | Scroll wheel at position | + +Setup: See `docs/DESKTOP_SETUP.md` and run `scripts/install_desktop.sh` on server. + ## After Significant Changes When you make architectural changes to this codebase, **update the Cursor rules**: diff --git a/.cursor/rules/secrets.mdc b/.cursor/rules/secrets.mdc index b37e05a..5638aac 100644 --- a/.cursor/rules/secrets.mdc +++ b/.cursor/rules/secrets.mdc @@ -69,6 +69,14 @@ Template file for local credentials. Copy to `secrets.json` and fill in values. | `PORT` | `3000` | Server port | | `MAX_ITERATIONS` | `50` | Max agent loop iterations | +### Desktop Automation (Optional) + +| Variable | Default | Description | +|----------|---------|-------------| +| `DESKTOP_ENABLED` | `false` | Enable desktop_* tools (Xvfb, xdotool, etc.) | +| `DESKTOP_RESOLUTION` | `1920x1080` | Virtual display resolution | +| `DESKTOP_DISPLAY_START` | `99` | Starting X display number | + ### Dashboard | Variable | Description | diff --git a/Cargo.toml b/Cargo.toml index 0717e56..4006c79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,9 @@ regex = "1" # For memory/storage chrono = { version = "0.4", features = ["serde"] } +# For desktop tools (process management on Unix) +libc = "0.2" + # Auth (JWT) jsonwebtoken = "9" diff --git a/dashboard/public/favicon.svg b/dashboard/public/favicon.svg index 93cdd89..768c95b 100644 --- a/dashboard/public/favicon.svg +++ b/dashboard/public/favicon.svg @@ -1,5 +1,10 @@ - - - + + + + + + + + diff --git a/dashboard/public/line_logo.svg b/dashboard/public/line_logo.svg new file mode 100644 index 0000000..93cdd89 --- /dev/null +++ b/dashboard/public/line_logo.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/docs/DESKTOP_SETUP.md b/docs/DESKTOP_SETUP.md new file mode 100644 index 0000000..94207ba --- /dev/null +++ b/docs/DESKTOP_SETUP.md @@ -0,0 +1,255 @@ +# Desktop Environment Setup + +This guide covers setting up a headless desktop environment for the Open Agent to control browsers and graphical applications. + +## Overview + +The desktop automation stack consists of: +- **Xvfb**: Virtual framebuffer for headless X11 +- **i3**: Minimal, deterministic window manager +- **xdotool**: Keyboard and mouse automation +- **scrot**: Screenshot capture +- **Chromium**: Web browser +- **AT-SPI2**: Accessibility tree extraction +- **Tesseract**: OCR fallback for text extraction + +## Installation (Ubuntu/Debian) + +```bash +# Update package list +apt update + +# Install core X11 and window manager +apt install -y xvfb i3 x11-utils + +# Install automation tools +apt install -y xdotool scrot imagemagick + +# Install Chromium browser +apt install -y chromium chromium-sandbox + +# Install accessibility tools (AT-SPI2) +apt install -y at-spi2-core libatspi2.0-0 python3-gi python3-gi-cairo gir1.2-atspi-2.0 + +# Install OCR +apt install -y tesseract-ocr + +# Install fonts for proper rendering +apt install -y fonts-liberation fonts-dejavu-core +``` + +## i3 Configuration + +Create a minimal, deterministic i3 config at `/root/.config/i3/config`: + +```bash +mkdir -p /root/.config/i3 +cat > /root/.config/i3/config << 'EOF' +# Open Agent i3 Config - Minimal and Deterministic +# No decorations, no animations, simple layout + +# Use Super (Mod4) as modifier +set $mod Mod4 + +# Font for window titles (not shown due to no decorations) +font pango:DejaVu Sans Mono 10 + +# Remove window decorations +default_border none +default_floating_border none + +# No gaps +gaps inner 0 +gaps outer 0 + +# Focus follows mouse (predictable behavior) +focus_follows_mouse no + +# Disable window titlebars completely +for_window [class=".*"] border pixel 0 + +# Make all windows float by default for easier positioning +# (comment out if you prefer tiling) +# for_window [class=".*"] floating enable + +# Chromium-specific: maximize and remove sandbox issues +for_window [class="Chromium"] border pixel 0 +for_window [class="chromium"] border pixel 0 + +# Keybindings (minimal set) +bindsym $mod+Return exec chromium --no-sandbox --disable-gpu +bindsym $mod+Shift+q kill +bindsym $mod+d exec dmenu_run + +# Focus movement +bindsym $mod+h focus left +bindsym $mod+j focus down +bindsym $mod+k focus up +bindsym $mod+l focus right + +# Exit i3 +bindsym $mod+Shift+e exit + +# Reload config +bindsym $mod+Shift+r reload + +# Workspace setup (just workspace 1) +workspace 1 output primary +EOF +``` + +## Environment Variables + +Add these to `/etc/open_agent/open_agent.env`: + +```bash +# Enable desktop automation tools +DESKTOP_ENABLED=true + +# Xvfb resolution (width x height) +DESKTOP_RESOLUTION=1920x1080 + +# Starting display number (will increment for concurrent sessions) +DESKTOP_DISPLAY_START=99 +``` + +## Manual Testing + +Test the setup manually before enabling for the agent: + +```bash +# Start Xvfb on display :99 +Xvfb :99 -screen 0 1920x1080x24 & +export DISPLAY=:99 + +# Start i3 window manager +i3 & + +# Launch Chromium +chromium --no-sandbox --disable-gpu & + +# Take a screenshot +sleep 2 +scrot /tmp/test_screenshot.png + +# Verify screenshot exists +ls -la /tmp/test_screenshot.png + +# Test xdotool +xdotool getactivewindow + +# Clean up +pkill -f "Xvfb :99" +``` + +## AT-SPI Accessibility Tree + +Test accessibility tree extraction: + +```bash +export DISPLAY=:99 +export DBUS_SESSION_BUS_ADDRESS=unix:path=/tmp/dbus-session-$$ + +# Start dbus session (required for AT-SPI) +dbus-daemon --session --fork --address=$DBUS_SESSION_BUS_ADDRESS + +# Python script to dump accessibility tree +python3 << 'EOF' +import gi +gi.require_version('Atspi', '2.0') +from gi.repository import Atspi + +def print_tree(obj, indent=0): + try: + name = obj.get_name() or "" + role = obj.get_role_name() + if name or role != "unknown": + print(" " * indent + f"[{role}] {name}") + for i in range(obj.get_child_count()): + child = obj.get_child_at_index(i) + if child: + print_tree(child, indent + 1) + except Exception as e: + pass + +desktop = Atspi.get_desktop(0) +for i in range(desktop.get_child_count()): + app = desktop.get_child_at_index(i) + if app: + print_tree(app) +EOF +``` + +## OCR with Tesseract + +Test OCR on a screenshot: + +```bash +# Take screenshot and run OCR +DISPLAY=:99 scrot /tmp/screen.png +tesseract /tmp/screen.png stdout + +# With language hint +tesseract /tmp/screen.png stdout -l eng +``` + +## Troubleshooting + +### Xvfb won't start +```bash +# Check if display is already in use +ls -la /tmp/.X*-lock +# Remove stale lock files +rm -f /tmp/.X99-lock /tmp/.X11-unix/X99 +``` + +### Chromium sandbox issues +Always use `--no-sandbox` flag when running as root: +```bash +chromium --no-sandbox --disable-gpu +``` + +### xdotool can't find windows +```bash +# List all windows +xdotool search --name "" + +# Ensure DISPLAY is set +echo $DISPLAY +``` + +### AT-SPI not working +```bash +# Ensure dbus is running +export $(dbus-launch) + +# Enable AT-SPI for Chromium +chromium --force-renderer-accessibility --no-sandbox +``` + +### No fonts rendering +```bash +# Install additional fonts +apt install -y fonts-noto fonts-freefont-ttf + +# Rebuild font cache +fc-cache -fv +``` + +## Security Considerations + +- The agent runs with full system access +- Xvfb sessions are isolated per-task +- Sessions are cleaned up when tasks complete +- Chromium runs with `--no-sandbox` (required for root, but limits isolation) +- Consider running in a container for additional isolation + +## Session Lifecycle + +1. **Task starts**: Agent calls `desktop_start_session` +2. **Xvfb starts**: Virtual display created at `:99` (or next available) +3. **i3 starts**: Window manager provides predictable layout +4. **Browser launches**: Chromium opens (if requested) +5. **Agent works**: Screenshots, clicks, typing via desktop_* tools +6. **Task ends**: `desktop_stop_session` kills Xvfb and children +7. **Cleanup**: Any orphaned sessions killed on task failure diff --git a/scripts/install_desktop.sh b/scripts/install_desktop.sh new file mode 100755 index 0000000..23640fe --- /dev/null +++ b/scripts/install_desktop.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# Install desktop automation dependencies for Open Agent +# Run this on the production server: bash scripts/install_desktop.sh + +set -e + +echo "=== Installing desktop automation packages ===" + +# Update package list +apt update + +# Install core X11 and window manager +echo "Installing Xvfb and i3..." +apt install -y xvfb i3 x11-utils + +# Install automation tools +echo "Installing xdotool and screenshot tools..." +apt install -y xdotool scrot imagemagick + +# Install Chromium browser +echo "Installing Chromium..." +apt install -y chromium chromium-sandbox || apt install -y chromium-browser + +# Install accessibility tools (AT-SPI2) +echo "Installing AT-SPI2 for accessibility tree..." +apt install -y at-spi2-core libatspi2.0-0 python3-gi python3-gi-cairo gir1.2-atspi-2.0 + +# Install OCR +echo "Installing Tesseract OCR..." +apt install -y tesseract-ocr + +# Install fonts for proper rendering +echo "Installing fonts..." +apt install -y fonts-liberation fonts-dejavu-core fonts-noto + +# Create i3 config directory +echo "Creating i3 configuration..." +mkdir -p /root/.config/i3 + +# Write i3 config +cat > /root/.config/i3/config << 'EOF' +# Open Agent i3 Config - Minimal and Deterministic +# No decorations, no animations, simple layout + +# Use Super (Mod4) as modifier +set $mod Mod4 + +# Font for window titles (not shown due to no decorations) +font pango:DejaVu Sans Mono 10 + +# Remove window decorations +default_border none +default_floating_border none + +# No gaps +gaps inner 0 +gaps outer 0 + +# Focus follows mouse (predictable behavior) +focus_follows_mouse no + +# Disable window titlebars completely +for_window [class=".*"] border pixel 0 + +# Chromium-specific: maximize and remove sandbox issues +for_window [class="Chromium"] border pixel 0 +for_window [class="chromium"] border pixel 0 + +# Keybindings (minimal set) +bindsym $mod+Return exec chromium --no-sandbox --disable-gpu +bindsym $mod+Shift+q kill +bindsym $mod+d exec dmenu_run + +# Focus movement +bindsym $mod+h focus left +bindsym $mod+j focus down +bindsym $mod+k focus up +bindsym $mod+l focus right + +# Exit i3 +bindsym $mod+Shift+e exit + +# Reload config +bindsym $mod+Shift+r reload + +# Workspace setup (just workspace 1) +workspace 1 output primary +EOF + +echo "i3 configuration written to /root/.config/i3/config" + +# Add DESKTOP_ENABLED to environment file +echo "Enabling desktop in environment..." +if ! grep -q "DESKTOP_ENABLED" /etc/open_agent/open_agent.env 2>/dev/null; then + echo "" >> /etc/open_agent/open_agent.env + echo "# Desktop automation" >> /etc/open_agent/open_agent.env + echo "DESKTOP_ENABLED=true" >> /etc/open_agent/open_agent.env + echo "DESKTOP_RESOLUTION=1920x1080" >> /etc/open_agent/open_agent.env +fi + +# Create work and screenshots directories +echo "Creating working directories..." +mkdir -p /root/work/screenshots +mkdir -p /root/tools + +# Test installation +echo "" +echo "=== Testing installation ===" + +echo -n "Xvfb: " +which Xvfb && echo "OK" || echo "MISSING" + +echo -n "i3: " +which i3 && echo "OK" || echo "MISSING" + +echo -n "xdotool: " +which xdotool && echo "OK" || echo "MISSING" + +echo -n "scrot: " +which scrot && echo "OK" || echo "MISSING" + +echo -n "chromium: " +(which chromium || which chromium-browser) && echo "OK" || echo "MISSING" + +echo -n "tesseract: " +which tesseract && echo "OK" || echo "MISSING" + +echo -n "python3 with gi: " +python3 -c "import gi; print('OK')" 2>/dev/null || echo "MISSING" + +echo "" +echo "=== Installation complete ===" +echo "Run: systemctl restart open_agent" +echo "To test manually:" +echo " Xvfb :99 -screen 0 1920x1080x24 &" +echo " DISPLAY=:99 i3 &" +echo " DISPLAY=:99 chromium --no-sandbox &" +echo " DISPLAY=:99 scrot /tmp/test.png" diff --git a/src/tools/desktop.rs b/src/tools/desktop.rs new file mode 100644 index 0000000..2f2b6d0 --- /dev/null +++ b/src/tools/desktop.rs @@ -0,0 +1,965 @@ +//! Desktop automation tools for controlling graphical applications. +//! +//! This module provides tools for: +//! - Managing Xvfb virtual display sessions +//! - Taking screenshots +//! - Keyboard input (typing) +//! - Mouse operations (clicking) +//! - Extracting visible text (AT-SPI + OCR) +//! +//! Requires: Xvfb, i3, xdotool, scrot, tesseract, AT-SPI2 +//! Only available when DESKTOP_ENABLED=true + +use std::path::Path; +use std::process::Stdio; +use std::sync::atomic::{AtomicU32, Ordering}; + +use async_trait::async_trait; +use serde_json::{json, Value}; +use tokio::process::Command; + +use super::Tool; + +/// Global counter for display numbers to avoid conflicts +static DISPLAY_COUNTER: AtomicU32 = AtomicU32::new(99); + +/// Check if desktop tools are enabled +fn desktop_enabled() -> bool { + std::env::var("DESKTOP_ENABLED") + .map(|v| v.to_lowercase() == "true" || v == "1") + .unwrap_or(false) +} + +/// Get the configured resolution +fn get_resolution() -> String { + std::env::var("DESKTOP_RESOLUTION").unwrap_or_else(|_| "1920x1080".to_string()) +} + +/// Run a command with DISPLAY environment variable set +async fn run_with_display( + display: &str, + program: &str, + args: &[&str], + timeout_secs: u64, +) -> anyhow::Result<(String, String, i32)> { + let output = match tokio::time::timeout( + std::time::Duration::from_secs(timeout_secs), + Command::new(program) + .args(args) + .env("DISPLAY", display) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output(), + ) + .await + { + Ok(Ok(output)) => output, + Ok(Err(e)) => return Err(anyhow::anyhow!("Failed to execute {}: {}", program, e)), + Err(_) => return Err(anyhow::anyhow!("Command {} timed out", program)), + }; + + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let exit_code = output.status.code().unwrap_or(-1); + + Ok((stdout, stderr, exit_code)) +} + +/// Start a new desktop session with Xvfb and i3. +/// +/// Creates a virtual X11 display and starts the i3 window manager. +/// Returns the display identifier (e.g., ":99") for use with other desktop tools. +pub struct StartSession; + +#[async_trait] +impl Tool for StartSession { + fn name(&self) -> &str { + "desktop_start_session" + } + + fn description(&self) -> &str { + "Start a virtual desktop session (Xvfb + i3 window manager). Returns the DISPLAY identifier (e.g., ':99') needed for other desktop_* tools. Call this before using any other desktop tools. Optionally launches Chromium browser." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "launch_browser": { + "type": "boolean", + "description": "If true, automatically launch Chromium browser after starting the session (default: false)" + }, + "url": { + "type": "string", + "description": "Optional URL to open in Chromium (only used if launch_browser is true)" + } + }, + "required": [] + }) + } + + async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result { + if !desktop_enabled() { + return Err(anyhow::anyhow!( + "Desktop tools are disabled. Set DESKTOP_ENABLED=true to enable." + )); + } + + // Get next display number + let display_num = DISPLAY_COUNTER.fetch_add(1, Ordering::SeqCst); + let display_id = format!(":{}", display_num); + let resolution = get_resolution(); + + tracing::info!(display = %display_id, resolution = %resolution, "Starting desktop session"); + + // Clean up any stale lock files + let lock_file = format!("/tmp/.X{}-lock", display_num); + let socket_file = format!("/tmp/.X11-unix/X{}", display_num); + let _ = std::fs::remove_file(&lock_file); + let _ = std::fs::remove_file(&socket_file); + + // Start Xvfb + let xvfb_args = format!("{} -screen 0 {}x24", display_id, resolution); + let mut xvfb = Command::new("Xvfb") + .args(xvfb_args.split_whitespace()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .map_err(|e| anyhow::anyhow!("Failed to start Xvfb: {}. Is Xvfb installed?", e))?; + + let xvfb_pid = xvfb.id().unwrap_or(0); + + // Wait for Xvfb to be ready + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + + // Verify Xvfb is running + if let Ok(Some(status)) = xvfb.try_wait() { + return Err(anyhow::anyhow!( + "Xvfb exited immediately with status: {:?}", + status + )); + } + + // Start i3 window manager + let i3 = Command::new("i3") + .env("DISPLAY", &display_id) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .map_err(|e| anyhow::anyhow!("Failed to start i3: {}. Is i3 installed?", e))?; + + let i3_pid = i3.id().unwrap_or(0); + + // Wait for i3 to initialize + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + + // Create screenshots directory in working dir + let screenshots_dir = working_dir.join("screenshots"); + std::fs::create_dir_all(&screenshots_dir)?; + + // Save session info to a file for cleanup + let session_file = working_dir.join(format!(".desktop_session_{}", display_num)); + let session_info = json!({ + "display": display_id, + "display_num": display_num, + "xvfb_pid": xvfb_pid, + "i3_pid": i3_pid, + "resolution": resolution, + "screenshots_dir": screenshots_dir.to_string_lossy() + }); + std::fs::write(&session_file, serde_json::to_string_pretty(&session_info)?)?; + + // Optionally launch browser + let launch_browser = args["launch_browser"].as_bool().unwrap_or(false); + let browser_info = if launch_browser { + let url = args["url"].as_str().unwrap_or("about:blank"); + + let chromium = Command::new("chromium") + .args([ + "--no-sandbox", + "--disable-gpu", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + "--force-renderer-accessibility", + url, + ]) + .env("DISPLAY", &display_id) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .map_err(|e| anyhow::anyhow!("Failed to start Chromium: {}", e))?; + + let chromium_pid = chromium.id().unwrap_or(0); + + // Wait for browser to load + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + + format!(", \"browser\": \"chromium\", \"browser_pid\": {}, \"url\": \"{}\"", chromium_pid, url) + } else { + String::new() + }; + + Ok(format!( + "{{\"success\": true, \"display\": \"{}\", \"resolution\": \"{}\", \"xvfb_pid\": {}, \"i3_pid\": {}, \"screenshots_dir\": \"{}\"{}}}", + display_id, + resolution, + xvfb_pid, + i3_pid, + screenshots_dir.display(), + browser_info + )) + } +} + +/// Stop a desktop session and clean up resources. +pub struct StopSession; + +#[async_trait] +impl Tool for StopSession { + fn name(&self) -> &str { + "desktop_stop_session" + } + + fn description(&self) -> &str { + "Stop a virtual desktop session. Kills Xvfb and all associated processes. Call this when done with desktop automation." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99') returned by desktop_start_session" + } + }, + "required": ["display"] + }) + } + + async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + // Extract display number + let display_num: u32 = display_id + .trim_start_matches(':') + .parse() + .map_err(|_| anyhow::anyhow!("Invalid display format: {}", display_id))?; + + tracing::info!(display = %display_id, "Stopping desktop session"); + + // Read session file if it exists + let session_file = working_dir.join(format!(".desktop_session_{}", display_num)); + let mut killed_pids = Vec::new(); + + if session_file.exists() { + if let Ok(content) = std::fs::read_to_string(&session_file) { + if let Ok(session_info) = serde_json::from_str::(&content) { + // Kill processes by PID + for pid_key in ["xvfb_pid", "i3_pid", "browser_pid"] { + if let Some(pid) = session_info[pid_key].as_u64() { + let pid = pid as i32; + unsafe { + libc::kill(pid, libc::SIGTERM); + } + killed_pids.push(pid); + } + } + } + } + let _ = std::fs::remove_file(&session_file); + } + + // Also kill by display pattern (fallback) + let _ = Command::new("pkill") + .args(["-f", &format!("Xvfb {}", display_id)]) + .output() + .await; + + // Clean up lock files + let lock_file = format!("/tmp/.X{}-lock", display_num); + let socket_file = format!("/tmp/.X11-unix/X{}", display_num); + let _ = std::fs::remove_file(&lock_file); + let _ = std::fs::remove_file(&socket_file); + + Ok(format!( + "{{\"success\": true, \"display\": \"{}\", \"killed_pids\": {:?}}}", + display_id, killed_pids + )) + } +} + +/// Take a screenshot of the desktop. +pub struct Screenshot; + +#[async_trait] +impl Tool for Screenshot { + fn name(&self) -> &str { + "desktop_screenshot" + } + + fn description(&self) -> &str { + "Take a screenshot of the virtual desktop. Returns the file path to the saved PNG image. You can then use read_file to view the image (supports vision)." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99') from desktop_start_session" + }, + "filename": { + "type": "string", + "description": "Optional filename for the screenshot (default: auto-generated with timestamp)" + }, + "region": { + "type": "object", + "description": "Optional region to capture (x, y, width, height)", + "properties": { + "x": { "type": "integer" }, + "y": { "type": "integer" }, + "width": { "type": "integer" }, + "height": { "type": "integer" } + } + } + }, + "required": ["display"] + }) + } + + async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + // Generate filename + let filename = args["filename"].as_str().map(|s| s.to_string()).unwrap_or_else(|| { + let timestamp = chrono::Utc::now().format("%Y%m%d_%H%M%S"); + format!("screenshot_{}.png", timestamp) + }); + + // Ensure screenshots directory exists + let screenshots_dir = working_dir.join("screenshots"); + std::fs::create_dir_all(&screenshots_dir)?; + + let filepath = screenshots_dir.join(&filename); + + tracing::info!(display = %display_id, path = %filepath.display(), "Taking screenshot"); + + // Build scrot command + let mut scrot_args = vec!["-o".to_string(), filepath.to_string_lossy().to_string()]; + + // Add region if specified + if let Some(region) = args.get("region") { + if region.is_object() { + let x = region["x"].as_i64().unwrap_or(0); + let y = region["y"].as_i64().unwrap_or(0); + let w = region["width"].as_i64().unwrap_or(100); + let h = region["height"].as_i64().unwrap_or(100); + scrot_args.push("-a".to_string()); + scrot_args.push(format!("{},{},{},{}", x, y, w, h)); + } + } + + let (_stdout, stderr, exit_code) = run_with_display( + display_id, + "scrot", + &scrot_args.iter().map(|s| s.as_str()).collect::>(), + 30, + ) + .await?; + + if exit_code != 0 { + // Try import as fallback + let import_result = run_with_display( + display_id, + "import", + &["-window", "root", filepath.to_string_lossy().as_ref()], + 30, + ) + .await; + + if let Err(e) = import_result { + return Err(anyhow::anyhow!( + "Screenshot failed. scrot error: {}. import error: {}", + stderr, + e + )); + } + } + + // Verify file exists + if !filepath.exists() { + return Err(anyhow::anyhow!("Screenshot file was not created")); + } + + let metadata = std::fs::metadata(&filepath)?; + + Ok(format!( + "{{\"success\": true, \"path\": \"{}\", \"size_bytes\": {}}}", + filepath.display(), + metadata.len() + )) + } +} + +/// Send keyboard input to the desktop. +pub struct TypeText; + +#[async_trait] +impl Tool for TypeText { + fn name(&self) -> &str { + "desktop_type" + } + + fn description(&self) -> &str { + "Send keyboard input to the virtual desktop. Can type text or send special keys (Return, Tab, Escape, ctrl+a, alt+F4, etc.). Text is typed into the currently focused window." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99')" + }, + "text": { + "type": "string", + "description": "Text to type. For special keys, use key names: 'Return', 'Tab', 'Escape', 'BackSpace', 'Delete', 'Up', 'Down', 'Left', 'Right', 'Home', 'End', 'Page_Up', 'Page_Down', 'F1'-'F12'" + }, + "key": { + "type": "string", + "description": "Send a key combination instead of typing text. Examples: 'Return', 'ctrl+a', 'alt+F4', 'ctrl+shift+t', 'super+Return'" + }, + "delay_ms": { + "type": "integer", + "description": "Delay between keystrokes in milliseconds (default: 12, increase for slow applications)" + } + }, + "required": ["display"], + "oneOf": [ + { "required": ["text"] }, + { "required": ["key"] } + ] + }) + } + + async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + let delay_ms = args["delay_ms"].as_u64().unwrap_or(12); + + let (command, input) = if let Some(text) = args["text"].as_str() { + // Type text character by character + ("type", text.to_string()) + } else if let Some(key) = args["key"].as_str() { + // Send key combination + ("key", key.to_string()) + } else { + return Err(anyhow::anyhow!("Either 'text' or 'key' must be provided")); + }; + + tracing::info!(display = %display_id, command = %command, "Sending keyboard input"); + + let (_stdout, stderr, exit_code) = run_with_display( + display_id, + "xdotool", + &[command, "--delay", &delay_ms.to_string(), &input], + 30, + ) + .await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("xdotool failed: {}", stderr)); + } + + Ok(format!( + "{{\"success\": true, \"command\": \"{}\", \"input\": \"{}\"}}", + command, + input.replace('\"', "\\\"").replace('\n', "\\n") + )) + } +} + +/// Click at a position on the desktop. +pub struct Click; + +#[async_trait] +impl Tool for Click { + fn name(&self) -> &str { + "desktop_click" + } + + fn description(&self) -> &str { + "Click at a specific position on the virtual desktop. Supports left, middle, right click and double-click. Coordinates are in pixels from top-left (0,0)." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99')" + }, + "x": { + "type": "integer", + "description": "X coordinate in pixels from left edge" + }, + "y": { + "type": "integer", + "description": "Y coordinate in pixels from top edge" + }, + "button": { + "type": "string", + "enum": ["left", "middle", "right"], + "description": "Mouse button to click (default: 'left')" + }, + "double": { + "type": "boolean", + "description": "If true, perform a double-click (default: false)" + }, + "hold_ms": { + "type": "integer", + "description": "Hold the click for this many milliseconds (for drag operations, use with move)" + } + }, + "required": ["display", "x", "y"] + }) + } + + async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + let x = args["x"] + .as_i64() + .ok_or_else(|| anyhow::anyhow!("Missing 'x' argument"))?; + let y = args["y"] + .as_i64() + .ok_or_else(|| anyhow::anyhow!("Missing 'y' argument"))?; + + let button = match args["button"].as_str().unwrap_or("left") { + "left" => "1", + "middle" => "2", + "right" => "3", + other => return Err(anyhow::anyhow!("Invalid button: {}", other)), + }; + + let double = args["double"].as_bool().unwrap_or(false); + let repeat = if double { "2" } else { "1" }; + + tracing::info!(display = %display_id, x = x, y = y, button = button, "Clicking"); + + // Move to position first + let (_, stderr, exit_code) = run_with_display( + display_id, + "xdotool", + &["mousemove", &x.to_string(), &y.to_string()], + 10, + ) + .await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr)); + } + + // Small delay to ensure move completes + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Click + let (_, stderr, exit_code) = run_with_display( + display_id, + "xdotool", + &["click", "--repeat", repeat, button], + 10, + ) + .await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("xdotool click failed: {}", stderr)); + } + + Ok(format!( + "{{\"success\": true, \"x\": {}, \"y\": {}, \"button\": \"{}\", \"double\": {}}}", + x, + y, + args["button"].as_str().unwrap_or("left"), + double + )) + } +} + +/// Extract visible text from the desktop using AT-SPI or OCR. +pub struct GetText; + +#[async_trait] +impl Tool for GetText { + fn name(&self) -> &str { + "desktop_get_text" + } + + fn description(&self) -> &str { + "Extract visible text from the virtual desktop. Uses the accessibility tree (AT-SPI) for structured output with element types, or falls back to OCR (Tesseract) for raw text. The accessibility tree provides better structure for web pages and applications." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99')" + }, + "method": { + "type": "string", + "enum": ["accessibility", "ocr", "both"], + "description": "Method to extract text. 'accessibility' uses AT-SPI (best for browsers/apps), 'ocr' uses Tesseract (works on any content), 'both' tries accessibility first then OCR (default: 'accessibility')" + }, + "max_depth": { + "type": "integer", + "description": "Maximum depth to traverse in accessibility tree (default: 10)" + } + }, + "required": ["display"] + }) + } + + async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + let method = args["method"].as_str().unwrap_or("accessibility"); + let max_depth = args["max_depth"].as_u64().unwrap_or(10); + + tracing::info!(display = %display_id, method = %method, "Extracting text"); + + let mut results = Vec::new(); + + // Try accessibility tree + if method == "accessibility" || method == "both" { + match get_accessibility_text(display_id, max_depth).await { + Ok(text) if !text.trim().is_empty() => { + results.push(("accessibility", text)); + } + Ok(_) => { + tracing::debug!("Accessibility tree returned empty"); + } + Err(e) => { + tracing::warn!("Accessibility tree extraction failed: {}", e); + if method == "accessibility" { + // Only fail if accessibility was the only method + results.push(("accessibility_error", e.to_string())); + } + } + } + } + + // Try OCR + if method == "ocr" || (method == "both" && results.is_empty()) { + match get_ocr_text(display_id, working_dir).await { + Ok(text) => { + results.push(("ocr", text)); + } + Err(e) => { + tracing::warn!("OCR extraction failed: {}", e); + results.push(("ocr_error", e.to_string())); + } + } + } + + // Format output + if results.is_empty() { + return Err(anyhow::anyhow!("No text extraction method succeeded")); + } + + let mut output = String::new(); + for (method_name, content) in results { + if method_name.ends_with("_error") { + output.push_str(&format!("--- {} ---\n{}\n\n", method_name, content)); + } else { + output.push_str(&format!("--- {} ---\n{}\n\n", method_name, content)); + } + } + + Ok(output.trim().to_string()) + } +} + +/// Extract text using AT-SPI accessibility tree +async fn get_accessibility_text(display: &str, max_depth: u64) -> anyhow::Result { + // Python script to extract accessibility tree + let python_script = format!( + r#" +import gi +import sys +gi.require_version('Atspi', '2.0') +from gi.repository import Atspi + +def get_text(obj, depth=0, max_depth={}): + if depth > max_depth: + return "" + + result = [] + try: + name = obj.get_name() or "" + role = obj.get_role_name() + + # Get text content if available + text = "" + try: + text_iface = obj.get_text() + if text_iface: + text = text_iface.get_text(0, text_iface.get_character_count()) + except: + pass + + # Include meaningful content + if name or text: + indent = " " * depth + content = text or name + if content.strip(): + result.append(f"{{indent}}[{{role}}] {{content[:500]}}") + + # Recurse into children + for i in range(obj.get_child_count()): + child = obj.get_child_at_index(i) + if child: + child_text = get_text(child, depth + 1, max_depth) + if child_text: + result.append(child_text) + except Exception as e: + pass + + return "\n".join(result) + +try: + desktop = Atspi.get_desktop(0) + output = [] + for i in range(desktop.get_child_count()): + app = desktop.get_child_at_index(i) + if app: + app_text = get_text(app, 0, {}) + if app_text.strip(): + output.append(app_text) + print("\n".join(output)) +except Exception as e: + print(f"Error: {{e}}", file=sys.stderr) + sys.exit(1) +"#, + max_depth, max_depth + ); + + let (stdout, stderr, exit_code) = run_with_display(display, "python3", &["-c", &python_script], 30).await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("AT-SPI extraction failed: {}", stderr)); + } + + Ok(stdout) +} + +/// Extract text using OCR (Tesseract) +async fn get_ocr_text(display: &str, working_dir: &Path) -> anyhow::Result { + // Take a screenshot first + let screenshots_dir = working_dir.join("screenshots"); + std::fs::create_dir_all(&screenshots_dir)?; + + let screenshot_path = screenshots_dir.join("_ocr_temp.png"); + + // Take screenshot + let (_, stderr, exit_code) = run_with_display( + display, + "scrot", + &["-o", screenshot_path.to_string_lossy().as_ref()], + 30, + ) + .await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("Failed to take screenshot for OCR: {}", stderr)); + } + + // Run tesseract + let output = Command::new("tesseract") + .args([screenshot_path.to_string_lossy().as_ref(), "stdout", "-l", "eng"]) + .output() + .await + .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; + + // Clean up temp screenshot + let _ = std::fs::remove_file(&screenshot_path); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(anyhow::anyhow!("Tesseract failed: {}", stderr)); + } + + let text = String::from_utf8_lossy(&output.stdout).to_string(); + Ok(text) +} + +/// Move the mouse to a position (without clicking). +pub struct MouseMove; + +#[async_trait] +impl Tool for MouseMove { + fn name(&self) -> &str { + "desktop_mouse_move" + } + + fn description(&self) -> &str { + "Move the mouse cursor to a specific position without clicking. Useful for hover effects or preparing for drag operations." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99')" + }, + "x": { + "type": "integer", + "description": "X coordinate in pixels from left edge" + }, + "y": { + "type": "integer", + "description": "Y coordinate in pixels from top edge" + } + }, + "required": ["display", "x", "y"] + }) + } + + async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + let x = args["x"] + .as_i64() + .ok_or_else(|| anyhow::anyhow!("Missing 'x' argument"))?; + let y = args["y"] + .as_i64() + .ok_or_else(|| anyhow::anyhow!("Missing 'y' argument"))?; + + tracing::info!(display = %display_id, x = x, y = y, "Moving mouse"); + + let (_, stderr, exit_code) = run_with_display( + display_id, + "xdotool", + &["mousemove", &x.to_string(), &y.to_string()], + 10, + ) + .await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr)); + } + + Ok(format!("{{\"success\": true, \"x\": {}, \"y\": {}}}", x, y)) + } +} + +/// Scroll the mouse wheel. +pub struct Scroll; + +#[async_trait] +impl Tool for Scroll { + fn name(&self) -> &str { + "desktop_scroll" + } + + fn description(&self) -> &str { + "Scroll the mouse wheel at the current position or at specified coordinates. Positive amount scrolls down, negative scrolls up." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "display": { + "type": "string", + "description": "The display identifier (e.g., ':99')" + }, + "amount": { + "type": "integer", + "description": "Scroll amount. Positive = down, negative = up. Each unit is typically one 'click' of the scroll wheel." + }, + "x": { + "type": "integer", + "description": "Optional: X coordinate to scroll at (moves mouse first)" + }, + "y": { + "type": "integer", + "description": "Optional: Y coordinate to scroll at (moves mouse first)" + } + }, + "required": ["display", "amount"] + }) + } + + async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result { + let display_id = args["display"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?; + + let amount = args["amount"] + .as_i64() + .ok_or_else(|| anyhow::anyhow!("Missing 'amount' argument"))?; + + // Move to position if specified + if let (Some(x), Some(y)) = (args["x"].as_i64(), args["y"].as_i64()) { + let (_, stderr, exit_code) = run_with_display( + display_id, + "xdotool", + &["mousemove", &x.to_string(), &y.to_string()], + 10, + ) + .await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr)); + } + + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + } + + tracing::info!(display = %display_id, amount = amount, "Scrolling"); + + // xdotool uses button 4 for scroll up, button 5 for scroll down + let (button, clicks) = if amount >= 0 { + ("5", amount.unsigned_abs() as usize) + } else { + ("4", amount.unsigned_abs() as usize) + }; + + for _ in 0..clicks { + let (_, stderr, exit_code) = + run_with_display(display_id, "xdotool", &["click", button], 10).await?; + + if exit_code != 0 { + return Err(anyhow::anyhow!("xdotool scroll failed: {}", stderr)); + } + + tokio::time::sleep(std::time::Duration::from_millis(20)).await; + } + + Ok(format!( + "{{\"success\": true, \"amount\": {}, \"direction\": \"{}\"}}", + amount, + if amount >= 0 { "down" } else { "up" } + )) + } +} diff --git a/src/tools/mod.rs b/src/tools/mod.rs index c6de310..d0dba33 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -7,6 +7,7 @@ //! and search anywhere on the machine. The `working_dir` parameter is the default directory //! for relative paths (typically `/root` in production). +mod desktop; mod directory; mod file_ops; mod git; @@ -97,6 +98,33 @@ impl ToolRegistry { tools.insert("ui_optionList".to_string(), Arc::new(ui::UiOptionList)); tools.insert("ui_dataTable".to_string(), Arc::new(ui::UiDataTable)); + // Desktop automation (conditional on DESKTOP_ENABLED) + if std::env::var("DESKTOP_ENABLED") + .map(|v| v.to_lowercase() == "true" || v == "1") + .unwrap_or(false) + { + tools.insert( + "desktop_start_session".to_string(), + Arc::new(desktop::StartSession), + ); + tools.insert( + "desktop_stop_session".to_string(), + Arc::new(desktop::StopSession), + ); + tools.insert( + "desktop_screenshot".to_string(), + Arc::new(desktop::Screenshot), + ); + tools.insert("desktop_type".to_string(), Arc::new(desktop::TypeText)); + tools.insert("desktop_click".to_string(), Arc::new(desktop::Click)); + tools.insert("desktop_get_text".to_string(), Arc::new(desktop::GetText)); + tools.insert( + "desktop_mouse_move".to_string(), + Arc::new(desktop::MouseMove), + ); + tools.insert("desktop_scroll".to_string(), Arc::new(desktop::Scroll)); + } + Self { tools } }