feat: better icon

2025-12-17 14:44:08 +00:00
parent 932f463f0d
commit 7ce644b77a
9 changed files with 1428 additions and 4 deletions
--- a/.cursor/rules/project.mdc
+++ b/.cursor/rules/project.mdc
@@ -37,7 +37,7 @@ src/
 ├── memory/           # Supabase + pgvector persistence
 ├── mcp/              # MCP server registry + config
 ├── llm/              # OpenRouter client
-├── tools/            # File ops, terminal, git, web, search
+├── tools/            # File ops, terminal, git, web, search, desktop
 ├── task/             # Task types + verification
 └── api/              # HTTP routes (axum)
 ```
@@ -81,6 +81,23 @@ Analyzes failure signals to decide action:
 | External error | API/network issues | **Retry** same config |
 | Infeasible | consistent failures | **Stop** |

+## Desktop Automation
+
+When `DESKTOP_ENABLED=true`, the agent has access to desktop automation tools:
+
+| Tool | Purpose |
+|------|---------|
+| `desktop_start_session` | Start Xvfb + i3 virtual desktop |
+| `desktop_stop_session` | Clean up desktop session |
+| `desktop_screenshot` | Capture screen (returns PNG path) |
+| `desktop_type` | Type text or send key combos |
+| `desktop_click` | Mouse click at coordinates |
+| `desktop_get_text` | Extract text via AT-SPI or OCR |
+| `desktop_mouse_move` | Move mouse without clicking |
+| `desktop_scroll` | Scroll wheel at position |
+
+Setup: See `docs/DESKTOP_SETUP.md` and run `scripts/install_desktop.sh` on server.
+
 ## After Significant Changes

 When you make architectural changes to this codebase, **update the Cursor rules**:
--- a/.cursor/rules/secrets.mdc
+++ b/.cursor/rules/secrets.mdc
@@ -69,6 +69,14 @@ Template file for local credentials. Copy to `secrets.json` and fill in values.
 | `PORT` | `3000` | Server port |
 | `MAX_ITERATIONS` | `50` | Max agent loop iterations |

+### Desktop Automation (Optional)
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `DESKTOP_ENABLED` | `false` | Enable desktop_* tools (Xvfb, xdotool, etc.) |
+| `DESKTOP_RESOLUTION` | `1920x1080` | Virtual display resolution |
+| `DESKTOP_DISPLAY_START` | `99` | Starting X display number |
+
 ### Dashboard

 | Variable | Description |
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,9 @@ regex = "1"
 # For memory/storage
 chrono = { version = "0.4", features = ["serde"] }

+# For desktop tools (process management on Unix)
+libc = "0.2"
+
 # Auth (JWT)
 jsonwebtoken = "9"

--- a/dashboard/public/favicon.svg
+++ b/dashboard/public/favicon.svg
--- a/dashboard/public/line_logo.svg
+++ b/dashboard/public/line_logo.svg
--- a/docs/DESKTOP_SETUP.md
+++ b/docs/DESKTOP_SETUP.md
@@ -0,0 +1,255 @@
+# Desktop Environment Setup
+
+This guide covers setting up a headless desktop environment for the Open Agent to control browsers and graphical applications.
+
+## Overview
+
+The desktop automation stack consists of:
+- **Xvfb**: Virtual framebuffer for headless X11
+- **i3**: Minimal, deterministic window manager
+- **xdotool**: Keyboard and mouse automation
+- **scrot**: Screenshot capture
+- **Chromium**: Web browser
+- **AT-SPI2**: Accessibility tree extraction
+- **Tesseract**: OCR fallback for text extraction
+
+## Installation (Ubuntu/Debian)
+
+```bash
+# Update package list
+apt update
+
+# Install core X11 and window manager
+apt install -y xvfb i3 x11-utils
+
+# Install automation tools
+apt install -y xdotool scrot imagemagick
+
+# Install Chromium browser
+apt install -y chromium chromium-sandbox
+
+# Install accessibility tools (AT-SPI2)
+apt install -y at-spi2-core libatspi2.0-0 python3-gi python3-gi-cairo gir1.2-atspi-2.0
+
+# Install OCR
+apt install -y tesseract-ocr
+
+# Install fonts for proper rendering
+apt install -y fonts-liberation fonts-dejavu-core
+```
+
+## i3 Configuration
+
+Create a minimal, deterministic i3 config at `/root/.config/i3/config`:
+
+```bash
+mkdir -p /root/.config/i3
+cat > /root/.config/i3/config << 'EOF'
+# Open Agent i3 Config - Minimal and Deterministic
+# No decorations, no animations, simple layout
+
+# Use Super (Mod4) as modifier
+set $mod Mod4
+
+# Font for window titles (not shown due to no decorations)
+font pango:DejaVu Sans Mono 10
+
+# Remove window decorations
+default_border none
+default_floating_border none
+
+# No gaps
+gaps inner 0
+gaps outer 0
+
+# Focus follows mouse (predictable behavior)
+focus_follows_mouse no
+
+# Disable window titlebars completely
+for_window [class=".*"] border pixel 0
+
+# Make all windows float by default for easier positioning
+# (comment out if you prefer tiling)
+# for_window [class=".*"] floating enable
+
+# Chromium-specific: maximize and remove sandbox issues
+for_window [class="Chromium"] border pixel 0
+for_window [class="chromium"] border pixel 0
+
+# Keybindings (minimal set)
+bindsym $mod+Return exec chromium --no-sandbox --disable-gpu
+bindsym $mod+Shift+q kill
+bindsym $mod+d exec dmenu_run
+
+# Focus movement
+bindsym $mod+h focus left
+bindsym $mod+j focus down
+bindsym $mod+k focus up
+bindsym $mod+l focus right
+
+# Exit i3
+bindsym $mod+Shift+e exit
+
+# Reload config
+bindsym $mod+Shift+r reload
+
+# Workspace setup (just workspace 1)
+workspace 1 output primary
+EOF
+```
+
+## Environment Variables
+
+Add these to `/etc/open_agent/open_agent.env`:
+
+```bash
+# Enable desktop automation tools
+DESKTOP_ENABLED=true
+
+# Xvfb resolution (width x height)
+DESKTOP_RESOLUTION=1920x1080
+
+# Starting display number (will increment for concurrent sessions)
+DESKTOP_DISPLAY_START=99
+```
+
+## Manual Testing
+
+Test the setup manually before enabling for the agent:
+
+```bash
+# Start Xvfb on display :99
+Xvfb :99 -screen 0 1920x1080x24 &
+export DISPLAY=:99
+
+# Start i3 window manager
+i3 &
+
+# Launch Chromium
+chromium --no-sandbox --disable-gpu &
+
+# Take a screenshot
+sleep 2
+scrot /tmp/test_screenshot.png
+
+# Verify screenshot exists
+ls -la /tmp/test_screenshot.png
+
+# Test xdotool
+xdotool getactivewindow
+
+# Clean up
+pkill -f "Xvfb :99"
+```
+
+## AT-SPI Accessibility Tree
+
+Test accessibility tree extraction:
+
+```bash
+export DISPLAY=:99
+export DBUS_SESSION_BUS_ADDRESS=unix:path=/tmp/dbus-session-$$
+
+# Start dbus session (required for AT-SPI)
+dbus-daemon --session --fork --address=$DBUS_SESSION_BUS_ADDRESS
+
+# Python script to dump accessibility tree
+python3 << 'EOF'
+import gi
+gi.require_version('Atspi', '2.0')
+from gi.repository import Atspi
+
+def print_tree(obj, indent=0):
+    try:
+        name = obj.get_name() or ""
+        role = obj.get_role_name()
+        if name or role != "unknown":
+            print("  " * indent + f"[{role}] {name}")
+        for i in range(obj.get_child_count()):
+            child = obj.get_child_at_index(i)
+            if child:
+                print_tree(child, indent + 1)
+    except Exception as e:
+        pass
+
+desktop = Atspi.get_desktop(0)
+for i in range(desktop.get_child_count()):
+    app = desktop.get_child_at_index(i)
+    if app:
+        print_tree(app)
+EOF
+```
+
+## OCR with Tesseract
+
+Test OCR on a screenshot:
+
+```bash
+# Take screenshot and run OCR
+DISPLAY=:99 scrot /tmp/screen.png
+tesseract /tmp/screen.png stdout
+
+# With language hint
+tesseract /tmp/screen.png stdout -l eng
+```
+
+## Troubleshooting
+
+### Xvfb won't start
+```bash
+# Check if display is already in use
+ls -la /tmp/.X*-lock
+# Remove stale lock files
+rm -f /tmp/.X99-lock /tmp/.X11-unix/X99
+```
+
+### Chromium sandbox issues
+Always use `--no-sandbox` flag when running as root:
+```bash
+chromium --no-sandbox --disable-gpu
+```
+
+### xdotool can't find windows
+```bash
+# List all windows
+xdotool search --name ""
+
+# Ensure DISPLAY is set
+echo $DISPLAY
+```
+
+### AT-SPI not working
+```bash
+# Ensure dbus is running
+export $(dbus-launch)
+
+# Enable AT-SPI for Chromium
+chromium --force-renderer-accessibility --no-sandbox
+```
+
+### No fonts rendering
+```bash
+# Install additional fonts
+apt install -y fonts-noto fonts-freefont-ttf
+
+# Rebuild font cache
+fc-cache -fv
+```
+
+## Security Considerations
+
+- The agent runs with full system access
+- Xvfb sessions are isolated per-task
+- Sessions are cleaned up when tasks complete
+- Chromium runs with `--no-sandbox` (required for root, but limits isolation)
+- Consider running in a container for additional isolation
+
+## Session Lifecycle
+
+1. **Task starts**: Agent calls `desktop_start_session`
+2. **Xvfb starts**: Virtual display created at `:99` (or next available)
+3. **i3 starts**: Window manager provides predictable layout
+4. **Browser launches**: Chromium opens (if requested)
+5. **Agent works**: Screenshots, clicks, typing via desktop_* tools
+6. **Task ends**: `desktop_stop_session` kills Xvfb and children
+7. **Cleanup**: Any orphaned sessions killed on task failure
--- a/scripts/install_desktop.sh
+++ b/scripts/install_desktop.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+# Install desktop automation dependencies for Open Agent
+# Run this on the production server: bash scripts/install_desktop.sh
+
+set -e
+
+echo "=== Installing desktop automation packages ==="
+
+# Update package list
+apt update
+
+# Install core X11 and window manager
+echo "Installing Xvfb and i3..."
+apt install -y xvfb i3 x11-utils
+
+# Install automation tools
+echo "Installing xdotool and screenshot tools..."
+apt install -y xdotool scrot imagemagick
+
+# Install Chromium browser
+echo "Installing Chromium..."
+apt install -y chromium chromium-sandbox || apt install -y chromium-browser
+
+# Install accessibility tools (AT-SPI2)
+echo "Installing AT-SPI2 for accessibility tree..."
+apt install -y at-spi2-core libatspi2.0-0 python3-gi python3-gi-cairo gir1.2-atspi-2.0
+
+# Install OCR
+echo "Installing Tesseract OCR..."
+apt install -y tesseract-ocr
+
+# Install fonts for proper rendering
+echo "Installing fonts..."
+apt install -y fonts-liberation fonts-dejavu-core fonts-noto
+
+# Create i3 config directory
+echo "Creating i3 configuration..."
+mkdir -p /root/.config/i3
+
+# Write i3 config
+cat > /root/.config/i3/config << 'EOF'
+# Open Agent i3 Config - Minimal and Deterministic
+# No decorations, no animations, simple layout
+
+# Use Super (Mod4) as modifier
+set $mod Mod4
+
+# Font for window titles (not shown due to no decorations)
+font pango:DejaVu Sans Mono 10
+
+# Remove window decorations
+default_border none
+default_floating_border none
+
+# No gaps
+gaps inner 0
+gaps outer 0
+
+# Focus follows mouse (predictable behavior)
+focus_follows_mouse no
+
+# Disable window titlebars completely
+for_window [class=".*"] border pixel 0
+
+# Chromium-specific: maximize and remove sandbox issues
+for_window [class="Chromium"] border pixel 0
+for_window [class="chromium"] border pixel 0
+
+# Keybindings (minimal set)
+bindsym $mod+Return exec chromium --no-sandbox --disable-gpu
+bindsym $mod+Shift+q kill
+bindsym $mod+d exec dmenu_run
+
+# Focus movement
+bindsym $mod+h focus left
+bindsym $mod+j focus down
+bindsym $mod+k focus up
+bindsym $mod+l focus right
+
+# Exit i3
+bindsym $mod+Shift+e exit
+
+# Reload config
+bindsym $mod+Shift+r reload
+
+# Workspace setup (just workspace 1)
+workspace 1 output primary
+EOF
+
+echo "i3 configuration written to /root/.config/i3/config"
+
+# Add DESKTOP_ENABLED to environment file
+echo "Enabling desktop in environment..."
+if ! grep -q "DESKTOP_ENABLED" /etc/open_agent/open_agent.env 2>/dev/null; then
+    echo "" >> /etc/open_agent/open_agent.env
+    echo "# Desktop automation" >> /etc/open_agent/open_agent.env
+    echo "DESKTOP_ENABLED=true" >> /etc/open_agent/open_agent.env
+    echo "DESKTOP_RESOLUTION=1920x1080" >> /etc/open_agent/open_agent.env
+fi
+
+# Create work and screenshots directories
+echo "Creating working directories..."
+mkdir -p /root/work/screenshots
+mkdir -p /root/tools
+
+# Test installation
+echo ""
+echo "=== Testing installation ==="
+
+echo -n "Xvfb: "
+which Xvfb && echo "OK" || echo "MISSING"
+
+echo -n "i3: "
+which i3 && echo "OK" || echo "MISSING"
+
+echo -n "xdotool: "
+which xdotool && echo "OK" || echo "MISSING"
+
+echo -n "scrot: "
+which scrot && echo "OK" || echo "MISSING"
+
+echo -n "chromium: "
+(which chromium || which chromium-browser) && echo "OK" || echo "MISSING"
+
+echo -n "tesseract: "
+which tesseract && echo "OK" || echo "MISSING"
+
+echo -n "python3 with gi: "
+python3 -c "import gi; print('OK')" 2>/dev/null || echo "MISSING"
+
+echo ""
+echo "=== Installation complete ==="
+echo "Run: systemctl restart open_agent"
+echo "To test manually:"
+echo "  Xvfb :99 -screen 0 1920x1080x24 &"
+echo "  DISPLAY=:99 i3 &"
+echo "  DISPLAY=:99 chromium --no-sandbox &"
+echo "  DISPLAY=:99 scrot /tmp/test.png"
--- a/src/tools/desktop.rs
+++ b/src/tools/desktop.rs
@@ -0,0 +1,965 @@
+//! Desktop automation tools for controlling graphical applications.
+//!
+//! This module provides tools for:
+//! - Managing Xvfb virtual display sessions
+//! - Taking screenshots
+//! - Keyboard input (typing)
+//! - Mouse operations (clicking)
+//! - Extracting visible text (AT-SPI + OCR)
+//!
+//! Requires: Xvfb, i3, xdotool, scrot, tesseract, AT-SPI2
+//! Only available when DESKTOP_ENABLED=true
+
+use std::path::Path;
+use std::process::Stdio;
+use std::sync::atomic::{AtomicU32, Ordering};
+
+use async_trait::async_trait;
+use serde_json::{json, Value};
+use tokio::process::Command;
+
+use super::Tool;
+
+/// Global counter for display numbers to avoid conflicts
+static DISPLAY_COUNTER: AtomicU32 = AtomicU32::new(99);
+
+/// Check if desktop tools are enabled
+fn desktop_enabled() -> bool {
+    std::env::var("DESKTOP_ENABLED")
+        .map(|v| v.to_lowercase() == "true" || v == "1")
+        .unwrap_or(false)
+}
+
+/// Get the configured resolution
+fn get_resolution() -> String {
+    std::env::var("DESKTOP_RESOLUTION").unwrap_or_else(|_| "1920x1080".to_string())
+}
+
+/// Run a command with DISPLAY environment variable set
+async fn run_with_display(
+    display: &str,
+    program: &str,
+    args: &[&str],
+    timeout_secs: u64,
+) -> anyhow::Result<(String, String, i32)> {
+    let output = match tokio::time::timeout(
+        std::time::Duration::from_secs(timeout_secs),
+        Command::new(program)
+            .args(args)
+            .env("DISPLAY", display)
+            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
+            .output(),
+    )
+    .await
+    {
+        Ok(Ok(output)) => output,
+        Ok(Err(e)) => return Err(anyhow::anyhow!("Failed to execute {}: {}", program, e)),
+        Err(_) => return Err(anyhow::anyhow!("Command {} timed out", program)),
+    };
+
+    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
+    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
+    let exit_code = output.status.code().unwrap_or(-1);
+
+    Ok((stdout, stderr, exit_code))
+}
+
+/// Start a new desktop session with Xvfb and i3.
+///
+/// Creates a virtual X11 display and starts the i3 window manager.
+/// Returns the display identifier (e.g., ":99") for use with other desktop tools.
+pub struct StartSession;
+
+#[async_trait]
+impl Tool for StartSession {
+    fn name(&self) -> &str {
+        "desktop_start_session"
+    }
+
+    fn description(&self) -> &str {
+        "Start a virtual desktop session (Xvfb + i3 window manager). Returns the DISPLAY identifier (e.g., ':99') needed for other desktop_* tools. Call this before using any other desktop tools. Optionally launches Chromium browser."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "launch_browser": {
+                    "type": "boolean",
+                    "description": "If true, automatically launch Chromium browser after starting the session (default: false)"
+                },
+                "url": {
+                    "type": "string",
+                    "description": "Optional URL to open in Chromium (only used if launch_browser is true)"
+                }
+            },
+            "required": []
+        })
+    }
+
+    async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
+        if !desktop_enabled() {
+            return Err(anyhow::anyhow!(
+                "Desktop tools are disabled. Set DESKTOP_ENABLED=true to enable."
+            ));
+        }
+
+        // Get next display number
+        let display_num = DISPLAY_COUNTER.fetch_add(1, Ordering::SeqCst);
+        let display_id = format!(":{}", display_num);
+        let resolution = get_resolution();
+
+        tracing::info!(display = %display_id, resolution = %resolution, "Starting desktop session");
+
+        // Clean up any stale lock files
+        let lock_file = format!("/tmp/.X{}-lock", display_num);
+        let socket_file = format!("/tmp/.X11-unix/X{}", display_num);
+        let _ = std::fs::remove_file(&lock_file);
+        let _ = std::fs::remove_file(&socket_file);
+
+        // Start Xvfb
+        let xvfb_args = format!("{} -screen 0 {}x24", display_id, resolution);
+        let mut xvfb = Command::new("Xvfb")
+            .args(xvfb_args.split_whitespace())
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .spawn()
+            .map_err(|e| anyhow::anyhow!("Failed to start Xvfb: {}. Is Xvfb installed?", e))?;
+
+        let xvfb_pid = xvfb.id().unwrap_or(0);
+
+        // Wait for Xvfb to be ready
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+
+        // Verify Xvfb is running
+        if let Ok(Some(status)) = xvfb.try_wait() {
+            return Err(anyhow::anyhow!(
+                "Xvfb exited immediately with status: {:?}",
+                status
+            ));
+        }
+
+        // Start i3 window manager
+        let i3 = Command::new("i3")
+            .env("DISPLAY", &display_id)
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .spawn()
+            .map_err(|e| anyhow::anyhow!("Failed to start i3: {}. Is i3 installed?", e))?;
+
+        let i3_pid = i3.id().unwrap_or(0);
+
+        // Wait for i3 to initialize
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+
+        // Create screenshots directory in working dir
+        let screenshots_dir = working_dir.join("screenshots");
+        std::fs::create_dir_all(&screenshots_dir)?;
+
+        // Save session info to a file for cleanup
+        let session_file = working_dir.join(format!(".desktop_session_{}", display_num));
+        let session_info = json!({
+            "display": display_id,
+            "display_num": display_num,
+            "xvfb_pid": xvfb_pid,
+            "i3_pid": i3_pid,
+            "resolution": resolution,
+            "screenshots_dir": screenshots_dir.to_string_lossy()
+        });
+        std::fs::write(&session_file, serde_json::to_string_pretty(&session_info)?)?;
+
+        // Optionally launch browser
+        let launch_browser = args["launch_browser"].as_bool().unwrap_or(false);
+        let browser_info = if launch_browser {
+            let url = args["url"].as_str().unwrap_or("about:blank");
+            
+            let chromium = Command::new("chromium")
+                .args([
+                    "--no-sandbox",
+                    "--disable-gpu",
+                    "--disable-software-rasterizer",
+                    "--disable-dev-shm-usage",
+                    "--force-renderer-accessibility",
+                    url,
+                ])
+                .env("DISPLAY", &display_id)
+                .stdout(Stdio::null())
+                .stderr(Stdio::null())
+                .spawn()
+                .map_err(|e| anyhow::anyhow!("Failed to start Chromium: {}", e))?;
+
+            let chromium_pid = chromium.id().unwrap_or(0);
+            
+            // Wait for browser to load
+            tokio::time::sleep(std::time::Duration::from_secs(2)).await;
+
+            format!(", \"browser\": \"chromium\", \"browser_pid\": {}, \"url\": \"{}\"", chromium_pid, url)
+        } else {
+            String::new()
+        };
+
+        Ok(format!(
+            "{{\"success\": true, \"display\": \"{}\", \"resolution\": \"{}\", \"xvfb_pid\": {}, \"i3_pid\": {}, \"screenshots_dir\": \"{}\"{}}}",
+            display_id,
+            resolution,
+            xvfb_pid,
+            i3_pid,
+            screenshots_dir.display(),
+            browser_info
+        ))
+    }
+}
+
+/// Stop a desktop session and clean up resources.
+pub struct StopSession;
+
+#[async_trait]
+impl Tool for StopSession {
+    fn name(&self) -> &str {
+        "desktop_stop_session"
+    }
+
+    fn description(&self) -> &str {
+        "Stop a virtual desktop session. Kills Xvfb and all associated processes. Call this when done with desktop automation."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99') returned by desktop_start_session"
+                }
+            },
+            "required": ["display"]
+        })
+    }
+
+    async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        // Extract display number
+        let display_num: u32 = display_id
+            .trim_start_matches(':')
+            .parse()
+            .map_err(|_| anyhow::anyhow!("Invalid display format: {}", display_id))?;
+
+        tracing::info!(display = %display_id, "Stopping desktop session");
+
+        // Read session file if it exists
+        let session_file = working_dir.join(format!(".desktop_session_{}", display_num));
+        let mut killed_pids = Vec::new();
+
+        if session_file.exists() {
+            if let Ok(content) = std::fs::read_to_string(&session_file) {
+                if let Ok(session_info) = serde_json::from_str::<Value>(&content) {
+                    // Kill processes by PID
+                    for pid_key in ["xvfb_pid", "i3_pid", "browser_pid"] {
+                        if let Some(pid) = session_info[pid_key].as_u64() {
+                            let pid = pid as i32;
+                            unsafe {
+                                libc::kill(pid, libc::SIGTERM);
+                            }
+                            killed_pids.push(pid);
+                        }
+                    }
+                }
+            }
+            let _ = std::fs::remove_file(&session_file);
+        }
+
+        // Also kill by display pattern (fallback)
+        let _ = Command::new("pkill")
+            .args(["-f", &format!("Xvfb {}", display_id)])
+            .output()
+            .await;
+
+        // Clean up lock files
+        let lock_file = format!("/tmp/.X{}-lock", display_num);
+        let socket_file = format!("/tmp/.X11-unix/X{}", display_num);
+        let _ = std::fs::remove_file(&lock_file);
+        let _ = std::fs::remove_file(&socket_file);
+
+        Ok(format!(
+            "{{\"success\": true, \"display\": \"{}\", \"killed_pids\": {:?}}}",
+            display_id, killed_pids
+        ))
+    }
+}
+
+/// Take a screenshot of the desktop.
+pub struct Screenshot;
+
+#[async_trait]
+impl Tool for Screenshot {
+    fn name(&self) -> &str {
+        "desktop_screenshot"
+    }
+
+    fn description(&self) -> &str {
+        "Take a screenshot of the virtual desktop. Returns the file path to the saved PNG image. You can then use read_file to view the image (supports vision)."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99') from desktop_start_session"
+                },
+                "filename": {
+                    "type": "string",
+                    "description": "Optional filename for the screenshot (default: auto-generated with timestamp)"
+                },
+                "region": {
+                    "type": "object",
+                    "description": "Optional region to capture (x, y, width, height)",
+                    "properties": {
+                        "x": { "type": "integer" },
+                        "y": { "type": "integer" },
+                        "width": { "type": "integer" },
+                        "height": { "type": "integer" }
+                    }
+                }
+            },
+            "required": ["display"]
+        })
+    }
+
+    async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        // Generate filename
+        let filename = args["filename"].as_str().map(|s| s.to_string()).unwrap_or_else(|| {
+            let timestamp = chrono::Utc::now().format("%Y%m%d_%H%M%S");
+            format!("screenshot_{}.png", timestamp)
+        });
+
+        // Ensure screenshots directory exists
+        let screenshots_dir = working_dir.join("screenshots");
+        std::fs::create_dir_all(&screenshots_dir)?;
+
+        let filepath = screenshots_dir.join(&filename);
+
+        tracing::info!(display = %display_id, path = %filepath.display(), "Taking screenshot");
+
+        // Build scrot command
+        let mut scrot_args = vec!["-o".to_string(), filepath.to_string_lossy().to_string()];
+
+        // Add region if specified
+        if let Some(region) = args.get("region") {
+            if region.is_object() {
+                let x = region["x"].as_i64().unwrap_or(0);
+                let y = region["y"].as_i64().unwrap_or(0);
+                let w = region["width"].as_i64().unwrap_or(100);
+                let h = region["height"].as_i64().unwrap_or(100);
+                scrot_args.push("-a".to_string());
+                scrot_args.push(format!("{},{},{},{}", x, y, w, h));
+            }
+        }
+
+        let (_stdout, stderr, exit_code) = run_with_display(
+            display_id,
+            "scrot",
+            &scrot_args.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+            30,
+        )
+        .await?;
+
+        if exit_code != 0 {
+            // Try import as fallback
+            let import_result = run_with_display(
+                display_id,
+                "import",
+                &["-window", "root", filepath.to_string_lossy().as_ref()],
+                30,
+            )
+            .await;
+
+            if let Err(e) = import_result {
+                return Err(anyhow::anyhow!(
+                    "Screenshot failed. scrot error: {}. import error: {}",
+                    stderr,
+                    e
+                ));
+            }
+        }
+
+        // Verify file exists
+        if !filepath.exists() {
+            return Err(anyhow::anyhow!("Screenshot file was not created"));
+        }
+
+        let metadata = std::fs::metadata(&filepath)?;
+
+        Ok(format!(
+            "{{\"success\": true, \"path\": \"{}\", \"size_bytes\": {}}}",
+            filepath.display(),
+            metadata.len()
+        ))
+    }
+}
+
+/// Send keyboard input to the desktop.
+pub struct TypeText;
+
+#[async_trait]
+impl Tool for TypeText {
+    fn name(&self) -> &str {
+        "desktop_type"
+    }
+
+    fn description(&self) -> &str {
+        "Send keyboard input to the virtual desktop. Can type text or send special keys (Return, Tab, Escape, ctrl+a, alt+F4, etc.). Text is typed into the currently focused window."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99')"
+                },
+                "text": {
+                    "type": "string",
+                    "description": "Text to type. For special keys, use key names: 'Return', 'Tab', 'Escape', 'BackSpace', 'Delete', 'Up', 'Down', 'Left', 'Right', 'Home', 'End', 'Page_Up', 'Page_Down', 'F1'-'F12'"
+                },
+                "key": {
+                    "type": "string",
+                    "description": "Send a key combination instead of typing text. Examples: 'Return', 'ctrl+a', 'alt+F4', 'ctrl+shift+t', 'super+Return'"
+                },
+                "delay_ms": {
+                    "type": "integer",
+                    "description": "Delay between keystrokes in milliseconds (default: 12, increase for slow applications)"
+                }
+            },
+            "required": ["display"],
+            "oneOf": [
+                { "required": ["text"] },
+                { "required": ["key"] }
+            ]
+        })
+    }
+
+    async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        let delay_ms = args["delay_ms"].as_u64().unwrap_or(12);
+
+        let (command, input) = if let Some(text) = args["text"].as_str() {
+            // Type text character by character
+            ("type", text.to_string())
+        } else if let Some(key) = args["key"].as_str() {
+            // Send key combination
+            ("key", key.to_string())
+        } else {
+            return Err(anyhow::anyhow!("Either 'text' or 'key' must be provided"));
+        };
+
+        tracing::info!(display = %display_id, command = %command, "Sending keyboard input");
+
+        let (_stdout, stderr, exit_code) = run_with_display(
+            display_id,
+            "xdotool",
+            &[command, "--delay", &delay_ms.to_string(), &input],
+            30,
+        )
+        .await?;
+
+        if exit_code != 0 {
+            return Err(anyhow::anyhow!("xdotool failed: {}", stderr));
+        }
+
+        Ok(format!(
+            "{{\"success\": true, \"command\": \"{}\", \"input\": \"{}\"}}",
+            command,
+            input.replace('\"', "\\\"").replace('\n', "\\n")
+        ))
+    }
+}
+
+/// Click at a position on the desktop.
+pub struct Click;
+
+#[async_trait]
+impl Tool for Click {
+    fn name(&self) -> &str {
+        "desktop_click"
+    }
+
+    fn description(&self) -> &str {
+        "Click at a specific position on the virtual desktop. Supports left, middle, right click and double-click. Coordinates are in pixels from top-left (0,0)."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99')"
+                },
+                "x": {
+                    "type": "integer",
+                    "description": "X coordinate in pixels from left edge"
+                },
+                "y": {
+                    "type": "integer",
+                    "description": "Y coordinate in pixels from top edge"
+                },
+                "button": {
+                    "type": "string",
+                    "enum": ["left", "middle", "right"],
+                    "description": "Mouse button to click (default: 'left')"
+                },
+                "double": {
+                    "type": "boolean",
+                    "description": "If true, perform a double-click (default: false)"
+                },
+                "hold_ms": {
+                    "type": "integer",
+                    "description": "Hold the click for this many milliseconds (for drag operations, use with move)"
+                }
+            },
+            "required": ["display", "x", "y"]
+        })
+    }
+
+    async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        let x = args["x"]
+            .as_i64()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'x' argument"))?;
+        let y = args["y"]
+            .as_i64()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'y' argument"))?;
+
+        let button = match args["button"].as_str().unwrap_or("left") {
+            "left" => "1",
+            "middle" => "2",
+            "right" => "3",
+            other => return Err(anyhow::anyhow!("Invalid button: {}", other)),
+        };
+
+        let double = args["double"].as_bool().unwrap_or(false);
+        let repeat = if double { "2" } else { "1" };
+
+        tracing::info!(display = %display_id, x = x, y = y, button = button, "Clicking");
+
+        // Move to position first
+        let (_, stderr, exit_code) = run_with_display(
+            display_id,
+            "xdotool",
+            &["mousemove", &x.to_string(), &y.to_string()],
+            10,
+        )
+        .await?;
+
+        if exit_code != 0 {
+            return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr));
+        }
+
+        // Small delay to ensure move completes
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        // Click
+        let (_, stderr, exit_code) = run_with_display(
+            display_id,
+            "xdotool",
+            &["click", "--repeat", repeat, button],
+            10,
+        )
+        .await?;
+
+        if exit_code != 0 {
+            return Err(anyhow::anyhow!("xdotool click failed: {}", stderr));
+        }
+
+        Ok(format!(
+            "{{\"success\": true, \"x\": {}, \"y\": {}, \"button\": \"{}\", \"double\": {}}}",
+            x,
+            y,
+            args["button"].as_str().unwrap_or("left"),
+            double
+        ))
+    }
+}
+
+/// Extract visible text from the desktop using AT-SPI or OCR.
+pub struct GetText;
+
+#[async_trait]
+impl Tool for GetText {
+    fn name(&self) -> &str {
+        "desktop_get_text"
+    }
+
+    fn description(&self) -> &str {
+        "Extract visible text from the virtual desktop. Uses the accessibility tree (AT-SPI) for structured output with element types, or falls back to OCR (Tesseract) for raw text. The accessibility tree provides better structure for web pages and applications."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99')"
+                },
+                "method": {
+                    "type": "string",
+                    "enum": ["accessibility", "ocr", "both"],
+                    "description": "Method to extract text. 'accessibility' uses AT-SPI (best for browsers/apps), 'ocr' uses Tesseract (works on any content), 'both' tries accessibility first then OCR (default: 'accessibility')"
+                },
+                "max_depth": {
+                    "type": "integer",
+                    "description": "Maximum depth to traverse in accessibility tree (default: 10)"
+                }
+            },
+            "required": ["display"]
+        })
+    }
+
+    async fn execute(&self, args: Value, working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        let method = args["method"].as_str().unwrap_or("accessibility");
+        let max_depth = args["max_depth"].as_u64().unwrap_or(10);
+
+        tracing::info!(display = %display_id, method = %method, "Extracting text");
+
+        let mut results = Vec::new();
+
+        // Try accessibility tree
+        if method == "accessibility" || method == "both" {
+            match get_accessibility_text(display_id, max_depth).await {
+                Ok(text) if !text.trim().is_empty() => {
+                    results.push(("accessibility", text));
+                }
+                Ok(_) => {
+                    tracing::debug!("Accessibility tree returned empty");
+                }
+                Err(e) => {
+                    tracing::warn!("Accessibility tree extraction failed: {}", e);
+                    if method == "accessibility" {
+                        // Only fail if accessibility was the only method
+                        results.push(("accessibility_error", e.to_string()));
+                    }
+                }
+            }
+        }
+
+        // Try OCR
+        if method == "ocr" || (method == "both" && results.is_empty()) {
+            match get_ocr_text(display_id, working_dir).await {
+                Ok(text) => {
+                    results.push(("ocr", text));
+                }
+                Err(e) => {
+                    tracing::warn!("OCR extraction failed: {}", e);
+                    results.push(("ocr_error", e.to_string()));
+                }
+            }
+        }
+
+        // Format output
+        if results.is_empty() {
+            return Err(anyhow::anyhow!("No text extraction method succeeded"));
+        }
+
+        let mut output = String::new();
+        for (method_name, content) in results {
+            if method_name.ends_with("_error") {
+                output.push_str(&format!("--- {} ---\n{}\n\n", method_name, content));
+            } else {
+                output.push_str(&format!("--- {} ---\n{}\n\n", method_name, content));
+            }
+        }
+
+        Ok(output.trim().to_string())
+    }
+}
+
+/// Extract text using AT-SPI accessibility tree
+async fn get_accessibility_text(display: &str, max_depth: u64) -> anyhow::Result<String> {
+    // Python script to extract accessibility tree
+    let python_script = format!(
+        r#"
+import gi
+import sys
+gi.require_version('Atspi', '2.0')
+from gi.repository import Atspi
+
+def get_text(obj, depth=0, max_depth={}):
+    if depth > max_depth:
+        return ""
+    
+    result = []
+    try:
+        name = obj.get_name() or ""
+        role = obj.get_role_name()
+        
+        # Get text content if available
+        text = ""
+        try:
+            text_iface = obj.get_text()
+            if text_iface:
+                text = text_iface.get_text(0, text_iface.get_character_count())
+        except:
+            pass
+        
+        # Include meaningful content
+        if name or text:
+            indent = "  " * depth
+            content = text or name
+            if content.strip():
+                result.append(f"{{indent}}[{{role}}] {{content[:500]}}")
+        
+        # Recurse into children
+        for i in range(obj.get_child_count()):
+            child = obj.get_child_at_index(i)
+            if child:
+                child_text = get_text(child, depth + 1, max_depth)
+                if child_text:
+                    result.append(child_text)
+    except Exception as e:
+        pass
+    
+    return "\n".join(result)
+
+try:
+    desktop = Atspi.get_desktop(0)
+    output = []
+    for i in range(desktop.get_child_count()):
+        app = desktop.get_child_at_index(i)
+        if app:
+            app_text = get_text(app, 0, {})
+            if app_text.strip():
+                output.append(app_text)
+    print("\n".join(output))
+except Exception as e:
+    print(f"Error: {{e}}", file=sys.stderr)
+    sys.exit(1)
+"#,
+        max_depth, max_depth
+    );
+
+    let (stdout, stderr, exit_code) = run_with_display(display, "python3", &["-c", &python_script], 30).await?;
+
+    if exit_code != 0 {
+        return Err(anyhow::anyhow!("AT-SPI extraction failed: {}", stderr));
+    }
+
+    Ok(stdout)
+}
+
+/// Extract text using OCR (Tesseract)
+async fn get_ocr_text(display: &str, working_dir: &Path) -> anyhow::Result<String> {
+    // Take a screenshot first
+    let screenshots_dir = working_dir.join("screenshots");
+    std::fs::create_dir_all(&screenshots_dir)?;
+    
+    let screenshot_path = screenshots_dir.join("_ocr_temp.png");
+
+    // Take screenshot
+    let (_, stderr, exit_code) = run_with_display(
+        display,
+        "scrot",
+        &["-o", screenshot_path.to_string_lossy().as_ref()],
+        30,
+    )
+    .await?;
+
+    if exit_code != 0 {
+        return Err(anyhow::anyhow!("Failed to take screenshot for OCR: {}", stderr));
+    }
+
+    // Run tesseract
+    let output = Command::new("tesseract")
+        .args([screenshot_path.to_string_lossy().as_ref(), "stdout", "-l", "eng"])
+        .output()
+        .await
+        .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
+
+    // Clean up temp screenshot
+    let _ = std::fs::remove_file(&screenshot_path);
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        return Err(anyhow::anyhow!("Tesseract failed: {}", stderr));
+    }
+
+    let text = String::from_utf8_lossy(&output.stdout).to_string();
+    Ok(text)
+}
+
+/// Move the mouse to a position (without clicking).
+pub struct MouseMove;
+
+#[async_trait]
+impl Tool for MouseMove {
+    fn name(&self) -> &str {
+        "desktop_mouse_move"
+    }
+
+    fn description(&self) -> &str {
+        "Move the mouse cursor to a specific position without clicking. Useful for hover effects or preparing for drag operations."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99')"
+                },
+                "x": {
+                    "type": "integer",
+                    "description": "X coordinate in pixels from left edge"
+                },
+                "y": {
+                    "type": "integer",
+                    "description": "Y coordinate in pixels from top edge"
+                }
+            },
+            "required": ["display", "x", "y"]
+        })
+    }
+
+    async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        let x = args["x"]
+            .as_i64()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'x' argument"))?;
+        let y = args["y"]
+            .as_i64()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'y' argument"))?;
+
+        tracing::info!(display = %display_id, x = x, y = y, "Moving mouse");
+
+        let (_, stderr, exit_code) = run_with_display(
+            display_id,
+            "xdotool",
+            &["mousemove", &x.to_string(), &y.to_string()],
+            10,
+        )
+        .await?;
+
+        if exit_code != 0 {
+            return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr));
+        }
+
+        Ok(format!("{{\"success\": true, \"x\": {}, \"y\": {}}}", x, y))
+    }
+}
+
+/// Scroll the mouse wheel.
+pub struct Scroll;
+
+#[async_trait]
+impl Tool for Scroll {
+    fn name(&self) -> &str {
+        "desktop_scroll"
+    }
+
+    fn description(&self) -> &str {
+        "Scroll the mouse wheel at the current position or at specified coordinates. Positive amount scrolls down, negative scrolls up."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "display": {
+                    "type": "string",
+                    "description": "The display identifier (e.g., ':99')"
+                },
+                "amount": {
+                    "type": "integer",
+                    "description": "Scroll amount. Positive = down, negative = up. Each unit is typically one 'click' of the scroll wheel."
+                },
+                "x": {
+                    "type": "integer",
+                    "description": "Optional: X coordinate to scroll at (moves mouse first)"
+                },
+                "y": {
+                    "type": "integer",
+                    "description": "Optional: Y coordinate to scroll at (moves mouse first)"
+                }
+            },
+            "required": ["display", "amount"]
+        })
+    }
+
+    async fn execute(&self, args: Value, _working_dir: &Path) -> anyhow::Result<String> {
+        let display_id = args["display"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'display' argument"))?;
+
+        let amount = args["amount"]
+            .as_i64()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'amount' argument"))?;
+
+        // Move to position if specified
+        if let (Some(x), Some(y)) = (args["x"].as_i64(), args["y"].as_i64()) {
+            let (_, stderr, exit_code) = run_with_display(
+                display_id,
+                "xdotool",
+                &["mousemove", &x.to_string(), &y.to_string()],
+                10,
+            )
+            .await?;
+
+            if exit_code != 0 {
+                return Err(anyhow::anyhow!("xdotool mousemove failed: {}", stderr));
+            }
+            
+            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+        }
+
+        tracing::info!(display = %display_id, amount = amount, "Scrolling");
+
+        // xdotool uses button 4 for scroll up, button 5 for scroll down
+        let (button, clicks) = if amount >= 0 {
+            ("5", amount.unsigned_abs() as usize)
+        } else {
+            ("4", amount.unsigned_abs() as usize)
+        };
+
+        for _ in 0..clicks {
+            let (_, stderr, exit_code) =
+                run_with_display(display_id, "xdotool", &["click", button], 10).await?;
+
+            if exit_code != 0 {
+                return Err(anyhow::anyhow!("xdotool scroll failed: {}", stderr));
+            }
+            
+            tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+        }
+
+        Ok(format!(
+            "{{\"success\": true, \"amount\": {}, \"direction\": \"{}\"}}",
+            amount,
+            if amount >= 0 { "down" } else { "up" }
+        ))
+    }
+}
--- a/src/tools/mod.rs
+++ b/src/tools/mod.rs
@@ -7,6 +7,7 @@
 //! and search anywhere on the machine. The `working_dir` parameter is the default directory
 //! for relative paths (typically `/root` in production).

+mod desktop;
 mod directory;
 mod file_ops;
 mod git;
@@ -97,6 +98,33 @@ impl ToolRegistry {
        tools.insert("ui_optionList".to_string(), Arc::new(ui::UiOptionList));
        tools.insert("ui_dataTable".to_string(), Arc::new(ui::UiDataTable));

+        // Desktop automation (conditional on DESKTOP_ENABLED)
+        if std::env::var("DESKTOP_ENABLED")
+            .map(|v| v.to_lowercase() == "true" || v == "1")
+            .unwrap_or(false)
+        {
+            tools.insert(
+                "desktop_start_session".to_string(),
+                Arc::new(desktop::StartSession),
+            );
+            tools.insert(
+                "desktop_stop_session".to_string(),
+                Arc::new(desktop::StopSession),
+            );
+            tools.insert(
+                "desktop_screenshot".to_string(),
+                Arc::new(desktop::Screenshot),
+            );
+            tools.insert("desktop_type".to_string(), Arc::new(desktop::TypeText));
+            tools.insert("desktop_click".to_string(), Arc::new(desktop::Click));
+            tools.insert("desktop_get_text".to_string(), Arc::new(desktop::GetText));
+            tools.insert(
+                "desktop_mouse_move".to_string(),
+                Arc::new(desktop::MouseMove),
+            );
+            tools.insert("desktop_scroll".to_string(), Arc::new(desktop::Scroll));
+        }
+
        Self { tools }
    }