diff --git a/Cargo.toml b/Cargo.toml index 4006c79..e7470fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ base64 = "0.22" bytes = "1" portable-pty = "0.9" tokio-util = { version = "0.7", features = ["io"] } +chromiumoxide = { version = "0.8.0", features = ["tokio-runtime"] } [dev-dependencies] tokio-test = "0.4" diff --git a/dev-browser b/dev-browser new file mode 160000 index 0000000..0845959 --- /dev/null +++ b/dev-browser @@ -0,0 +1 @@ +Subproject commit 0845959b22ab6e3b07f134f64e2ead3f4b39d4da diff --git a/src/tools/browser.rs b/src/tools/browser.rs new file mode 100644 index 0000000..57593b5 --- /dev/null +++ b/src/tools/browser.rs @@ -0,0 +1,684 @@ +//! Browser automation tools using Chrome DevTools Protocol (CDP). +//! +//! These tools connect to a Chrome/Chromium browser running with remote debugging enabled. +//! Start Chrome with: `google-chrome --remote-debugging-port=9222` +//! +//! Environment variables: +//! - `BROWSER_CDP_URL`: CDP WebSocket URL (default: `http://127.0.0.1:9222`) +//! - `BROWSER_ENABLED`: Set to `true` to enable browser tools (default: false) + +use std::path::Path; +use std::sync::Arc; +use std::time::Duration; + +use async_trait::async_trait; +use chromiumoxide::browser::Browser; +use chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat; +use chromiumoxide::page::ScreenshotParams; +use chromiumoxide::Page; +use futures::StreamExt; +use serde_json::{json, Value}; +use tokio::sync::Mutex; + +use super::Tool; + +/// Default CDP endpoint +const DEFAULT_CDP_URL: &str = "http://127.0.0.1:9222"; + +/// Shared browser state (lazy initialization) +static BROWSER_STATE: std::sync::LazyLock>>> = + std::sync::LazyLock::new(|| Arc::new(Mutex::new(None))); + +/// Browser session holding the browser and current page +struct BrowserSession { + #[allow(dead_code)] + browser: Browser, + page: Page, +} + +/// Get or create a browser session +async fn get_browser_session() -> anyhow::Result>>> { + let state = BROWSER_STATE.clone(); + let mut guard = state.lock().await; + + if guard.is_none() { + let cdp_url = std::env::var("BROWSER_CDP_URL").unwrap_or_else(|_| DEFAULT_CDP_URL.to_string()); + + // Connect to existing Chrome with remote debugging + let (browser, mut handler) = Browser::connect(&cdp_url).await.map_err(|e| { + anyhow::anyhow!( + "Failed to connect to Chrome at {}. Make sure Chrome is running with --remote-debugging-port=9222. Error: {}", + cdp_url, + e + ) + })?; + + // Spawn handler in background + tokio::spawn(async move { + while let Some(event) = handler.next().await { + if let Err(e) = event { + tracing::warn!("Browser event error: {}", e); + } + } + }); + + // Get or create a page + let page = browser.new_page("about:blank").await?; + + *guard = Some(BrowserSession { browser, page }); + } + + drop(guard); + Ok(state) +} + +/// Get the current page, creating a new session if needed +async fn with_page(f: F) -> anyhow::Result +where + F: FnOnce(Page) -> Fut, + Fut: std::future::Future>, +{ + let state = get_browser_session().await?; + let guard = state.lock().await; + + if let Some(session) = guard.as_ref() { + f(session.page.clone()).await + } else { + Err(anyhow::anyhow!("Browser session not initialized")) + } +} + +// ============================================================================ +// Browser Navigate Tool +// ============================================================================ + +/// Navigate to a URL and wait for page load +pub struct BrowserNavigate; + +#[async_trait] +impl Tool for BrowserNavigate { + fn name(&self) -> &str { + "browser_navigate" + } + + fn description(&self) -> &str { + "Navigate the browser to a URL. Waits for the page to fully load. Use this to open websites for scraping, testing, or interaction." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to navigate to" + }, + "wait_selector": { + "type": "string", + "description": "Optional CSS selector to wait for after navigation" + } + }, + "required": ["url"] + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let url = args["url"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'url' argument"))?; + let wait_selector = args["wait_selector"].as_str(); + + with_page(|page| async move { + // Navigate to URL + page.goto(url).await?; + + // Wait for network idle or timeout + tokio::time::sleep(Duration::from_millis(1000)).await; + + // Optionally wait for a specific element + if let Some(selector) = wait_selector { + page.wait_for_navigation().await.ok(); + page.find_element(selector) + .await + .map_err(|e| anyhow::anyhow!("Element '{}' not found: {}", selector, e))?; + } + + // Get page info + let title = page.get_title().await?.unwrap_or_default(); + let current_url = page.url().await?.map(|u| u.to_string()).unwrap_or_default(); + + Ok(format!( + "Navigated to: {}\nTitle: {}\nCurrent URL: {}", + url, title, current_url + )) + }) + .await + } +} + +// ============================================================================ +// Browser Screenshot Tool +// ============================================================================ + +/// Take a screenshot of the current page +pub struct BrowserScreenshot; + +#[async_trait] +impl Tool for BrowserScreenshot { + fn name(&self) -> &str { + "browser_screenshot" + } + + fn description(&self) -> &str { + "Take a screenshot of the current browser page. Returns the path to the saved PNG image. Use after navigating to see what's on the page." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "filename": { + "type": "string", + "description": "Optional filename (without path). Defaults to screenshot_.png" + }, + "full_page": { + "type": "boolean", + "description": "Capture the full scrollable page (default: false, captures viewport only)" + } + } + }) + } + + async fn execute(&self, args: Value, workspace: &Path) -> anyhow::Result { + let filename = args["filename"].as_str().map(|s| s.to_string()).unwrap_or_else(|| { + format!("screenshot_{}.png", chrono::Utc::now().format("%Y%m%d_%H%M%S")) + }); + let full_page = args["full_page"].as_bool().unwrap_or(false); + + with_page(|page| async move { + // Configure screenshot + let params = ScreenshotParams::builder() + .format(CaptureScreenshotFormat::Png) + .full_page(full_page) + .build(); + + // Take screenshot + let screenshot = page.screenshot(params).await?; + + // Save to workspace/temp directory + let temp_dir = workspace.join("temp"); + std::fs::create_dir_all(&temp_dir)?; + let file_path = temp_dir.join(&filename); + + std::fs::write(&file_path, &screenshot)?; + + Ok(format!( + "Screenshot saved to: {}\nSize: {} bytes", + file_path.display(), + screenshot.len() + )) + }) + .await + } +} + +// ============================================================================ +// Browser Get Content Tool +// ============================================================================ + +/// Get the text content of the current page +pub struct BrowserGetContent; + +#[async_trait] +impl Tool for BrowserGetContent { + fn name(&self) -> &str { + "browser_get_content" + } + + fn description(&self) -> &str { + "Extract the text content from the current page. Returns readable text with structure preserved. Use after navigating to read the page content." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "selector": { + "type": "string", + "description": "Optional CSS selector to get content from a specific element" + }, + "include_html": { + "type": "boolean", + "description": "Return HTML instead of text (default: false)" + } + } + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let selector = args["selector"].as_str(); + let include_html = args["include_html"].as_bool().unwrap_or(false); + + with_page(|page| async move { + let content: String = if let Some(sel) = selector { + // Get content from specific element + let element = page.find_element(sel).await + .map_err(|e| anyhow::anyhow!("Element '{}' not found: {}", sel, e))?; + + if include_html { + element.inner_html().await?.unwrap_or_default() + } else { + element.inner_text().await?.unwrap_or_default() + } + } else { + // Get full page content + if include_html { + page.content().await? + } else { + // Execute JS to get text content + let result = page.evaluate("document.body.innerText").await?; + result.into_value::().unwrap_or_default() + } + }; + + // Truncate if too long + let max_len = 50000; + if content.len() > max_len { + Ok(format!( + "{}\n\n... [truncated, {} total characters]", + &content[..max_len], + content.len() + )) + } else { + Ok(content) + } + }) + .await + } +} + +// ============================================================================ +// Browser Click Tool +// ============================================================================ + +/// Click on an element +pub struct BrowserClick; + +#[async_trait] +impl Tool for BrowserClick { + fn name(&self) -> &str { + "browser_click" + } + + fn description(&self) -> &str { + "Click on an element in the browser. Use CSS selectors like '#id', '.class', 'button', 'a[href*=login]', etc." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector for the element to click" + } + }, + "required": ["selector"] + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let selector = args["selector"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' argument"))?; + + with_page(|page| async move { + let element = page.find_element(selector).await + .map_err(|e| anyhow::anyhow!("Element '{}' not found: {}", selector, e))?; + + element.click().await?; + + // Wait a bit for any navigation or dynamic updates + tokio::time::sleep(Duration::from_millis(500)).await; + + Ok(format!("Clicked element: {}", selector)) + }) + .await + } +} + +// ============================================================================ +// Browser Type Tool +// ============================================================================ + +/// Type text into an input element +pub struct BrowserType; + +#[async_trait] +impl Tool for BrowserType { + fn name(&self) -> &str { + "browser_type" + } + + fn description(&self) -> &str { + "Type text into an input field. First clicks the element to focus it, then types the text." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector for the input element" + }, + "text": { + "type": "string", + "description": "Text to type" + }, + "clear_first": { + "type": "boolean", + "description": "Clear the input before typing (default: true)" + } + }, + "required": ["selector", "text"] + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let selector = args["selector"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' argument"))?; + let text = args["text"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'text' argument"))?; + let clear_first = args["clear_first"].as_bool().unwrap_or(true); + + with_page(|page| async move { + let element = page.find_element(selector).await + .map_err(|e| anyhow::anyhow!("Element '{}' not found: {}", selector, e))?; + + // Click to focus + element.click().await?; + + // Clear if requested + if clear_first { + // Select all and delete + element.type_str("").await?; // Focus + page.evaluate("document.activeElement.value = ''").await.ok(); + } + + // Type the text + element.type_str(text).await?; + + Ok(format!("Typed '{}' into: {}", text, selector)) + }) + .await + } +} + +// ============================================================================ +// Browser Evaluate Tool +// ============================================================================ + +/// Execute JavaScript in the browser +pub struct BrowserEvaluate; + +#[async_trait] +impl Tool for BrowserEvaluate { + fn name(&self) -> &str { + "browser_evaluate" + } + + fn description(&self) -> &str { + "Execute JavaScript code in the browser and return the result. Useful for complex interactions, extracting data, or debugging." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "script": { + "type": "string", + "description": "JavaScript code to execute. The result of the last expression is returned." + } + }, + "required": ["script"] + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let script = args["script"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'script' argument"))?; + + with_page(|page| async move { + let result = page.evaluate(script).await?; + + // Try to serialize the result + let value = result.value(); + match value { + Some(v) => Ok(serde_json::to_string_pretty(&v)?), + None => Ok("(no return value)".to_string()), + } + }) + .await + } +} + +// ============================================================================ +// Browser Wait Tool +// ============================================================================ + +/// Wait for an element or condition +pub struct BrowserWait; + +#[async_trait] +impl Tool for BrowserWait { + fn name(&self) -> &str { + "browser_wait" + } + + fn description(&self) -> &str { + "Wait for an element to appear or a condition to be met. Use after clicking or navigating when content loads dynamically." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector to wait for" + }, + "timeout_ms": { + "type": "integer", + "description": "Maximum time to wait in milliseconds (default: 10000)" + } + }, + "required": ["selector"] + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let selector = args["selector"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' argument"))?; + let timeout_ms = args["timeout_ms"].as_u64().unwrap_or(10000); + + with_page(|page| async move { + let start = std::time::Instant::now(); + let timeout = Duration::from_millis(timeout_ms); + + loop { + if page.find_element(selector).await.is_ok() { + return Ok(format!( + "Element '{}' found after {} ms", + selector, + start.elapsed().as_millis() + )); + } + + if start.elapsed() > timeout { + return Err(anyhow::anyhow!( + "Timeout waiting for element '{}' after {} ms", + selector, + timeout_ms + )); + } + + tokio::time::sleep(Duration::from_millis(100)).await; + } + }) + .await + } +} + +// ============================================================================ +// Browser Close Tool +// ============================================================================ + +/// Close the browser session +pub struct BrowserClose; + +#[async_trait] +impl Tool for BrowserClose { + fn name(&self) -> &str { + "browser_close" + } + + fn description(&self) -> &str { + "Close the current browser page/tab. Use when done with browser automation to free resources." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": {} + }) + } + + async fn execute(&self, _args: Value, _workspace: &Path) -> anyhow::Result { + let state = BROWSER_STATE.clone(); + let mut guard = state.lock().await; + + if let Some(session) = guard.take() { + // Close the page + session.page.close().await.ok(); + // Browser will be dropped + Ok("Browser session closed".to_string()) + } else { + Ok("No active browser session".to_string()) + } + } +} + +// ============================================================================ +// Browser List Elements Tool +// ============================================================================ + +/// List interactive elements on the page +pub struct BrowserListElements; + +#[async_trait] +impl Tool for BrowserListElements { + fn name(&self) -> &str { + "browser_list_elements" + } + + fn description(&self) -> &str { + "List interactive elements on the current page (links, buttons, inputs, etc.). Useful for understanding page structure before interacting." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "selector": { + "type": "string", + "description": "Optional CSS selector to filter elements (default: 'a, button, input, select, textarea, [onclick]')" + }, + "limit": { + "type": "integer", + "description": "Maximum number of elements to return (default: 50)" + } + } + }) + } + + async fn execute(&self, args: Value, _workspace: &Path) -> anyhow::Result { + let selector = args["selector"].as_str() + .unwrap_or("a, button, input, select, textarea, [onclick], [role='button']"); + let limit = args["limit"].as_u64().unwrap_or(50) as usize; + + with_page(|page| async move { + // Use JavaScript to get element info + let script = format!(r#" + (() => {{ + const elements = document.querySelectorAll('{}'); + const results = []; + for (let i = 0; i < Math.min(elements.length, {}); i++) {{ + const el = elements[i]; + const rect = el.getBoundingClientRect(); + results.push({{ + tag: el.tagName.toLowerCase(), + id: el.id || null, + class: el.className || null, + text: (el.innerText || el.value || '').slice(0, 100).trim(), + href: el.href || null, + type: el.type || null, + name: el.name || null, + visible: rect.width > 0 && rect.height > 0 + }}); + }} + return results; + }})() + "#, selector, limit); + + let result = page.evaluate(script.as_str()).await?; + let elements: Vec = result.into_value().unwrap_or_default(); + + if elements.is_empty() { + return Ok(format!("No elements found matching: {}", selector)); + } + + let mut output = format!("Found {} elements:\n\n", elements.len()); + for (i, el) in elements.iter().enumerate() { + let tag = el["tag"].as_str().unwrap_or("?"); + let id = el["id"].as_str().filter(|s| !s.is_empty()); + let class = el["class"].as_str().filter(|s| !s.is_empty()); + let text = el["text"].as_str().filter(|s| !s.is_empty()); + let href = el["href"].as_str().filter(|s| !s.is_empty()); + let visible = el["visible"].as_bool().unwrap_or(true); + + // Build selector hint + let selector_hint = if let Some(id) = id { + format!("#{}", id) + } else if let Some(cls) = class { + let first_class = cls.split_whitespace().next().unwrap_or(""); + format!("{}.{}", tag, first_class) + } else { + tag.to_string() + }; + + output.push_str(&format!( + "{}. [{}] {}", + i + 1, + if visible { "✓" } else { "hidden" }, + selector_hint + )); + + if let Some(t) = text { + output.push_str(&format!(" - \"{}\"", t.chars().take(60).collect::())); + } + if let Some(h) = href { + output.push_str(&format!(" → {}", h.chars().take(50).collect::())); + } + output.push('\n'); + } + + Ok(output) + }) + .await + } +} diff --git a/src/tools/mod.rs b/src/tools/mod.rs index 374b20c..3f7ddd3 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -12,6 +12,7 @@ //! This encourages agents to stay within their assigned workspace while preserving //! flexibility for tasks that require broader access. +mod browser; mod desktop; mod directory; mod file_ops; @@ -201,6 +202,50 @@ impl ToolRegistry { // Storage (image upload - requires Supabase) tools.insert("upload_image".to_string(), Arc::new(storage::UploadImage)); + // Browser automation (conditional on BROWSER_ENABLED) + if std::env::var("BROWSER_ENABLED") + .map(|v| v.to_lowercase() == "true" || v == "1") + .unwrap_or(false) + { + tracing::info!("Registering browser automation tools (BROWSER_ENABLED=true)"); + tools.insert( + "browser_navigate".to_string(), + Arc::new(browser::BrowserNavigate), + ); + tools.insert( + "browser_screenshot".to_string(), + Arc::new(browser::BrowserScreenshot), + ); + tools.insert( + "browser_get_content".to_string(), + Arc::new(browser::BrowserGetContent), + ); + tools.insert( + "browser_click".to_string(), + Arc::new(browser::BrowserClick), + ); + tools.insert( + "browser_type".to_string(), + Arc::new(browser::BrowserType), + ); + tools.insert( + "browser_evaluate".to_string(), + Arc::new(browser::BrowserEvaluate), + ); + tools.insert( + "browser_wait".to_string(), + Arc::new(browser::BrowserWait), + ); + tools.insert( + "browser_close".to_string(), + Arc::new(browser::BrowserClose), + ); + tools.insert( + "browser_list_elements".to_string(), + Arc::new(browser::BrowserListElements), + ); + } + // Desktop automation (conditional on DESKTOP_ENABLED) if std::env::var("DESKTOP_ENABLED") .map(|v| v.to_lowercase() == "true" || v == "1")