diff --git a/Cargo.toml b/Cargo.toml index 9214972..bfe3209 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ http = "1" encoding_rs = "0.8" zstd = "0.13" tar = "0.4" +fantoccini = { version = "0.22", default-features = false, features = ["rustls-tls"] } [build-dependencies] zstd = "0.13" diff --git a/src/config/mod.rs b/src/config/mod.rs index 2abe7e3..7c3bc14 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -55,6 +55,8 @@ pub struct Config { pub workspace_dir: String, #[serde(default)] pub mcp: McpConfig, + #[serde(default)] + pub browser: BrowserConfig, } fn default_workspace_dir() -> String { @@ -324,6 +326,37 @@ fn default_mcp_tool_timeout_secs() -> u64 { 180 } +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct BrowserConfig { + #[serde(default)] + pub enabled: bool, + #[serde(default = "default_webdriver_url")] + pub webdriver_url: String, + #[serde(default = "default_true")] + pub headless: bool, + #[serde(default)] + pub chrome_path: Option, +} + +fn default_webdriver_url() -> String { + "http://127.0.0.1:9515".to_string() +} + +fn default_true() -> bool { + true +} + +impl Default for BrowserConfig { + fn default() -> Self { + Self { + enabled: false, + webdriver_url: default_webdriver_url(), + headless: true, + chrome_path: None, + } + } +} + fn default_recall_limit() -> usize { 5 } fn default_idle_consolidation_minutes() -> u64 { 10 } fn default_timeline_retention_days() -> u64 { 90 } diff --git a/src/gateway/mod.rs b/src/gateway/mod.rs index cf9354c..af5d1dc 100644 --- a/src/gateway/mod.rs +++ b/src/gateway/mod.rs @@ -78,12 +78,19 @@ impl GatewayState { // Create MessageBus first (shared by SessionManager and ChannelManager) let bus = MessageBus::new(100); + let browser_config = if config.browser.enabled { + Some(config.browser.clone()) + } else { + None + }; + // Create SessionManager with bus injection let session_manager = SessionManager::new( provider_config.clone(), storage.clone(), bus.clone(), memory_manager, + browser_config, )?; let session_manager = Arc::new(session_manager); diff --git a/src/session/session.rs b/src/session/session.rs index 0920060..511c4e6 100644 --- a/src/session/session.rs +++ b/src/session/session.rs @@ -21,6 +21,7 @@ pub enum HandleResult { } use crate::channels::slash_command::parse_slash_command; use crate::config::LLMProviderConfig; +use crate::config::BrowserConfig; use crate::agent::{AgentLoop, AgentError, ContextCompressor}; use crate::agent::system_prompt::build_system_prompt; use crate::agent::context_compressor::ContextCompressionConfig; @@ -813,13 +814,18 @@ impl SessionManager { storage: Arc, bus: Arc, memory_manager: Arc, + browser_config: Option, ) -> Result { let mut skills_loader = SkillsLoader::new(); skills_loader.load_skills(); skills_loader.set_workspace_skills_dir(provider_config.workspace_dir.clone()); let skills_loader = Arc::new(skills_loader); - let tools = Arc::new(create_default_tools(skills_loader.clone(), memory_manager.clone())); + let tools = Arc::new(create_default_tools( + skills_loader.clone(), + memory_manager.clone(), + browser_config.as_ref(), + )); Ok(Self { inner: Arc::new(Mutex::new(SessionManagerInner { diff --git a/src/tools/browser.rs b/src/tools/browser.rs new file mode 100644 index 0000000..b1c25f2 --- /dev/null +++ b/src/tools/browser.rs @@ -0,0 +1,1029 @@ +use std::net::TcpStream; +use std::process::Stdio; +use std::time::Duration; + +use anyhow::Context; +use async_trait::async_trait; +use base64::Engine; +use fantoccini::actions::{InputSource, MouseActions, PointerAction}; +use fantoccini::key::Key; +use fantoccini::{Client, ClientBuilder, Locator}; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value, json}; +use tracing; + +use crate::config::BrowserConfig; +use crate::tools::traits::{Tool, ToolResult}; + +const CHROME_CANDIDATES: &[&str] = &[ + "google-chrome", + "chromium-browser", + "chromium", + "google-chrome-stable", + "chrome", +]; + +const CHROMEDRIVER_CANDIDATES: &[&str] = &["chromedriver"]; + +pub struct BrowserTool { + webdriver_url: String, + headless: bool, + chrome_path: Option, + state: tokio::sync::Mutex, + driver: std::sync::Mutex>, +} + +struct BrowserState { + client: Option, +} + +impl Drop for BrowserTool { + fn drop(&mut self) { + if let Ok(mut driver) = self.driver.lock() { + if let Some(ref mut child) = driver.take() { + tracing::debug!("Stopping chromedriver process"); + let _ = child.start_kill(); + } + } + } +} + +impl BrowserTool { + pub fn new(config: &BrowserConfig) -> Self { + Self { + webdriver_url: config.webdriver_url.clone(), + headless: config.headless, + chrome_path: config.chrome_path.clone(), + state: tokio::sync::Mutex::new(BrowserState { client: None }), + driver: std::sync::Mutex::new(None), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum BrowserAction { + Open { url: String }, + Snapshot { + #[serde(default)] + interactive_only: bool, + #[serde(default)] + compact: bool, + #[serde(default)] + depth: Option, + }, + Click { selector: String }, + Fill { selector: String, value: String }, + Type { selector: String, text: String }, + GetText { selector: String }, + GetTitle, + GetUrl, + Screenshot { + #[serde(default)] + path: Option, + #[serde(default)] + full_page: bool, + }, + Wait { + #[serde(default)] + selector: Option, + #[serde(default)] + ms: Option, + #[serde(default)] + text: Option, + }, + Press { key: String }, + Hover { selector: String }, + Scroll { + direction: String, + #[serde(default)] + pixels: Option, + }, + Close, +} + +fn parse_browser_action(action_str: &str, args: &Value) -> anyhow::Result { + match action_str { + "open" => { + let url = args + .get("url") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'url' for open action"))?; + Ok(BrowserAction::Open { + url: url.to_string(), + }) + } + "snapshot" => Ok(BrowserAction::Snapshot { + interactive_only: args + .get("interactive_only") + .and_then(Value::as_bool) + .unwrap_or(true), + compact: args + .get("compact") + .and_then(Value::as_bool) + .unwrap_or(true), + depth: args + .get("depth") + .and_then(|v| v.as_u64()) + .map(|d| u32::try_from(d).unwrap_or(u32::MAX)), + }), + "click" => { + let selector = args + .get("selector") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for click"))?; + Ok(BrowserAction::Click { + selector: selector.to_string(), + }) + } + "fill" => { + let selector = args + .get("selector") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for fill"))?; + let value = args + .get("value") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'value' for fill"))?; + Ok(BrowserAction::Fill { + selector: selector.to_string(), + value: value.to_string(), + }) + } + "type" => { + let selector = args + .get("selector") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for type"))?; + let text = args + .get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'text' for type"))?; + Ok(BrowserAction::Type { + selector: selector.to_string(), + text: text.to_string(), + }) + } + "get_text" => { + let selector = args + .get("selector") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for get_text"))?; + Ok(BrowserAction::GetText { + selector: selector.to_string(), + }) + } + "get_title" => Ok(BrowserAction::GetTitle), + "get_url" => Ok(BrowserAction::GetUrl), + "screenshot" => Ok(BrowserAction::Screenshot { + path: args.get("path").and_then(|v| v.as_str()).map(String::from), + full_page: args + .get("full_page") + .and_then(Value::as_bool) + .unwrap_or(false), + }), + "wait" => Ok(BrowserAction::Wait { + selector: args + .get("selector") + .and_then(|v| v.as_str()) + .map(String::from), + ms: args.get("ms").and_then(|v| v.as_u64()), + text: args + .get("text") + .and_then(|v| v.as_str()) + .map(String::from), + }), + "press" => { + let key = args + .get("key") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'key' for press"))?; + Ok(BrowserAction::Press { + key: key.to_string(), + }) + } + "hover" => { + let selector = args + .get("selector") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for hover"))?; + Ok(BrowserAction::Hover { + selector: selector.to_string(), + }) + } + "scroll" => { + let direction = args + .get("direction") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'direction' for scroll"))?; + Ok(BrowserAction::Scroll { + direction: direction.to_string(), + pixels: args + .get("pixels") + .and_then(|v| v.as_u64()) + .map(|p| u32::try_from(p).unwrap_or(u32::MAX)), + }) + } + "close" => Ok(BrowserAction::Close), + other => anyhow::bail!("Unsupported browser action: {}", other), + } +} + +#[async_trait] +impl Tool for BrowserTool { + fn name(&self) -> &str { + "browser" + } + + fn description(&self) -> &str { + "Automate browser interactions using WebDriver. \ + First call open to navigate to a URL, then use other actions. \ + Use snapshot to get an accessibility tree of the page. \ + Selectors can be CSS, @e1 refs (from snapshot), text=..., or label=..." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "action": { + "type": "string", + "description": "Browser action to perform", + "enum": [ + "open", "snapshot", "click", "fill", "type", + "get_text", "get_title", "get_url", "screenshot", + "wait", "press", "hover", "scroll", "close" + ] + }, + "url": { + "type": "string", + "description": "URL to navigate to (required for open)" + }, + "selector": { + "type": "string", + "description": "CSS selector, @e1 ref, text=..., or label=..." + }, + "value": { + "type": "string", + "description": "Value to fill into a form field" + }, + "text": { + "type": "string", + "description": "Text to type or wait for" + }, + "key": { + "type": "string", + "description": "Key to press (Enter, Tab, Escape, Backspace, Delete, ArrowUp, ArrowDown, etc.)" + }, + "direction": { + "type": "string", + "description": "Scroll direction", + "enum": ["up", "down", "left", "right"] + }, + "pixels": { + "type": "integer", + "description": "Pixels to scroll (default 600)" + }, + "ms": { + "type": "integer", + "description": "Milliseconds to wait (for wait action)" + }, + "path": { + "type": "string", + "description": "File path to save screenshot to. If omitted, returns base64." + }, + "full_page": { + "type": "boolean", + "description": "Take full-page screenshot" + }, + "interactive_only": { + "type": "boolean", + "description": "Only show interactive elements in snapshot (default true)" + }, + "compact": { + "type": "boolean", + "description": "Compact snapshot output (default true)" + }, + "depth": { + "type": "integer", + "description": "Max depth for snapshot traversal" + } + }, + "required": ["action"] + }) + } + + fn exclusive(&self) -> bool { + true + } + + async fn execute(&self, args: Value) -> anyhow::Result { + let action_str = args + .get("action") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing required parameter: action"))?; + + let action = parse_browser_action(action_str, &args)?; + + let mut state = self.state.lock().await; + state + .execute_action( + action, + self.headless, + &self.webdriver_url, + self.chrome_path.as_deref(), + &self.driver, + ) + .await + } +} + +impl BrowserState { + async fn execute_action( + &mut self, + action: BrowserAction, + headless: bool, + webdriver_url: &str, + chrome_path: Option<&str>, + driver: &std::sync::Mutex>, + ) -> anyhow::Result { + let action_clone = action.clone(); + let result = self + .try_execute(action, headless, webdriver_url, chrome_path, driver) + .await; + + match result { + Ok(r) => Ok(r), + Err(e) => { + if is_recoverable_error(&e) { + tracing::warn!("Recoverable browser session error, retrying: {:#}", e); + self.reset_session(driver).await; + self.try_execute(action_clone, headless, webdriver_url, chrome_path, driver) + .await + } else { + Err(e) + } + } + } + } + + #[allow(clippy::too_many_lines)] + async fn try_execute( + &mut self, + action: BrowserAction, + headless: bool, + webdriver_url: &str, + chrome_path: Option<&str>, + driver: &std::sync::Mutex>, + ) -> anyhow::Result { + match action { + BrowserAction::Open { url } => { + self.ensure_session(headless, webdriver_url, chrome_path, driver) + .await?; + let client = self.active_client()?; + client.goto(&url).await?; + let current = client.current_url().await?; + Ok(ToolResult { + success: true, + output: format!("Opened {}", current), + error: None, + }) + } + BrowserAction::Snapshot { + interactive_only, + compact, + depth, + } => { + let client = self.active_client()?; + let result: Value = client + .execute( + &snapshot_script(interactive_only, compact, depth.map(i64::from)), + vec![], + ) + .await?; + let output = serde_json::to_string_pretty(&result)?; + Ok(ToolResult { + success: true, + output, + error: None, + }) + } + BrowserAction::Click { selector } => { + let client = self.active_client()?; + find_element(client, &selector).await?.click().await?; + Ok(ToolResult { + success: true, + output: format!("Clicked {}", selector), + error: None, + }) + } + BrowserAction::Fill { selector, value } => { + let client = self.active_client()?; + let el = find_element(client, &selector).await?; + let _ = el.clear().await; + el.send_keys(&value).await?; + Ok(ToolResult { + success: true, + output: format!("Filled {} with {}", selector, value), + error: None, + }) + } + BrowserAction::Type { selector, text } => { + let client = self.active_client()?; + find_element(client, &selector) + .await? + .send_keys(&text) + .await?; + Ok(ToolResult { + success: true, + output: format!("Typed {} chars into {}", text.len(), selector), + error: None, + }) + } + BrowserAction::GetText { selector } => { + let client = self.active_client()?; + let text = find_element(client, &selector).await?.text().await?; + Ok(ToolResult { + success: true, + output: text, + error: None, + }) + } + BrowserAction::GetTitle => { + let client = self.active_client()?; + let title = client.title().await?; + Ok(ToolResult { + success: true, + output: title, + error: None, + }) + } + BrowserAction::GetUrl => { + let client = self.active_client()?; + let url = client.current_url().await?; + Ok(ToolResult { + success: true, + output: url.to_string(), + error: None, + }) + } + BrowserAction::Screenshot { path, full_page } => { + let client = self.active_client()?; + let png = client.screenshot().await?; + let _ = full_page; + + match path { + Some(p) => { + tokio::fs::write(&p, &png).await?; + Ok(ToolResult { + success: true, + output: format!("Screenshot saved to {}", p), + error: None, + }) + } + None => { + let b64 = base64::engine::general_purpose::STANDARD.encode(&png); + Ok(ToolResult { + success: true, + output: format!("data:image/png;base64,{}", b64), + error: None, + }) + } + } + } + BrowserAction::Wait { + selector, + ms, + text, + } => { + if let Some(sel) = selector { + let client = self.active_client()?; + wait_for_selector(client, &sel).await?; + Ok(ToolResult { + success: true, + output: format!("Element found: {}", sel), + error: None, + }) + } else if let Some(duration_ms) = ms { + tokio::time::sleep(Duration::from_millis(duration_ms)).await; + Ok(ToolResult { + success: true, + output: format!("Waited {} ms", duration_ms), + error: None, + }) + } else if let Some(needle) = text { + let client = self.active_client()?; + let xpath = xpath_contains_text(&needle); + client.wait().for_element(Locator::XPath(&xpath)).await?; + Ok(ToolResult { + success: true, + output: format!("Text appeared: {}", needle), + error: None, + }) + } else { + tokio::time::sleep(Duration::from_millis(250)).await; + Ok(ToolResult { + success: true, + output: "Waited 250 ms".to_string(), + error: None, + }) + } + } + BrowserAction::Press { key } => { + let client = self.active_client()?; + let key_input = webdriver_key(&key); + match client.active_element().await { + Ok(el) => { + el.send_keys(&key_input).await?; + } + Err(_) => { + find_element(client, "body") + .await? + .send_keys(&key_input) + .await?; + } + } + Ok(ToolResult { + success: true, + output: format!("Pressed {}", key), + error: None, + }) + } + BrowserAction::Hover { selector } => { + let client = self.active_client()?; + let el = find_element(client, &selector).await?; + hover_element(client, &el).await?; + Ok(ToolResult { + success: true, + output: format!("Hovered {}", selector), + error: None, + }) + } + BrowserAction::Scroll { direction, pixels } => { + let client = self.active_client()?; + let amount = i64::from(pixels.unwrap_or(600)); + let (dx, dy) = match direction.as_str() { + "up" => (0, -amount), + "down" => (0, amount), + "left" => (-amount, 0), + "right" => (amount, 0), + _ => anyhow::bail!( + "Unsupported scroll direction '{}'. Use up/down/left/right", + direction + ), + }; + let position: Value = client + .execute( + "window.scrollBy(arguments[0], arguments[1]); \ + return { x: window.scrollX, y: window.scrollY };", + vec![json!(dx), json!(dy)], + ) + .await?; + Ok(ToolResult { + success: true, + output: format!( + "Scrolled {} by {} -> {}", + direction, + amount, + serde_json::to_string(&position).unwrap_or_default() + ), + error: None, + }) + } + BrowserAction::Close => { + self.reset_session(driver).await; + Ok(ToolResult { + success: true, + output: "Browser session closed".to_string(), + error: None, + }) + } + } + } + + pub async fn reset_session( + &mut self, + driver: &std::sync::Mutex>, + ) { + if let Some(client) = self.client.take() { + let _ = client.close().await; + } + if let Ok(mut guard) = driver.lock() { + if let Some(ref mut child) = guard.take() { + tracing::debug!("Stopping chromedriver process"); + let _ = child.start_kill(); + } + } + } + + async fn ensure_session( + &mut self, + headless: bool, + webdriver_url: &str, + chrome_path: Option<&str>, + driver: &std::sync::Mutex>, + ) -> anyhow::Result<()> { + if self.client.is_some() { + return Ok(()); + } + + let chrome_binary = match chrome_path { + Some(path) if !path.trim().is_empty() => Some(path.trim().to_string()), + _ => { + verify_chrome_installed()?; + None + } + }; + + let chromedriver_binary = find_chromedriver_binary()?; + + let port = extract_port(webdriver_url); + + let launched = if !webdriver_endpoint_reachable(webdriver_url) { + tracing::info!( + "chromedriver not running at {}, launching: {} --port={}", + webdriver_url, + chromedriver_binary, + port + ); + launch_chromedriver(driver, &chromedriver_binary, port)?; + true + } else { + false + }; + + if launched { + wait_for_webdriver_ready(webdriver_url).await + .map_err(|e| { + kill_driver_guard(driver); + e.context("chromedriver failed to start. Is Chrome installed? Check browser.chrome_path in config.json") + })?; + } + + let mut capabilities: Map = Map::new(); + let mut chrome_options: Map = Map::new(); + let mut args: Vec = Vec::new(); + + if headless { + args.push(Value::String("--headless=new".to_string())); + args.push(Value::String("--disable-gpu".to_string())); + args.push(Value::String("--no-sandbox".to_string())); + args.push(Value::String("--disable-dev-shm-usage".to_string())); + } + + args.push(Value::String("--window-size=1280,720".to_string())); + + if let Some(ref binary) = chrome_binary { + chrome_options.insert("binary".to_string(), Value::String(binary.clone())); + } + + if !args.is_empty() { + chrome_options.insert("args".to_string(), Value::Array(args)); + } + + capabilities.insert( + "goog:chromeOptions".to_string(), + Value::Object(chrome_options), + ); + + let client = ClientBuilder::rustls()? + .capabilities(capabilities) + .connect(webdriver_url) + .await + .with_context(|| { + format!( + "Failed to connect to WebDriver at {}. \ + Make sure chromedriver is installed and running.", + webdriver_url + ) + })?; + + self.client = Some(client); + Ok(()) + } + + fn active_client(&self) -> anyhow::Result<&Client> { + self.client + .as_ref() + .ok_or_else(|| anyhow::anyhow!("No active browser session. Run action='open' first")) + } +} + +fn launch_chromedriver( + driver: &std::sync::Mutex>, + chromedriver_binary: &str, + port: u16, +) -> anyhow::Result<()> { + let child = tokio::process::Command::new(chromedriver_binary) + .arg(format!("--port={}", port)) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .with_context(|| format!("Failed to launch chromedriver: {}", chromedriver_binary))?; + + *driver.lock().unwrap() = Some(child); + Ok(()) +} + +fn kill_driver_guard(driver: &std::sync::Mutex>) { + if let Ok(mut guard) = driver.lock() { + if let Some(ref mut child) = guard.take() { + let _ = child.start_kill(); + } + } +} + +async fn wait_for_webdriver_ready(webdriver_url: &str) -> anyhow::Result<()> { + let deadline = tokio::time::Instant::now() + Duration::from_secs(10); + + loop { + if webdriver_endpoint_reachable(webdriver_url) { + return Ok(()); + } + + if tokio::time::Instant::now() >= deadline { + anyhow::bail!( + "Timed out waiting for chromedriver to start listening on {}", + webdriver_url + ); + } + + tokio::time::sleep(Duration::from_millis(200)).await; + } +} + +fn verify_chrome_installed() -> anyhow::Result<()> { + for name in CHROME_CANDIDATES { + if which::which(name).is_ok() { + return Ok(()); + } + } + + anyhow::bail!( + "Chrome/Chromium not found. Install with:\n\ + Ubuntu/Debian: apt install chromium-browser\n\ + macOS: brew install chromium\n\ + Or set browser.chrome_path in config.json" + ) +} + +fn find_chromedriver_binary() -> anyhow::Result { + for name in CHROMEDRIVER_CANDIDATES { + if let Ok(path) = which::which(name) { + return Ok(path.to_string_lossy().to_string()); + } + } + + anyhow::bail!( + "chromedriver not found. Install with:\n\ + Ubuntu/Debian: apt install chromium-chromedriver\n\ + macOS: brew install chromedriver\n\ + Or download from https://chromedriver.chromium.org/" + ) +} + +fn extract_port(webdriver_url: &str) -> u16 { + reqwest::Url::parse(webdriver_url) + .ok() + .and_then(|u| u.port()) + .unwrap_or(9515) +} + +fn webdriver_endpoint_reachable(webdriver_url: &str) -> bool { + let parsed = match reqwest::Url::parse(webdriver_url) { + Ok(url) => url, + Err(_) => return false, + }; + + if parsed.scheme() != "http" && parsed.scheme() != "https" { + return false; + } + + let host = match parsed.host_str() { + Some(h) if !h.is_empty() => h, + _ => return false, + }; + + let port = parsed.port_or_known_default().unwrap_or(9515); + let addr = format!("{}:{}", host, port); + + TcpStream::connect_timeout( + &addr + .parse::() + .unwrap_or_else(|_| ([127, 0, 0, 1], port).into()), + Duration::from_millis(500), + ) + .is_ok() +} + +fn is_recoverable_error(err: &anyhow::Error) -> bool { + let message = format!("{:#}", err).to_ascii_lowercase(); + + message.contains("invalid session id") + || message.contains("no such window") + || message.contains("session not created") + || message.contains("connection reset") + || message.contains("broken pipe") + || (message.contains("webdriver") + && (message.contains("timed out") || message.contains("timeout"))) +} + +enum SelectorKind { + Css(String), + XPath(String), +} + +fn parse_selector(selector: &str) -> SelectorKind { + let trimmed = selector.trim(); + + if let Some(text_query) = trimmed.strip_prefix("text=") { + return SelectorKind::XPath(xpath_contains_text(text_query)); + } + + if let Some(label_query) = trimmed.strip_prefix("label=") { + let literal = xpath_literal(label_query); + return SelectorKind::XPath(format!( + "(//label[contains(normalize-space(.), {literal})] \ + /following::*[self::input or self::textarea or self::select][1] \ + | //*[@aria-label and contains(normalize-space(@aria-label), {literal})] \ + | //label[contains(normalize-space(.), {literal})])" + )); + } + + if trimmed.starts_with('@') { + let escaped = css_attr_escape(trimmed); + return SelectorKind::Css(format!(r#"[data-zc-ref="{escaped}"]"#)); + } + + SelectorKind::Css(trimmed.to_string()) +} + +async fn find_element( + client: &Client, + selector: &str, +) -> anyhow::Result { + match parse_selector(selector) { + SelectorKind::Css(css) => Ok(client.find(Locator::Css(&css)).await?), + SelectorKind::XPath(xpath) => Ok(client.find(Locator::XPath(&xpath)).await?), + } +} + +async fn wait_for_selector(client: &Client, selector: &str) -> anyhow::Result<()> { + match parse_selector(selector) { + SelectorKind::Css(css) => { + client.wait().for_element(Locator::Css(&css)).await?; + } + SelectorKind::XPath(xpath) => { + client.wait().for_element(Locator::XPath(&xpath)).await?; + } + } + Ok(()) +} + +async fn hover_element( + client: &Client, + element: &fantoccini::elements::Element, +) -> anyhow::Result<()> { + let actions = MouseActions::new("mouse".to_string()).then(PointerAction::MoveToElement { + element: element.clone(), + duration: Some(Duration::from_millis(150)), + x: 0.0, + y: 0.0, + }); + + client.perform_actions(actions).await?; + let _ = client.release_actions().await; + Ok(()) +} + +fn css_attr_escape(input: &str) -> String { + input + .replace('\\', "\\\\") + .replace('"', "\\\"") + .replace('\n', " ") +} + +fn xpath_contains_text(text: &str) -> String { + format!( + "//*[contains(normalize-space(.), {})]", + xpath_literal(text) + ) +} + +fn xpath_literal(input: &str) -> String { + if !input.contains('"') { + return format!("\"{}\"", input); + } + if !input.contains('\'') { + return format!("'{}'", input); + } + + let segments: Vec<&str> = input.split('"').collect(); + let mut parts: Vec = Vec::new(); + for (index, part) in segments.iter().enumerate() { + if !part.is_empty() { + parts.push(format!("\"{}\"", part)); + } + if index + 1 < segments.len() { + parts.push("'\"'".to_string()); + } + } + + if parts.is_empty() { + "\"\"".to_string() + } else { + format!("concat({})", parts.join(",")) + } +} + +fn webdriver_key(key: &str) -> String { + match key.trim().to_ascii_lowercase().as_str() { + "enter" => Key::Enter.to_string(), + "return" => Key::Return.to_string(), + "tab" => Key::Tab.to_string(), + "escape" | "esc" => Key::Escape.to_string(), + "backspace" => Key::Backspace.to_string(), + "delete" => Key::Delete.to_string(), + "arrowup" => Key::Up.to_string(), + "arrowdown" => Key::Down.to_string(), + "arrowleft" => Key::Left.to_string(), + "arrowright" => Key::Right.to_string(), + "home" => Key::Home.to_string(), + "end" => Key::End.to_string(), + "pageup" => Key::PageUp.to_string(), + "pagedown" => Key::PageDown.to_string(), + "space" => " ".to_string(), + other => other.to_string(), + } +} + +fn snapshot_script(interactive_only: bool, compact: bool, depth: Option) -> String { + let depth_literal = depth + .map(|level| level.to_string()) + .unwrap_or_else(|| "null".to_string()); + + format!( + r#"(() => {{ + const interactiveOnly = {interactive_only}; + const compact = {compact}; + const maxDepth = {depth_literal}; + const nodes = []; + const root = document.body || document.documentElement; + let counter = 0; + + const isVisible = (el) => {{ + const style = window.getComputedStyle(el); + if (style.display === 'none' || style.visibility === 'hidden' || Number(style.opacity || 1) === 0) {{ + return false; + }} + const rect = el.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; + }}; + + const isInteractive = (el) => {{ + if (el.matches('a,button,input,select,textarea,summary,[role],*[tabindex]')) return true; + return typeof el.onclick === 'function'; + }}; + + const describe = (el, depth) => {{ + const interactive = isInteractive(el); + const text = (el.innerText || el.textContent || '').trim().replace(/\s+/g, ' ').slice(0, 140); + if (interactiveOnly && !interactive) return; + if (compact && !interactive && !text) return; + + const ref = '@e' + (++counter); + el.setAttribute('data-zc-ref', ref); + nodes.push({{ + ref, + depth, + tag: el.tagName.toLowerCase(), + id: el.id || null, + role: el.getAttribute('role'), + text, + interactive, + }}); + }}; + + const walk = (el, depth) => {{ + if (!(el instanceof Element)) return; + if (maxDepth !== null && depth > maxDepth) return; + if (isVisible(el)) {{ + describe(el, depth); + }} + for (const child of el.children) {{ + walk(child, depth + 1); + if (nodes.length >= 400) return; + }} + }}; + + if (root) walk(root, 0); + + return JSON.stringify({{ + title: document.title, + url: window.location.href, + count: nodes.length, + nodes, + }}); +}})();"# + ) +} diff --git a/src/tools/mod.rs b/src/tools/mod.rs index 03a572d..b1608e9 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -1,4 +1,5 @@ pub mod bash; +pub mod browser; pub mod calculator; pub mod chat_manager; pub mod content_search; @@ -18,6 +19,7 @@ pub mod traits; pub mod web_fetch; pub use bash::BashTool; +pub use browser::BrowserTool; pub use calculator::CalculatorTool; pub use chat_manager::ChatManagerTool; pub use content_search::ContentSearchTool; @@ -34,6 +36,7 @@ pub use traits::{OutboundMessenger, Tool, ToolResult}; pub use web_fetch::WebFetchTool; use std::sync::Arc; +use crate::config::BrowserConfig; use crate::memory::MemoryManager; use crate::skills::SkillsLoader; @@ -43,6 +46,7 @@ use crate::skills::SkillsLoader; pub fn create_default_tools( skills_loader: Arc, memory: Arc, + browser_config: Option<&BrowserConfig>, ) -> ToolRegistry { let registry = ToolRegistry::new(); registry.register(CalculatorTool::new()); @@ -66,5 +70,11 @@ pub fn create_default_tools( registry.register(TimelineRecallTool::new(memory.clone())); registry.register(MemoryForgetTool::new(memory.clone())); + if let Some(cfg) = browser_config { + if cfg.enabled { + registry.register(BrowserTool::new(cfg)); + } + } + registry }