use std::net::TcpStream; use std::process::Stdio; use std::time::Duration; use anyhow::Context; use async_trait::async_trait; use base64::Engine; use fantoccini::actions::{InputSource, MouseActions, PointerAction}; use fantoccini::key::Key; use fantoccini::{Client, ClientBuilder, Locator}; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value, json}; use tracing; use crate::config::BrowserConfig; use crate::tools::traits::{Tool, ToolResult}; const CHROME_CANDIDATES: &[&str] = &[ "google-chrome", "chromium-browser", "chromium", "google-chrome-stable", "chrome", ]; const CHROMEDRIVER_CANDIDATES: &[&str] = &["chromedriver"]; pub struct BrowserTool { webdriver_url: String, headless: bool, chrome_path: Option, state: tokio::sync::Mutex, driver: std::sync::Mutex>, } struct BrowserState { client: Option, } impl Drop for BrowserTool { fn drop(&mut self) { if let Ok(mut driver) = self.driver.lock() { if let Some(ref mut child) = driver.take() { tracing::debug!("Stopping chromedriver process"); let _ = child.start_kill(); } } } } impl BrowserTool { pub fn new(config: &BrowserConfig) -> Self { Self { webdriver_url: config.webdriver_url.clone(), headless: config.headless, chrome_path: config.chrome_path.clone(), state: tokio::sync::Mutex::new(BrowserState { client: None }), driver: std::sync::Mutex::new(None), } } } #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum BrowserAction { Open { url: String }, Snapshot { #[serde(default)] interactive_only: bool, #[serde(default)] compact: bool, #[serde(default)] depth: Option, }, Click { selector: String }, Fill { selector: String, value: String }, Type { selector: String, text: String }, GetText { selector: String }, GetTitle, GetUrl, Screenshot { #[serde(default)] path: Option, #[serde(default)] full_page: bool, }, Wait { #[serde(default)] selector: Option, #[serde(default)] ms: Option, #[serde(default)] text: Option, }, Press { key: String }, Hover { selector: String }, Scroll { direction: String, #[serde(default)] pixels: Option, }, Close, } fn parse_browser_action(action_str: &str, args: &Value) -> anyhow::Result { match action_str { "open" => { let url = args .get("url") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'url' for open action"))?; Ok(BrowserAction::Open { url: url.to_string(), }) } "snapshot" => Ok(BrowserAction::Snapshot { interactive_only: args .get("interactive_only") .and_then(Value::as_bool) .unwrap_or(true), compact: args .get("compact") .and_then(Value::as_bool) .unwrap_or(true), depth: args .get("depth") .and_then(|v| v.as_u64()) .map(|d| u32::try_from(d).unwrap_or(u32::MAX)), }), "click" => { let selector = args .get("selector") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for click"))?; Ok(BrowserAction::Click { selector: selector.to_string(), }) } "fill" => { let selector = args .get("selector") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for fill"))?; let value = args .get("value") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'value' for fill"))?; Ok(BrowserAction::Fill { selector: selector.to_string(), value: value.to_string(), }) } "type" => { let selector = args .get("selector") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for type"))?; let text = args .get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'text' for type"))?; Ok(BrowserAction::Type { selector: selector.to_string(), text: text.to_string(), }) } "get_text" => { let selector = args .get("selector") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for get_text"))?; Ok(BrowserAction::GetText { selector: selector.to_string(), }) } "get_title" => Ok(BrowserAction::GetTitle), "get_url" => Ok(BrowserAction::GetUrl), "screenshot" => Ok(BrowserAction::Screenshot { path: args.get("path").and_then(|v| v.as_str()).map(String::from), full_page: args .get("full_page") .and_then(Value::as_bool) .unwrap_or(false), }), "wait" => Ok(BrowserAction::Wait { selector: args .get("selector") .and_then(|v| v.as_str()) .map(String::from), ms: args.get("ms").and_then(|v| v.as_u64()), text: args .get("text") .and_then(|v| v.as_str()) .map(String::from), }), "press" => { let key = args .get("key") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'key' for press"))?; Ok(BrowserAction::Press { key: key.to_string(), }) } "hover" => { let selector = args .get("selector") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'selector' for hover"))?; Ok(BrowserAction::Hover { selector: selector.to_string(), }) } "scroll" => { let direction = args .get("direction") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'direction' for scroll"))?; Ok(BrowserAction::Scroll { direction: direction.to_string(), pixels: args .get("pixels") .and_then(|v| v.as_u64()) .map(|p| u32::try_from(p).unwrap_or(u32::MAX)), }) } "close" => Ok(BrowserAction::Close), other => anyhow::bail!("Unsupported browser action: {}", other), } } #[async_trait] impl Tool for BrowserTool { fn name(&self) -> &str { "browser" } fn description(&self) -> &str { "Automate browser interactions using WebDriver. \ First call open to navigate to a URL, then use other actions. \ Use snapshot to get an accessibility tree of the page. \ Selectors can be CSS, @e1 refs (from snapshot), text=..., or label=..." } fn parameters_schema(&self) -> Value { json!({ "type": "object", "properties": { "action": { "type": "string", "description": "Browser action to perform", "enum": [ "open", "snapshot", "click", "fill", "type", "get_text", "get_title", "get_url", "screenshot", "wait", "press", "hover", "scroll", "close" ] }, "url": { "type": "string", "description": "URL to navigate to (required for open)" }, "selector": { "type": "string", "description": "CSS selector, @e1 ref, text=..., or label=..." }, "value": { "type": "string", "description": "Value to fill into a form field" }, "text": { "type": "string", "description": "Text to type or wait for" }, "key": { "type": "string", "description": "Key to press (Enter, Tab, Escape, Backspace, Delete, ArrowUp, ArrowDown, etc.)" }, "direction": { "type": "string", "description": "Scroll direction", "enum": ["up", "down", "left", "right"] }, "pixels": { "type": "integer", "description": "Pixels to scroll (default 600)" }, "ms": { "type": "integer", "description": "Milliseconds to wait (for wait action)" }, "path": { "type": "string", "description": "File path to save screenshot to. If omitted, returns base64." }, "full_page": { "type": "boolean", "description": "Take full-page screenshot" }, "interactive_only": { "type": "boolean", "description": "Only show interactive elements in snapshot (default true)" }, "compact": { "type": "boolean", "description": "Compact snapshot output (default true)" }, "depth": { "type": "integer", "description": "Max depth for snapshot traversal" } }, "required": ["action"] }) } fn exclusive(&self) -> bool { true } async fn execute(&self, args: Value) -> anyhow::Result { let action_str = args .get("action") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing required parameter: action"))?; let action = parse_browser_action(action_str, &args)?; let mut state = self.state.lock().await; state .execute_action( action, self.headless, &self.webdriver_url, self.chrome_path.as_deref(), &self.driver, ) .await } } impl BrowserState { async fn execute_action( &mut self, action: BrowserAction, headless: bool, webdriver_url: &str, chrome_path: Option<&str>, driver: &std::sync::Mutex>, ) -> anyhow::Result { let action_clone = action.clone(); let result = self .try_execute(action, headless, webdriver_url, chrome_path, driver) .await; match result { Ok(r) => Ok(r), Err(e) => { if is_recoverable_error(&e) { tracing::warn!("Recoverable browser session error, retrying: {:#}", e); self.reset_session(driver).await; self.try_execute(action_clone, headless, webdriver_url, chrome_path, driver) .await } else { Err(e) } } } } #[allow(clippy::too_many_lines)] async fn try_execute( &mut self, action: BrowserAction, headless: bool, webdriver_url: &str, chrome_path: Option<&str>, driver: &std::sync::Mutex>, ) -> anyhow::Result { match action { BrowserAction::Open { url } => { self.ensure_session(headless, webdriver_url, chrome_path, driver) .await?; let client = self.active_client()?; client.goto(&url).await?; let current = client.current_url().await?; Ok(ToolResult { success: true, output: format!("Opened {}", current), error: None, }) } BrowserAction::Snapshot { interactive_only, compact, depth, } => { let client = self.active_client()?; let result: Value = client .execute( &snapshot_script(interactive_only, compact, depth.map(i64::from)), vec![], ) .await?; let output = serde_json::to_string_pretty(&result)?; Ok(ToolResult { success: true, output, error: None, }) } BrowserAction::Click { selector } => { let client = self.active_client()?; find_element(client, &selector).await?.click().await?; Ok(ToolResult { success: true, output: format!("Clicked {}", selector), error: None, }) } BrowserAction::Fill { selector, value } => { let client = self.active_client()?; let el = find_element(client, &selector).await?; let _ = el.clear().await; el.send_keys(&value).await?; Ok(ToolResult { success: true, output: format!("Filled {} with {}", selector, value), error: None, }) } BrowserAction::Type { selector, text } => { let client = self.active_client()?; find_element(client, &selector) .await? .send_keys(&text) .await?; Ok(ToolResult { success: true, output: format!("Typed {} chars into {}", text.len(), selector), error: None, }) } BrowserAction::GetText { selector } => { let client = self.active_client()?; let text = find_element(client, &selector).await?.text().await?; Ok(ToolResult { success: true, output: text, error: None, }) } BrowserAction::GetTitle => { let client = self.active_client()?; let title = client.title().await?; Ok(ToolResult { success: true, output: title, error: None, }) } BrowserAction::GetUrl => { let client = self.active_client()?; let url = client.current_url().await?; Ok(ToolResult { success: true, output: url.to_string(), error: None, }) } BrowserAction::Screenshot { path, full_page } => { let client = self.active_client()?; let png = client.screenshot().await?; let _ = full_page; match path { Some(p) => { tokio::fs::write(&p, &png).await?; Ok(ToolResult { success: true, output: format!("Screenshot saved to {}", p), error: None, }) } None => { let b64 = base64::engine::general_purpose::STANDARD.encode(&png); Ok(ToolResult { success: true, output: format!("data:image/png;base64,{}", b64), error: None, }) } } } BrowserAction::Wait { selector, ms, text, } => { if let Some(sel) = selector { let client = self.active_client()?; wait_for_selector(client, &sel).await?; Ok(ToolResult { success: true, output: format!("Element found: {}", sel), error: None, }) } else if let Some(duration_ms) = ms { tokio::time::sleep(Duration::from_millis(duration_ms)).await; Ok(ToolResult { success: true, output: format!("Waited {} ms", duration_ms), error: None, }) } else if let Some(needle) = text { let client = self.active_client()?; let xpath = xpath_contains_text(&needle); client.wait().for_element(Locator::XPath(&xpath)).await?; Ok(ToolResult { success: true, output: format!("Text appeared: {}", needle), error: None, }) } else { tokio::time::sleep(Duration::from_millis(250)).await; Ok(ToolResult { success: true, output: "Waited 250 ms".to_string(), error: None, }) } } BrowserAction::Press { key } => { let client = self.active_client()?; let key_input = webdriver_key(&key); match client.active_element().await { Ok(el) => { el.send_keys(&key_input).await?; } Err(_) => { find_element(client, "body") .await? .send_keys(&key_input) .await?; } } Ok(ToolResult { success: true, output: format!("Pressed {}", key), error: None, }) } BrowserAction::Hover { selector } => { let client = self.active_client()?; let el = find_element(client, &selector).await?; hover_element(client, &el).await?; Ok(ToolResult { success: true, output: format!("Hovered {}", selector), error: None, }) } BrowserAction::Scroll { direction, pixels } => { let client = self.active_client()?; let amount = i64::from(pixels.unwrap_or(600)); let (dx, dy) = match direction.as_str() { "up" => (0, -amount), "down" => (0, amount), "left" => (-amount, 0), "right" => (amount, 0), _ => anyhow::bail!( "Unsupported scroll direction '{}'. Use up/down/left/right", direction ), }; let position: Value = client .execute( "window.scrollBy(arguments[0], arguments[1]); \ return { x: window.scrollX, y: window.scrollY };", vec![json!(dx), json!(dy)], ) .await?; Ok(ToolResult { success: true, output: format!( "Scrolled {} by {} -> {}", direction, amount, serde_json::to_string(&position).unwrap_or_default() ), error: None, }) } BrowserAction::Close => { self.reset_session(driver).await; Ok(ToolResult { success: true, output: "Browser session closed".to_string(), error: None, }) } } } pub async fn reset_session( &mut self, driver: &std::sync::Mutex>, ) { if let Some(client) = self.client.take() { let _ = client.close().await; } if let Ok(mut guard) = driver.lock() { if let Some(ref mut child) = guard.take() { tracing::debug!("Stopping chromedriver process"); let _ = child.start_kill(); } } } async fn ensure_session( &mut self, headless: bool, webdriver_url: &str, chrome_path: Option<&str>, driver: &std::sync::Mutex>, ) -> anyhow::Result<()> { if self.client.is_some() { return Ok(()); } let chrome_binary = match chrome_path { Some(path) if !path.trim().is_empty() => Some(path.trim().to_string()), _ => { verify_chrome_installed()?; None } }; let chromedriver_binary = find_chromedriver_binary()?; let port = extract_port(webdriver_url); let launched = if !webdriver_endpoint_reachable(webdriver_url) { tracing::info!( "chromedriver not running at {}, launching: {} --port={}", webdriver_url, chromedriver_binary, port ); launch_chromedriver(driver, &chromedriver_binary, port)?; true } else { false }; if launched { wait_for_webdriver_ready(webdriver_url).await .map_err(|e| { kill_driver_guard(driver); e.context("chromedriver failed to start. Is Chrome installed? Check browser.chrome_path in config.json") })?; } let mut capabilities: Map = Map::new(); let mut chrome_options: Map = Map::new(); let mut args: Vec = Vec::new(); if headless { args.push(Value::String("--headless=new".to_string())); args.push(Value::String("--disable-gpu".to_string())); args.push(Value::String("--no-sandbox".to_string())); args.push(Value::String("--disable-dev-shm-usage".to_string())); } args.push(Value::String("--window-size=1280,720".to_string())); if let Some(ref binary) = chrome_binary { chrome_options.insert("binary".to_string(), Value::String(binary.clone())); } if !args.is_empty() { chrome_options.insert("args".to_string(), Value::Array(args)); } capabilities.insert( "goog:chromeOptions".to_string(), Value::Object(chrome_options), ); let client = ClientBuilder::rustls()? .capabilities(capabilities) .connect(webdriver_url) .await .with_context(|| { format!( "Failed to connect to WebDriver at {}. \ Make sure chromedriver is installed and running.", webdriver_url ) })?; self.client = Some(client); Ok(()) } fn active_client(&self) -> anyhow::Result<&Client> { self.client .as_ref() .ok_or_else(|| anyhow::anyhow!("No active browser session. Run action='open' first")) } } fn launch_chromedriver( driver: &std::sync::Mutex>, chromedriver_binary: &str, port: u16, ) -> anyhow::Result<()> { let child = tokio::process::Command::new(chromedriver_binary) .arg(format!("--port={}", port)) .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .with_context(|| format!("Failed to launch chromedriver: {}", chromedriver_binary))?; *driver.lock().unwrap() = Some(child); Ok(()) } fn kill_driver_guard(driver: &std::sync::Mutex>) { if let Ok(mut guard) = driver.lock() { if let Some(ref mut child) = guard.take() { let _ = child.start_kill(); } } } async fn wait_for_webdriver_ready(webdriver_url: &str) -> anyhow::Result<()> { let deadline = tokio::time::Instant::now() + Duration::from_secs(10); loop { if webdriver_endpoint_reachable(webdriver_url) { return Ok(()); } if tokio::time::Instant::now() >= deadline { anyhow::bail!( "Timed out waiting for chromedriver to start listening on {}", webdriver_url ); } tokio::time::sleep(Duration::from_millis(200)).await; } } fn verify_chrome_installed() -> anyhow::Result<()> { for name in CHROME_CANDIDATES { if which::which(name).is_ok() { return Ok(()); } } anyhow::bail!( "Chrome/Chromium not found. Install with:\n\ Ubuntu/Debian: apt install chromium-browser\n\ macOS: brew install chromium\n\ Or set browser.chrome_path in config.json" ) } fn find_chromedriver_binary() -> anyhow::Result { for name in CHROMEDRIVER_CANDIDATES { if let Ok(path) = which::which(name) { return Ok(path.to_string_lossy().to_string()); } } anyhow::bail!( "chromedriver not found. Install with:\n\ Ubuntu/Debian: apt install chromium-chromedriver\n\ macOS: brew install chromedriver\n\ Or download from https://chromedriver.chromium.org/" ) } fn extract_port(webdriver_url: &str) -> u16 { reqwest::Url::parse(webdriver_url) .ok() .and_then(|u| u.port()) .unwrap_or(9515) } fn webdriver_endpoint_reachable(webdriver_url: &str) -> bool { let parsed = match reqwest::Url::parse(webdriver_url) { Ok(url) => url, Err(_) => return false, }; if parsed.scheme() != "http" && parsed.scheme() != "https" { return false; } let host = match parsed.host_str() { Some(h) if !h.is_empty() => h, _ => return false, }; let port = parsed.port_or_known_default().unwrap_or(9515); let addr = format!("{}:{}", host, port); TcpStream::connect_timeout( &addr .parse::() .unwrap_or_else(|_| ([127, 0, 0, 1], port).into()), Duration::from_millis(500), ) .is_ok() } fn is_recoverable_error(err: &anyhow::Error) -> bool { let message = format!("{:#}", err).to_ascii_lowercase(); message.contains("invalid session id") || message.contains("no such window") || message.contains("session not created") || message.contains("connection reset") || message.contains("broken pipe") || (message.contains("webdriver") && (message.contains("timed out") || message.contains("timeout"))) } enum SelectorKind { Css(String), XPath(String), } fn parse_selector(selector: &str) -> SelectorKind { let trimmed = selector.trim(); if let Some(text_query) = trimmed.strip_prefix("text=") { return SelectorKind::XPath(xpath_contains_text(text_query)); } if let Some(label_query) = trimmed.strip_prefix("label=") { let literal = xpath_literal(label_query); return SelectorKind::XPath(format!( "(//label[contains(normalize-space(.), {literal})] \ /following::*[self::input or self::textarea or self::select][1] \ | //*[@aria-label and contains(normalize-space(@aria-label), {literal})] \ | //label[contains(normalize-space(.), {literal})])" )); } if trimmed.starts_with('@') { let escaped = css_attr_escape(trimmed); return SelectorKind::Css(format!(r#"[data-zc-ref="{escaped}"]"#)); } SelectorKind::Css(trimmed.to_string()) } async fn find_element( client: &Client, selector: &str, ) -> anyhow::Result { match parse_selector(selector) { SelectorKind::Css(css) => Ok(client.find(Locator::Css(&css)).await?), SelectorKind::XPath(xpath) => Ok(client.find(Locator::XPath(&xpath)).await?), } } async fn wait_for_selector(client: &Client, selector: &str) -> anyhow::Result<()> { match parse_selector(selector) { SelectorKind::Css(css) => { client.wait().for_element(Locator::Css(&css)).await?; } SelectorKind::XPath(xpath) => { client.wait().for_element(Locator::XPath(&xpath)).await?; } } Ok(()) } async fn hover_element( client: &Client, element: &fantoccini::elements::Element, ) -> anyhow::Result<()> { let actions = MouseActions::new("mouse".to_string()).then(PointerAction::MoveToElement { element: element.clone(), duration: Some(Duration::from_millis(150)), x: 0.0, y: 0.0, }); client.perform_actions(actions).await?; let _ = client.release_actions().await; Ok(()) } fn css_attr_escape(input: &str) -> String { input .replace('\\', "\\\\") .replace('"', "\\\"") .replace('\n', " ") } fn xpath_contains_text(text: &str) -> String { format!( "//*[contains(normalize-space(.), {})]", xpath_literal(text) ) } fn xpath_literal(input: &str) -> String { if !input.contains('"') { return format!("\"{}\"", input); } if !input.contains('\'') { return format!("'{}'", input); } let segments: Vec<&str> = input.split('"').collect(); let mut parts: Vec = Vec::new(); for (index, part) in segments.iter().enumerate() { if !part.is_empty() { parts.push(format!("\"{}\"", part)); } if index + 1 < segments.len() { parts.push("'\"'".to_string()); } } if parts.is_empty() { "\"\"".to_string() } else { format!("concat({})", parts.join(",")) } } fn webdriver_key(key: &str) -> String { match key.trim().to_ascii_lowercase().as_str() { "enter" => Key::Enter.to_string(), "return" => Key::Return.to_string(), "tab" => Key::Tab.to_string(), "escape" | "esc" => Key::Escape.to_string(), "backspace" => Key::Backspace.to_string(), "delete" => Key::Delete.to_string(), "arrowup" => Key::Up.to_string(), "arrowdown" => Key::Down.to_string(), "arrowleft" => Key::Left.to_string(), "arrowright" => Key::Right.to_string(), "home" => Key::Home.to_string(), "end" => Key::End.to_string(), "pageup" => Key::PageUp.to_string(), "pagedown" => Key::PageDown.to_string(), "space" => " ".to_string(), other => other.to_string(), } } fn snapshot_script(interactive_only: bool, compact: bool, depth: Option) -> String { let depth_literal = depth .map(|level| level.to_string()) .unwrap_or_else(|| "null".to_string()); format!( r#"(() => {{ const interactiveOnly = {interactive_only}; const compact = {compact}; const maxDepth = {depth_literal}; const nodes = []; const root = document.body || document.documentElement; let counter = 0; const isVisible = (el) => {{ const style = window.getComputedStyle(el); if (style.display === 'none' || style.visibility === 'hidden' || Number(style.opacity || 1) === 0) {{ return false; }} const rect = el.getBoundingClientRect(); return rect.width > 0 && rect.height > 0; }}; const isInteractive = (el) => {{ if (el.matches('a,button,input,select,textarea,summary,[role],*[tabindex]')) return true; return typeof el.onclick === 'function'; }}; const describe = (el, depth) => {{ const interactive = isInteractive(el); const text = (el.innerText || el.textContent || '').trim().replace(/\s+/g, ' ').slice(0, 140); if (interactiveOnly && !interactive) return; if (compact && !interactive && !text) return; const ref = '@e' + (++counter); el.setAttribute('data-zc-ref', ref); nodes.push({{ ref, depth, tag: el.tagName.toLowerCase(), id: el.id || null, role: el.getAttribute('role'), text, interactive, }}); }}; const walk = (el, depth) => {{ if (!(el instanceof Element)) return; if (maxDepth !== null && depth > maxDepth) return; if (isVisible(el)) {{ describe(el, depth); }} for (const child of el.children) {{ walk(child, depth + 1); if (nodes.length >= 400) return; }} }}; if (root) walk(root, 0); return JSON.stringify({{ title: document.title, url: window.location.href, count: nodes.length, nodes, }}); }})();"# ) }