From f9ae4b2c6930292b6a1273ff42981a91e3de29cd Mon Sep 17 00:00:00 2001 From: xiaoxixi Date: Sun, 10 May 2026 12:23:26 +0800 Subject: [PATCH] =?UTF-8?q?fix(context):=20=E5=85=A8=E9=9D=A2=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E5=92=8C=E5=A2=9E=E5=BC=BA=E4=B8=8A=E4=B8=8B=E6=96=87?= =?UTF-8?q?=E5=8E=8B=E7=BC=A9=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug修复: - fast_trim 死代码: 去掉 .clone() 使其真正修改原 history - compress_once 边界错误: 第一条 user 重复 + 最后一条 user 丢失 - handle_cron_message 遗留的重复压缩调用 新特性: - System prompt 不参与压缩,在压缩后注入 - 孤儿 tool 清理: compress_once 返回前修复 tool 对完整性 - Agent Loop 内每轮 token 检查: 80% 窗口阈值时预裁剪旧 tool 输出 - 上下文溢出恢复: 解析 LLM 错误中的实际 token limit 并降级重试 - 硬截断降级: tokens > 90% context_window 时 head+tail 保底 - parse_context_limit_from_error: 支持 OpenAI/Anthropic/llama.cpp 格式 测试: 新增 6 个单元测试覆盖上述所有场景 --- src/agent/agent_loop.rs | 63 ++++++ src/agent/context_compressor.rs | 375 ++++++++++++++++++++++++++++++-- src/agent/mod.rs | 2 +- src/session/session.rs | 59 ++++- 4 files changed, 476 insertions(+), 23 deletions(-) diff --git a/src/agent/agent_loop.rs b/src/agent/agent_loop.rs index 1cabec7..9071db0 100644 --- a/src/agent/agent_loop.rs +++ b/src/agent/agent_loop.rs @@ -1,3 +1,4 @@ +use crate::agent::context_compressor::estimate_tokens; use crate::agent::system_prompt::build_system_prompt; use crate::bus::message::ContentBlock; use crate::bus::ChatMessage; @@ -226,6 +227,7 @@ pub struct AgentLoop { max_iterations: usize, workspace_dir: PathBuf, model_name: String, + context_window: usize, notify_tx: Option>, } @@ -249,6 +251,7 @@ impl AgentLoop { tools: Arc::new(ToolRegistry::new()), observer: None, notify_tx: None, + context_window: 0, max_iterations, workspace_dir, model_name, @@ -268,6 +271,7 @@ impl AgentLoop { tools, observer: None, notify_tx: None, + context_window: 0, max_iterations, workspace_dir, model_name, @@ -281,6 +285,7 @@ impl AgentLoop { tools: Arc::new(ToolRegistry::new()), observer: None, notify_tx: None, + context_window: 0, max_iterations, workspace_dir, model_name, @@ -300,12 +305,19 @@ impl AgentLoop { tools, observer: None, notify_tx: None, + context_window: 0, max_iterations, workspace_dir, model_name, } } + /// Set the context window size for preemptive trimming. + pub fn with_context_window(mut self, window: usize) -> Self { + self.context_window = window; + self + } + /// Set the workspace directory. pub fn with_workspace_dir(mut self, dir: PathBuf) -> Self { self.workspace_dir = dir; @@ -323,6 +335,36 @@ impl AgentLoop { self } + /// Preemptive trim: truncate old tool results in-place when history is + /// approaching the context window limit. Only trims tool messages with + /// content > TRIM_CHARS, preserving the most recent KEEP messages. + fn preemptive_trim_old_tool_results( + &self, + messages: &mut [ChatMessage], + max_chars: usize, + keep_recent: usize, + ) -> usize { + let end = messages.len().saturating_sub(keep_recent); + let start = 1; // protect system message at [0] if present + let mut modified = 0; + for i in start..end { + if messages[i].role != "tool" { + continue; + } + if messages[i].content.len() <= max_chars { + continue; + } + let removed = messages[i].content.len() - max_chars; + messages[i].content = format!( + "{}...\n\n[Output truncated - {} characters removed]", + &messages[i].content[..messages[i].content.ceil_char_boundary(max_chars)], + removed + ); + modified += 1; + } + modified + } + pub fn tools(&self) -> &Arc { &self.tools } @@ -355,6 +397,27 @@ impl AgentLoop { #[cfg(debug_assertions)] tracing::debug!(iteration, "Agent iteration started"); + // Preemptive context check: trim old tool results if token estimate + // exceeds 80% of context window to prevent mid-loop overflow. + if self.context_window > 0 { + let estimated = estimate_tokens(&messages); + let danger = (self.context_window as f64 * 0.8) as usize; + if estimated > danger { + let trimmed = self.preemptive_trim_old_tool_results( + &mut messages, 2000, 4, + ); + if trimmed > 0 { + #[cfg(debug_assertions)] + tracing::debug!( + estimated, + danger, + trimmed_msgs = trimmed, + "Preemptive tool-result trim applied in loop" + ); + } + } + } + // Convert messages to LLM format let messages_for_llm: Vec = messages .iter() diff --git a/src/agent/context_compressor.rs b/src/agent/context_compressor.rs index 61f3ba5..081f473 100644 --- a/src/agent/context_compressor.rs +++ b/src/agent/context_compressor.rs @@ -15,6 +15,19 @@ pub fn estimate_tokens(messages: &[ChatMessage]) -> usize { (raw as f64 * 1.2) as usize } +/// Extract the first number found within `max_len` characters of the start of `s`. +/// Used by `parse_context_limit_from_error` to find token limits in error messages. +fn find_number_nearby(s: &str, max_len: usize) -> Option<&str> { + let end = s.len().min(max_len); + let slice = &s[..end]; + let start = slice.find(|c: char| c.is_ascii_digit())?; + let end = slice[start..] + .find(|c: char| !c.is_ascii_digit()) + .map(|p| start + p) + .unwrap_or(end); + Some(&slice[start..end]) +} + /// Configuration for context compression. #[derive(Debug, Clone)] pub struct ContextCompressionConfig { @@ -96,13 +109,18 @@ impl ContextCompressor { self.session_id = id; } + /// Update the context window size (e.g., after parsing actual limit from LLM error). + pub fn set_context_window(&mut self, window: usize) { + self.context_window = window; + } + /// Always true — memory is always available (memory system is always on). pub fn has_memory(&self) -> bool { true } /// Get the compression threshold in tokens. - fn threshold(&self) -> usize { + pub fn threshold(&self) -> usize { (self.context_window as f64 * self.threshold_ratio) as usize } @@ -127,10 +145,34 @@ impl ContextCompressor { modified } + /// Remove orphan tool results whose declaring tool_calls have been compressed away. + /// Scans for tool messages with no preceding assistant tool_call, and removes them. + pub fn repair_tool_pairs(messages: &mut Vec) { + let mut declared: std::collections::HashSet = std::collections::HashSet::new(); + let mut i = 0; + while i < messages.len() { + if messages[i].role == "assistant" { + if let Some(ref tool_calls) = messages[i].tool_calls { + for tc in tool_calls { + declared.insert(tc.id.clone()); + } + } + } else if messages[i].role == "tool" { + if let Some(ref tid) = messages[i].tool_call_id { + if !declared.contains(tid.as_str()) { + messages.remove(i); + continue; + } + } + } + i += 1; + } + } + /// Main entry point - compresses history if over threshold. pub async fn compress_if_needed( &self, - history: Vec, + mut history: Vec, ) -> Result, AgentError> { // Check if compression is needed let tokens = estimate_tokens(&history); @@ -146,19 +188,19 @@ impl ContextCompressor { "Starting context compression" ); - // Fast trim pass first - let trimmed = self.fast_trim_tool_results(&mut history.clone()); + // Fast trim pass first — modify history in place + let trimmed = self.fast_trim_tool_results(&mut history); + let tokens_after = estimate_tokens(&history); if trimmed > 0 { - let tokens_after = estimate_tokens(&history); #[cfg(debug_assertions)] tracing::debug!( trimmed_messages = trimmed, tokens_after = tokens_after, "Fast trim completed" ); - if tokens_after <= self.threshold() { - return Ok(history); - } + } + if tokens_after <= self.threshold() { + return Ok(history); } // LLM summarization pass @@ -191,6 +233,36 @@ impl ContextCompressor { } } + // Hard safety net: if still dangerously high after all passes, + // fall back to head+tail truncation so the LLM call doesn't overflow. + let final_tokens = estimate_tokens(¤t_history); + let danger_threshold = (self.context_window as f64 * 0.9) as usize; + if final_tokens > danger_threshold + && current_history.len() > self.config.protect_first_n + self.config.protect_last_n + { + let head: Vec<_> = current_history[..self.config.protect_first_n].to_vec(); + let tail_start = current_history.len() - self.config.protect_last_n; + let tail: Vec<_> = current_history[tail_start..].to_vec(); + let dropped = current_history.len() - self.config.protect_first_n - self.config.protect_last_n; + + let mut truncated = head; + truncated.push(ChatMessage::user(format!( + "[Context truncation — {} earlier messages dropped due to token limit]\n\ + Previous context could not be fully compressed. Continuing with most recent context.", + dropped + ))); + truncated.extend(tail); + + tracing::warn!( + final_tokens = final_tokens, + danger = danger_threshold, + dropped_msgs = dropped, + "Hard truncation fallback applied" + ); + + current_history = truncated; + } + #[cfg(debug_assertions)] tracing::debug!( final_tokens = estimate_tokens(¤t_history), @@ -201,6 +273,48 @@ impl ContextCompressor { Ok(current_history) } + /// Try to extract the actual context token limit from an LLM error message. + /// Recognizes patterns from OpenAI, Anthropic, and llama.cpp-style errors. + pub fn parse_context_limit_from_error(msg: &str) -> Option { + let lower = msg.to_lowercase(); + + // Common patterns: "maximum context length is 128000", "context window of 131072", + // "128000 token context", "available context size (8448 tokens)", "> 128000 maximum" + let markers = [ + "maximum context length", + "context window", + "context length", + "available context size", + ]; + + for marker in &markers { + if let Some(pos) = lower.find(marker) { + let after = &lower[pos + marker.len()..]; + // Look for a number in the vicinity (up to 10 chars after marker) + if let Some(num_str) = find_number_nearby(after, 50) { + if let Ok(n) = num_str.parse::() { + if (1024..=10_000_000).contains(&n) { + return Some(n); + } + } + } + } + } + + // Also try: "XXXX token context" or "XXXX limit" + if let Some(num_str) = find_number_nearby(&lower, lower.len()) { + if let Ok(n) = num_str.parse::() { + if (1024..=10_000_000).contains(&n) + && (lower.contains("token") || lower.contains("context") || lower.contains("limit")) + { + return Some(n); + } + } + } + + None + } + /// Single compression pass - summarize middle messages between user turns. /// Returns Some(compressed) if compression happened, None if nothing to compress. async fn compress_once( @@ -227,7 +341,7 @@ impl ContextCompressor { // Build segments: user -> (assistant turns) -> next user // We'll summarize the assistant turns between consecutive user messages - let mut new_messages = history[..=user_indices[0]].to_vec(); + let mut new_messages = history[..user_indices[0]].to_vec(); for i in 0..user_indices.len() - 1 { let user_idx = user_indices[i]; @@ -272,13 +386,13 @@ impl ContextCompressor { // Add last user and everything after (protected) let last_user_idx = user_indices[user_indices.len() - 1]; - if last_user_idx < history.len() - 1 { - // Add everything from last user onwards (protected) - for i in last_user_idx..history.len() { - new_messages.push(history[i].clone()); - } + for i in last_user_idx..history.len() { + new_messages.push(history[i].clone()); } + // Remove orphan tool results whose declaring tool_calls were compressed away + Self::repair_tool_pairs(&mut new_messages); + // If nothing changed, return None if new_messages.len() == history.len() { return Ok(None); @@ -370,8 +484,11 @@ Be concise, aim for {} characters or less. #[cfg(test)] mod tests { use super::*; + use crate::memory::MemoryManager; use crate::providers::ChatCompletionResponse; + use crate::providers::Usage; use async_trait::async_trait; + use std::sync::Arc; use std::sync::OnceLock; /// Mock provider for testing - panics if actually used for LLM calls @@ -403,6 +520,34 @@ mod tests { Arc::new(MockProvider) } + /// Mock summarizer that returns a simple summary — used when compress_once + /// needs to call the LLM for summarization. + struct MockSummarizer; + + #[async_trait] + impl LLMProvider for MockSummarizer { + async fn chat( + &self, + _request: ChatCompletionRequest, + ) -> Result> { + Ok(ChatCompletionResponse { + id: "mock".into(), + model: "mock".into(), + content: "[summarized]".into(), + tool_calls: vec![], + usage: Usage { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }, + }) + } + + fn ptype(&self) -> &str { "mock" } + fn name(&self) -> &str { "mock" } + fn model_id(&self) -> &str { "mock" } + } + + fn mock_summarizer() -> Arc { + Arc::new(MockSummarizer) + } + fn test_memory_manager() -> Arc { static MM: OnceLock> = OnceLock::new(); MM.get_or_init(|| { @@ -454,4 +599,206 @@ mod tests { let compressor = ContextCompressor::new(mock_provider(), 128_000, test_memory_manager()); assert_eq!(compressor.threshold(), 64_000); } + + #[tokio::test] + async fn test_compress_if_needed_fast_trims_tool_results() { + // context_window=200 → threshold=100. + // user "Hi" (~6 raw), tool(3000 x's) → ~760 raw*1.2=912 > 100 → triggers compression. + // fast_trim to 50 chars should bring tokens well under 100. + let tmp = std::env::temp_dir().join(format!("picobot_ctx_trim_{}.db", std::process::id())); + let storage = Arc::new(crate::storage::Storage::new(&tmp).await.unwrap()); + let mm = Arc::new(MemoryManager::new(storage, "test".into(), "test".into())); + + let config = ContextCompressionConfig { + tool_result_trim_chars: 50, + protect_first_n: 0, + protect_last_n: 10, + max_passes: 0, + ..Default::default() + }; + let compressor = ContextCompressor::with_config(mock_provider(), 200, config, mm); + + let messages = vec![ + ChatMessage::user("Hi"), + ChatMessage::tool("call1", "bash", &"x".repeat(3000)), + ]; + + let result = compressor.compress_if_needed(messages).await.unwrap(); + + let tool_msg = result.iter().find(|m| m.role == "tool").unwrap(); + assert!( + tool_msg.content.len() < 3000, + "tool result should be trimmed, got {} chars", + tool_msg.content.len() + ); + assert!( + tool_msg.content.contains("[Output truncated"), + "trim marker missing from: {}", + tool_msg.content + ); + + let _ = std::fs::remove_file(&tmp); + } + + #[tokio::test] + async fn test_compress_once_no_duplicate_and_no_lost_user() { + // Verifies two boundary bugs in compress_once: + // - B2A (L230): first user message duplicated when protect_first_n > 0 + // - B2B (L275): last user message lost when it is the final history message + // + // context_window=200 → threshold=100. Large tool outputs force LLM summarization. + let tmp = std::env::temp_dir().join(format!("picobot_ctx_boundary_{}.db", std::process::id())); + let storage = Arc::new(crate::storage::Storage::new(&tmp).await.unwrap()); + let mm = Arc::new(MemoryManager::new(storage, "test".into(), "test".into())); + + let config = ContextCompressionConfig { + tool_result_trim_chars: 2000, + protect_first_n: 1, // system/protected → B2A: first user (after skip) duplicated + protect_last_n: 2, + max_passes: 1, + ..Default::default() + }; + let compressor = ContextCompressor::with_config(mock_summarizer(), 200, config, mm); + + // History: 9 messages, last message is user Q4. + // user_indices (skip 1) = [1, 3, 6, 8] + // B2A: init history[..=1] includes Q1, then loop i=0 pushes Q1 again → duplicate + // B2B: last_user_idx=8, 8 < 8 → false → Q4 not pushed → lost + let big = "x".repeat(3000); + let messages = vec![ + ChatMessage::system("You are a helper."), // 0: protected + ChatMessage::user("Q1"), // 1: first user + ChatMessage::tool("t1", "bash", &big), // 2 + ChatMessage::user("Q2"), // 3 + ChatMessage::assistant("thinking"), // 4 + ChatMessage::tool("t2", "bash", &big), // 5 + ChatMessage::user("Q3"), // 6 + ChatMessage::assistant("thinking"), // 7 + ChatMessage::user("Q4"), // 8: LAST, is user → B2B triggers + ]; + + let result = compressor.compress_if_needed(messages).await.unwrap(); + + // B2A: "Q1" must appear exactly once + let q1_count = result.iter().filter(|m| m.role == "user" && m.content == "Q1").count(); + assert_eq!(q1_count, 1, "Q1 should appear exactly once, got {}", q1_count); + + // B2B: "Q4" must NOT be lost + let q4_count = result.iter().filter(|m| m.role == "user" && m.content == "Q4").count(); + assert_eq!(q4_count, 1, "Q4 should appear exactly once (not lost), got {}", q4_count); + + let _ = std::fs::remove_file(&tmp); + } + + #[tokio::test] + async fn test_compress_hard_truncation_fallback() { + // When LLM compression fails (or max_passes=0) and tokens are still + // above 90% of context_window, a head+tail truncation kicks in. + let tmp = std::env::temp_dir().join(format!("picobot_ctx_trunc_{}.db", std::process::id())); + let storage = Arc::new(crate::storage::Storage::new(&tmp).await.unwrap()); + let mm = Arc::new(MemoryManager::new(storage, "test".into(), "test".into())); + + let config = ContextCompressionConfig { + tool_result_trim_chars: 500, // trim reduces but not enough + protect_first_n: 1, + protect_last_n: 2, + max_passes: 0, // no LLM summarization → will exceed danger + ..Default::default() + }; + // context_window=100, danger_threshold=90. + // Each trimmed tool (~500 chars): ceil(500/4)+4 = 129 raw. 3 tools = 387. + // Plus users (~5 each) + system (~15) = ~417 raw * 1.2 = 500 > 90. + let compressor = ContextCompressor::with_config(mock_provider(), 100, config, mm); + + let big = "x".repeat(3000); + let messages = vec![ + ChatMessage::system("sys"), + ChatMessage::user("Q1"), + ChatMessage::tool("t1", "bash", &big), + ChatMessage::user("Q2"), + ChatMessage::tool("t2", "bash", &big), + ChatMessage::user("Q3"), + ChatMessage::tool("t3", "bash", &big), + ]; + + let result = compressor.compress_if_needed(messages).await.unwrap(); + + // After hard truncation: head (1) + trunc_note (1) + tail (2) = 4 messages + assert!(result.len() < 7, "expected truncation reduction, got {} messages", result.len()); + + // Truncation notice should be present + let has_notice = result.iter().any(|m| m.content.contains("Context truncation")); + assert!(has_notice, "hard truncation notice missing"); + + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_repair_tool_pairs_removes_orphans() { + use crate::providers::ToolCall; + + // Simulate compressed output: summary replaced assistant(tool_call: tc1), + // leaving tool(tc1) as an orphan. Legitimate tool(tc2) should be kept. + let mut messages = vec![ + ChatMessage::user("Q1"), + ChatMessage::user("[Context Summary]\n\nsummary of previous turn"), + ChatMessage::tool("tc1", "bash", "orphan result"), // orphan — tc1 never declared + ChatMessage::assistant("done"), // declares tc2 + ChatMessage::tool("tc2", "bash", "legitimate result"), // legit + ]; + // Set tool_call_id on tool messages and tool_calls on assistant + messages[2].tool_call_id = Some("tc1".into()); + messages[4].tool_call_id = Some("tc2".into()); + messages[3].tool_calls = Some(vec![ToolCall { + id: "tc2".into(), + name: "bash".into(), + arguments: serde_json::json!({"cmd": "echo ok"}), + }]); + + ContextCompressor::repair_tool_pairs(&mut messages); + + // orphan should be removed; legitimate should stay + assert_eq!(messages.len(), 4); + assert!(messages.iter().all(|m| m.tool_call_id != Some("tc1".into()))); + assert!(messages.iter().any(|m| m.tool_call_id == Some("tc2".into()))); + } + + #[test] + fn test_parse_context_limit_from_error() { + // OpenAI: "maximum context length is 128000" + assert_eq!( + ContextCompressor::parse_context_limit_from_error( + "This model's maximum context length is 128000 tokens." + ), + Some(128000) + ); + + // Anthropic: "context window of 200000" + assert_eq!( + ContextCompressor::parse_context_limit_from_error( + "Your request exceeds the context window of 200000." + ), + Some(200000) + ); + + // llama.cpp: "available context size (8448 tokens)" + assert_eq!( + ContextCompressor::parse_context_limit_from_error( + "context size exceeded, available context size (8448 tokens)" + ), + Some(8448) + ); + + // Non-context error should return None + assert_eq!( + ContextCompressor::parse_context_limit_from_error("Internal server error"), + None + ); + + // Numbers too small should be rejected + assert_eq!( + ContextCompressor::parse_context_limit_from_error("context length is 500"), + None + ); + } } diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 1aace0c..2153826 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -3,5 +3,5 @@ pub mod context_compressor; pub mod system_prompt; pub use agent_loop::{AgentLoop, AgentError, AgentProcessResult}; -pub use context_compressor::ContextCompressor; +pub use context_compressor::{ContextCompressor, estimate_tokens}; pub use system_prompt::{build_system_prompt, PromptContext, PromptSection, SystemPromptBuilder}; diff --git a/src/session/session.rs b/src/session/session.rs index 631e29e..3c8e0db 100644 --- a/src/session/session.rs +++ b/src/session/session.rs @@ -21,6 +21,18 @@ use crate::config::LLMProviderConfig; use crate::agent::{AgentLoop, AgentError, ContextCompressor}; use crate::agent::system_prompt::build_system_prompt; use crate::agent::context_compressor::ContextCompressionConfig; + +/// Check if an LLM error message indicates a context window overflow. +fn is_context_overflow_error(msg: &str) -> bool { + let lower = msg.to_lowercase(); + lower.contains("context length") + || lower.contains("context window") + || lower.contains("maximum context") + || lower.contains("too many tokens") + || lower.contains("token limit exceeded") + || lower.contains("prompt is too long") + || lower.contains("input is too long") +} use crate::providers::{create_provider, LLMProvider}; use crate::session::session_id::UnifiedSessionId; use crate::session::events::DialogInfo; @@ -372,6 +384,11 @@ impl Session { &self.compressor } + /// Get the compressor's current threshold for diagnostics/fallback. + pub fn compressor_threshold(&self) -> usize { + self.compressor.threshold() + } + /// 创建一个临时的 AgentLoop 实例来处理消息 pub fn create_agent(&self) -> Result { Ok(AgentLoop::with_provider_and_tools( @@ -380,7 +397,7 @@ impl Session { self.provider_config.max_tool_iterations, self.provider_config.model_id.clone(), self.provider_config.workspace_dir.clone(), - )) + ).with_context_window(self.provider_config.token_limit)) } /// 创建一个附通知通道的 AgentLoop 实例 @@ -1319,15 +1336,17 @@ impl SessionManager { _ => None, }; - // Build combined system prompt and inject at position 0 - // This ensures AgentLoop.process() sees a system message and doesn't inject its own + // Build combined system prompt and inject at position 0 AFTER compression. + // This ensures AgentLoop.process() sees a system message without it participating + // in context compression (system prompt is dynamic and should not be persisted). let system_prompt = session_guard.build_system_prompt(&skills_prompt, memory_context.as_deref()); - history.insert(0, ChatMessage::system(system_prompt)); - let history = session_guard.compressor + let mut history = session_guard.compressor .compress_if_needed(history) .await?; + history.insert(0, ChatMessage::system(system_prompt.clone())); + // Advance consolidation pointer — future compressions skip already-processed messages let now = chrono::Utc::now().timestamp_millis(); session_guard.last_consolidated_at = Some(now); @@ -1336,7 +1355,28 @@ impl SessionManager { } let agent = session_guard.create_agent_with_notify(notify_tx)?; - let result = agent.process(history).await?; + + // Try LLM call; on context overflow, re-compress with tighter window and retry once. + let result = match agent.process(history).await { + Ok(r) => r, + Err(AgentError::LlmError(ref msg)) + if is_context_overflow_error(msg) => + { + let new_window = crate::agent::ContextCompressor::parse_context_limit_from_error(msg) + .unwrap_or(session_guard.compressor_threshold()); + tracing::warn!( + new_window, + error = %msg, + "Context overflow in handle_message — retrying with tighter window" + ); + session_guard.compressor.set_context_window(new_window); + let raw = session_guard.get_history().to_vec(); + let mut retry = session_guard.compressor.compress_if_needed(raw).await?; + retry.insert(0, ChatMessage::system(system_prompt)); + agent.process(retry).await? + } + Err(e) => return Err(e), + }; for msg in result.emitted_messages { session_guard.add_message(msg, true).await @@ -1443,12 +1483,15 @@ impl SessionManager { job_name, job_id, channel, chat_id ); let full_system_prompt = format!("{}{}", system_prompt, cron_context); - history.insert(0, ChatMessage::system(full_system_prompt)); - let history = session_guard.compressor + // Inject system prompt AFTER compression so it doesn't participate + // in context compression (system prompt is dynamic and should not be persisted). + let mut history = session_guard.compressor .compress_if_needed(history) .await?; + history.insert(0, ChatMessage::system(full_system_prompt)); + let agent = session_guard.create_agent_with_notify(notify_tx)?; let result = agent.process(history).await?;