fix(context): 全面修复和增强上下文压缩机制

Bug修复: - fast_trim 死代码: 去掉 .clone() 使其真正修改原 history - compress_once 边界错误: 第一条 user 重复 + 最后一条 user 丢失 - handle_cron_message 遗留的重复压缩调用新特性: - System prompt 不参与压缩，在压缩后注入 - 孤儿 tool 清理: compress_once 返回前修复 tool 对完整性 - Agent Loop 内每轮 token 检查: 80% 窗口阈值时预裁剪旧 tool 输出 - 上下文溢出恢复: 解析 LLM 错误中的实际 token limit 并降级重试 - 硬截断降级: tokens > 90% context_window 时 head+tail 保底 - parse_context_limit_from_error: 支持 OpenAI/Anthropic/llama.cpp 格式测试: 新增 6 个单元测试覆盖上述所有场景
2026-05-10 12:23:26 +08:00 · 2026-05-10 12:23:26 +08:00 · f9ae4b2c69
commit f9ae4b2c69
parent 488e10dceb
4 changed files with 476 additions and 23 deletions
--- a/src/agent/agent_loop.rs
+++ b/src/agent/agent_loop.rs
@ -1,3 +1,4 @@
+use crate::agent::context_compressor::estimate_tokens;
 use crate::agent::system_prompt::build_system_prompt;
 use crate::bus::message::ContentBlock;
 use crate::bus::ChatMessage;
@ -226,6 +227,7 @@ pub struct AgentLoop {
    max_iterations: usize,
    workspace_dir: PathBuf,
    model_name: String,
+    context_window: usize,
    notify_tx: Option<tokio::sync::mpsc::UnboundedSender<String>>,
 }

@ -249,6 +251,7 @@ impl AgentLoop {
            tools: Arc::new(ToolRegistry::new()),
            observer: None,
            notify_tx: None,
+            context_window: 0,
            max_iterations,
            workspace_dir,
            model_name,
@ -268,6 +271,7 @@ impl AgentLoop {
            tools,
            observer: None,
            notify_tx: None,
+            context_window: 0,
            max_iterations,
            workspace_dir,
            model_name,
@ -281,6 +285,7 @@ impl AgentLoop {
            tools: Arc::new(ToolRegistry::new()),
            observer: None,
            notify_tx: None,
+            context_window: 0,
            max_iterations,
            workspace_dir,
            model_name,
@ -300,12 +305,19 @@ impl AgentLoop {
            tools,
            observer: None,
            notify_tx: None,
+            context_window: 0,
            max_iterations,
            workspace_dir,
            model_name,
        }
    }

+    /// Set the context window size for preemptive trimming.
+    pub fn with_context_window(mut self, window: usize) -> Self {
+        self.context_window = window;
+        self
+    }
+
    /// Set the workspace directory.
    pub fn with_workspace_dir(mut self, dir: PathBuf) -> Self {
        self.workspace_dir = dir;
@ -323,6 +335,36 @@ impl AgentLoop {
        self
    }

+    /// Preemptive trim: truncate old tool results in-place when history is
+    /// approaching the context window limit. Only trims tool messages with
+    /// content > TRIM_CHARS, preserving the most recent KEEP messages.
+    fn preemptive_trim_old_tool_results(
+        &self,
+        messages: &mut [ChatMessage],
+        max_chars: usize,
+        keep_recent: usize,
+    ) -> usize {
+        let end = messages.len().saturating_sub(keep_recent);
+        let start = 1; // protect system message at [0] if present
+        let mut modified = 0;
+        for i in start..end {
+            if messages[i].role != "tool" {
+                continue;
+            }
+            if messages[i].content.len() <= max_chars {
+                continue;
+            }
+            let removed = messages[i].content.len() - max_chars;
+            messages[i].content = format!(
+                "{}...\n\n[Output truncated - {} characters removed]",
+                &messages[i].content[..messages[i].content.ceil_char_boundary(max_chars)],
+                removed
+            );
+            modified += 1;
+        }
+        modified
+    }
+
    pub fn tools(&self) -> &Arc<ToolRegistry> {
        &self.tools
    }
@ -355,6 +397,27 @@ impl AgentLoop {
            #[cfg(debug_assertions)]
            tracing::debug!(iteration, "Agent iteration started");

+            // Preemptive context check: trim old tool results if token estimate
+            // exceeds 80% of context window to prevent mid-loop overflow.
+            if self.context_window > 0 {
+                let estimated = estimate_tokens(&messages);
+                let danger = (self.context_window as f64 * 0.8) as usize;
+                if estimated > danger {
+                    let trimmed = self.preemptive_trim_old_tool_results(
+                        &mut messages, 2000, 4,
+                    );
+                    if trimmed > 0 {
+                        #[cfg(debug_assertions)]
+                        tracing::debug!(
+                            estimated,
+                            danger,
+                            trimmed_msgs = trimmed,
+                            "Preemptive tool-result trim applied in loop"
+                        );
+                    }
+                }
+            }
+
            // Convert messages to LLM format
            let messages_for_llm: Vec<Message> = messages
                .iter()
--- a/src/agent/context_compressor.rs
+++ b/src/agent/context_compressor.rs
@ -15,6 +15,19 @@ pub fn estimate_tokens(messages: &[ChatMessage]) -> usize {
    (raw as f64 * 1.2) as usize
 }

+/// Extract the first number found within `max_len` characters of the start of `s`.
+/// Used by `parse_context_limit_from_error` to find token limits in error messages.
+fn find_number_nearby(s: &str, max_len: usize) -> Option<&str> {
+    let end = s.len().min(max_len);
+    let slice = &s[..end];
+    let start = slice.find(|c: char| c.is_ascii_digit())?;
+    let end = slice[start..]
+        .find(|c: char| !c.is_ascii_digit())
+        .map(|p| start + p)
+        .unwrap_or(end);
+    Some(&slice[start..end])
+}
+
 /// Configuration for context compression.
 #[derive(Debug, Clone)]
 pub struct ContextCompressionConfig {
@ -96,13 +109,18 @@ impl ContextCompressor {
        self.session_id = id;
    }

+    /// Update the context window size (e.g., after parsing actual limit from LLM error).
+    pub fn set_context_window(&mut self, window: usize) {
+        self.context_window = window;
+    }
+
    /// Always true — memory is always available (memory system is always on).
    pub fn has_memory(&self) -> bool {
        true
    }

    /// Get the compression threshold in tokens.
-    fn threshold(&self) -> usize {
+    pub fn threshold(&self) -> usize {
        (self.context_window as f64 * self.threshold_ratio) as usize
    }

@ -127,10 +145,34 @@ impl ContextCompressor {
        modified
    }

+    /// Remove orphan tool results whose declaring tool_calls have been compressed away.
+    /// Scans for tool messages with no preceding assistant tool_call, and removes them.
+    pub fn repair_tool_pairs(messages: &mut Vec<ChatMessage>) {
+        let mut declared: std::collections::HashSet<String> = std::collections::HashSet::new();
+        let mut i = 0;
+        while i < messages.len() {
+            if messages[i].role == "assistant" {
+                if let Some(ref tool_calls) = messages[i].tool_calls {
+                    for tc in tool_calls {
+                        declared.insert(tc.id.clone());
+                    }
+                }
+            } else if messages[i].role == "tool" {
+                if let Some(ref tid) = messages[i].tool_call_id {
+                    if !declared.contains(tid.as_str()) {
+                        messages.remove(i);
+                        continue;
+                    }
+                }
+            }
+            i += 1;
+        }
+    }
+
    /// Main entry point - compresses history if over threshold.
    pub async fn compress_if_needed(
        &self,
-        history: Vec<ChatMessage>,
+        mut history: Vec<ChatMessage>,
    ) -> Result<Vec<ChatMessage>, AgentError> {
        // Check if compression is needed
        let tokens = estimate_tokens(&history);
@ -146,19 +188,19 @@ impl ContextCompressor {
            "Starting context compression"
        );

-        // Fast trim pass first
-        let trimmed = self.fast_trim_tool_results(&mut history.clone());
+        // Fast trim pass first — modify history in place
+        let trimmed = self.fast_trim_tool_results(&mut history);
+        let tokens_after = estimate_tokens(&history);
        if trimmed > 0 {
-            let tokens_after = estimate_tokens(&history);
            #[cfg(debug_assertions)]
            tracing::debug!(
                trimmed_messages = trimmed,
                tokens_after = tokens_after,
                "Fast trim completed"
            );
-            if tokens_after <= self.threshold() {
-                return Ok(history);
-            }
+        }
+        if tokens_after <= self.threshold() {
+            return Ok(history);
        }

        // LLM summarization pass
@ -191,6 +233,36 @@ impl ContextCompressor {
            }
        }

+        // Hard safety net: if still dangerously high after all passes,
+        // fall back to head+tail truncation so the LLM call doesn't overflow.
+        let final_tokens = estimate_tokens(&current_history);
+        let danger_threshold = (self.context_window as f64 * 0.9) as usize;
+        if final_tokens > danger_threshold
+            && current_history.len() > self.config.protect_first_n + self.config.protect_last_n
+        {
+            let head: Vec<_> = current_history[..self.config.protect_first_n].to_vec();
+            let tail_start = current_history.len() - self.config.protect_last_n;
+            let tail: Vec<_> = current_history[tail_start..].to_vec();
+            let dropped = current_history.len() - self.config.protect_first_n - self.config.protect_last_n;
+
+            let mut truncated = head;
+            truncated.push(ChatMessage::user(format!(
+                "[Context truncation — {} earlier messages dropped due to token limit]\n\
+                 Previous context could not be fully compressed. Continuing with most recent context.",
+                dropped
+            )));
+            truncated.extend(tail);
+
+            tracing::warn!(
+                final_tokens = final_tokens,
+                danger = danger_threshold,
+                dropped_msgs = dropped,
+                "Hard truncation fallback applied"
+            );
+
+            current_history = truncated;
+        }
+
        #[cfg(debug_assertions)]
        tracing::debug!(
            final_tokens = estimate_tokens(&current_history),
@ -201,6 +273,48 @@ impl ContextCompressor {
        Ok(current_history)
    }

+    /// Try to extract the actual context token limit from an LLM error message.
+    /// Recognizes patterns from OpenAI, Anthropic, and llama.cpp-style errors.
+    pub fn parse_context_limit_from_error(msg: &str) -> Option<usize> {
+        let lower = msg.to_lowercase();
+
+        // Common patterns: "maximum context length is 128000", "context window of 131072",
+        // "128000 token context", "available context size (8448 tokens)", "> 128000 maximum"
+        let markers = [
+            "maximum context length",
+            "context window",
+            "context length",
+            "available context size",
+        ];
+
+        for marker in &markers {
+            if let Some(pos) = lower.find(marker) {
+                let after = &lower[pos + marker.len()..];
+                // Look for a number in the vicinity (up to 10 chars after marker)
+                if let Some(num_str) = find_number_nearby(after, 50) {
+                    if let Ok(n) = num_str.parse::<usize>() {
+                        if (1024..=10_000_000).contains(&n) {
+                            return Some(n);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Also try: "XXXX token context" or "XXXX limit"
+        if let Some(num_str) = find_number_nearby(&lower, lower.len()) {
+            if let Ok(n) = num_str.parse::<usize>() {
+                if (1024..=10_000_000).contains(&n)
+                    && (lower.contains("token") || lower.contains("context") || lower.contains("limit"))
+                {
+                    return Some(n);
+                }
+            }
+        }
+
+        None
+    }
+
    /// Single compression pass - summarize middle messages between user turns.
    /// Returns Some(compressed) if compression happened, None if nothing to compress.
    async fn compress_once(
@ -227,7 +341,7 @@ impl ContextCompressor {

        // Build segments: user -> (assistant turns) -> next user
        // We'll summarize the assistant turns between consecutive user messages
-        let mut new_messages = history[..=user_indices[0]].to_vec();
+        let mut new_messages = history[..user_indices[0]].to_vec();

        for i in 0..user_indices.len() - 1 {
            let user_idx = user_indices[i];
@ -272,13 +386,13 @@ impl ContextCompressor {

        // Add last user and everything after (protected)
        let last_user_idx = user_indices[user_indices.len() - 1];
-        if last_user_idx < history.len() - 1 {
-            // Add everything from last user onwards (protected)
-            for i in last_user_idx..history.len() {
-                new_messages.push(history[i].clone());
-            }
+        for i in last_user_idx..history.len() {
+            new_messages.push(history[i].clone());
        }

+        // Remove orphan tool results whose declaring tool_calls were compressed away
+        Self::repair_tool_pairs(&mut new_messages);
+
        // If nothing changed, return None
        if new_messages.len() == history.len() {
            return Ok(None);
@ -370,8 +484,11 @@ Be concise, aim for {} characters or less.
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::memory::MemoryManager;
    use crate::providers::ChatCompletionResponse;
+    use crate::providers::Usage;
    use async_trait::async_trait;
+    use std::sync::Arc;
    use std::sync::OnceLock;

    /// Mock provider for testing - panics if actually used for LLM calls
@ -403,6 +520,34 @@ mod tests {
        Arc::new(MockProvider)
    }

+    /// Mock summarizer that returns a simple summary — used when compress_once
+    /// needs to call the LLM for summarization.
+    struct MockSummarizer;
+
+    #[async_trait]
+    impl LLMProvider for MockSummarizer {
+        async fn chat(
+            &self,
+            _request: ChatCompletionRequest,
+        ) -> Result<ChatCompletionResponse, Box<dyn std::error::Error + Send + Sync>> {
+            Ok(ChatCompletionResponse {
+                id: "mock".into(),
+                model: "mock".into(),
+                content: "[summarized]".into(),
+                tool_calls: vec![],
+                usage: Usage { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
+            })
+        }
+
+        fn ptype(&self) -> &str { "mock" }
+        fn name(&self) -> &str { "mock" }
+        fn model_id(&self) -> &str { "mock" }
+    }
+
+    fn mock_summarizer() -> Arc<dyn LLMProvider> {
+        Arc::new(MockSummarizer)
+    }
+
    fn test_memory_manager() -> Arc<MemoryManager> {
        static MM: OnceLock<Arc<MemoryManager>> = OnceLock::new();
        MM.get_or_init(|| {
@ -454,4 +599,206 @@ mod tests {
        let compressor = ContextCompressor::new(mock_provider(), 128_000, test_memory_manager());
        assert_eq!(compressor.threshold(), 64_000);
    }
+
+    #[tokio::test]
+    async fn test_compress_if_needed_fast_trims_tool_results() {
+        // context_window=200 → threshold=100.
+        // user "Hi" (~6 raw), tool(3000 x's) → ~760 raw*1.2=912 > 100 → triggers compression.
+        // fast_trim to 50 chars should bring tokens well under 100.
+        let tmp = std::env::temp_dir().join(format!("picobot_ctx_trim_{}.db", std::process::id()));
+        let storage = Arc::new(crate::storage::Storage::new(&tmp).await.unwrap());
+        let mm = Arc::new(MemoryManager::new(storage, "test".into(), "test".into()));
+
+        let config = ContextCompressionConfig {
+            tool_result_trim_chars: 50,
+            protect_first_n: 0,
+            protect_last_n: 10,
+            max_passes: 0,
+            ..Default::default()
+        };
+        let compressor = ContextCompressor::with_config(mock_provider(), 200, config, mm);
+
+        let messages = vec![
+            ChatMessage::user("Hi"),
+            ChatMessage::tool("call1", "bash", &"x".repeat(3000)),
+        ];
+
+        let result = compressor.compress_if_needed(messages).await.unwrap();
+
+        let tool_msg = result.iter().find(|m| m.role == "tool").unwrap();
+        assert!(
+            tool_msg.content.len() < 3000,
+            "tool result should be trimmed, got {} chars",
+            tool_msg.content.len()
+        );
+        assert!(
+            tool_msg.content.contains("[Output truncated"),
+            "trim marker missing from: {}",
+            tool_msg.content
+        );
+
+        let _ = std::fs::remove_file(&tmp);
+    }
+
+    #[tokio::test]
+    async fn test_compress_once_no_duplicate_and_no_lost_user() {
+        // Verifies two boundary bugs in compress_once:
+        //  - B2A (L230): first user message duplicated when protect_first_n > 0
+        //  - B2B (L275): last user message lost when it is the final history message
+        //
+        // context_window=200 → threshold=100. Large tool outputs force LLM summarization.
+        let tmp = std::env::temp_dir().join(format!("picobot_ctx_boundary_{}.db", std::process::id()));
+        let storage = Arc::new(crate::storage::Storage::new(&tmp).await.unwrap());
+        let mm = Arc::new(MemoryManager::new(storage, "test".into(), "test".into()));
+
+        let config = ContextCompressionConfig {
+            tool_result_trim_chars: 2000,
+            protect_first_n: 1,   // system/protected → B2A: first user (after skip) duplicated
+            protect_last_n: 2,
+            max_passes: 1,
+            ..Default::default()
+        };
+        let compressor = ContextCompressor::with_config(mock_summarizer(), 200, config, mm);
+
+        // History: 9 messages, last message is user Q4.
+        // user_indices (skip 1) = [1, 3, 6, 8]
+        // B2A: init history[..=1] includes Q1, then loop i=0 pushes Q1 again → duplicate
+        // B2B: last_user_idx=8, 8 < 8 → false → Q4 not pushed → lost
+        let big = "x".repeat(3000);
+        let messages = vec![
+            ChatMessage::system("You are a helper."), // 0: protected
+            ChatMessage::user("Q1"),                   // 1: first user
+            ChatMessage::tool("t1", "bash", &big),     // 2
+            ChatMessage::user("Q2"),                   // 3
+            ChatMessage::assistant("thinking"),         // 4
+            ChatMessage::tool("t2", "bash", &big),     // 5
+            ChatMessage::user("Q3"),                   // 6
+            ChatMessage::assistant("thinking"),         // 7
+            ChatMessage::user("Q4"),                   // 8: LAST, is user → B2B triggers
+        ];
+
+        let result = compressor.compress_if_needed(messages).await.unwrap();
+
+        // B2A: "Q1" must appear exactly once
+        let q1_count = result.iter().filter(|m| m.role == "user" && m.content == "Q1").count();
+        assert_eq!(q1_count, 1, "Q1 should appear exactly once, got {}", q1_count);
+
+        // B2B: "Q4" must NOT be lost
+        let q4_count = result.iter().filter(|m| m.role == "user" && m.content == "Q4").count();
+        assert_eq!(q4_count, 1, "Q4 should appear exactly once (not lost), got {}", q4_count);
+
+        let _ = std::fs::remove_file(&tmp);
+    }
+
+    #[tokio::test]
+    async fn test_compress_hard_truncation_fallback() {
+        // When LLM compression fails (or max_passes=0) and tokens are still
+        // above 90% of context_window, a head+tail truncation kicks in.
+        let tmp = std::env::temp_dir().join(format!("picobot_ctx_trunc_{}.db", std::process::id()));
+        let storage = Arc::new(crate::storage::Storage::new(&tmp).await.unwrap());
+        let mm = Arc::new(MemoryManager::new(storage, "test".into(), "test".into()));
+
+        let config = ContextCompressionConfig {
+            tool_result_trim_chars: 500,  // trim reduces but not enough
+            protect_first_n: 1,
+            protect_last_n: 2,
+            max_passes: 0,                // no LLM summarization → will exceed danger
+            ..Default::default()
+        };
+        // context_window=100, danger_threshold=90.
+        // Each trimmed tool (~500 chars): ceil(500/4)+4 = 129 raw. 3 tools = 387.
+        // Plus users (~5 each) + system (~15) = ~417 raw * 1.2 = 500 > 90.
+        let compressor = ContextCompressor::with_config(mock_provider(), 100, config, mm);
+
+        let big = "x".repeat(3000);
+        let messages = vec![
+            ChatMessage::system("sys"),
+            ChatMessage::user("Q1"),
+            ChatMessage::tool("t1", "bash", &big),
+            ChatMessage::user("Q2"),
+            ChatMessage::tool("t2", "bash", &big),
+            ChatMessage::user("Q3"),
+            ChatMessage::tool("t3", "bash", &big),
+        ];
+
+        let result = compressor.compress_if_needed(messages).await.unwrap();
+
+        // After hard truncation: head (1) + trunc_note (1) + tail (2) = 4 messages
+        assert!(result.len() < 7, "expected truncation reduction, got {} messages", result.len());
+
+        // Truncation notice should be present
+        let has_notice = result.iter().any(|m| m.content.contains("Context truncation"));
+        assert!(has_notice, "hard truncation notice missing");
+
+        let _ = std::fs::remove_file(&tmp);
+    }
+
+    #[test]
+    fn test_repair_tool_pairs_removes_orphans() {
+        use crate::providers::ToolCall;
+
+        // Simulate compressed output: summary replaced assistant(tool_call: tc1),
+        // leaving tool(tc1) as an orphan. Legitimate tool(tc2) should be kept.
+        let mut messages = vec![
+            ChatMessage::user("Q1"),
+            ChatMessage::user("[Context Summary]\n\nsummary of previous turn"),
+            ChatMessage::tool("tc1", "bash", "orphan result"),         // orphan — tc1 never declared
+            ChatMessage::assistant("done"),                            // declares tc2
+            ChatMessage::tool("tc2", "bash", "legitimate result"),     // legit
+        ];
+        // Set tool_call_id on tool messages and tool_calls on assistant
+        messages[2].tool_call_id = Some("tc1".into());
+        messages[4].tool_call_id = Some("tc2".into());
+        messages[3].tool_calls = Some(vec![ToolCall {
+            id: "tc2".into(),
+            name: "bash".into(),
+            arguments: serde_json::json!({"cmd": "echo ok"}),
+        }]);
+
+        ContextCompressor::repair_tool_pairs(&mut messages);
+
+        // orphan should be removed; legitimate should stay
+        assert_eq!(messages.len(), 4);
+        assert!(messages.iter().all(|m| m.tool_call_id != Some("tc1".into())));
+        assert!(messages.iter().any(|m| m.tool_call_id == Some("tc2".into())));
+    }
+
+    #[test]
+    fn test_parse_context_limit_from_error() {
+        // OpenAI: "maximum context length is 128000"
+        assert_eq!(
+            ContextCompressor::parse_context_limit_from_error(
+                "This model's maximum context length is 128000 tokens."
+            ),
+            Some(128000)
+        );
+
+        // Anthropic: "context window of 200000"
+        assert_eq!(
+            ContextCompressor::parse_context_limit_from_error(
+                "Your request exceeds the context window of 200000."
+            ),
+            Some(200000)
+        );
+
+        // llama.cpp: "available context size (8448 tokens)"
+        assert_eq!(
+            ContextCompressor::parse_context_limit_from_error(
+                "context size exceeded, available context size (8448 tokens)"
+            ),
+            Some(8448)
+        );
+
+        // Non-context error should return None
+        assert_eq!(
+            ContextCompressor::parse_context_limit_from_error("Internal server error"),
+            None
+        );
+
+        // Numbers too small should be rejected
+        assert_eq!(
+            ContextCompressor::parse_context_limit_from_error("context length is 500"),
+            None
+        );
+    }
 }
--- a/src/agent/mod.rs
+++ b/src/agent/mod.rs
@ -3,5 +3,5 @@ pub mod context_compressor;
 pub mod system_prompt;

 pub use agent_loop::{AgentLoop, AgentError, AgentProcessResult};
-pub use context_compressor::ContextCompressor;
+pub use context_compressor::{ContextCompressor, estimate_tokens};
 pub use system_prompt::{build_system_prompt, PromptContext, PromptSection, SystemPromptBuilder};
--- a/src/session/session.rs
+++ b/src/session/session.rs
@ -21,6 +21,18 @@ use crate::config::LLMProviderConfig;
 use crate::agent::{AgentLoop, AgentError, ContextCompressor};
 use crate::agent::system_prompt::build_system_prompt;
 use crate::agent::context_compressor::ContextCompressionConfig;
+
+/// Check if an LLM error message indicates a context window overflow.
+fn is_context_overflow_error(msg: &str) -> bool {
+    let lower = msg.to_lowercase();
+    lower.contains("context length")
+        || lower.contains("context window")
+        || lower.contains("maximum context")
+        || lower.contains("too many tokens")
+        || lower.contains("token limit exceeded")
+        || lower.contains("prompt is too long")
+        || lower.contains("input is too long")
+}
 use crate::providers::{create_provider, LLMProvider};
 use crate::session::session_id::UnifiedSessionId;
 use crate::session::events::DialogInfo;
@ -372,6 +384,11 @@ impl Session {
        &self.compressor
    }

+    /// Get the compressor's current threshold for diagnostics/fallback.
+    pub fn compressor_threshold(&self) -> usize {
+        self.compressor.threshold()
+    }
+
    /// 创建一个临时的 AgentLoop 实例来处理消息
    pub fn create_agent(&self) -> Result<AgentLoop, AgentError> {
        Ok(AgentLoop::with_provider_and_tools(
@ -380,7 +397,7 @@ impl Session {
            self.provider_config.max_tool_iterations,
            self.provider_config.model_id.clone(),
            self.provider_config.workspace_dir.clone(),
-        ))
+        ).with_context_window(self.provider_config.token_limit))
    }

    /// 创建一个附通知通道的 AgentLoop 实例
@ -1319,15 +1336,17 @@ impl SessionManager {
                _ => None,
            };

-            // Build combined system prompt and inject at position 0
-            // This ensures AgentLoop.process() sees a system message and doesn't inject its own
+            // Build combined system prompt and inject at position 0 AFTER compression.
+            // This ensures AgentLoop.process() sees a system message without it participating
+            // in context compression (system prompt is dynamic and should not be persisted).
            let system_prompt = session_guard.build_system_prompt(&skills_prompt, memory_context.as_deref());
-            history.insert(0, ChatMessage::system(system_prompt));

-            let history = session_guard.compressor
+            let mut history = session_guard.compressor
                .compress_if_needed(history)
                .await?;

+            history.insert(0, ChatMessage::system(system_prompt.clone()));
+
            // Advance consolidation pointer — future compressions skip already-processed messages
            let now = chrono::Utc::now().timestamp_millis();
            session_guard.last_consolidated_at = Some(now);
@ -1336,7 +1355,28 @@ impl SessionManager {
            }

            let agent = session_guard.create_agent_with_notify(notify_tx)?;
-            let result = agent.process(history).await?;
+
+            // Try LLM call; on context overflow, re-compress with tighter window and retry once.
+            let result = match agent.process(history).await {
+                Ok(r) => r,
+                Err(AgentError::LlmError(ref msg))
+                    if is_context_overflow_error(msg) =>
+                {
+                    let new_window = crate::agent::ContextCompressor::parse_context_limit_from_error(msg)
+                        .unwrap_or(session_guard.compressor_threshold());
+                    tracing::warn!(
+                        new_window,
+                        error = %msg,
+                        "Context overflow in handle_message — retrying with tighter window"
+                    );
+                    session_guard.compressor.set_context_window(new_window);
+                    let raw = session_guard.get_history().to_vec();
+                    let mut retry = session_guard.compressor.compress_if_needed(raw).await?;
+                    retry.insert(0, ChatMessage::system(system_prompt));
+                    agent.process(retry).await?
+                }
+                Err(e) => return Err(e),
+            };

            for msg in result.emitted_messages {
                session_guard.add_message(msg, true).await
@ -1443,12 +1483,15 @@ impl SessionManager {
                job_name, job_id, channel, chat_id
            );
            let full_system_prompt = format!("{}{}", system_prompt, cron_context);
-            history.insert(0, ChatMessage::system(full_system_prompt));

-            let history = session_guard.compressor
+            // Inject system prompt AFTER compression so it doesn't participate
+            // in context compression (system prompt is dynamic and should not be persisted).
+            let mut history = session_guard.compressor
                .compress_if_needed(history)
                .await?;

+            history.insert(0, ChatMessage::system(full_system_prompt));
+
            let agent = session_guard.create_agent_with_notify(notify_tx)?;
            let result = agent.process(history).await?;