feat: 优化文本分割逻辑，避免在 markdown 表格和代码块中间拆分

2026-06-06 10:32:28 +08:00 · 2026-06-06 10:32:28 +08:00 · 39072f724e
commit 39072f724e
parent c3bfe32fa3
1 changed files with 109 additions and 19 deletions
--- a/src/channels/wechat.rs
+++ b/src/channels/wechat.rs
@ -379,7 +379,8 @@ impl Channel for WechatChannel {
 }

 /// Split text into chunks suitable for WeChat delivery.
-/// Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
+/// - Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
+/// - Avoids splitting in the middle of markdown tables and code blocks.
 const MAX_WECHAT_CHUNK_CHARS: usize = 2000;
 const CHUNK_SEND_INTERVAL_MS: u64 = 500;

@ -396,25 +397,11 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
        }
        let end = remaining.floor_char_boundary(limit);
        let window = &remaining[..end];
-        let cut = window
-            .rfind("\n\n")
-            .filter(|&i| i > end * 3 / 10)
-            .map(|i| i + 2)
-            .or_else(|| {
-                window
-                    .rfind('\n')
-                    .filter(|&i| i > end * 3 / 10)
-                    .map(|i| i + 1)
-            })
-            .or_else(|| {
-                window
-                    .rfind('。')
-                    .filter(|&i| i > end * 3 / 10)
-                    .map(|i| i + 3)
-            })
-            .unwrap_or(end);
+
+        // Find a safe split point, avoiding table/code-block interiors
+        let cut = find_split_point(window, limit);
        chunks.push(remaining[..cut].to_string());
-        remaining = &remaining[cut..];
+        remaining = remaining[cut..].trim_start();
    }
    if chunks.is_empty() {
        vec![String::new()]
@ -423,6 +410,109 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
    }
 }

+/// Find the best split point in `window`, avoiding markdown table rows and code fences.
+fn find_split_point(window: &str, _limit: usize) -> usize {
+    let end = window.len();
+
+    // Build a set of line-start indices that are "unsafe" to split before
+    // because they're inside a markdown table or code block.
+    let unsafe_starts = find_unsafe_line_starts(window);
+
+    // Try split points from best to worst, skipping unsafe ones
+    for (delim, len) in &[
+        ("\n\n", 2), // paragraph break (best)
+        ("\n", 1),   // newline
+        ("。", 3),   // Chinese period
+        ("\n", 1),   // any newline (retry with relaxed threshold)
+    ] {
+        let min_pos = if *delim == "\n" && *len == 1 && end > 0 {
+            // For the relaxed newline pass, accept any position
+            0
+        } else {
+            end * 3 / 10
+        };
+
+        match window.rfind(delim) {
+            Some(pos) if pos >= min_pos => {
+                let after = pos + len;
+                // Check that the line starting at `after` is not inside a protected block
+                if !unsafe_starts.contains(&after) {
+                    return after;
+                }
+                // If this split point is inside a protected block, keep looking earlier
+                if let Some(prev) = window[..pos].rfind(delim) {
+                    let prev_after = prev + len;
+                    if prev_after >= min_pos && !unsafe_starts.contains(&prev_after) {
+                        return prev_after;
+                    }
+                }
+                // If still inside protected block, try earlier .find
+                if let Some(earlier) = window[..pos].rfind("\n\n") {
+                    let earlier_after = earlier + 2;
+                    if !unsafe_starts.contains(&earlier_after) {
+                        return earlier_after;
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+
+    // Last resort: just cut at the character boundary (may break a table, but better than nothing)
+    end
+}
+
+/// Returns byte offsets of line starts that are "unsafe" to split before,
+/// because they fall inside a markdown table or code block.
+fn find_unsafe_line_starts(window: &str) -> Vec<usize> {
+    let mut unsafe_starts = Vec::new();
+    let mut in_code_block = false;
+    let mut in_table = false;
+    let mut pos = 0;
+
+    for line in window.split_inclusive('\n') {
+        let trimmed = line.trim();
+        let is_empty = trimmed.is_empty();
+
+        // Track code blocks
+        if trimmed.starts_with("```") {
+            if in_code_block {
+                in_code_block = false;
+                // The closing fence itself is safe after
+                pos += line.len();
+                continue;
+            } else {
+                in_code_block = true;
+                pos += line.len();
+                continue;
+            }
+        }
+
+        if in_code_block {
+            unsafe_starts.push(pos);
+            pos += line.len();
+            continue;
+        }
+
+        // Track markdown table rows
+        let is_table_row = trimmed.starts_with('|') && trimmed.ends_with('|');
+
+        if is_table_row {
+            in_table = true;
+            unsafe_starts.push(pos);
+        } else if in_table && !is_empty {
+            // Non-empty non-table line after table: table ended on previous line
+            in_table = false;
+        } else if is_empty {
+            in_table = false;
+        }
+
+        pos += line.len();
+    }
+
+    unsafe_starts
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;