feat: 优化文本分割逻辑，避免在 markdown 表格和代码块中间拆分

2026-06-06 10:32:28 +08:00 · 2026-06-06 10:32:28 +08:00 · 39072f724e
commit 39072f724e
parent c3bfe32fa3
1 changed files with 109 additions and 19 deletions
--- a/src/channels/wechat.rs
+++ b/src/channels/wechat.rs
@ -379,7 +379,8 @@ impl Channel for WechatChannel {
 }
 /// Split text into chunks suitable for WeChat delivery.
-/// Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
+/// - Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
 /// - Avoids splitting in the middle of markdown tables and code blocks.
 const MAX_WECHAT_CHUNK_CHARS: usize = 2000;
 const CHUNK_SEND_INTERVAL_MS: u64 = 500;
@ -396,25 +397,11 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
        }
        let end = remaining.floor_char_boundary(limit);
        let window = &remaining[..end];
-        let cut = window
+
-            .rfind("\n\n")
+        // Find a safe split point, avoiding table/code-block interiors
-            .filter(|&i| i > end * 3 / 10)
+        let cut = find_split_point(window, limit);
            .map(|i| i + 2)
            .or_else(|| {
                window
                    .rfind('\n')
                    .filter(|&i| i > end * 3 / 10)
                    .map(|i| i + 1)
            })
            .or_else(|| {
                window
                    .rfind('。')
                    .filter(|&i| i > end * 3 / 10)
                    .map(|i| i + 3)
            })
            .unwrap_or(end);
        chunks.push(remaining[..cut].to_string());
-        remaining = &remaining[cut..];
+        remaining = remaining[cut..].trim_start();
    }
    if chunks.is_empty() {
        vec![String::new()]
@ -423,6 +410,109 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
    }
 }
 /// Find the best split point in `window`, avoiding markdown table rows and code fences.
 fn find_split_point(window: &str, _limit: usize) -> usize {
    let end = window.len();
    // Build a set of line-start indices that are "unsafe" to split before
    // because they're inside a markdown table or code block.
    let unsafe_starts = find_unsafe_line_starts(window);
    // Try split points from best to worst, skipping unsafe ones
    for (delim, len) in &[
        ("\n\n", 2), // paragraph break (best)
        ("\n", 1),   // newline
        ("。", 3),   // Chinese period
        ("\n", 1),   // any newline (retry with relaxed threshold)
    ] {
        let min_pos = if *delim == "\n" && *len == 1 && end > 0 {
            // For the relaxed newline pass, accept any position
            0
        } else {
            end * 3 / 10
        };
        match window.rfind(delim) {
            Some(pos) if pos >= min_pos => {
                let after = pos + len;
                // Check that the line starting at `after` is not inside a protected block
                if !unsafe_starts.contains(&after) {
                    return after;
                }
                // If this split point is inside a protected block, keep looking earlier
                if let Some(prev) = window[..pos].rfind(delim) {
                    let prev_after = prev + len;
                    if prev_after >= min_pos && !unsafe_starts.contains(&prev_after) {
                        return prev_after;
                    }
                }
                // If still inside protected block, try earlier .find
                if let Some(earlier) = window[..pos].rfind("\n\n") {
                    let earlier_after = earlier + 2;
                    if !unsafe_starts.contains(&earlier_after) {
                        return earlier_after;
                    }
                }
            }
            _ => {}
        }
    }
    // Last resort: just cut at the character boundary (may break a table, but better than nothing)
    end
 }
 /// Returns byte offsets of line starts that are "unsafe" to split before,
 /// because they fall inside a markdown table or code block.
 fn find_unsafe_line_starts(window: &str) -> Vec<usize> {
    let mut unsafe_starts = Vec::new();
    let mut in_code_block = false;
    let mut in_table = false;
    let mut pos = 0;
    for line in window.split_inclusive('\n') {
        let trimmed = line.trim();
        let is_empty = trimmed.is_empty();
        // Track code blocks
        if trimmed.starts_with("```") {
            if in_code_block {
                in_code_block = false;
                // The closing fence itself is safe after
                pos += line.len();
                continue;
            } else {
                in_code_block = true;
                pos += line.len();
                continue;
            }
        }
        if in_code_block {
            unsafe_starts.push(pos);
            pos += line.len();
            continue;
        }
        // Track markdown table rows
        let is_table_row = trimmed.starts_with('|') && trimmed.ends_with('|');
        if is_table_row {
            in_table = true;
            unsafe_starts.push(pos);
        } else if in_table && !is_empty {
            // Non-empty non-table line after table: table ended on previous line
            in_table = false;
        } else if is_empty {
            in_table = false;
        }
        pos += line.len();
    }
    unsafe_starts
 }
 #[cfg(test)]
 mod tests {
    use super::*;