feat: 优化文本分割逻辑,避免在 markdown 表格和代码块中间拆分

This commit is contained in:
ooodc 2026-06-06 10:32:28 +08:00
parent c3bfe32fa3
commit 39072f724e

View File

@ -379,7 +379,8 @@ impl Channel for WechatChannel {
}
/// Split text into chunks suitable for WeChat delivery.
/// Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
/// - Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
/// - Avoids splitting in the middle of markdown tables and code blocks.
const MAX_WECHAT_CHUNK_CHARS: usize = 2000;
const CHUNK_SEND_INTERVAL_MS: u64 = 500;
@ -396,25 +397,11 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
}
let end = remaining.floor_char_boundary(limit);
let window = &remaining[..end];
let cut = window
.rfind("\n\n")
.filter(|&i| i > end * 3 / 10)
.map(|i| i + 2)
.or_else(|| {
window
.rfind('\n')
.filter(|&i| i > end * 3 / 10)
.map(|i| i + 1)
})
.or_else(|| {
window
.rfind('。')
.filter(|&i| i > end * 3 / 10)
.map(|i| i + 3)
})
.unwrap_or(end);
// Find a safe split point, avoiding table/code-block interiors
let cut = find_split_point(window, limit);
chunks.push(remaining[..cut].to_string());
remaining = &remaining[cut..];
remaining = remaining[cut..].trim_start();
}
if chunks.is_empty() {
vec![String::new()]
@ -423,6 +410,109 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
}
}
/// Find the best split point in `window`, avoiding markdown table rows and code fences.
fn find_split_point(window: &str, _limit: usize) -> usize {
let end = window.len();
// Build a set of line-start indices that are "unsafe" to split before
// because they're inside a markdown table or code block.
let unsafe_starts = find_unsafe_line_starts(window);
// Try split points from best to worst, skipping unsafe ones
for (delim, len) in &[
("\n\n", 2), // paragraph break (best)
("\n", 1), // newline
("", 3), // Chinese period
("\n", 1), // any newline (retry with relaxed threshold)
] {
let min_pos = if *delim == "\n" && *len == 1 && end > 0 {
// For the relaxed newline pass, accept any position
0
} else {
end * 3 / 10
};
match window.rfind(delim) {
Some(pos) if pos >= min_pos => {
let after = pos + len;
// Check that the line starting at `after` is not inside a protected block
if !unsafe_starts.contains(&after) {
return after;
}
// If this split point is inside a protected block, keep looking earlier
if let Some(prev) = window[..pos].rfind(delim) {
let prev_after = prev + len;
if prev_after >= min_pos && !unsafe_starts.contains(&prev_after) {
return prev_after;
}
}
// If still inside protected block, try earlier .find
if let Some(earlier) = window[..pos].rfind("\n\n") {
let earlier_after = earlier + 2;
if !unsafe_starts.contains(&earlier_after) {
return earlier_after;
}
}
}
_ => {}
}
}
// Last resort: just cut at the character boundary (may break a table, but better than nothing)
end
}
/// Returns byte offsets of line starts that are "unsafe" to split before,
/// because they fall inside a markdown table or code block.
fn find_unsafe_line_starts(window: &str) -> Vec<usize> {
let mut unsafe_starts = Vec::new();
let mut in_code_block = false;
let mut in_table = false;
let mut pos = 0;
for line in window.split_inclusive('\n') {
let trimmed = line.trim();
let is_empty = trimmed.is_empty();
// Track code blocks
if trimmed.starts_with("```") {
if in_code_block {
in_code_block = false;
// The closing fence itself is safe after
pos += line.len();
continue;
} else {
in_code_block = true;
pos += line.len();
continue;
}
}
if in_code_block {
unsafe_starts.push(pos);
pos += line.len();
continue;
}
// Track markdown table rows
let is_table_row = trimmed.starts_with('|') && trimmed.ends_with('|');
if is_table_row {
in_table = true;
unsafe_starts.push(pos);
} else if in_table && !is_empty {
// Non-empty non-table line after table: table ended on previous line
in_table = false;
} else if is_empty {
in_table = false;
}
pos += line.len();
}
unsafe_starts
}
#[cfg(test)]
mod tests {
use super::*;