feat: 优化文本分割逻辑,避免在 markdown 表格和代码块中间拆分
This commit is contained in:
parent
c3bfe32fa3
commit
39072f724e
@ -379,7 +379,8 @@ impl Channel for WechatChannel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Split text into chunks suitable for WeChat delivery.
|
/// Split text into chunks suitable for WeChat delivery.
|
||||||
/// Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
|
/// - Prefers splitting at paragraph breaks, then newlines, then sentence boundaries.
|
||||||
|
/// - Avoids splitting in the middle of markdown tables and code blocks.
|
||||||
const MAX_WECHAT_CHUNK_CHARS: usize = 2000;
|
const MAX_WECHAT_CHUNK_CHARS: usize = 2000;
|
||||||
const CHUNK_SEND_INTERVAL_MS: u64 = 500;
|
const CHUNK_SEND_INTERVAL_MS: u64 = 500;
|
||||||
|
|
||||||
@ -396,25 +397,11 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
|
|||||||
}
|
}
|
||||||
let end = remaining.floor_char_boundary(limit);
|
let end = remaining.floor_char_boundary(limit);
|
||||||
let window = &remaining[..end];
|
let window = &remaining[..end];
|
||||||
let cut = window
|
|
||||||
.rfind("\n\n")
|
// Find a safe split point, avoiding table/code-block interiors
|
||||||
.filter(|&i| i > end * 3 / 10)
|
let cut = find_split_point(window, limit);
|
||||||
.map(|i| i + 2)
|
|
||||||
.or_else(|| {
|
|
||||||
window
|
|
||||||
.rfind('\n')
|
|
||||||
.filter(|&i| i > end * 3 / 10)
|
|
||||||
.map(|i| i + 1)
|
|
||||||
})
|
|
||||||
.or_else(|| {
|
|
||||||
window
|
|
||||||
.rfind('。')
|
|
||||||
.filter(|&i| i > end * 3 / 10)
|
|
||||||
.map(|i| i + 3)
|
|
||||||
})
|
|
||||||
.unwrap_or(end);
|
|
||||||
chunks.push(remaining[..cut].to_string());
|
chunks.push(remaining[..cut].to_string());
|
||||||
remaining = &remaining[cut..];
|
remaining = remaining[cut..].trim_start();
|
||||||
}
|
}
|
||||||
if chunks.is_empty() {
|
if chunks.is_empty() {
|
||||||
vec![String::new()]
|
vec![String::new()]
|
||||||
@ -423,6 +410,109 @@ fn split_text(text: &str, limit: usize) -> Vec<String> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Find the best split point in `window`, avoiding markdown table rows and code fences.
|
||||||
|
fn find_split_point(window: &str, _limit: usize) -> usize {
|
||||||
|
let end = window.len();
|
||||||
|
|
||||||
|
// Build a set of line-start indices that are "unsafe" to split before
|
||||||
|
// because they're inside a markdown table or code block.
|
||||||
|
let unsafe_starts = find_unsafe_line_starts(window);
|
||||||
|
|
||||||
|
// Try split points from best to worst, skipping unsafe ones
|
||||||
|
for (delim, len) in &[
|
||||||
|
("\n\n", 2), // paragraph break (best)
|
||||||
|
("\n", 1), // newline
|
||||||
|
("。", 3), // Chinese period
|
||||||
|
("\n", 1), // any newline (retry with relaxed threshold)
|
||||||
|
] {
|
||||||
|
let min_pos = if *delim == "\n" && *len == 1 && end > 0 {
|
||||||
|
// For the relaxed newline pass, accept any position
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
end * 3 / 10
|
||||||
|
};
|
||||||
|
|
||||||
|
match window.rfind(delim) {
|
||||||
|
Some(pos) if pos >= min_pos => {
|
||||||
|
let after = pos + len;
|
||||||
|
// Check that the line starting at `after` is not inside a protected block
|
||||||
|
if !unsafe_starts.contains(&after) {
|
||||||
|
return after;
|
||||||
|
}
|
||||||
|
// If this split point is inside a protected block, keep looking earlier
|
||||||
|
if let Some(prev) = window[..pos].rfind(delim) {
|
||||||
|
let prev_after = prev + len;
|
||||||
|
if prev_after >= min_pos && !unsafe_starts.contains(&prev_after) {
|
||||||
|
return prev_after;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If still inside protected block, try earlier .find
|
||||||
|
if let Some(earlier) = window[..pos].rfind("\n\n") {
|
||||||
|
let earlier_after = earlier + 2;
|
||||||
|
if !unsafe_starts.contains(&earlier_after) {
|
||||||
|
return earlier_after;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last resort: just cut at the character boundary (may break a table, but better than nothing)
|
||||||
|
end
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns byte offsets of line starts that are "unsafe" to split before,
|
||||||
|
/// because they fall inside a markdown table or code block.
|
||||||
|
fn find_unsafe_line_starts(window: &str) -> Vec<usize> {
|
||||||
|
let mut unsafe_starts = Vec::new();
|
||||||
|
let mut in_code_block = false;
|
||||||
|
let mut in_table = false;
|
||||||
|
let mut pos = 0;
|
||||||
|
|
||||||
|
for line in window.split_inclusive('\n') {
|
||||||
|
let trimmed = line.trim();
|
||||||
|
let is_empty = trimmed.is_empty();
|
||||||
|
|
||||||
|
// Track code blocks
|
||||||
|
if trimmed.starts_with("```") {
|
||||||
|
if in_code_block {
|
||||||
|
in_code_block = false;
|
||||||
|
// The closing fence itself is safe after
|
||||||
|
pos += line.len();
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
in_code_block = true;
|
||||||
|
pos += line.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if in_code_block {
|
||||||
|
unsafe_starts.push(pos);
|
||||||
|
pos += line.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track markdown table rows
|
||||||
|
let is_table_row = trimmed.starts_with('|') && trimmed.ends_with('|');
|
||||||
|
|
||||||
|
if is_table_row {
|
||||||
|
in_table = true;
|
||||||
|
unsafe_starts.push(pos);
|
||||||
|
} else if in_table && !is_empty {
|
||||||
|
// Non-empty non-table line after table: table ended on previous line
|
||||||
|
in_table = false;
|
||||||
|
} else if is_empty {
|
||||||
|
in_table = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += line.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe_starts
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user