diff --git a/src/channels/wechat.rs b/src/channels/wechat.rs index 1c21f83..7ed697c 100644 --- a/src/channels/wechat.rs +++ b/src/channels/wechat.rs @@ -379,7 +379,8 @@ impl Channel for WechatChannel { } /// Split text into chunks suitable for WeChat delivery. -/// Prefers splitting at paragraph breaks, then newlines, then sentence boundaries. +/// - Prefers splitting at paragraph breaks, then newlines, then sentence boundaries. +/// - Avoids splitting in the middle of markdown tables and code blocks. const MAX_WECHAT_CHUNK_CHARS: usize = 2000; const CHUNK_SEND_INTERVAL_MS: u64 = 500; @@ -396,25 +397,11 @@ fn split_text(text: &str, limit: usize) -> Vec { } let end = remaining.floor_char_boundary(limit); let window = &remaining[..end]; - let cut = window - .rfind("\n\n") - .filter(|&i| i > end * 3 / 10) - .map(|i| i + 2) - .or_else(|| { - window - .rfind('\n') - .filter(|&i| i > end * 3 / 10) - .map(|i| i + 1) - }) - .or_else(|| { - window - .rfind('。') - .filter(|&i| i > end * 3 / 10) - .map(|i| i + 3) - }) - .unwrap_or(end); + + // Find a safe split point, avoiding table/code-block interiors + let cut = find_split_point(window, limit); chunks.push(remaining[..cut].to_string()); - remaining = &remaining[cut..]; + remaining = remaining[cut..].trim_start(); } if chunks.is_empty() { vec![String::new()] @@ -423,6 +410,109 @@ fn split_text(text: &str, limit: usize) -> Vec { } } +/// Find the best split point in `window`, avoiding markdown table rows and code fences. +fn find_split_point(window: &str, _limit: usize) -> usize { + let end = window.len(); + + // Build a set of line-start indices that are "unsafe" to split before + // because they're inside a markdown table or code block. + let unsafe_starts = find_unsafe_line_starts(window); + + // Try split points from best to worst, skipping unsafe ones + for (delim, len) in &[ + ("\n\n", 2), // paragraph break (best) + ("\n", 1), // newline + ("。", 3), // Chinese period + ("\n", 1), // any newline (retry with relaxed threshold) + ] { + let min_pos = if *delim == "\n" && *len == 1 && end > 0 { + // For the relaxed newline pass, accept any position + 0 + } else { + end * 3 / 10 + }; + + match window.rfind(delim) { + Some(pos) if pos >= min_pos => { + let after = pos + len; + // Check that the line starting at `after` is not inside a protected block + if !unsafe_starts.contains(&after) { + return after; + } + // If this split point is inside a protected block, keep looking earlier + if let Some(prev) = window[..pos].rfind(delim) { + let prev_after = prev + len; + if prev_after >= min_pos && !unsafe_starts.contains(&prev_after) { + return prev_after; + } + } + // If still inside protected block, try earlier .find + if let Some(earlier) = window[..pos].rfind("\n\n") { + let earlier_after = earlier + 2; + if !unsafe_starts.contains(&earlier_after) { + return earlier_after; + } + } + } + _ => {} + } + } + + // Last resort: just cut at the character boundary (may break a table, but better than nothing) + end +} + +/// Returns byte offsets of line starts that are "unsafe" to split before, +/// because they fall inside a markdown table or code block. +fn find_unsafe_line_starts(window: &str) -> Vec { + let mut unsafe_starts = Vec::new(); + let mut in_code_block = false; + let mut in_table = false; + let mut pos = 0; + + for line in window.split_inclusive('\n') { + let trimmed = line.trim(); + let is_empty = trimmed.is_empty(); + + // Track code blocks + if trimmed.starts_with("```") { + if in_code_block { + in_code_block = false; + // The closing fence itself is safe after + pos += line.len(); + continue; + } else { + in_code_block = true; + pos += line.len(); + continue; + } + } + + if in_code_block { + unsafe_starts.push(pos); + pos += line.len(); + continue; + } + + // Track markdown table rows + let is_table_row = trimmed.starts_with('|') && trimmed.ends_with('|'); + + if is_table_row { + in_table = true; + unsafe_starts.push(pos); + } else if in_table && !is_empty { + // Non-empty non-table line after table: table ended on previous line + in_table = false; + } else if is_empty { + in_table = false; + } + + pos += line.len(); + } + + unsafe_starts +} + #[cfg(test)] mod tests { use super::*;