fix: send_session_message 增加详细 debug 日志和文件名归一化匹配

- filename_matches_target 关键日志从 trace 升级到 debug
- 增加 on-disk bytes hex dump 输出,便于定位编码差异
- UTF-8 解码成功但不匹配时继续尝试 GBK 解码
- 新增 normalize_filename() 去除空白/零宽字符后模糊比对
  解决 LLM 在中文文件名中多插空格的问题

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
oudecheng 2026-06-11 09:44:47 +08:00
parent 0de0b93896
commit 694b3ce0e0

View File

@ -272,28 +272,36 @@ fn filename_matches_target(on_disk_name: &std::ffi::OsStr, target: &str) -> bool
use std::os::unix::ffi::OsStrExt; use std::os::unix::ffi::OsStrExt;
let bytes = on_disk_name.as_bytes(); let bytes = on_disk_name.as_bytes();
tracing::trace!( tracing::debug!(
on_disk_bytes = ?bytes,
on_disk_bytes_hex = %format_bytes_hex(bytes), on_disk_bytes_hex = %format_bytes_hex(bytes),
target = %target, target = %target,
"filename_matches_target: comparing" target_bytes_hex = %format_bytes_hex(target.as_bytes()),
"filename_matches_target: comparing on-disk bytes with target"
); );
// 直接 UTF-8 匹配 // 直接 UTF-8 匹配
match std::str::from_utf8(bytes) { match std::str::from_utf8(bytes) {
Ok(decoded) => { Ok(decoded) => {
let matches = decoded == target; let matches = decoded == target;
tracing::trace!( tracing::debug!(
decoded_utf8 = %decoded, decoded_utf8 = %decoded,
decoded_len = decoded.len(),
target_len = target.len(),
matches = matches, matches = matches,
"filename_matches_target: UTF-8 decode result" "filename_matches_target: UTF-8 decode result"
); );
return matches; if matches {
return true;
}
// UTF-8 匹配失败,继续尝试其他编码 — 可能磁盘上是 GBK
tracing::debug!(
"filename_matches_target: UTF-8 matched but strings differ, trying GBK decode"
);
} }
Err(e) => { Err(e) => {
tracing::trace!( tracing::debug!(
utf8_error = %e, utf8_error = %e,
"filename_matches_target: not valid UTF-8, trying GBK" "filename_matches_target: not valid UTF-8, trying GBK decode"
); );
} }
} }
@ -304,15 +312,26 @@ fn filename_matches_target(on_disk_name: &std::ffi::OsStr, target: &str) -> bool
let matches = gbk_decoded == target; let matches = gbk_decoded == target;
tracing::debug!( tracing::debug!(
gbk_decoded = %gbk_decoded, gbk_decoded = %gbk_decoded,
gbk_decoded_len = gbk_decoded.len(),
target = %target, target = %target,
target_len = target.len(),
matches = matches, matches = matches,
gbk_decoded_bytes = ?gbk_decoded.as_bytes(),
target_bytes = ?target.as_bytes(),
"filename_matches_target: GBK decode result" "filename_matches_target: GBK decode result"
); );
if matches { if matches {
return true; return true;
} }
// GBK 解码成功但不匹配,尝试归一化后比对
let normalized_disk = normalize_filename(&gbk_decoded);
let normalized_target = normalize_filename(target);
if normalized_disk == normalized_target {
tracing::debug!(
normalized_disk = %normalized_disk,
normalized_target = %normalized_target,
"filename_matches_target: matched after normalization"
);
return true;
}
} else { } else {
tracing::debug!( tracing::debug!(
gbk_decoded_lossy = %gbk_decoded, gbk_decoded_lossy = %gbk_decoded,
@ -330,7 +349,43 @@ fn filename_matches_target(on_disk_name: &std::ffi::OsStr, target: &str) -> bool
matches = matches, matches = matches,
"filename_matches_target: lossy fallback result" "filename_matches_target: lossy fallback result"
); );
matches if matches {
return true;
}
// 最后尝试lossy 归一化比对
let normalized_lossy = normalize_filename(&lossy);
let normalized_target = normalize_filename(target);
if normalized_lossy == normalized_target {
tracing::debug!(
normalized_lossy = %normalized_lossy,
normalized_target = %normalized_target,
"filename_matches_target: matched after lossy normalization"
);
return true;
}
false
}
/// 对文件名做归一化处理:去除不可见字符和空白字符差异,便于模糊匹配。
///
/// LLM 有时会在中文文件名中插入空格(如 "139 邮箱" vs "139邮箱"
/// 此函数去掉所有空白字符和零宽字符,只比对有意义的文字部分。
#[cfg(unix)]
fn normalize_filename(s: &str) -> String {
s.chars()
.filter(|c| match *c {
// 过滤 ASCII 空白
' ' | '\t' | '\n' | '\r' => false,
// 过滤零宽字符
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{200E}' | '\u{200F}' => false,
// 过滤 Unicode 空白字符
'\u{00A0}' | '\u{3000}' => false,
// 保留其他所有字符
_ => true,
})
.collect()
} }
/// 将字节切片格式化为十六进制字符串,用于调试日志。 /// 将字节切片格式化为十六进制字符串,用于调试日志。