fix: send_session_message 增加详细 debug 日志和文件名归一化匹配
- filename_matches_target 关键日志从 trace 升级到 debug - 增加 on-disk bytes hex dump 输出,便于定位编码差异 - UTF-8 解码成功但不匹配时继续尝试 GBK 解码 - 新增 normalize_filename() 去除空白/零宽字符后模糊比对 解决 LLM 在中文文件名中多插空格的问题 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
0de0b93896
commit
694b3ce0e0
@ -272,28 +272,36 @@ fn filename_matches_target(on_disk_name: &std::ffi::OsStr, target: &str) -> bool
|
||||
use std::os::unix::ffi::OsStrExt;
|
||||
let bytes = on_disk_name.as_bytes();
|
||||
|
||||
tracing::trace!(
|
||||
on_disk_bytes = ?bytes,
|
||||
tracing::debug!(
|
||||
on_disk_bytes_hex = %format_bytes_hex(bytes),
|
||||
target = %target,
|
||||
"filename_matches_target: comparing"
|
||||
target_bytes_hex = %format_bytes_hex(target.as_bytes()),
|
||||
"filename_matches_target: comparing on-disk bytes with target"
|
||||
);
|
||||
|
||||
// 直接 UTF-8 匹配
|
||||
match std::str::from_utf8(bytes) {
|
||||
Ok(decoded) => {
|
||||
let matches = decoded == target;
|
||||
tracing::trace!(
|
||||
tracing::debug!(
|
||||
decoded_utf8 = %decoded,
|
||||
decoded_len = decoded.len(),
|
||||
target_len = target.len(),
|
||||
matches = matches,
|
||||
"filename_matches_target: UTF-8 decode result"
|
||||
);
|
||||
return matches;
|
||||
if matches {
|
||||
return true;
|
||||
}
|
||||
// UTF-8 匹配失败,继续尝试其他编码 — 可能磁盘上是 GBK
|
||||
tracing::debug!(
|
||||
"filename_matches_target: UTF-8 matched but strings differ, trying GBK decode"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::trace!(
|
||||
tracing::debug!(
|
||||
utf8_error = %e,
|
||||
"filename_matches_target: not valid UTF-8, trying GBK"
|
||||
"filename_matches_target: not valid UTF-8, trying GBK decode"
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -304,15 +312,26 @@ fn filename_matches_target(on_disk_name: &std::ffi::OsStr, target: &str) -> bool
|
||||
let matches = gbk_decoded == target;
|
||||
tracing::debug!(
|
||||
gbk_decoded = %gbk_decoded,
|
||||
gbk_decoded_len = gbk_decoded.len(),
|
||||
target = %target,
|
||||
target_len = target.len(),
|
||||
matches = matches,
|
||||
gbk_decoded_bytes = ?gbk_decoded.as_bytes(),
|
||||
target_bytes = ?target.as_bytes(),
|
||||
"filename_matches_target: GBK decode result"
|
||||
);
|
||||
if matches {
|
||||
return true;
|
||||
}
|
||||
// GBK 解码成功但不匹配,尝试归一化后比对
|
||||
let normalized_disk = normalize_filename(&gbk_decoded);
|
||||
let normalized_target = normalize_filename(target);
|
||||
if normalized_disk == normalized_target {
|
||||
tracing::debug!(
|
||||
normalized_disk = %normalized_disk,
|
||||
normalized_target = %normalized_target,
|
||||
"filename_matches_target: matched after normalization"
|
||||
);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
tracing::debug!(
|
||||
gbk_decoded_lossy = %gbk_decoded,
|
||||
@ -330,7 +349,43 @@ fn filename_matches_target(on_disk_name: &std::ffi::OsStr, target: &str) -> bool
|
||||
matches = matches,
|
||||
"filename_matches_target: lossy fallback result"
|
||||
);
|
||||
matches
|
||||
if matches {
|
||||
return true;
|
||||
}
|
||||
|
||||
// 最后尝试:lossy 归一化比对
|
||||
let normalized_lossy = normalize_filename(&lossy);
|
||||
let normalized_target = normalize_filename(target);
|
||||
if normalized_lossy == normalized_target {
|
||||
tracing::debug!(
|
||||
normalized_lossy = %normalized_lossy,
|
||||
normalized_target = %normalized_target,
|
||||
"filename_matches_target: matched after lossy normalization"
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// 对文件名做归一化处理:去除不可见字符和空白字符差异,便于模糊匹配。
|
||||
///
|
||||
/// LLM 有时会在中文文件名中插入空格(如 "139 邮箱" vs "139邮箱"),
|
||||
/// 此函数去掉所有空白字符和零宽字符,只比对有意义的文字部分。
|
||||
#[cfg(unix)]
|
||||
fn normalize_filename(s: &str) -> String {
|
||||
s.chars()
|
||||
.filter(|c| match *c {
|
||||
// 过滤 ASCII 空白
|
||||
' ' | '\t' | '\n' | '\r' => false,
|
||||
// 过滤零宽字符
|
||||
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{200E}' | '\u{200F}' => false,
|
||||
// 过滤 Unicode 空白字符
|
||||
'\u{00A0}' | '\u{3000}' => false,
|
||||
// 保留其他所有字符
|
||||
_ => true,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// 将字节切片格式化为十六进制字符串,用于调试日志。
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user