feat: 添加附件路径解析功能,支持非 UTF-8 编码文件名的回退机制

This commit is contained in:
oudecheng 2026-06-10 18:07:17 +08:00
parent a4cdb31ba0
commit 0646a17073

View File

@ -1,16 +1,20 @@
use std::io::Read; use std::io::Read;
use std::path::Path; use std::path::{Path, PathBuf};
use std::sync::Arc; use std::sync::Arc;
use anyhow::anyhow; use anyhow::anyhow;
use async_trait::async_trait; use async_trait::async_trait;
use base64::Engine; use base64::Engine;
use encoding_rs::GBK;
use serde_json::json; use serde_json::json;
use crate::bus::MediaItem; use crate::bus::MediaItem;
use super::traits::{Tool, ToolContext, ToolResult}; use super::traits::{Tool, ToolContext, ToolResult};
#[cfg(unix)]
use std::os::unix::ffi::OsStringExt;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct SessionSendRequest { pub struct SessionSendRequest {
pub text: Option<String>, pub text: Option<String>,
@ -165,6 +169,42 @@ fn validate_context(context: &ToolContext) -> anyhow::Result<()> {
Ok(()) Ok(())
} }
/// 解析附件文件路径,支持非 UTF-8 编码的文件名回退。
///
/// 在 Linux 上,文件名可能是 GBK 等非 UTF-8 编码。当 LLM 用 UTF-8 字符串
/// 引用这些文件时直接访问会失败。此函数先按原样UTF-8尝试访问
/// 如果文件不存在,则将文件名部分用 GBK 重新编码后再尝试。
#[cfg_attr(not(unix), allow(unused_variables))]
fn resolve_attachment_path(raw_path: &str) -> PathBuf {
let path = Path::new(raw_path);
// 先按原样UTF-8尝试
if path.exists() {
return path.to_path_buf();
}
// 提取父目录和文件名,只对文件名做编码回退
if let (Some(parent), Some(filename_os)) = (path.parent(), path.file_name()) {
let filename_str = filename_os.to_string_lossy();
// 尝试 GBK 编码回退
let (gbk_bytes, _, had_errors) = GBK.encode(&filename_str);
if !had_errors {
#[cfg(unix)]
{
let os_filename = std::ffi::OsString::from_vec(gbk_bytes.into_owned());
let resolved = parent.join(os_filename);
if resolved.exists() {
return resolved;
}
}
}
}
// 回退失败,返回原路径(让调用方报错)
path.to_path_buf()
}
fn parse_attachments(value: &serde_json::Value) -> anyhow::Result<Vec<MediaItem>> { fn parse_attachments(value: &serde_json::Value) -> anyhow::Result<Vec<MediaItem>> {
// 支持两种格式:实际数组 或 字符串化的 JSON 数组 // 支持两种格式:实际数组 或 字符串化的 JSON 数组
let paths = if let Some(arr) = value.as_array() { let paths = if let Some(arr) = value.as_array() {
@ -198,7 +238,11 @@ fn parse_attachments(value: &serde_json::Value) -> anyhow::Result<Vec<MediaItem>
let mut attachments = Vec::with_capacity(paths.len()); let mut attachments = Vec::with_capacity(paths.len());
for raw_path in paths { for raw_path in paths {
let metadata = std::fs::metadata(&raw_path) // 解析路径(含编码回退),确保能正确访问非 UTF-8 文件名的文件
let resolved_path = resolve_attachment_path(&raw_path);
let resolved_path_str = resolved_path.to_string_lossy().to_string();
let metadata = std::fs::metadata(&resolved_path)
.map_err(|err| anyhow!("failed to access attachment '{}': {}", raw_path, err))?; .map_err(|err| anyhow!("failed to access attachment '{}': {}", raw_path, err))?;
if !metadata.is_file() { if !metadata.is_file() {
return Err(anyhow!("attachment path is not a file: {}", raw_path)); return Err(anyhow!("attachment path is not a file: {}", raw_path));
@ -209,21 +253,20 @@ fn parse_attachments(value: &serde_json::Value) -> anyhow::Result<Vec<MediaItem>
let content_base64 = (metadata.len() <= 50 * 1024 * 1024) let content_base64 = (metadata.len() <= 50 * 1024 * 1024)
.then(|| { .then(|| {
let mut file = std::fs::File::open(&raw_path)?; let mut file = std::fs::File::open(&resolved_path)?;
let mut buf = Vec::with_capacity(metadata.len() as usize); let mut buf = Vec::with_capacity(metadata.len() as usize);
file.read_to_end(&mut buf)?; file.read_to_end(&mut buf)?;
Ok::<_, anyhow::Error>(base64::engine::general_purpose::STANDARD.encode(&buf)) Ok::<_, anyhow::Error>(base64::engine::general_purpose::STANDARD.encode(&buf))
}) })
.transpose()?; .transpose()?;
let file_name = Path::new(&raw_path) let file_name = Path::new(&resolved_path)
.file_name() .file_name()
.and_then(|n| n.to_str()) .map(|n| n.to_string_lossy().to_string());
.map(ToOwned::to_owned);
let media_type = infer_media_type(&raw_path); let media_type = infer_media_type(&resolved_path_str);
let mut item = MediaItem::new(raw_path.to_string(), media_type); let mut item = MediaItem::new(resolved_path_str, media_type);
item.mime_type = mime_guess::from_path(&raw_path) item.mime_type = mime_guess::from_path(&resolved_path)
.first_raw() .first_raw()
.map(ToOwned::to_owned); .map(ToOwned::to_owned);
item.content_base64 = content_base64; item.content_base64 = content_base64;
@ -371,4 +414,36 @@ mod tests {
assert_eq!(attachments.len(), 1); assert_eq!(attachments.len(), 1);
assert_eq!(attachments[0].media_type, "file"); assert_eq!(attachments[0].media_type, "file");
} }
#[test]
#[cfg(unix)]
fn parse_attachments_resolves_gbk_encoded_filename() {
use encoding_rs::GBK;
use std::os::unix::ffi::OsStringExt;
// UTF-8 中文文件名
let utf8_filename = "测试文件.txt";
// 编码为 GBK 字节
let (gbk_bytes, _, had_errors) = GBK.encode(utf8_filename);
assert!(!had_errors);
// 在临时目录中创建 GBK 编码文件名的文件
let tmp_dir = tempfile::tempdir().unwrap();
let gbk_osstring = std::ffi::OsString::from_vec(gbk_bytes.into_owned());
let gbk_path = tmp_dir.path().join(gbk_osstring);
std::fs::write(&gbk_path, b"test content").unwrap();
// 用 UTF-8 路径调用 parse_attachments
let utf8_path = tmp_dir.path().join(utf8_filename);
let utf8_path_str = utf8_path.to_string_lossy().to_string();
let attachments = parse_attachments(&json!([utf8_path_str])).unwrap();
assert_eq!(attachments.len(), 1);
assert_eq!(attachments[0].media_type, "file");
// 验证文件名能正确提取(用 lossy 方式,因为是 GBK 编码)
assert!(attachments[0].file_name.is_some());
}
} }