feat(file_read): enhance file reading with encoding detection and support for binary files
This commit is contained in:
parent
1a3cfbb0af
commit
db24d42076
@ -45,3 +45,4 @@ rmcp = { version = "1.6", default-features = false, features = [
|
||||
"which-command",
|
||||
] }
|
||||
http = "1"
|
||||
encoding_rs = "0.8"
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use encoding_rs::*;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::tools::path_utils;
|
||||
@ -117,10 +118,23 @@ impl Tool for FileReadTool {
|
||||
});
|
||||
}
|
||||
|
||||
// Try to read as text
|
||||
match std::fs::read_to_string(&resolved) {
|
||||
Ok(content) => {
|
||||
let all_lines: Vec<&str> = content.lines().collect();
|
||||
// Read raw bytes and try multiple encodings
|
||||
let bytes = match std::fs::read(&resolved) {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Failed to read file: {}", e)),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let (content, encoding_label) = decode_text(&bytes);
|
||||
|
||||
match content {
|
||||
Some(text) => {
|
||||
let all_lines: Vec<&str> = text.lines().collect();
|
||||
let total = all_lines.len();
|
||||
|
||||
if offset < 1 {
|
||||
@ -182,7 +196,14 @@ impl Tool for FileReadTool {
|
||||
end + 1
|
||||
));
|
||||
} else {
|
||||
result.push_str(&format!("\n\n(End of file — {} lines total)", total));
|
||||
result.push_str(&format!(
|
||||
"\n\n(End of file — {} lines total)",
|
||||
total
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(label) = encoding_label {
|
||||
result.insert_str(0, &format!("(编码: {})\n", label));
|
||||
}
|
||||
|
||||
Ok(ToolResult {
|
||||
@ -191,37 +212,69 @@ impl Tool for FileReadTool {
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
// Try to read as binary and encode as base64
|
||||
match std::fs::read(&resolved) {
|
||||
Ok(bytes) => {
|
||||
use base64::{engine::general_purpose::STANDARD, Engine};
|
||||
let encoded = STANDARD.encode(&bytes);
|
||||
let mime = mime_guess::from_path(&resolved)
|
||||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
Ok(ToolResult {
|
||||
success: true,
|
||||
output: format!(
|
||||
"(Binary file: {}, {} bytes, base64 encoded)\n{}",
|
||||
mime,
|
||||
bytes.len(),
|
||||
encoded
|
||||
),
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
Err(_) => Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Failed to read file: {}", e)),
|
||||
}),
|
||||
}
|
||||
None => {
|
||||
// Truly binary file — base64 encode
|
||||
use base64::{engine::general_purpose::STANDARD, Engine};
|
||||
let encoded = STANDARD.encode(&bytes);
|
||||
let mime = mime_guess::from_path(&resolved)
|
||||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
Ok(ToolResult {
|
||||
success: true,
|
||||
output: format!(
|
||||
"(Binary file: {}, {} bytes, base64 encoded)\n{}",
|
||||
mime,
|
||||
bytes.len(),
|
||||
encoded
|
||||
),
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_text(bytes: &[u8]) -> (Option<String>, Option<&'static str>) {
|
||||
// Try UTF-8 first
|
||||
if let Ok(text) = std::str::from_utf8(bytes) {
|
||||
return (Some(text.to_string()), None);
|
||||
}
|
||||
|
||||
// Try GB18030
|
||||
let (cow, _, had_errors) = GB18030.decode(bytes);
|
||||
if !had_errors {
|
||||
return (Some(cow.into_owned()), Some("GB18030"));
|
||||
}
|
||||
|
||||
// Try GBK
|
||||
let (cow, _, had_errors) = GBK.decode(bytes);
|
||||
if !had_errors {
|
||||
return (Some(cow.into_owned()), Some("GBK"));
|
||||
}
|
||||
|
||||
// Try UTF-8 lossy as last resort
|
||||
let (cow, _, had_errors) = UTF_8.decode(bytes);
|
||||
if !had_errors {
|
||||
// UTF-8 decode succeeded via encoding_rs but std::str::from_utf8
|
||||
// rejected it (maybe due to BOM or unpaired surrogates); still return it
|
||||
return (Some(cow.into_owned()), None);
|
||||
}
|
||||
|
||||
// Check if content looks text-ish (>50% printable) for lossy UTF-8 fallback
|
||||
let printable_ratio = bytes
|
||||
.iter()
|
||||
.filter(|&&b| b.is_ascii_graphic() || b == b' ' || b == b'\n' || b == b'\r' || b == b'\t')
|
||||
.count() as f64
|
||||
/ bytes.len().max(1) as f64;
|
||||
|
||||
if printable_ratio > 0.5 {
|
||||
let text = String::from_utf8_lossy(bytes);
|
||||
return (Some(text.into_owned()), Some("UTF-8(lossy)"));
|
||||
}
|
||||
|
||||
(None, None)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user