PicoBot/src/tools/file_read.rs
2026-06-16 22:56:01 +08:00

418 lines
13 KiB
Rust

use async_trait::async_trait;
use encoding_rs::*;
use serde_json::json;
use crate::tools::path_utils;
use crate::tools::traits::{Tool, ToolResult};
const MAX_CHARS: usize = 128_000;
const MAX_FILE_BYTES: u64 = 5 * 1024 * 1024;
const MAX_BINARY_BYTES: usize = 512 * 1024;
const DEFAULT_LIMIT: usize = 2000;
pub struct FileReadTool {
allowed_dir: Option<String>,
}
impl FileReadTool {
pub fn new() -> Self {
Self { allowed_dir: None }
}
pub fn with_allowed_dir(dir: String) -> Self {
Self {
allowed_dir: Some(dir),
}
}
}
impl Default for FileReadTool {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Tool for FileReadTool {
fn name(&self) -> &str {
"file_read"
}
fn description(&self) -> &str {
"Read the contents of a file. Returns numbered lines. Use offset and limit to paginate through large files."
}
fn parameters_schema(&self) -> serde_json::Value {
json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "The file path to read"
},
"offset": {
"type": "integer",
"description": "Line number to start reading from (1-indexed, default 1)",
"minimum": 1
},
"limit": {
"type": "integer",
"description": "Maximum number of lines to read (default 2000)",
"minimum": 1
}
},
"required": ["path"]
})
}
fn read_only(&self) -> bool {
true
}
async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
let path = match args.get("path").and_then(|v| v.as_str()) {
Some(p) => p,
None => {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some("Missing required parameter: path".to_string()),
});
}
};
let offset = args
.get("offset")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(1);
let limit = args
.get("limit")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(DEFAULT_LIMIT);
let resolved = match path_utils::resolve_path(path, self.allowed_dir.as_deref()) {
Ok(p) => p,
Err(e) => {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(e),
});
}
};
if !resolved.exists() {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("File not found: {}", path)),
});
}
if !resolved.is_file() {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Not a file: {}", path)),
});
}
let metadata = match std::fs::metadata(&resolved) {
Ok(m) => m,
Err(e) => {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Failed to inspect file: {}", e)),
});
}
};
if metadata.len() > MAX_FILE_BYTES {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!(
"File too large to read safely: {} bytes (max {} bytes). Use a narrower tool or inspect a smaller excerpt.",
metadata.len(),
MAX_FILE_BYTES
)),
});
}
// Read raw bytes and try multiple encodings
let bytes = match std::fs::read(&resolved) {
Ok(b) => b,
Err(e) => {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Failed to read file: {}", e)),
});
}
};
let (content, encoding_label) = decode_text(&bytes);
match content {
Some(text) => {
let all_lines: Vec<&str> = text.lines().collect();
let total = all_lines.len();
if offset < 1 {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("offset must be at least 1, got {}", offset)),
});
}
if offset > total {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!(
"offset {} is beyond end of file ({} lines)",
offset, total
)),
});
}
let start = offset - 1;
let end = std::cmp::min(start + limit, total);
let lines: Vec<String> = all_lines[start..end]
.iter()
.enumerate()
.map(|(i, line)| format!("{}| {}", start + i + 1, line))
.collect();
let mut result = lines.join("\n");
// Truncate if too long
if result.len() > MAX_CHARS {
let original_len = result.len();
let mut truncated_chars = 0;
let mut end_idx = 0;
for (i, line) in lines.iter().enumerate() {
truncated_chars += line.len() + 1;
if truncated_chars > MAX_CHARS {
end_idx = i;
break;
}
end_idx = i + 1;
}
result = lines[..end_idx].join("\n");
let truncated = original_len - result.len();
result.push_str(&format!("\n\n... ({} chars truncated) ...", truncated));
}
if end < total {
result.push_str(&format!(
"\n\n(Showing lines {}-{} of {}. Use offset={} to continue.)",
offset,
end,
total,
end + 1
));
} else {
result.push_str(&format!("\n\n(End of file — {} lines total)", total));
}
if let Some(label) = encoding_label {
result.insert_str(0, &format!("(编码: {})\n", label));
}
Ok(ToolResult {
success: true,
output: result,
error: None,
})
}
None => {
// Truly binary file — base64 encode
use base64::{Engine, engine::general_purpose::STANDARD};
if bytes.len() > MAX_BINARY_BYTES {
let mime = mime_guess::from_path(&resolved)
.first_or_octet_stream()
.to_string();
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!(
"Binary file too large to inline: {}, {} bytes (max {} bytes).",
mime,
bytes.len(),
MAX_BINARY_BYTES
)),
});
}
let encoded = STANDARD.encode(&bytes);
let mime = mime_guess::from_path(&resolved)
.first_or_octet_stream()
.to_string();
Ok(ToolResult {
success: true,
output: format!(
"(Binary file: {}, {} bytes, base64 encoded)\n{}",
mime,
bytes.len(),
encoded
),
error: None,
})
}
}
}
}
fn decode_text(bytes: &[u8]) -> (Option<String>, Option<&'static str>) {
if bytes.contains(&0) {
return (None, None);
}
// Try UTF-8 first
if let Ok(text) = std::str::from_utf8(bytes) {
return (Some(text.to_string()), None);
}
// Try GB18030
let (cow, _, had_errors) = GB18030.decode(bytes);
if !had_errors {
return (Some(cow.into_owned()), Some("GB18030"));
}
// Try GBK
let (cow, _, had_errors) = GBK.decode(bytes);
if !had_errors {
return (Some(cow.into_owned()), Some("GBK"));
}
// Try UTF-8 lossy as last resort
let (cow, _, had_errors) = UTF_8.decode(bytes);
if !had_errors {
// UTF-8 decode succeeded via encoding_rs but std::str::from_utf8
// rejected it (maybe due to BOM or unpaired surrogates); still return it
return (Some(cow.into_owned()), None);
}
// Check if content looks text-ish (>50% printable) for lossy UTF-8 fallback
let printable_ratio = bytes
.iter()
.filter(|&&b| b.is_ascii_graphic() || b == b' ' || b == b'\n' || b == b'\r' || b == b'\t')
.count() as f64
/ bytes.len().max(1) as f64;
if printable_ratio > 0.5 {
let text = String::from_utf8_lossy(bytes);
return (Some(text.into_owned()), Some("UTF-8(lossy)"));
}
(None, None)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[tokio::test]
async fn test_read_simple_file() {
let mut file = NamedTempFile::new().unwrap();
writeln!(file, "Line 1").unwrap();
writeln!(file, "Line 2").unwrap();
writeln!(file, "Line 3").unwrap();
let tool = FileReadTool::new();
let result = tool
.execute(json!({ "path": file.path().to_str().unwrap() }))
.await
.unwrap();
assert!(result.success);
assert!(result.output.contains("Line 1"));
assert!(result.output.contains("Line 2"));
assert!(result.output.contains("Line 3"));
}
#[tokio::test]
async fn test_read_with_offset_limit() {
let mut file = NamedTempFile::new().unwrap();
for i in 1..=10 {
writeln!(file, "Line {}", i).unwrap();
}
let tool = FileReadTool::new();
let result = tool
.execute(json!({
"path": file.path().to_str().unwrap(),
"offset": 3,
"limit": 2
}))
.await
.unwrap();
assert!(result.success);
assert!(result.output.contains("Line 3"));
assert!(result.output.contains("Line 4"));
assert!(!result.output.contains("Line 2"));
}
#[tokio::test]
async fn test_file_not_found() {
let tool = FileReadTool::new();
let result = tool
.execute(json!({ "path": "/nonexistent/file.txt" }))
.await
.unwrap();
assert!(!result.success);
assert!(result.error.unwrap().contains("not found"));
}
#[tokio::test]
async fn test_is_directory() {
let tool = FileReadTool::new();
let result = tool.execute(json!({ "path": "." })).await.unwrap();
assert!(!result.success);
assert!(result.error.unwrap().contains("Not a file"));
}
#[tokio::test]
async fn test_rejects_large_file_before_reading() {
let mut file = NamedTempFile::new().unwrap();
file.as_file_mut()
.set_len(MAX_FILE_BYTES + 1)
.expect("set large file length");
let tool = FileReadTool::new();
let result = tool
.execute(json!({ "path": file.path().to_str().unwrap() }))
.await
.unwrap();
assert!(!result.success);
assert!(result.error.unwrap().contains("too large"));
}
#[tokio::test]
async fn test_rejects_large_binary_inline() {
let mut file = NamedTempFile::new().unwrap();
let bytes = vec![0_u8; MAX_BINARY_BYTES + 1];
file.write_all(&bytes).unwrap();
let tool = FileReadTool::new();
let result = tool
.execute(json!({ "path": file.path().to_str().unwrap() }))
.await
.unwrap();
assert!(!result.success);
assert!(result.error.unwrap().contains("Binary file too large"));
}
}