diff --git a/src/tools/mod.rs b/src/tools/mod.rs index 07028d5..0be616d 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -1,7 +1,9 @@ pub mod calculator; pub mod registry; +pub mod schema; pub mod traits; pub use calculator::CalculatorTool; pub use registry::ToolRegistry; +pub use schema::{CleaningStrategy, SchemaCleanr}; pub use traits::{Tool, ToolResult}; diff --git a/src/tools/schema.rs b/src/tools/schema.rs new file mode 100644 index 0000000..91bca47 --- /dev/null +++ b/src/tools/schema.rs @@ -0,0 +1,721 @@ +//! JSON Schema cleaning and validation for LLM tool-calling compatibility. +//! +//! Different providers support different subsets of JSON Schema. This module +//! normalizes tool schemas to improve cross-provider compatibility while +//! preserving semantic intent. +//! +//! ## What this module does +//! +//! 1. Removes unsupported keywords per provider strategy +//! 2. Resolves local `$ref` entries from `$defs` and `definitions` +//! 3. Flattens literal `anyOf` / `oneOf` unions into `enum` +//! 4. Strips nullable variants from unions and `type` arrays +//! 5. Converts `const` to single-value `enum` +//! 6. Detects circular references and stops recursion safely + +use serde_json::{Map, Value, json}; +use std::collections::{HashMap, HashSet}; + +/// Keywords that Gemini rejects for tool schemas. +pub const GEMINI_UNSUPPORTED_KEYWORDS: &[&str] = &[ + // Schema composition + "$ref", + "$schema", + "$id", + "$defs", + "definitions", + // Property constraints + "additionalProperties", + "patternProperties", + // String constraints + "minLength", + "maxLength", + "pattern", + "format", + // Number constraints + "minimum", + "maximum", + "multipleOf", + // Array constraints + "minItems", + "maxItems", + "uniqueItems", + // Object constraints + "minProperties", + "maxProperties", + // Non-standard + "examples", +]; + +/// Keywords that should be preserved during cleaning (metadata). +const SCHEMA_META_KEYWORDS: &[&str] = &["description", "title", "default"]; + +/// Schema cleaning strategies for different LLM providers. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CleaningStrategy { + /// Gemini (Google AI / Vertex AI) - Most restrictive + Gemini, + /// Anthropic Claude - Moderately permissive + Anthropic, + /// OpenAI GPT - Most permissive + OpenAI, + /// Conservative: Remove only universally unsupported keywords + Conservative, +} + +impl CleaningStrategy { + /// Get the list of unsupported keywords for this strategy. + pub fn unsupported_keywords(self) -> &'static [&'static str] { + match self { + Self::Gemini => GEMINI_UNSUPPORTED_KEYWORDS, + Self::Anthropic => &["$ref", "$defs", "definitions"], + Self::OpenAI => &[], + Self::Conservative => &["$ref", "$defs", "definitions", "additionalProperties"], + } + } +} + +/// JSON Schema cleaner optimized for LLM tool calling. +pub struct SchemaCleanr; + +impl SchemaCleanr { + /// Clean schema for Gemini compatibility (strictest). + pub fn clean_for_gemini(schema: Value) -> Value { + Self::clean(schema, CleaningStrategy::Gemini) + } + + /// Clean schema for Anthropic compatibility. + pub fn clean_for_anthropic(schema: Value) -> Value { + Self::clean(schema, CleaningStrategy::Anthropic) + } + + /// Clean schema for OpenAI compatibility (most permissive). + pub fn clean_for_openai(schema: Value) -> Value { + Self::clean(schema, CleaningStrategy::OpenAI) + } + + /// Clean schema with specified strategy. + pub fn clean(schema: Value, strategy: CleaningStrategy) -> Value { + let defs = if let Some(obj) = schema.as_object() { + Self::extract_defs(obj) + } else { + HashMap::new() + }; + Self::clean_with_defs(schema, &defs, strategy, &mut HashSet::new()) + } + + /// Validate that a schema is suitable for LLM tool calling. + pub fn validate(schema: &Value) -> anyhow::Result<()> { + let obj = schema + .as_object() + .ok_or_else(|| anyhow::anyhow!("Schema must be an object"))?; + + if !obj.contains_key("type") { + anyhow::bail!("Schema missing required 'type' field"); + } + + if let Some(Value::String(t)) = obj.get("type") { + if t == "object" && !obj.contains_key("properties") { + tracing::warn!("Object schema without 'properties' field may cause issues"); + } + } + + Ok(()) + } + + /// Extract $defs and definitions into a flat map for reference resolution. + fn extract_defs(obj: &Map) -> HashMap { + let mut defs = HashMap::new(); + + if let Some(Value::Object(defs_obj)) = obj.get("$defs") { + for (key, value) in defs_obj { + defs.insert(key.clone(), value.clone()); + } + } + + if let Some(Value::Object(defs_obj)) = obj.get("definitions") { + for (key, value) in defs_obj { + defs.insert(key.clone(), value.clone()); + } + } + + defs + } + + /// Recursively clean a schema value. + fn clean_with_defs( + schema: Value, + defs: &HashMap, + strategy: CleaningStrategy, + ref_stack: &mut HashSet, + ) -> Value { + match schema { + Value::Object(obj) => Self::clean_object(obj, defs, strategy, ref_stack), + Value::Array(arr) => Value::Array( + arr.into_iter() + .map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack)) + .collect(), + ), + other => other, + } + } + + /// Clean an object schema. + fn clean_object( + obj: Map, + defs: &HashMap, + strategy: CleaningStrategy, + ref_stack: &mut HashSet, + ) -> Value { + // Handle $ref resolution + if let Some(Value::String(ref_value)) = obj.get("$ref") { + return Self::resolve_ref(ref_value, &obj, defs, strategy, ref_stack); + } + + // Handle anyOf/oneOf simplification + if obj.contains_key("anyOf") || obj.contains_key("oneOf") { + if let Some(simplified) = Self::try_simplify_union(&obj, defs, strategy, ref_stack) { + return simplified; + } + } + + // Build cleaned object + let mut cleaned = Map::new(); + let unsupported: HashSet<&str> = strategy.unsupported_keywords().iter().copied().collect(); + let has_union = obj.contains_key("anyOf") || obj.contains_key("oneOf"); + + for (key, value) in obj { + // Skip unsupported keywords + if unsupported.contains(key.as_str()) { + continue; + } + + match key.as_str() { + // Convert const to enum + "const" => { + cleaned.insert("enum".to_string(), json!([value])); + } + // Skip type if we have anyOf/oneOf + "type" if has_union => {} + // Handle type arrays (remove null) + "type" if matches!(value, Value::Array(_)) => { + let cleaned_value = Self::clean_type_array(value); + cleaned.insert(key, cleaned_value); + } + // Recursively clean nested schemas + "properties" => { + let cleaned_value = Self::clean_properties(value, defs, strategy, ref_stack); + cleaned.insert(key, cleaned_value); + } + "items" => { + let cleaned_value = Self::clean_with_defs(value, defs, strategy, ref_stack); + cleaned.insert(key, cleaned_value); + } + "anyOf" | "oneOf" | "allOf" => { + let cleaned_value = Self::clean_union(value, defs, strategy, ref_stack); + cleaned.insert(key, cleaned_value); + } + _ => { + let cleaned_value = match value { + Value::Object(_) | Value::Array(_) => { + Self::clean_with_defs(value, defs, strategy, ref_stack) + } + other => other, + }; + cleaned.insert(key, cleaned_value); + } + } + } + + Value::Object(cleaned) + } + + /// Resolve a $ref to its definition. + fn resolve_ref( + ref_value: &str, + obj: &Map, + defs: &HashMap, + strategy: CleaningStrategy, + ref_stack: &mut HashSet, + ) -> Value { + // Prevent circular references + if ref_stack.contains(ref_value) { + tracing::warn!("Circular $ref detected: {}", ref_value); + return Self::preserve_meta(obj, Value::Object(Map::new())); + } + + if let Some(def_name) = Self::parse_local_ref(ref_value) { + if let Some(definition) = defs.get(def_name.as_str()) { + ref_stack.insert(ref_value.to_string()); + let cleaned = Self::clean_with_defs(definition.clone(), defs, strategy, ref_stack); + ref_stack.remove(ref_value); + return Self::preserve_meta(obj, cleaned); + } + } + + tracing::warn!("Cannot resolve $ref: {}", ref_value); + Self::preserve_meta(obj, Value::Object(Map::new())) + } + + /// Parse a local JSON Pointer ref (#/$defs/Name or #/definitions/Name). + fn parse_local_ref(ref_value: &str) -> Option { + ref_value + .strip_prefix("#/$defs/") + .or_else(|| ref_value.strip_prefix("#/definitions/")) + .map(Self::decode_json_pointer) + } + + /// Decode JSON Pointer escaping (`~0` = `~`, `~1` = `/`). + fn decode_json_pointer(segment: &str) -> String { + if !segment.contains('~') { + return segment.to_string(); + } + + let mut decoded = String::with_capacity(segment.len()); + let mut chars = segment.chars().peekable(); + + while let Some(ch) = chars.next() { + if ch == '~' { + match chars.peek().copied() { + Some('0') => { + chars.next(); + decoded.push('~'); + } + Some('1') => { + chars.next(); + decoded.push('/'); + } + _ => decoded.push('~'), + } + } else { + decoded.push(ch); + } + } + + decoded + } + + /// Try to simplify anyOf/oneOf to a simpler form. + fn try_simplify_union( + obj: &Map, + defs: &HashMap, + strategy: CleaningStrategy, + ref_stack: &mut HashSet, + ) -> Option { + let union_key = if obj.contains_key("anyOf") { + "anyOf" + } else if obj.contains_key("oneOf") { + "oneOf" + } else { + return None; + }; + + let variants = obj.get(union_key)?.as_array()?; + + let cleaned_variants: Vec = variants + .iter() + .map(|v| Self::clean_with_defs(v.clone(), defs, strategy, ref_stack)) + .collect(); + + // Strip null variants + let non_null: Vec = cleaned_variants + .into_iter() + .filter(|v| !Self::is_null_schema(v)) + .collect(); + + // If only one variant remains after stripping nulls, return it + if non_null.len() == 1 { + return Some(Self::preserve_meta(obj, non_null[0].clone())); + } + + // Try to flatten to enum if all variants are literals + if let Some(enum_value) = Self::try_flatten_literal_union(&non_null) { + return Some(Self::preserve_meta(obj, enum_value)); + } + + None + } + + /// Check if a schema represents null type. + fn is_null_schema(value: &Value) -> bool { + if let Some(obj) = value.as_object() { + if let Some(Value::Null) = obj.get("const") { + return true; + } + if let Some(Value::Array(arr)) = obj.get("enum") { + if arr.len() == 1 && matches!(arr[0], Value::Null) { + return true; + } + } + if let Some(Value::String(t)) = obj.get("type") { + if t == "null" { + return true; + } + } + } + false + } + + /// Try to flatten anyOf/oneOf with only literal values to enum. + fn try_flatten_literal_union(variants: &[Value]) -> Option { + if variants.is_empty() { + return None; + } + + let mut all_values = Vec::new(); + let mut common_type: Option = None; + + for variant in variants { + let obj = variant.as_object()?; + + let literal_value = if let Some(const_val) = obj.get("const") { + const_val.clone() + } else if let Some(Value::Array(arr)) = obj.get("enum") { + if arr.len() == 1 { + arr[0].clone() + } else { + return None; + } + } else { + return None; + }; + + let variant_type = obj.get("type")?.as_str()?; + match &common_type { + None => common_type = Some(variant_type.to_string()), + Some(t) if t != variant_type => return None, + _ => {} + } + + all_values.push(literal_value); + } + + common_type.map(|t| { + json!({ + "type": t, + "enum": all_values + }) + }) + } + + /// Clean type array, removing null. + fn clean_type_array(value: Value) -> Value { + if let Value::Array(types) = value { + let non_null: Vec = types + .into_iter() + .filter(|v| v.as_str() != Some("null")) + .collect(); + + match non_null.len() { + 0 => Value::String("null".to_string()), + 1 => non_null.into_iter().next().unwrap_or(Value::String("null".to_string())), + _ => Value::Array(non_null), + } + } else { + value + } + } + + /// Clean properties object. + fn clean_properties( + value: Value, + defs: &HashMap, + strategy: CleaningStrategy, + ref_stack: &mut HashSet, + ) -> Value { + if let Value::Object(props) = value { + let cleaned: Map = props + .into_iter() + .map(|(k, v)| (k, Self::clean_with_defs(v, defs, strategy, ref_stack))) + .collect(); + Value::Object(cleaned) + } else { + value + } + } + + /// Clean union (anyOf/oneOf/allOf). + fn clean_union( + value: Value, + defs: &HashMap, + strategy: CleaningStrategy, + ref_stack: &mut HashSet, + ) -> Value { + if let Value::Array(variants) = value { + let cleaned: Vec = variants + .into_iter() + .map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack)) + .collect(); + Value::Array(cleaned) + } else { + value + } + } + + /// Preserve metadata (description, title, default) from source to target. + fn preserve_meta(source: &Map, mut target: Value) -> Value { + if let Value::Object(target_obj) = &mut target { + for &key in SCHEMA_META_KEYWORDS { + if let Some(value) = source.get(key) { + target_obj.insert(key.to_string(), value.clone()); + } + } + } + target + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_remove_unsupported_keywords() { + let schema = json!({ + "type": "string", + "minLength": 1, + "maxLength": 100, + "pattern": "^[a-z]+$", + "description": "A lowercase string" + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["type"], "string"); + assert_eq!(cleaned["description"], "A lowercase string"); + assert!(cleaned.get("minLength").is_none()); + assert!(cleaned.get("maxLength").is_none()); + assert!(cleaned.get("pattern").is_none()); + } + + #[test] + fn test_resolve_ref() { + let schema = json!({ + "type": "object", + "properties": { + "age": { + "$ref": "#/$defs/Age" + } + }, + "$defs": { + "Age": { + "type": "integer", + "minimum": 0 + } + } + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["properties"]["age"]["type"], "integer"); + assert!(cleaned["properties"]["age"].get("minimum").is_none()); + assert!(cleaned.get("$defs").is_none()); + } + + #[test] + fn test_flatten_literal_union() { + let schema = json!({ + "anyOf": [ + { "const": "admin", "type": "string" }, + { "const": "user", "type": "string" }, + { "const": "guest", "type": "string" } + ] + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["type"], "string"); + assert!(cleaned["enum"].is_array()); + let enum_values = cleaned["enum"].as_array().unwrap(); + assert_eq!(enum_values.len(), 3); + assert!(enum_values.contains(&json!("admin"))); + assert!(enum_values.contains(&json!("user"))); + assert!(enum_values.contains(&json!("guest"))); + } + + #[test] + fn test_strip_null_from_union() { + let schema = json!({ + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["type"], "string"); + assert!(cleaned.get("oneOf").is_none()); + } + + #[test] + fn test_const_to_enum() { + let schema = json!({ + "const": "fixed_value", + "description": "A constant" + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["enum"], json!(["fixed_value"])); + assert_eq!(cleaned["description"], "A constant"); + assert!(cleaned.get("const").is_none()); + } + + #[test] + fn test_preserve_metadata() { + let schema = json!({ + "$ref": "#/$defs/Name", + "description": "User's name", + "title": "Name Field", + "default": "Anonymous", + "$defs": { + "Name": { + "type": "string" + } + } + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["type"], "string"); + assert_eq!(cleaned["description"], "User's name"); + assert_eq!(cleaned["title"], "Name Field"); + assert_eq!(cleaned["default"], "Anonymous"); + } + + #[test] + fn test_circular_ref_prevention() { + let schema = json!({ + "type": "object", + "properties": { + "parent": { + "$ref": "#/$defs/Node" + } + }, + "$defs": { + "Node": { + "type": "object", + "properties": { + "child": { + "$ref": "#/$defs/Node" + } + } + } + } + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["properties"]["parent"]["type"], "object"); + } + + #[test] + fn test_validate_schema() { + let valid = json!({ + "type": "object", + "properties": { + "name": { "type": "string" } + } + }); + + assert!(SchemaCleanr::validate(&valid).is_ok()); + + let invalid = json!({ + "properties": { + "name": { "type": "string" } + } + }); + + assert!(SchemaCleanr::validate(&invalid).is_err()); + } + + #[test] + fn test_strategy_differences() { + let schema = json!({ + "type": "string", + "minLength": 1, + "description": "A string field" + }); + + // Gemini: Most restrictive (removes minLength) + let gemini = SchemaCleanr::clean_for_gemini(schema.clone()); + assert!(gemini.get("minLength").is_none()); + assert_eq!(gemini["type"], "string"); + assert_eq!(gemini["description"], "A string field"); + + // OpenAI: Most permissive (keeps minLength) + let openai = SchemaCleanr::clean_for_openai(schema.clone()); + assert_eq!(openai["minLength"], 1); + assert_eq!(openai["type"], "string"); + } + + #[test] + fn test_nested_properties() { + let schema = json!({ + "type": "object", + "properties": { + "user": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + } + }, + "additionalProperties": false + } + } + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert!( + cleaned["properties"]["user"]["properties"]["name"] + .get("minLength") + .is_none() + ); + assert!( + cleaned["properties"]["user"] + .get("additionalProperties") + .is_none() + ); + } + + #[test] + fn test_type_array_null_removal() { + let schema = json!({ + "type": ["string", "null"] + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert_eq!(cleaned["type"], "string"); + } + + #[test] + fn test_skip_type_when_non_simplifiable_union_exists() { + let schema = json!({ + "type": "object", + "oneOf": [ + { + "type": "object", + "properties": { + "a": { "type": "string" } + } + }, + { + "type": "object", + "properties": { + "b": { "type": "number" } + } + } + ] + }); + + let cleaned = SchemaCleanr::clean_for_gemini(schema); + + assert!(cleaned.get("type").is_none()); + assert!(cleaned.get("oneOf").is_some()); + } +} diff --git a/src/tools/traits.rs b/src/tools/traits.rs index 10f0140..f3ffdc4 100644 --- a/src/tools/traits.rs +++ b/src/tools/traits.rs @@ -13,4 +13,19 @@ pub trait Tool: Send + Sync + 'static { fn description(&self) -> &str; fn parameters_schema(&self) -> serde_json::Value; async fn execute(&self, args: serde_json::Value) -> anyhow::Result; + + /// Whether this tool is side-effect free and safe to parallelize. + fn read_only(&self) -> bool { + false + } + + /// Whether this tool can run alongside other concurrency-safe tools. + fn concurrency_safe(&self) -> bool { + self.read_only() && !self.exclusive() + } + + /// Whether this tool should run alone even if concurrency is enabled. + fn exclusive(&self) -> bool { + false + } }