feat(tools): add SchemaCleanr for cross-provider schema normalization

- Add SchemaCleanr with CleaningStrategy enum (Gemini, Anthropic, OpenAI, Conservative)
- Support cleaning JSON schemas for different LLM provider compatibility
- Add $ref resolution, anyOf/oneOf flattening, const-to-enum conversion
- Add read_only, concurrency_safe, exclusive methods to Tool trait
- Add comprehensive unit tests for all schema cleaning features
This commit is contained in:
xiaoski 2026-04-07 23:41:20 +08:00
parent 2dada36bc6
commit d5b6cd24fc
3 changed files with 738 additions and 0 deletions

View File

@ -1,7 +1,9 @@
pub mod calculator; pub mod calculator;
pub mod registry; pub mod registry;
pub mod schema;
pub mod traits; pub mod traits;
pub use calculator::CalculatorTool; pub use calculator::CalculatorTool;
pub use registry::ToolRegistry; pub use registry::ToolRegistry;
pub use schema::{CleaningStrategy, SchemaCleanr};
pub use traits::{Tool, ToolResult}; pub use traits::{Tool, ToolResult};

721
src/tools/schema.rs Normal file
View File

@ -0,0 +1,721 @@
//! JSON Schema cleaning and validation for LLM tool-calling compatibility.
//!
//! Different providers support different subsets of JSON Schema. This module
//! normalizes tool schemas to improve cross-provider compatibility while
//! preserving semantic intent.
//!
//! ## What this module does
//!
//! 1. Removes unsupported keywords per provider strategy
//! 2. Resolves local `$ref` entries from `$defs` and `definitions`
//! 3. Flattens literal `anyOf` / `oneOf` unions into `enum`
//! 4. Strips nullable variants from unions and `type` arrays
//! 5. Converts `const` to single-value `enum`
//! 6. Detects circular references and stops recursion safely
use serde_json::{Map, Value, json};
use std::collections::{HashMap, HashSet};
/// Keywords that Gemini rejects for tool schemas.
pub const GEMINI_UNSUPPORTED_KEYWORDS: &[&str] = &[
// Schema composition
"$ref",
"$schema",
"$id",
"$defs",
"definitions",
// Property constraints
"additionalProperties",
"patternProperties",
// String constraints
"minLength",
"maxLength",
"pattern",
"format",
// Number constraints
"minimum",
"maximum",
"multipleOf",
// Array constraints
"minItems",
"maxItems",
"uniqueItems",
// Object constraints
"minProperties",
"maxProperties",
// Non-standard
"examples",
];
/// Keywords that should be preserved during cleaning (metadata).
const SCHEMA_META_KEYWORDS: &[&str] = &["description", "title", "default"];
/// Schema cleaning strategies for different LLM providers.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CleaningStrategy {
/// Gemini (Google AI / Vertex AI) - Most restrictive
Gemini,
/// Anthropic Claude - Moderately permissive
Anthropic,
/// OpenAI GPT - Most permissive
OpenAI,
/// Conservative: Remove only universally unsupported keywords
Conservative,
}
impl CleaningStrategy {
/// Get the list of unsupported keywords for this strategy.
pub fn unsupported_keywords(self) -> &'static [&'static str] {
match self {
Self::Gemini => GEMINI_UNSUPPORTED_KEYWORDS,
Self::Anthropic => &["$ref", "$defs", "definitions"],
Self::OpenAI => &[],
Self::Conservative => &["$ref", "$defs", "definitions", "additionalProperties"],
}
}
}
/// JSON Schema cleaner optimized for LLM tool calling.
pub struct SchemaCleanr;
impl SchemaCleanr {
/// Clean schema for Gemini compatibility (strictest).
pub fn clean_for_gemini(schema: Value) -> Value {
Self::clean(schema, CleaningStrategy::Gemini)
}
/// Clean schema for Anthropic compatibility.
pub fn clean_for_anthropic(schema: Value) -> Value {
Self::clean(schema, CleaningStrategy::Anthropic)
}
/// Clean schema for OpenAI compatibility (most permissive).
pub fn clean_for_openai(schema: Value) -> Value {
Self::clean(schema, CleaningStrategy::OpenAI)
}
/// Clean schema with specified strategy.
pub fn clean(schema: Value, strategy: CleaningStrategy) -> Value {
let defs = if let Some(obj) = schema.as_object() {
Self::extract_defs(obj)
} else {
HashMap::new()
};
Self::clean_with_defs(schema, &defs, strategy, &mut HashSet::new())
}
/// Validate that a schema is suitable for LLM tool calling.
pub fn validate(schema: &Value) -> anyhow::Result<()> {
let obj = schema
.as_object()
.ok_or_else(|| anyhow::anyhow!("Schema must be an object"))?;
if !obj.contains_key("type") {
anyhow::bail!("Schema missing required 'type' field");
}
if let Some(Value::String(t)) = obj.get("type") {
if t == "object" && !obj.contains_key("properties") {
tracing::warn!("Object schema without 'properties' field may cause issues");
}
}
Ok(())
}
/// Extract $defs and definitions into a flat map for reference resolution.
fn extract_defs(obj: &Map<String, Value>) -> HashMap<String, Value> {
let mut defs = HashMap::new();
if let Some(Value::Object(defs_obj)) = obj.get("$defs") {
for (key, value) in defs_obj {
defs.insert(key.clone(), value.clone());
}
}
if let Some(Value::Object(defs_obj)) = obj.get("definitions") {
for (key, value) in defs_obj {
defs.insert(key.clone(), value.clone());
}
}
defs
}
/// Recursively clean a schema value.
fn clean_with_defs(
schema: Value,
defs: &HashMap<String, Value>,
strategy: CleaningStrategy,
ref_stack: &mut HashSet<String>,
) -> Value {
match schema {
Value::Object(obj) => Self::clean_object(obj, defs, strategy, ref_stack),
Value::Array(arr) => Value::Array(
arr.into_iter()
.map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack))
.collect(),
),
other => other,
}
}
/// Clean an object schema.
fn clean_object(
obj: Map<String, Value>,
defs: &HashMap<String, Value>,
strategy: CleaningStrategy,
ref_stack: &mut HashSet<String>,
) -> Value {
// Handle $ref resolution
if let Some(Value::String(ref_value)) = obj.get("$ref") {
return Self::resolve_ref(ref_value, &obj, defs, strategy, ref_stack);
}
// Handle anyOf/oneOf simplification
if obj.contains_key("anyOf") || obj.contains_key("oneOf") {
if let Some(simplified) = Self::try_simplify_union(&obj, defs, strategy, ref_stack) {
return simplified;
}
}
// Build cleaned object
let mut cleaned = Map::new();
let unsupported: HashSet<&str> = strategy.unsupported_keywords().iter().copied().collect();
let has_union = obj.contains_key("anyOf") || obj.contains_key("oneOf");
for (key, value) in obj {
// Skip unsupported keywords
if unsupported.contains(key.as_str()) {
continue;
}
match key.as_str() {
// Convert const to enum
"const" => {
cleaned.insert("enum".to_string(), json!([value]));
}
// Skip type if we have anyOf/oneOf
"type" if has_union => {}
// Handle type arrays (remove null)
"type" if matches!(value, Value::Array(_)) => {
let cleaned_value = Self::clean_type_array(value);
cleaned.insert(key, cleaned_value);
}
// Recursively clean nested schemas
"properties" => {
let cleaned_value = Self::clean_properties(value, defs, strategy, ref_stack);
cleaned.insert(key, cleaned_value);
}
"items" => {
let cleaned_value = Self::clean_with_defs(value, defs, strategy, ref_stack);
cleaned.insert(key, cleaned_value);
}
"anyOf" | "oneOf" | "allOf" => {
let cleaned_value = Self::clean_union(value, defs, strategy, ref_stack);
cleaned.insert(key, cleaned_value);
}
_ => {
let cleaned_value = match value {
Value::Object(_) | Value::Array(_) => {
Self::clean_with_defs(value, defs, strategy, ref_stack)
}
other => other,
};
cleaned.insert(key, cleaned_value);
}
}
}
Value::Object(cleaned)
}
/// Resolve a $ref to its definition.
fn resolve_ref(
ref_value: &str,
obj: &Map<String, Value>,
defs: &HashMap<String, Value>,
strategy: CleaningStrategy,
ref_stack: &mut HashSet<String>,
) -> Value {
// Prevent circular references
if ref_stack.contains(ref_value) {
tracing::warn!("Circular $ref detected: {}", ref_value);
return Self::preserve_meta(obj, Value::Object(Map::new()));
}
if let Some(def_name) = Self::parse_local_ref(ref_value) {
if let Some(definition) = defs.get(def_name.as_str()) {
ref_stack.insert(ref_value.to_string());
let cleaned = Self::clean_with_defs(definition.clone(), defs, strategy, ref_stack);
ref_stack.remove(ref_value);
return Self::preserve_meta(obj, cleaned);
}
}
tracing::warn!("Cannot resolve $ref: {}", ref_value);
Self::preserve_meta(obj, Value::Object(Map::new()))
}
/// Parse a local JSON Pointer ref (#/$defs/Name or #/definitions/Name).
fn parse_local_ref(ref_value: &str) -> Option<String> {
ref_value
.strip_prefix("#/$defs/")
.or_else(|| ref_value.strip_prefix("#/definitions/"))
.map(Self::decode_json_pointer)
}
/// Decode JSON Pointer escaping (`~0` = `~`, `~1` = `/`).
fn decode_json_pointer(segment: &str) -> String {
if !segment.contains('~') {
return segment.to_string();
}
let mut decoded = String::with_capacity(segment.len());
let mut chars = segment.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '~' {
match chars.peek().copied() {
Some('0') => {
chars.next();
decoded.push('~');
}
Some('1') => {
chars.next();
decoded.push('/');
}
_ => decoded.push('~'),
}
} else {
decoded.push(ch);
}
}
decoded
}
/// Try to simplify anyOf/oneOf to a simpler form.
fn try_simplify_union(
obj: &Map<String, Value>,
defs: &HashMap<String, Value>,
strategy: CleaningStrategy,
ref_stack: &mut HashSet<String>,
) -> Option<Value> {
let union_key = if obj.contains_key("anyOf") {
"anyOf"
} else if obj.contains_key("oneOf") {
"oneOf"
} else {
return None;
};
let variants = obj.get(union_key)?.as_array()?;
let cleaned_variants: Vec<Value> = variants
.iter()
.map(|v| Self::clean_with_defs(v.clone(), defs, strategy, ref_stack))
.collect();
// Strip null variants
let non_null: Vec<Value> = cleaned_variants
.into_iter()
.filter(|v| !Self::is_null_schema(v))
.collect();
// If only one variant remains after stripping nulls, return it
if non_null.len() == 1 {
return Some(Self::preserve_meta(obj, non_null[0].clone()));
}
// Try to flatten to enum if all variants are literals
if let Some(enum_value) = Self::try_flatten_literal_union(&non_null) {
return Some(Self::preserve_meta(obj, enum_value));
}
None
}
/// Check if a schema represents null type.
fn is_null_schema(value: &Value) -> bool {
if let Some(obj) = value.as_object() {
if let Some(Value::Null) = obj.get("const") {
return true;
}
if let Some(Value::Array(arr)) = obj.get("enum") {
if arr.len() == 1 && matches!(arr[0], Value::Null) {
return true;
}
}
if let Some(Value::String(t)) = obj.get("type") {
if t == "null" {
return true;
}
}
}
false
}
/// Try to flatten anyOf/oneOf with only literal values to enum.
fn try_flatten_literal_union(variants: &[Value]) -> Option<Value> {
if variants.is_empty() {
return None;
}
let mut all_values = Vec::new();
let mut common_type: Option<String> = None;
for variant in variants {
let obj = variant.as_object()?;
let literal_value = if let Some(const_val) = obj.get("const") {
const_val.clone()
} else if let Some(Value::Array(arr)) = obj.get("enum") {
if arr.len() == 1 {
arr[0].clone()
} else {
return None;
}
} else {
return None;
};
let variant_type = obj.get("type")?.as_str()?;
match &common_type {
None => common_type = Some(variant_type.to_string()),
Some(t) if t != variant_type => return None,
_ => {}
}
all_values.push(literal_value);
}
common_type.map(|t| {
json!({
"type": t,
"enum": all_values
})
})
}
/// Clean type array, removing null.
fn clean_type_array(value: Value) -> Value {
if let Value::Array(types) = value {
let non_null: Vec<Value> = types
.into_iter()
.filter(|v| v.as_str() != Some("null"))
.collect();
match non_null.len() {
0 => Value::String("null".to_string()),
1 => non_null.into_iter().next().unwrap_or(Value::String("null".to_string())),
_ => Value::Array(non_null),
}
} else {
value
}
}
/// Clean properties object.
fn clean_properties(
value: Value,
defs: &HashMap<String, Value>,
strategy: CleaningStrategy,
ref_stack: &mut HashSet<String>,
) -> Value {
if let Value::Object(props) = value {
let cleaned: Map<String, Value> = props
.into_iter()
.map(|(k, v)| (k, Self::clean_with_defs(v, defs, strategy, ref_stack)))
.collect();
Value::Object(cleaned)
} else {
value
}
}
/// Clean union (anyOf/oneOf/allOf).
fn clean_union(
value: Value,
defs: &HashMap<String, Value>,
strategy: CleaningStrategy,
ref_stack: &mut HashSet<String>,
) -> Value {
if let Value::Array(variants) = value {
let cleaned: Vec<Value> = variants
.into_iter()
.map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack))
.collect();
Value::Array(cleaned)
} else {
value
}
}
/// Preserve metadata (description, title, default) from source to target.
fn preserve_meta(source: &Map<String, Value>, mut target: Value) -> Value {
if let Value::Object(target_obj) = &mut target {
for &key in SCHEMA_META_KEYWORDS {
if let Some(value) = source.get(key) {
target_obj.insert(key.to_string(), value.clone());
}
}
}
target
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_remove_unsupported_keywords() {
let schema = json!({
"type": "string",
"minLength": 1,
"maxLength": 100,
"pattern": "^[a-z]+$",
"description": "A lowercase string"
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["type"], "string");
assert_eq!(cleaned["description"], "A lowercase string");
assert!(cleaned.get("minLength").is_none());
assert!(cleaned.get("maxLength").is_none());
assert!(cleaned.get("pattern").is_none());
}
#[test]
fn test_resolve_ref() {
let schema = json!({
"type": "object",
"properties": {
"age": {
"$ref": "#/$defs/Age"
}
},
"$defs": {
"Age": {
"type": "integer",
"minimum": 0
}
}
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["properties"]["age"]["type"], "integer");
assert!(cleaned["properties"]["age"].get("minimum").is_none());
assert!(cleaned.get("$defs").is_none());
}
#[test]
fn test_flatten_literal_union() {
let schema = json!({
"anyOf": [
{ "const": "admin", "type": "string" },
{ "const": "user", "type": "string" },
{ "const": "guest", "type": "string" }
]
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["type"], "string");
assert!(cleaned["enum"].is_array());
let enum_values = cleaned["enum"].as_array().unwrap();
assert_eq!(enum_values.len(), 3);
assert!(enum_values.contains(&json!("admin")));
assert!(enum_values.contains(&json!("user")));
assert!(enum_values.contains(&json!("guest")));
}
#[test]
fn test_strip_null_from_union() {
let schema = json!({
"oneOf": [
{ "type": "string" },
{ "type": "null" }
]
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["type"], "string");
assert!(cleaned.get("oneOf").is_none());
}
#[test]
fn test_const_to_enum() {
let schema = json!({
"const": "fixed_value",
"description": "A constant"
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["enum"], json!(["fixed_value"]));
assert_eq!(cleaned["description"], "A constant");
assert!(cleaned.get("const").is_none());
}
#[test]
fn test_preserve_metadata() {
let schema = json!({
"$ref": "#/$defs/Name",
"description": "User's name",
"title": "Name Field",
"default": "Anonymous",
"$defs": {
"Name": {
"type": "string"
}
}
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["type"], "string");
assert_eq!(cleaned["description"], "User's name");
assert_eq!(cleaned["title"], "Name Field");
assert_eq!(cleaned["default"], "Anonymous");
}
#[test]
fn test_circular_ref_prevention() {
let schema = json!({
"type": "object",
"properties": {
"parent": {
"$ref": "#/$defs/Node"
}
},
"$defs": {
"Node": {
"type": "object",
"properties": {
"child": {
"$ref": "#/$defs/Node"
}
}
}
}
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["properties"]["parent"]["type"], "object");
}
#[test]
fn test_validate_schema() {
let valid = json!({
"type": "object",
"properties": {
"name": { "type": "string" }
}
});
assert!(SchemaCleanr::validate(&valid).is_ok());
let invalid = json!({
"properties": {
"name": { "type": "string" }
}
});
assert!(SchemaCleanr::validate(&invalid).is_err());
}
#[test]
fn test_strategy_differences() {
let schema = json!({
"type": "string",
"minLength": 1,
"description": "A string field"
});
// Gemini: Most restrictive (removes minLength)
let gemini = SchemaCleanr::clean_for_gemini(schema.clone());
assert!(gemini.get("minLength").is_none());
assert_eq!(gemini["type"], "string");
assert_eq!(gemini["description"], "A string field");
// OpenAI: Most permissive (keeps minLength)
let openai = SchemaCleanr::clean_for_openai(schema.clone());
assert_eq!(openai["minLength"], 1);
assert_eq!(openai["type"], "string");
}
#[test]
fn test_nested_properties() {
let schema = json!({
"type": "object",
"properties": {
"user": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
}
},
"additionalProperties": false
}
}
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert!(
cleaned["properties"]["user"]["properties"]["name"]
.get("minLength")
.is_none()
);
assert!(
cleaned["properties"]["user"]
.get("additionalProperties")
.is_none()
);
}
#[test]
fn test_type_array_null_removal() {
let schema = json!({
"type": ["string", "null"]
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert_eq!(cleaned["type"], "string");
}
#[test]
fn test_skip_type_when_non_simplifiable_union_exists() {
let schema = json!({
"type": "object",
"oneOf": [
{
"type": "object",
"properties": {
"a": { "type": "string" }
}
},
{
"type": "object",
"properties": {
"b": { "type": "number" }
}
}
]
});
let cleaned = SchemaCleanr::clean_for_gemini(schema);
assert!(cleaned.get("type").is_none());
assert!(cleaned.get("oneOf").is_some());
}
}

View File

@ -13,4 +13,19 @@ pub trait Tool: Send + Sync + 'static {
fn description(&self) -> &str; fn description(&self) -> &str;
fn parameters_schema(&self) -> serde_json::Value; fn parameters_schema(&self) -> serde_json::Value;
async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult>; async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult>;
/// Whether this tool is side-effect free and safe to parallelize.
fn read_only(&self) -> bool {
false
}
/// Whether this tool can run alongside other concurrency-safe tools.
fn concurrency_safe(&self) -> bool {
self.read_only() && !self.exclusive()
}
/// Whether this tool should run alone even if concurrency is enabled.
fn exclusive(&self) -> bool {
false
}
} }