Tool Order
The Tool Order scorer is an agentic scorer that evaluates whether tools are called in the correct sequence and optionally with the correct parameters.
Scorer Breakdown
The Tool Order scorer offers two distinct scoring modes that can be configured to match your evaluation needs.
Ordering Match (Default)
Checks that the ordering of the tools called is the same as the expected ordering. Returns a score of 1.0 if they match, otherwise 0.0.
scorer = ToolOrderScorer()
Exact Match
Checks that the ordering of the tools called is exactly the same as the expected ordering. Returns a score of 1.0 if they match, otherwise 0.0.
scorer = ToolOrderScorer(exact_match=True)
Example Agent Tool Structure
Here's how to structure your agent and tools with the @judgment.observe
decorator:
from judgeval.tracer import Tracer
judgment = Tracer(project_name="my_agent")
class MyAgent: # sample agent, replace with your own
@judgment.observe(span_type="tool")
def get_attractions(self, destination: str) -> str:
"""Get attractions for a destination"""
pass
@judgment.observe(span_type="tool")
def get_weather(self, destination: str, start_date: str, end_date: str) -> str:
"""Get weather forecast for a destination"""
pass
@judgment.observe(span_type="function")
def run_agent(self, prompt: str) -> str:
"""Run the agent with the given prompt"""
attractions = self.get_attractions("Paris") # replace with your own tool calling logic
weather = self.get_weather("Paris", "2025-06-01", "2025-06-02") # replace with your own args
return f"Attractions: {attractions}\nWeather: {weather}"
from typing import Annotated, List
from langchain_openai import ChatOpenAI
from langchain_core.messages import BaseMessage, HumanMessage
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode
from judgeval.common.tracer import Tracer
from judgeval.integrations.langgraph import JudgevalCallbackHandler
from judgeval.scorers import ToolOrderScorer
class State(TypedDict):
messages: Annotated[List[BaseMessage], add_messages]
# integration code
judgment = Tracer(project_name="LangGraphTravelAgent")
handler = JudgevalCallbackHandler(judgment)
def get_attractions(destination: str) -> str:
"""Get attractions for a destination"""
pass
def get_weather(destination: str, start_date: str, end_date: str) -> str:
"""Get weather forecast for a destination"""
pass
def run_agent(prompt: str):
tools = [get_attractions, get_weather]
llm = ChatOpenAI(model="gpt-4.1")
graph_builder = StateGraph(State)
def assistant(state: State):
llm_with_tools = llm.bind_tools(tools)
response = llm_with_tools.invoke(state["messages"])
return {"messages": [response]}
tool_node = ToolNode(tools)
graph_builder.add_node("assistant", assistant)
graph_builder.add_node("tools", tool_node)
graph_builder.set_entry_point("assistant")
graph_builder.add_conditional_edges(
"assistant",
lambda state: "tools" if state["messages"][-1].tool_calls else END
)
graph_builder.add_edge("tools", "assistant")
graph = graph_builder.compile()
config_with_callbacks = {"callbacks": [handler]}
result = graph.invoke({
"messages": [HumanMessage(content=prompt)]
}, config_with_callbacks)
return result, handler
Sample Implementation
Then you can use the ToolOrderScorer
to evaluate the agent's tool selection/ordering:
from judgeval import JudgmentClient
from judgeval.data import Example
from judgeval.scorers import ToolOrderScorer
client = JudgmentClient()
# Define example with expected tool sequence
example = Example(
input={"prompt": "What's the attraction and weather in Paris for early June 2025 (1st - 2nd)?"},
expected_tools=[
{
"tool_name": "get_attractions",
"parameters": {"destination": "Paris"}
},
{
"tool_name": "get_weather",
"parameters": {"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"}
}
]
)
agent = MyAgent()
results = client.assert_test(
examples=[example],
scorers=[ToolOrderScorer(exact_match=True)],
function=agent.run_agent,
tracer=judgment
)
from judgeval import JudgmentClient
from judgeval.data import Example
from judgeval.scorers import ToolOrderScorer
client = JudgmentClient()
example = Example(
input={"prompt": "What's the attraction and weather in Paris for early June 2025 (1st - 2nd)?"},
expected_tools=[
{
"tool_name": "get_attractions",
"parameters": {"destination": "Paris"}
},
{
"tool_name": "get_weather",
"parameters": {"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"}
}
]
)
results = client.assert_test(
examples=[example],
scorers=[ToolOrderScorer(exact_match=True)],
function=run_agent,
tracer=handler, # handler instead of the tracer
)
You can also define your test cases in a YAML file:
# tests.yaml
examples:
- input:
prompt: "What's the attraction and weather in Paris for early June 2025 (1st - 2nd)?"
expected_tools:
- tool_name: "get_attractions"
parameters:
destination: "Paris"
- tool_name: "get_weather"
parameters:
destination: "Paris"
start_date: "2025-06-01"
end_date: "2025-06-02"
Then run the evaluation using the YAML file:
from judgeval.utils.file_utils import get_examples_from_yaml
client.assert_test(
examples=get_examples_from_yaml("tests.yaml"),
scorers=[ToolOrderScorer(exact_match=True)],
function=agent.run_agent,
tracer=judgment
)
from judgeval.utils.file_utils import get_examples_from_yaml
client.assert_test(
examples=get_examples_from_yaml("tests.yaml"),
scorers=[ToolOrderScorer(exact_match=True)],
function=run_agent,
tracer=handler,
)