Tool Order

The Tool Order scorer is an agentic scorer that evaluates whether tools are called in the correct sequence and optionally with the correct parameters.

This is particularly useful for evaluating agent workflows where the order of tool calls matters for the overall success of the task.

Scorer Breakdown

The Tool Order scorer offers two distinct scoring modes that can be configured to match your evaluation needs.

Ordering Match (Default)

Checks that the ordering of the tools called is the same as the expected ordering. Returns a score of 1.0 if they match, otherwise 0.0.

This score is useful when you care about the ordering of the tools called but are fine with other tools being within the path.

ordering_match.py

scorer = ToolOrderScorer()

Exact Match

Checks that the ordering of the tools called is exactly the same as the expected ordering. Returns a score of 1.0 if they match, otherwise 0.0.

This score is useful when you care about the exact ordering of the tools called.

exact_match.py

scorer = ToolOrderScorer(exact_match=True)

Additionally, if tool parameters are included in the expected ordering, they will be checked against the actual parameters used. If parameters are not specified, only the tool order will be evaluated.

Example Agent Tool Structure

Here's how to structure your agent and tools with the @judgment.observe decorator:

vanilla_agent.py

from judgeval.tracer import Tracer

judgment = Tracer(project_name="default_project")

class MyAgent:  # sample agent, replace with your own
    @judgment.observe(span_type="tool")
    def get_attractions(self, destination: str) -> str:
        """Get attractions for a destination"""
        pass

    @judgment.observe(span_type="tool")
    def get_weather(self, destination: str, start_date: str, end_date: str) -> str:
        """Get weather forecast for a destination"""
        pass

    @judgment.observe(span_type="function")
    def run_agent(self, prompt: str) -> str:
        """Run the agent with the given prompt"""
        attractions = self.get_attractions("Paris")  # replace with your own tool calling logic
        weather = self.get_weather("Paris", "2025-06-01", "2025-06-02")  # replace with your own args
        return f"Attractions: {attractions}\nWeather: {weather}"

langgraph_agent.py

from typing import Annotated, List
from langchain_openai import ChatOpenAI
from langchain_core.messages import BaseMessage, HumanMessage
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode
from judgeval.common.tracer import Tracer
from judgeval.integrations.langgraph import JudgevalCallbackHandler
from judgeval.scorers import ToolOrderScorer

class State(TypedDict):
    messages: Annotated[List[BaseMessage], add_messages]

# integration code
judgment = Tracer(project_name="default_project")
handler = JudgevalCallbackHandler(judgment)

def get_attractions(destination: str) -> str:
    """Get attractions for a destination"""
    pass

def get_weather(destination: str, start_date: str, end_date: str) -> str:
    """Get weather forecast for a destination"""
    pass

def run_agent(prompt: str):
    tools = [get_attractions, get_weather]
    llm = ChatOpenAI(model="gpt-4.1")
    graph_builder = StateGraph(State)

    def assistant(state: State):
        llm_with_tools = llm.bind_tools(tools)
        response = llm_with_tools.invoke(state["messages"])
        return {"messages": [response]}

    tool_node = ToolNode(tools)
    graph_builder.add_node("assistant", assistant)
    graph_builder.add_node("tools", tool_node)
    graph_builder.set_entry_point("assistant")
    graph_builder.add_conditional_edges(
        "assistant",
        lambda state: "tools" if state["messages"][-1].tool_calls else END
    )
    graph_builder.add_edge("tools", "assistant")
    graph = graph_builder.compile()
    config_with_callbacks = {"callbacks": [handler]}
    result = graph.invoke({
        "messages": [HumanMessage(content=prompt)]
    }, config_with_callbacks)
    return result, handler

Sample Implementation

Then you can use the ToolOrderScorer to evaluate the agent's tool selection/ordering:

vanilla_agent.py

from judgeval import JudgmentClient
from judgeval.data import Example
from judgeval.scorers import ToolOrderScorer

client = JudgmentClient()
# Define example with expected tool sequence
example = Example(
    input={"prompt": "What's the attraction and weather in Paris for early June 2025 (1st - 2nd)?"},
    expected_tools=[
        {
            "tool_name": "get_attractions",
            "parameters": {"destination": "Paris"}
        },
        {
            "tool_name": "get_weather",
            "parameters": {"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"}
        }
    ]
)

agent = MyAgent()
results = client.assert_test(
    examples=[example],
    scorers=[ToolOrderScorer(exact_match=True)],
    function=agent.run_agent,
    tracer=judgment
)

langgraph_agent.py

from judgeval import JudgmentClient
from judgeval.data import Example
from judgeval.scorers import ToolOrderScorer

client = JudgmentClient()
example = Example(
    input={"prompt": "What's the attraction and weather in Paris for early June 2025 (1st - 2nd)?"},
    expected_tools=[
        {
            "tool_name": "get_attractions",
            "parameters": {"destination": "Paris"}
        },
        {
            "tool_name": "get_weather",
            "parameters": {"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"}
        }
    ]
)

results = client.assert_test(
    examples=[example],
    scorers=[ToolOrderScorer(exact_match=True)],
    function=run_agent,
    tracer=handler,  # handler instead of the tracer
)

Try playing around with the inputs and expected outputs to simulate different tool calling situations, like when the agent fails.

You can also define your test cases in a YAML file:

# tests.yaml
examples:
  - input:
      prompt: "What's the attraction and weather in Paris for early June 2025 (1st - 2nd)?"
    expected_tools:
      - tool_name: "get_attractions"
        parameters:
            destination: "Paris"
      - tool_name: "get_weather"
        parameters:
            destination: "Paris"
            start_date: "2025-06-01"
            end_date: "2025-06-02"

Then run the evaluation using the YAML file:

from judgeval.utils.file_utils import get_examples_from_yaml

client.assert_test(
    examples=get_examples_from_yaml("tests.yaml"),
    scorers=[ToolOrderScorer(exact_match=True)],
    function=agent.run_agent,
    tracer=judgment
)

from judgeval.utils.file_utils import get_examples_from_yaml

client.assert_test(
    examples=get_examples_from_yaml("tests.yaml"),
    scorers=[ToolOrderScorer(exact_match=True)],
    function=run_agent,
    tracer=handler,
)

Tool Order

Scorer Breakdown

Example Agent Tool Structure

Sample Implementation

On this page