Judgment Labs Logo

Offline Testing

Run your agent over eval sets with OfflineTracer, collect traces, and score them in batch.

Offline testing lets you run your agent against a fixed eval set, collect full traces for each run, and batch-score them with judges — without touching your live production monitoring. It's the standard workflow for iterating on prompts, catching regressions, and validating behavior before shipping.


How It Works

OfflineTracer is a drop-in replacement for Tracer that routes spans to an isolated offline endpoint instead of the live monitoring pipeline.

Key differences from Tracer:

  • Spans land in offline storage and do not appear on the live monitoring page
  • Each completed root span appends an Example to a caller-owned dataset list
  • The Example carries the offline_trace_id plus any static example_fields you configure (e.g. input, golden_output)
  • After the loop, pass dataset to evaluation.create().run() to score all traces in batch

Use the offline_tracer() convenience method on your Judgeval client — credentials are reused automatically.

See Tracing for general tracing concepts and the SDK Reference for full API details.


Quickstart

offline_test.py
from judgeval import Judgeval, Tracer
from judgeval.data import Example

client = Judgeval(project_name="default_project")
dataset: list[Example] = []

inputs = [
    "What is the capital of France?",
    "What is the capital of Japan?",
    "What is the capital of Brazil?",
]

for question in inputs:
    client.offline_tracer(
        dataset=dataset,
        example_fields={"input": question},
    )
    try:
        result = my_agent(question)
    finally:
        Tracer.force_flush()  # flush spans before next tracer registers

Tracer.shutdown()

# dataset now contains one Example per question, each linked to an offline trace
results = client.evaluation.create().run(
    examples=dataset,
    scorers=["AccuracyScorer"],
    eval_run_name="offline-test-run",
)
offlineTest.ts
import { Judgeval, Tracer, type Example } from "judgeval";

const client = await Judgeval.create({ projectName: "default_project" });
const dataset: Example[] = [];

const inputs = [
    "What is the capital of France?",
    "What is the capital of Japan?",
    "What is the capital of Brazil?",
];

for (const question of inputs) {
    await client.offlineTracer({
        dataset,
        exampleFields: { input: question },
    });
    try {
        const result = await myAgent(question);
    } finally {
        await Tracer.forceFlush(); // flush spans before next tracer registers
    }
}

await Tracer.shutdown();

// dataset now contains one Example per question, each linked to an offline trace
const evaluation = client.evaluation.create();
const results = await evaluation.run({
    examples: dataset,
    scorers: ["AccuracyScorer"],
    evalRunName: "offline-test-run",
});

Results are saved to your project on the Judgment platform where you can compare runs across prompt versions.


Agent Testing Use Case

A common pattern is running an agent over a golden dataset — a fixed set of inputs with known expected outputs — then scoring each trace with a judge. This lets you compare prompt versions, catch hallucinations, or measure behavioral drift without affecting production.

The example below runs an FAQ agent over 10 questions. exampleFields attaches the input and golden_output to each collected Example. A hosted judge then uses the offline_trace_id to fetch the agent's actual output server-side and compare it to golden_output.

agent_offline_test.py
from openai import OpenAI
from judgeval import Judgeval, Tracer, wrap
from judgeval.data import Example

client = Judgeval(project_name="default_project")
openai = wrap(OpenAI())

golden_dataset = [
    {"input": "How much does the Pro plan cost?", "golden_output": "$29/month"},
    {"input": "What formats can reports be exported as?", "golden_output": "PDF, CSV, Excel"},
    {"input": "Does FlowState support SSO?", "golden_output": "Yes, via SAML 2.0"},
]


@Tracer.observe(span_type="agent")
def answer_question(question: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": question}],
    )
    return response.choices[0].message.content or ""


dataset: list[Example] = []

for item in golden_dataset:
    client.offline_tracer(
        dataset=dataset,
        example_fields={
            "input": item["input"],
            "golden_output": item["golden_output"],
        },
    )
    try:
        answer_question(item["input"])
    finally:
        Tracer.force_flush()

Tracer.shutdown()

print(f"Collected {len(dataset)} offline traces.")

results = client.evaluation.create().run(
    examples=dataset,
    scorers=["Hallucination Judge"],
    eval_run_name="agent-hallucination-test",
)

for result in results:
    for scorer in result.scorers_data or []:
        status = "PASS" if scorer.success else "FAIL"
        print(f"[{status}] {scorer.name}: {scorer.reason}")
agentOfflineTest.ts
import OpenAI from "openai";
import { Judgeval, Tracer, wrap, type Example } from "judgeval";

const client = await Judgeval.create({ projectName: "default_project" });
const openai = wrap(new OpenAI());

const goldenDataset = [
    { input: "How much does the Pro plan cost?", golden_output: "$29/month" },
    { input: "What formats can reports be exported as?", golden_output: "PDF, CSV, Excel" },
    { input: "Does FlowState support SSO?", golden_output: "Yes, via SAML 2.0" },
];

const answerQuestion = Tracer.observe(
    async function answerQuestion(question: string): Promise<string> {
        const response = await openai.chat.completions.create({
            model: "gpt-4.1-mini",
            messages: [{ role: "user", content: question }],
        });
        return response.choices[0]?.message.content ?? "";
    },
    { spanType: "agent" },
);

const dataset: Example[] = [];

for (const item of goldenDataset) {
    await client.offlineTracer({
        dataset,
        exampleFields: {
            input: item.input,
            golden_output: item.golden_output,
        },
    });
    try {
        await answerQuestion(item.input);
    } finally {
        await Tracer.forceFlush();
    }
}

await Tracer.shutdown();

console.log(`Collected ${dataset.length} offline traces.`);

const evaluation = client.evaluation.create();
const results = await evaluation.run({
    examples: dataset,
    scorers: ["Hallucination Judge"],
    evalRunName: "agent-hallucination-test",
});

for (const result of results) {
    for (const scorer of result.scorers) {
        const status = scorer.success ? "PASS" : "FAIL";
        console.log(`[${status}] ${scorer.name}: ${scorer.reason}`);
    }
}

await Tracer.shutdown();

CI Integration

Run offline tests as assertions in CI by setting assert_test=True. This raises an exception if any scorer fails, failing the test run.

test_agent.py
from judgeval import Judgeval, Tracer
from judgeval.data import Example
import pytest

client = Judgeval(project_name="default_project")

def collect_dataset() -> list[Example]:
    dataset: list[Example] = []
    inputs = ["What is the capital of France?", "What is the capital of Japan?"]
    for question in inputs:
        client.offline_tracer(dataset=dataset, example_fields={"input": question})
        try:
            my_agent(question)
        finally:
            Tracer.force_flush()
    Tracer.shutdown()
    return dataset

def test_agent_accuracy():
    dataset = collect_dataset()
    client.evaluation.create().run(
        examples=dataset,
        scorers=["AccuracyScorer"],
        eval_run_name="ci-offline-test",
        assert_test=True,  # raises JudgmentTestError on failure
    )

Run with pytest test_agent.py.


Next Steps

  • Tracing - Learn how Tracer and span collection work
  • Agent Judges - Create judges to score your offline traces
  • Code Judges - Write custom scoring logic in Python
  • Datasets - Manage and share eval sets across your team

On this page