PythonEvaluation

EvaluatorRunner

Abstract base for evaluation runners.

S

Default:

TypeVar('S', str, Judge)

Abstract base for evaluation runners.

Concrete implementations handle either hosted (server-side) or local (in-process) scorer execution. The generic parameter S is str for hosted scorers or Judge for local scorers.

__init__()

def __init__(client, project_id, project_name):

Parameters

client

required

:

JudgmentSyncClient

project_id

required

:

Optional[str]

project_name

required

:

str


_build_payload()

Build the ExampleEvaluationRun payload for the evaluation.

def _build_payload(eval_id, project_id, eval_run_name, created_at, examples, scorers) -> ExampleEvaluationRun:

Parameters

eval_id

required

:

str

project_id

required

:

str

eval_run_name

required

:

str

created_at

required

:

str

examples

required

:

List[Example]

scorers

required

:

List[S]

Returns

ExampleEvaluationRun


_submit()

Run the evaluation and save the results to the server. Returns the number of unique examples to expect results for.

def _submit(console, project_id, eval_id, examples, scorers, payload, progress) -> int:

Parameters

console

required

:

Console

project_id

required

:

str

eval_id

required

:

str

examples

required

:

List[Example]

scorers

required

:

List[S]

payload

required

:

ExampleEvaluationRun

progress

required

:

Progress

Returns

int


_poll()

Poll the server for the results of the evaluation.

def _poll(console, project_id, eval_id, expected_count, timeout_seconds, progress) -> tuple[list[ExperimentRunItem], str]:

Parameters

console

required

:

Console

project_id

required

:

str

eval_id

required

:

str

expected_count

required

:

int

timeout_seconds

required

:

int

progress

required

:

Progress

Returns

tuple[list[ExperimentRunItem], str]


_display_results()

Display the results of the evaluation.

def _display_results(console, examples, results_data, url, assert_test) -> List[ScoringResult]:

Parameters

console

required

:

Console

examples

required

:

List[Example]

results_data

required

:

List[ExperimentRunItem]

url

required

:

str

assert_test

required

:

bool

Returns

List[ScoringResult]


run()

Execute an evaluation run and return results.

def run(examples, scorers, eval_run_name, assert_test=False, timeout_seconds=300) -> typing.List:

Parameters

examples

required

:

List[Example]

Examples to evaluate.

scorers

required

:

List[S]

Scorers to run (strings or Judge instances).

eval_run_name

required

:

str

Name for this evaluation run.

assert_test

:

bool

When True, raises AssertionError if any scorer fails its threshold.

Default:

False

timeout_seconds

:

int

Maximum time to wait for results.

Default:

300

Returns

typing.List - A list of ScoringResult objects, one per example.