PythonEvaluation

HostedEvaluatorRunner

_build_payload()

def _build_payload(eval_id, project_id, eval_run_name, created_at, examples, scorers) -> ExampleEvaluationRun:

Parameters

eval_id

required

:

str

project_id

required

:

str

eval_run_name

required

:

str

created_at

required

:

str

examples

required

:

List[Example]

scorers

required

:

List[str]

Returns

ExampleEvaluationRun


_submit()

def _submit(console, project_id, eval_id, examples, scorers, payload, progress) -> int:

Parameters

console

required

:

Console

project_id

required

:

str

eval_id

required

:

str

examples

required

:

List[Example]

scorers

required

:

List[str]

payload

required

:

ExampleEvaluationRun

progress

required

:

Progress

Returns

int


run()

Execute an evaluation run and return results.

def run(examples, scorers, eval_run_name, assert_test=False, timeout_seconds=300) -> typing.List:

Parameters

examples

required

:

List[Example]

Examples to evaluate.

scorers

required

:

List[S]

Scorers to run (strings or Judge instances).

eval_run_name

required

:

str

Name for this evaluation run.

assert_test

:

bool

When True, raises AssertionError if any scorer fails its threshold.

Default:

False

timeout_seconds

:

int

Maximum time to wait for results.

Default:

300

Returns

typing.List - A list of ScoringResult objects, one per example.