EvaluatorRunner
Abstract base for evaluation runners.
S
TypeVar('S', str, Judge)
Abstract base for evaluation runners.
Concrete implementations handle either hosted (server-side) or local
(in-process) scorer execution. The generic parameter S is str
for hosted scorers or Judge for local scorers.
__init__()
def __init__(client, project_id, project_name):Parameters
client
required:JudgmentSyncClient
project_id
required:Optional[str]
project_name
required:str
_build_payload()
Build the ExampleEvaluationRun payload for the evaluation.
def _build_payload(eval_id, project_id, eval_run_name, created_at, examples, scorers) -> ExampleEvaluationRun:Parameters
eval_id
required:str
project_id
required:str
eval_run_name
required:str
created_at
required:str
examples
required:List[Example]
scorers
required:List[S]
Returns
ExampleEvaluationRun
_submit()
Run the evaluation and save the results to the server. Returns the number of unique examples to expect results for.
def _submit(console, project_id, eval_id, examples, scorers, payload, progress) -> int:Parameters
console
required:Console
project_id
required:str
eval_id
required:str
examples
required:List[Example]
scorers
required:List[S]
payload
required:ExampleEvaluationRun
progress
required:Progress
Returns
int
_poll()
Poll the server for the results of the evaluation.
def _poll(console, project_id, eval_id, expected_count, timeout_seconds, progress) -> tuple[list[ExperimentRunItem], str]:Parameters
console
required:Console
project_id
required:str
eval_id
required:str
expected_count
required:int
timeout_seconds
required:int
progress
required:Progress
Returns
tuple[list[ExperimentRunItem], str]
_display_results()
Display the results of the evaluation.
def _display_results(console, examples, results_data, url, assert_test) -> List[ScoringResult]:Parameters
console
required:Console
examples
required:List[Example]
results_data
required:List[ExperimentRunItem]
url
required:str
assert_test
required:bool
Returns
List[ScoringResult]
run()
Execute an evaluation run and return results.
def run(examples, scorers, eval_run_name, assert_test=False, timeout_seconds=300) -> typing.List:Parameters
examples
required:List[Example]
Examples to evaluate.
scorers
required:List[S]
Scorers to run (strings or Judge instances).
eval_run_name
required:str
Name for this evaluation run.
assert_test
:bool
When True, raises AssertionError if any scorer
fails its threshold.
False
timeout_seconds
:int
Maximum time to wait for results.
300
Returns
typing.List - A list of ScoringResult objects, one per example.