evaluate_model

TextEvolve/Evaluate data models and types.

Classes

BaseEvaluateTurnRetryStrategy

Bases: BaseDataModel, ABC

Abstract base class for Evaluate turn retry strategies.

Functions

do_retry `abstractmethod`

do_retry(
    profile: EvaluateProfile,
    xi: List[DebateHistoryEntry],
    i: Agent,
    round_num: int,
    m_i: GetMemoriesResponse | None = None,
) -> RetryAction

DebateHistoryEntry

Bases: BaseModel

Agent debate history entry.

Attributes

confidence_scores `class-attribute` `instance-attribute`

confidence_scores: ndarray = Field(
    ...,
    title="Confidence",
    description="Unweighted confidence scores from the turn",
)

model_config `class-attribute` `instance-attribute`

model_config = ConfigDict(
    populate_by_name=True,
    use_enum_values=False,
    extra="forbid",
    strict=False,
    arbitrary_types_allowed=True,
)

r `class-attribute` `instance-attribute`

r: int = Field(
    ..., title="Round", description="The round number"
)

response `class-attribute` `instance-attribute`

response: str = Field(
    ..., title="Response", description="The agent response"
)

role `class-attribute` `instance-attribute`

role: str = Field(
    ..., title="Role", description="The agent role"
)

scores `class-attribute` `instance-attribute`

scores: ndarray = Field(
    ...,
    title="Scores",
    description="Unweighted scores from the turn",
)

Functions

format_llm

format_llm() -> str

Format the debate history entry as an LLM-safe string.

Returns:

str ( str ) –

The formatted debate history entry.

serialize_confidence_scores

serialize_confidence_scores(value: ndarray) -> List[float]

serialize_scores

serialize_scores(value: ndarray) -> List[List[float]]

validate_confidence_scores `classmethod`

validate_confidence_scores(
    value: Union[ndarray, List[float]]
) -> ndarray

validate_scores `classmethod`

validate_scores(
    value: Union[ndarray, List[List[float]]]
) -> ndarray

EvaluateTaskInput

Bases: BaseDataModel

TextEvolve/Evaluate internal task definition used by the evaluation engine.

Attributes

probabilistic_selection `class-attribute` `instance-attribute`

probabilistic_selection: bool = Field(
    default=False,
    title="Probabilistic Selection",
    description="When true",
)

x `class-attribute` `instance-attribute`

x: str = Field(
    ...,
    title="x",
    description="Input context in natural language",
)

x_name `class-attribute` `instance-attribute`

x_name: str = Field(
    default="input",
    title="X Name",
    description="Natural language name of the x input, ex: 'prompt', 'document', etc. This should be one word that can be used to identify the context of the evaluation to the LLM.",
)

y `class-attribute` `instance-attribute`

y: List[str] = Field(
    default_factory=list,
    title="y",
    description="List of candidate responses in natural language. Each will be scored against 'x'. Examples may include answers to a question, prompt variations, reasoning, etc.",
)

y_name `class-attribute` `instance-attribute`

y_name: str = Field(
    default="output",
    title="Y Name",
    description="Name of the y outputs that are to be scored. This should be one word that can be used to identify what is being evaluated.",
)

Functions

validate_y `classmethod`

validate_y(value: List[str]) -> List[str]

EvaluateTaskResults

Bases: BaseModel

When a TextEvolve/Evaluate task is complete this object is populated with the final results and are intended to be presented back to the user.

Attributes

agent_Sc `class-attribute` `instance-attribute`

agent_Sc: Dict[str, float] = Field(
    ...,
    title="Agent Confidence",
    description="Agent confidence scores",
)

input_task `class-attribute` `instance-attribute`

input_task: EvaluateTaskInput = Field(
    ...,
    title="Input Task",
    description="The Evaluate input task",
)

last_cv `class-attribute` `instance-attribute`

last_cv: float = Field(
    ..., title="Last CV", description="The last CV value"
)

last_round `class-attribute` `instance-attribute`

last_round: int = Field(
    ...,
    title="Last Round",
    description="The last round number",
)

model_config `class-attribute` `instance-attribute`

model_config = ConfigDict(
    populate_by_name=True,
    use_enum_values=False,
    extra="forbid",
    strict=False,
    arbitrary_types_allowed=True,
)

scores_norm `class-attribute` `instance-attribute`

scores_norm: ndarray = Field(
    default_factory=lambda: array([]),
    title="Normalized Scores",
    description="The normalized (weighted) score vector of shape (j). Normalized scores have confidence applied.",
)

scores_norm_unweighted `class-attribute` `instance-attribute`

scores_norm_unweighted: ndarray = Field(
    default_factory=lambda: array([]),
    title="Unweighted Normalized Scores",
    description="The normalized (weighted) score vector of shape (j)",
)

scores_softmax `class-attribute` `instance-attribute`

scores_softmax: ndarray = Field(
    default_factory=lambda: array([]),
    title="Softmax Scores",
    description="The softmax score vector of shape (j)",
)

trace_id `class-attribute` `instance-attribute`

trace_id: str = Field(
    ..., title="Trace ID", description="LLM trace_id"
)

y_debate `class-attribute` `instance-attribute`

y_debate: str = Field(
    default="",
    title="y Debate",
    description="The formatted debate transcript used to determine the final scores.",
)

y_index `class-attribute` `instance-attribute`

y_index: int = Field(
    default=-1,
    title="y Index",
    description="The index of the highest scoring response candidate",
)

y_response `class-attribute` `instance-attribute`

y_response: str = Field(
    default="",
    title="y Response",
    description="The raw text of the highest scoring response candidate",
)

y_score `class-attribute` `instance-attribute`

y_score: float = Field(
    default=-1.0,
    title="y Score",
    description="The score of the highest scoring response candidate. When probabilistic selection is enabled, this is the softmax score, otherwise it is the normalized score.",
)

Functions

eq

__eq__(other: Any) -> bool

serialize_scores_norm

serialize_scores_norm(value: ndarray) -> List[float]

serialize_scores_norm_unweighted

serialize_scores_norm_unweighted(
    value: ndarray,
) -> List[float]

serialize_scores_softmax

serialize_scores_softmax(value: ndarray) -> List[float]

validate_scores_norm `classmethod`

validate_scores_norm(
    value: Union[ndarray, List[float]]
) -> ndarray

validate_scores_norm_unweighted `classmethod`

validate_scores_norm_unweighted(
    value: Union[ndarray, List[float]]
) -> ndarray

validate_scores_softmax `classmethod`

validate_scores_softmax(
    value: Union[ndarray, List[float]]
) -> ndarray

EvaluateTaskState

Bases: BaseModel

TextEvolve/Evaluate compute state data model used internally by the CheckMate engine. This object represents a single task / unit of work.

Attributes

S `class-attribute` `instance-attribute`

S: ndarray = Field(
    default_factory=lambda: array([]),
    title="S",
    description="The scoring tensor of shape (r x i x j x k)",
)

S_round `class-attribute` `instance-attribute`

S_round: ndarray = Field(
    default_factory=lambda: array([]),
    title="S, Current Round",
    description="The scoring tensor for the current round of shape (i x j x k)",
)

Sc `class-attribute` `instance-attribute`

Sc: ndarray = Field(
    default_factory=lambda: array([]),
    title="Sc",
    description="The confidence scoring tensor of shape (r x i x j)",
)

Sc_round `class-attribute` `instance-attribute`

Sc_round: ndarray = Field(
    default_factory=lambda: array([]),
    title="Sc, Current Round",
    description="The confidence scoring tensor for the current round of shape (i x j)",
)

debate_history `class-attribute` `instance-attribute`

debate_history: List[DebateHistoryEntry] = Field(
    default_factory=list,
    title="Debate History",
    description="The full Evaluate debate history",
)

evaluate_result `class-attribute` `instance-attribute`

evaluate_result: EvaluateTaskResults | None = Field(
    default=None,
    title="Evaluate Result",
    description="The final evaluation result for the task, populated when the evaluate operation completes successfully.",
)

exception `class-attribute` `instance-attribute`

exception: str | None = Field(
    default=None,
    title="Exception",
    description="The exception raised during evaluation on failure",
)

model_config `class-attribute` `instance-attribute`

model_config = ConfigDict(
    populate_by_name=True,
    use_enum_values=False,
    extra="forbid",
    strict=False,
    arbitrary_types_allowed=True,
)

status `class-attribute` `instance-attribute`

status: RecordStatus = Field(
    default=UNPROCESSED,
    title="Status",
    description="Evaluate processing status",
)

task `class-attribute` `instance-attribute`

task: EvaluateTaskInput = Field(
    ..., title="Task", description="The Evaluate input task"
)

trace_id `class-attribute` `instance-attribute`

trace_id: str | None = Field(
    default=None,
    title="Trace ID",
    description="LLM trace_id",
)

Functions

eq

__eq__(other: Any) -> bool

serialize_S

serialize_S(
    value: ndarray,
) -> List[List[List[List[float]]]]

serialize_S_round

serialize_S_round(
    value: ndarray,
) -> List[List[List[float]]]

serialize_Sc

serialize_Sc(value: ndarray) -> List[List[List[float]]]

serialize_Sc_round

serialize_Sc_round(value: ndarray) -> List[List[float]]

validate_S `classmethod`

validate_S(
    value: Union[ndarray, List[List[List[List[float]]]]]
) -> ndarray

validate_S_round `classmethod`

validate_S_round(
    value: Union[ndarray, List[List[List[float]]]]
) -> ndarray

validate_Sc `classmethod`

validate_Sc(
    value: Union[ndarray, List[List[List[float]]]]
) -> ndarray

validate_Sc_round `classmethod`

validate_Sc_round(
    value: Union[ndarray, List[List[float]]]
) -> ndarray

HardLimitRetryStrategy

Bases: BaseEvaluateTurnRetryStrategy

Retries turn up to a hard failure limit.

Attributes:

fail_count (int) –

Number of Evaluate turn has failures.
max_fail_count (int) –

Maximum number of failures before a fatal error.

Attributes

fail_count `class-attribute` `instance-attribute`

fail_count: int = Field(
    default=0,
    title="Fail Count",
    description="Number of Evaluate turn has failures",
)

max_fail_count `class-attribute` `instance-attribute`

max_fail_count: int = Field(
    default=3,
    title="Max Fail Count",
    description="Maximum number of failures",
)

Functions

do_retry

do_retry(
    profile: EvaluateProfile,
    xi: List[DebateHistoryEntry],
    i: Agent,
    round_num: int,
    m_i: GetMemoriesResponse | None = None,
) -> RetryAction

Retry the operation up to a hard limit.

Parameters:

profile (EvaluateProfile) –

The evaluation profile settings.
xi (List[DebateHistoryEntry]) –

Debate history.
i (Agent) –

The current agent.
round_num (int) –

The current round number.
m_i (GetMemoriesResponse | None, default: None ) –

Memories for the current agent.

RetryAction

Bases: Enum

Retry action enumeration.

Attributes:

RETRY (str) –

Retry the operation.
STOP (str) –

Stop the operation and use the current scores.
ERROR (str) –

Stop the operation and (re)raise the triggering error.

Attributes

ERROR `class-attribute` `instance-attribute`

ERROR = 'error'

RETRY `class-attribute` `instance-attribute`

RETRY = 'retry'

STOP `class-attribute` `instance-attribute`

STOP = 'stop'

evaluate_model

Classes

BaseEvaluateTurnRetryStrategy

Functions

do_retry abstractmethod

DebateHistoryEntry

Attributes

confidence_scores class-attribute instance-attribute

model_config class-attribute instance-attribute

r class-attribute instance-attribute

response class-attribute instance-attribute

role class-attribute instance-attribute

scores class-attribute instance-attribute

Functions

format_llm

serialize_confidence_scores

serialize_scores

validate_confidence_scores classmethod

validate_scores classmethod

EvaluateTaskInput

Attributes

probabilistic_selection class-attribute instance-attribute

x class-attribute instance-attribute

x_name class-attribute instance-attribute

y class-attribute instance-attribute

y_name class-attribute instance-attribute

Functions

validate_y classmethod

EvaluateTaskResults

Attributes

agent_Sc class-attribute instance-attribute

input_task class-attribute instance-attribute

last_cv class-attribute instance-attribute

last_round class-attribute instance-attribute

model_config class-attribute instance-attribute

scores_norm class-attribute instance-attribute

scores_norm_unweighted class-attribute instance-attribute

scores_softmax class-attribute instance-attribute

trace_id class-attribute instance-attribute

y_debate class-attribute instance-attribute

y_index class-attribute instance-attribute

y_response class-attribute instance-attribute

y_score class-attribute instance-attribute

Functions

__eq__

serialize_scores_norm

serialize_scores_norm_unweighted

serialize_scores_softmax

validate_scores_norm classmethod

validate_scores_norm_unweighted classmethod

validate_scores_softmax classmethod

EvaluateTaskState

Attributes

S class-attribute instance-attribute

S_round class-attribute instance-attribute

Sc class-attribute instance-attribute

Sc_round class-attribute instance-attribute

debate_history class-attribute instance-attribute

evaluate_result class-attribute instance-attribute

exception class-attribute instance-attribute

model_config class-attribute instance-attribute

status class-attribute instance-attribute

task class-attribute instance-attribute

trace_id class-attribute instance-attribute

Functions

__eq__

serialize_S

serialize_S_round

serialize_Sc

serialize_Sc_round

validate_S classmethod

validate_S_round classmethod

validate_Sc classmethod

validate_Sc_round classmethod

HardLimitRetryStrategy

Attributes

fail_count class-attribute instance-attribute

max_fail_count class-attribute instance-attribute

Functions

do_retry

do_retry `abstractmethod`

confidence_scores `class-attribute` `instance-attribute`

model_config `class-attribute` `instance-attribute`

r `class-attribute` `instance-attribute`

response `class-attribute` `instance-attribute`

role `class-attribute` `instance-attribute`

scores `class-attribute` `instance-attribute`

validate_confidence_scores `classmethod`

validate_scores `classmethod`

probabilistic_selection `class-attribute` `instance-attribute`

x `class-attribute` `instance-attribute`

x_name `class-attribute` `instance-attribute`

y `class-attribute` `instance-attribute`

y_name `class-attribute` `instance-attribute`

validate_y `classmethod`

agent_Sc `class-attribute` `instance-attribute`

input_task `class-attribute` `instance-attribute`

last_cv `class-attribute` `instance-attribute`

last_round `class-attribute` `instance-attribute`

model_config `class-attribute` `instance-attribute`

scores_norm `class-attribute` `instance-attribute`

scores_norm_unweighted `class-attribute` `instance-attribute`

scores_softmax `class-attribute` `instance-attribute`

trace_id `class-attribute` `instance-attribute`

y_debate `class-attribute` `instance-attribute`

y_index `class-attribute` `instance-attribute`

y_response `class-attribute` `instance-attribute`

y_score `class-attribute` `instance-attribute`

eq

validate_scores_norm `classmethod`

validate_scores_norm_unweighted `classmethod`

validate_scores_softmax `classmethod`

S `class-attribute` `instance-attribute`

S_round `class-attribute` `instance-attribute`

Sc `class-attribute` `instance-attribute`

Sc_round `class-attribute` `instance-attribute`

debate_history `class-attribute` `instance-attribute`

evaluate_result `class-attribute` `instance-attribute`

exception `class-attribute` `instance-attribute`

model_config `class-attribute` `instance-attribute`

status `class-attribute` `instance-attribute`

task `class-attribute` `instance-attribute`

trace_id `class-attribute` `instance-attribute`

eq

validate_S `classmethod`

validate_S_round `classmethod`

validate_Sc `classmethod`

validate_Sc_round `classmethod`

fail_count `class-attribute` `instance-attribute`

max_fail_count `class-attribute` `instance-attribute`

ERROR `class-attribute` `instance-attribute`

RETRY `class-attribute` `instance-attribute`

STOP `class-attribute` `instance-attribute`