import pytest
from eval_protocol import (
evaluation_test,
aha_judge,
multi_turn_assistant_to_ground_truth,
EvaluationRow,
SingleTurnRolloutProcessor,
DynamicDataLoader,
create_braintrust_adapter,
)
def braintrust_data_generator() -> list[EvaluationRow]:
"""Execute a BTQL query and convert results to EvaluationRow."""
adapter = create_braintrust_adapter()
btql_query = """
select: *
from: project_logs('your_project_id') traces
limit: 50
"""
return adapter.get_evaluation_rows(btql_query)
@pytest.mark.parametrize(
"completion_params",
[
{"model": "gpt-4.1"},
{
"max_tokens": 131000,
"extra_body": {"reasoning_effort": "medium"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
},
{
"max_tokens": 131000,
"extra_body": {"reasoning_effort": "low"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
},
],
)
@evaluation_test(
data_loaders=DynamicDataLoader(
generators=[braintrust_data_generator],
),
rollout_processor=SingleTurnRolloutProcessor(),
preprocess_fn=multi_turn_assistant_to_ground_truth,
max_concurrent_evaluations=2,
)
async def test_braintrust_data(row: EvaluationRow) -> EvaluationRow:
return await aha_judge(row)