from eval_protocol.adapters import create_huggingface_adapter
from eval_protocol import evaluate_rows
from eval_protocol.rewards.accuracy import accuracy_reward
# Custom transformation for GSM8K
def custom_gsm8k_transform(row):
return {
'messages': [
{
'role': 'system',
'content': 'You are a math expert. Solve the following problem step by step.'
},
{'role': 'user', 'content': row['question']}
],
'ground_truth': row['answer'],
'metadata': {
'source': 'gsm8k',
'difficulty': 'challenging'
}
}
# Create custom adapter
adapter = create_huggingface_adapter(
dataset_id="gsm8k",
config_name="main",
transform_fn=custom_gsm8k_transform
)
# Get evaluation rows
rows = list(adapter.get_evaluation_rows(split="test", limit=20))
# Evaluate accuracy
results = evaluate_rows(rows, accuracy_reward)
# Calculate average score
avg_score = sum(r.score for r in results) / len(results) if results else 0
print(f"Average accuracy score: {avg_score:.2f}")