summaryrefslogtreecommitdiff
path: root/kernel/bpf/verifier.c
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2025-03-03 17:40:14 -0800
committerAlexei Starovoitov <ast@kernel.org>2025-03-15 11:48:28 -0700
commit3a6fa573c50f31d6ab8c8c3318a68d511c79f8fb (patch)
treeb109c094280af113cda230fce761407b94cd9127 /kernel/bpf/verifier.c
parent2941e215376399d5e71eddcd720f185e28ba2dbb (diff)
parent2fb761823eadf2fdfb6fdf146c4b94807b4bc3ba (diff)
Merge branch 'timed-may_goto'
Kumar Kartikeya Dwivedi says: ==================== Timed may_goto This series replaces the current implementation of cond_break, which uses the may_goto instruction, and counts 8 million iterations per stack frame, with an implementation based on sampling time locally on the CPU. This is done to permit a longer time for a given loop per-program invocation. The accounting is still done per-stack frame, but the count is used to instead amortize the cost of the logic to sample and check the time spent since the start. This is needed for expressing more complicated algorithms (spin locks, waiting loops, etc.) in BPF programs without false positive expiration of the loop. For instance, the plan is to make use of this for implementing spin locks for BPF arena [0]. For the loop as follows: for (int i = 0;; i++) {} Testing on a bare-metal Sapphire Rapids Intel server yields the following table (taking an average of 25 runs). +-----------------------------+--------------+--------------+------------------+ | Loop type | Iterations | Time (ms) | Time/iter (ns) | +-----------------------------|--------------+--------------+------------------+ | may_goto | 8388608 | 3 | 0.36 | | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | | bpf_for | 8388608 | 10 | 1.19 | +-----------------------------+--------------+--------------+------------------+ Here, count is used to amortize the time sampling and checking logic. Obviously, this is the limit of an empty loop. Given the complexity of the loop body, the time spent in the loop can be longer. Cancellations will address the task of imposing an upper bound on program runtime. For now, the implementation only supports x86. [0]: https://lore.kernel.org/bpf/20250118162238.2621311-1-memxor@gmail.com Changelog: ---------- v1 -> v2 v1: https://lore.kernel.org/bpf/20250302201348.940234-1-memxor@gmail.com * Address comments from Alexei * Use kernel comment style for new code. * Remove p->count == 0 check in bpf_check_timed_may_goto. * Add comments on AX as argument/retval calling convention. * Add comments describing how the counting logic works. * Use BPF_EMIT_CALL instead of open-coding instruction encoding. * Change if ax != 1 goto pc+X condition to if ax != 0 goto pc+X. ==================== Link: https://patch.msgid.link/20250304003239.2390751-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel/bpf/verifier.c')
-rw-r--r--kernel/bpf/verifier.c69
1 files changed, 61 insertions, 8 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 22c4edc8695c..4ec1d1aa25ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21572,7 +21572,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
- if (is_may_goto_insn(insn)) {
+ if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (is_may_goto_insn(insn)) {
int stack_off = -stack_depth - 8;
stack_depth_extra = 8;
@@ -22113,23 +22156,33 @@ next_insn:
env->prog->aux->stack_depth = subprogs[0].stack_depth;
for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
int subprog_start = subprogs[i].start;
int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
if (!stack_slots)
continue;
- if (stack_slots > 1) {
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
verbose(env, "verifier bug: stack_slots supports may_goto only\n");
return -EFAULT;
}
- /* Add ST insn to subprog prologue to init extra stack */
- insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
/* Copy first actual insn to preserve it */
- insn_buf[1] = env->prog->insnsi[subprog_start];
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
- new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
env->prog = prog = new_prog;
@@ -22139,7 +22192,7 @@ next_insn:
* to insn after BPF_ST that inits may_goto count.
* Adjustment will succeed because bpf_patch_insn_data() didn't fail.
*/
- WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
}
/* Since poke tab is now finalized, publish aux to tracker. */