diff options
256 files changed, 5509 insertions, 3535 deletions
diff --git a/arch/arm/boot/dts/da850-lcdk.dts b/arch/arm/boot/dts/da850-lcdk.dts index eed89e659143..a1f4d6d5a569 100644 --- a/arch/arm/boot/dts/da850-lcdk.dts +++ b/arch/arm/boot/dts/da850-lcdk.dts @@ -293,12 +293,12 @@ label = "u-boot env"; reg = <0 0x020000>; }; - partition@0x020000 { + partition@20000 { /* The LCDK defaults to booting from this partition */ label = "u-boot"; reg = <0x020000 0x080000>; }; - partition@0x0a0000 { + partition@a0000 { label = "free space"; reg = <0x0a0000 0>; }; diff --git a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts index cf2f5240e176..27cc913ca0f5 100644 --- a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts +++ b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts @@ -53,7 +53,8 @@ }; pinctrl: pin-controller@10000 { - pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header>; + pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header + &pmx_gpio_header_gpo>; pinctrl-names = "default"; pmx_uart0: pmx-uart0 { @@ -85,11 +86,16 @@ * ground. */ pmx_gpio_header: pmx-gpio-header { - marvell,pins = "mpp17", "mpp7", "mpp29", "mpp28", + marvell,pins = "mpp17", "mpp29", "mpp28", "mpp35", "mpp34", "mpp40"; marvell,function = "gpio"; }; + pmx_gpio_header_gpo: pxm-gpio-header-gpo { + marvell,pins = "mpp7"; + marvell,function = "gpo"; + }; + pmx_gpio_init: pmx-init { marvell,pins = "mpp38"; marvell,function = "gpio"; diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index 5840f5c75c3b..4f2f2eea0755 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -1104,7 +1104,7 @@ be1_out_tcon0: endpoint@0 { reg = <0>; - remote-endpoint = <&tcon1_in_be0>; + remote-endpoint = <&tcon0_in_be1>; }; be1_out_tcon1: endpoint@1 { diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 59655e42e4b0..bd0cd3204273 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -1354,7 +1354,7 @@ be1_out_tcon0: endpoint@0 { reg = <0>; - remote-endpoint = <&tcon1_in_be0>; + remote-endpoint = <&tcon0_in_be1>; }; be1_out_tcon1: endpoint@1 { diff --git a/arch/arm/configs/sunxi_defconfig b/arch/arm/configs/sunxi_defconfig index 5caaf971fb50..df433abfcb02 100644 --- a/arch/arm/configs/sunxi_defconfig +++ b/arch/arm/configs/sunxi_defconfig @@ -10,6 +10,7 @@ CONFIG_SMP=y CONFIG_NR_CPUS=8 CONFIG_AEABI=y CONFIG_HIGHMEM=y +CONFIG_CMA=y CONFIG_ARM_APPENDED_DTB=y CONFIG_ARM_ATAG_DTB_COMPAT=y CONFIG_CPU_FREQ=y @@ -33,6 +34,7 @@ CONFIG_CAN_SUN4I=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y +CONFIG_DMA_CMA=y CONFIG_BLK_DEV_SD=y CONFIG_ATA=y CONFIG_AHCI_SUNXI=y diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 4425189bb24c..41e2feb0cf4f 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -25,16 +25,58 @@ #include "bpf_jit_32.h" -int bpf_jit_enable __read_mostly; +/* + * eBPF prog stack layout: + * + * high + * original ARM_SP => +-----+ + * | | callee saved registers + * +-----+ <= (BPF_FP + SCRATCH_SIZE) + * | ... | eBPF JIT scratch space + * eBPF fp register => +-----+ + * (BPF_FP) | ... | eBPF prog stack + * +-----+ + * |RSVD | JIT scratchpad + * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE) + * | | + * | ... | Function call stack + * | | + * +-----+ + * low + * + * The callee saved registers depends on whether frame pointers are enabled. + * With frame pointers (to be compliant with the ABI): + * + * high + * original ARM_SP => +------------------+ \ + * | pc | | + * current ARM_FP => +------------------+ } callee saved registers + * |r4-r8,r10,fp,ip,lr| | + * +------------------+ / + * low + * + * Without frame pointers: + * + * high + * original ARM_SP => +------------------+ + * | r4-r8,r10,fp,lr | callee saved registers + * current ARM_FP => +------------------+ + * low + * + * When popping registers off the stack at the end of a BPF function, we + * reference them via the current ARM_FP register. + */ +#define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \ + 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R10 | \ + 1 << ARM_FP) +#define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR) +#define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC) #define STACK_OFFSET(k) (k) #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ #define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */ -/* Flags used for JIT optimization */ -#define SEEN_CALL (1 << 0) - #define FLAG_IMM_OVERFLOW (1 << 0) /* @@ -95,7 +137,6 @@ static const u8 bpf2a32[][2] = { * idx : index of current last JITed instruction. * prologue_bytes : bytes used in prologue. * epilogue_offset : offset of epilogue starting. - * seen : bit mask used for JIT optimization. * offsets : array of eBPF instruction offsets in * JITed code. * target : final JITed code. @@ -110,7 +151,6 @@ struct jit_ctx { unsigned int idx; unsigned int prologue_bytes; unsigned int epilogue_offset; - u32 seen; u32 flags; u32 *offsets; u32 *target; @@ -179,8 +219,13 @@ static void jit_fill_hole(void *area, unsigned int size) *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF); } -/* Stack must be multiples of 16 Bytes */ -#define STACK_ALIGN(sz) (((sz) + 3) & ~3) +#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5) +/* EABI requires the stack to be aligned to 64-bit boundaries */ +#define STACK_ALIGNMENT 8 +#else +/* Stack must be aligned to 32-bit boundaries */ +#define STACK_ALIGNMENT 4 +#endif /* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4, * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9, @@ -194,7 +239,7 @@ static void jit_fill_hole(void *area, unsigned int size) + SCRATCH_SIZE + \ + 4 /* extra for skb_copy_bits buffer */) -#define STACK_SIZE STACK_ALIGN(_STACK_SIZE) +#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT) /* Get the offset of eBPF REGISTERs stored on scratch space. */ #define STACK_VAR(off) (STACK_SIZE-off-4) @@ -285,16 +330,19 @@ static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx) emit_mov_i_no8m(rd, val, ctx); } -static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) +static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx) { - ctx->seen |= SEEN_CALL; -#if __LINUX_ARM_ARCH__ < 5 - emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); - if (elf_hwcap & HWCAP_THUMB) emit(ARM_BX(tgt_reg), ctx); else emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); +} + +static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) +{ +#if __LINUX_ARM_ARCH__ < 5 + emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); + emit_bx_r(tgt_reg, ctx); #else emit(ARM_BLX_R(tgt_reg), ctx); #endif @@ -354,7 +402,6 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) } /* Call appropriate function */ - ctx->seen |= SEEN_CALL; emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); emit_blx_r(ARM_IP, ctx); @@ -620,8 +667,6 @@ static inline void emit_a32_lsh_r64(const u8 dst[], const u8 src[], bool dstk, /* Do LSH operation */ emit(ARM_SUB_I(ARM_IP, rt, 32), ctx); emit(ARM_RSB_I(tmp2[0], rt, 32), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx); emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx); @@ -656,8 +701,6 @@ static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk, /* Do the ARSH operation */ emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); _emit(ARM_COND_MI, ARM_B(0), ctx); @@ -692,8 +735,6 @@ static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk, /* Do LSH operation */ emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx); @@ -828,8 +869,6 @@ static inline void emit_a32_mul_r64(const u8 dst[], const u8 src[], bool dstk, /* Do Multiplication */ emit(ARM_MUL(ARM_IP, rd, rn), ctx); emit(ARM_MUL(ARM_LR, rm, rt), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx); emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx); @@ -872,33 +911,53 @@ static inline void emit_str_r(const u8 dst, const u8 src, bool dstk, } /* dst = *(size*)(src + off) */ -static inline void emit_ldx_r(const u8 dst, const u8 src, bool dstk, - const s32 off, struct jit_ctx *ctx, const u8 sz){ +static inline void emit_ldx_r(const u8 dst[], const u8 src, bool dstk, + s32 off, struct jit_ctx *ctx, const u8 sz){ const u8 *tmp = bpf2a32[TMP_REG_1]; - u8 rd = dstk ? tmp[1] : dst; + const u8 *rd = dstk ? tmp : dst; u8 rm = src; + s32 off_max; - if (off) { + if (sz == BPF_H) + off_max = 0xff; + else + off_max = 0xfff; + + if (off < 0 || off > off_max) { emit_a32_mov_i(tmp[0], off, false, ctx); emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); rm = tmp[0]; + off = 0; + } else if (rd[1] == rm) { + emit(ARM_MOV_R(tmp[0], rm), ctx); + rm = tmp[0]; } switch (sz) { - case BPF_W: - /* Load a Word */ - emit(ARM_LDR_I(rd, rm, 0), ctx); + case BPF_B: + /* Load a Byte */ + emit(ARM_LDRB_I(rd[1], rm, off), ctx); + emit_a32_mov_i(dst[0], 0, dstk, ctx); break; case BPF_H: /* Load a HalfWord */ - emit(ARM_LDRH_I(rd, rm, 0), ctx); + emit(ARM_LDRH_I(rd[1], rm, off), ctx); + emit_a32_mov_i(dst[0], 0, dstk, ctx); break; - case BPF_B: - /* Load a Byte */ - emit(ARM_LDRB_I(rd, rm, 0), ctx); + case BPF_W: + /* Load a Word */ + emit(ARM_LDR_I(rd[1], rm, off), ctx); + emit_a32_mov_i(dst[0], 0, dstk, ctx); + break; + case BPF_DW: + /* Load a Double Word */ + emit(ARM_LDR_I(rd[1], rm, off), ctx); + emit(ARM_LDR_I(rd[0], rm, off + 4), ctx); break; } if (dstk) - emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); + emit(ARM_STR_I(rd[1], ARM_SP, STACK_VAR(dst[1])), ctx); + if (dstk && sz == BPF_DW) + emit(ARM_STR_I(rd[0], ARM_SP, STACK_VAR(dst[0])), ctx); } /* Arithmatic Operation */ @@ -906,7 +965,6 @@ static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm, const u8 rn, struct jit_ctx *ctx, u8 op) { switch (op) { case BPF_JSET: - ctx->seen |= SEEN_CALL; emit(ARM_AND_R(ARM_IP, rt, rn), ctx); emit(ARM_AND_R(ARM_LR, rd, rm), ctx); emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx); @@ -945,7 +1003,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) const u8 *tcc = bpf2a32[TCALL_CNT]; const int idx0 = ctx->idx; #define cur_offset (ctx->idx - idx0) -#define jmp_offset (out_offset - (cur_offset)) +#define jmp_offset (out_offset - (cur_offset) - 2) u32 off, lo, hi; /* if (index >= array->map.max_entries) @@ -956,7 +1014,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit_a32_mov_i(tmp[1], off, false, ctx); emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx); - /* index (64 bit) */ + /* index is 32-bit for arrays */ emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); /* index >= array->map.max_entries */ emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx); @@ -997,7 +1055,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit_a32_mov_i(tmp2[1], off, false, ctx); emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx); emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx); - emit(ARM_BX(tmp[1]), ctx); + emit_bx_r(tmp[1], ctx); /* out: */ if (out_offset == -1) @@ -1070,54 +1128,22 @@ static void build_prologue(struct jit_ctx *ctx) const u8 r2 = bpf2a32[BPF_REG_1][1]; const u8 r3 = bpf2a32[BPF_REG_1][0]; const u8 r4 = bpf2a32[BPF_REG_6][1]; - const u8 r5 = bpf2a32[BPF_REG_6][0]; - const u8 r6 = bpf2a32[TMP_REG_1][1]; - const u8 r7 = bpf2a32[TMP_REG_1][0]; - const u8 r8 = bpf2a32[TMP_REG_2][1]; - const u8 r10 = bpf2a32[TMP_REG_2][0]; const u8 fplo = bpf2a32[BPF_REG_FP][1]; const u8 fphi = bpf2a32[BPF_REG_FP][0]; - const u8 sp = ARM_SP; const u8 *tcc = bpf2a32[TCALL_CNT]; - u16 reg_set = 0; - - /* - * eBPF prog stack layout - * - * high - * original ARM_SP => +-----+ eBPF prologue - * |FP/LR| - * current ARM_FP => +-----+ - * | ... | callee saved registers - * eBPF fp register => +-----+ <= (BPF_FP) - * | ... | eBPF JIT scratch space - * | | eBPF prog stack - * +-----+ - * |RSVD | JIT scratchpad - * current A64_SP => +-----+ <= (BPF_FP - STACK_SIZE) - * | | - * | ... | Function call stack - * | | - * +-----+ - * low - */ - /* Save callee saved registers. */ - reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10); #ifdef CONFIG_FRAME_POINTER - reg_set |= (1<<ARM_FP) | (1<<ARM_IP) | (1<<ARM_LR) | (1<<ARM_PC); - emit(ARM_MOV_R(ARM_IP, sp), ctx); + u16 reg_set = CALLEE_PUSH_MASK | 1 << ARM_IP | 1 << ARM_PC; + emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx); emit(ARM_PUSH(reg_set), ctx); emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); #else - /* Check if call instruction exists in BPF body */ - if (ctx->seen & SEEN_CALL) - reg_set |= (1<<ARM_LR); - emit(ARM_PUSH(reg_set), ctx); + emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx); + emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx); #endif /* Save frame pointer for later */ - emit(ARM_SUB_I(ARM_IP, sp, SCRATCH_SIZE), ctx); + emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx); ctx->stack_size = imm8m(STACK_SIZE); @@ -1140,33 +1166,19 @@ static void build_prologue(struct jit_ctx *ctx) /* end of prologue */ } +/* restore callee saved registers. */ static void build_epilogue(struct jit_ctx *ctx) { - const u8 r4 = bpf2a32[BPF_REG_6][1]; - const u8 r5 = bpf2a32[BPF_REG_6][0]; - const u8 r6 = bpf2a32[TMP_REG_1][1]; - const u8 r7 = bpf2a32[TMP_REG_1][0]; - const u8 r8 = bpf2a32[TMP_REG_2][1]; - const u8 r10 = bpf2a32[TMP_REG_2][0]; - u16 reg_set = 0; - - /* unwind function call stack */ - emit(ARM_ADD_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); - - /* restore callee saved registers. */ - reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10); #ifdef CONFIG_FRAME_POINTER - /* the first instruction of the prologue was: mov ip, sp */ - reg_set |= (1<<ARM_FP) | (1<<ARM_SP) | (1<<ARM_PC); + /* When using frame pointers, some additional registers need to + * be loaded. */ + u16 reg_set = CALLEE_POP_MASK | 1 << ARM_SP; + emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * 4), ctx); emit(ARM_LDM(ARM_SP, reg_set), ctx); #else - if (ctx->seen & SEEN_CALL) - reg_set |= (1<<ARM_PC); /* Restore callee saved registers. */ - emit(ARM_POP(reg_set), ctx); - /* Return back to the callee function */ - if (!(ctx->seen & SEEN_CALL)) - emit(ARM_BX(ARM_LR), ctx); + emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx); + emit(ARM_POP(CALLEE_POP_MASK), ctx); #endif } @@ -1394,8 +1406,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_rev32(rt, rt, ctx); goto emit_bswap_uxt; case 64: - /* Because of the usage of ARM_LR */ - ctx->seen |= SEEN_CALL; emit_rev32(ARM_LR, rt, ctx); emit_rev32(rt, rd, ctx); emit(ARM_MOV_R(rd, ARM_LR), ctx); @@ -1448,22 +1458,7 @@ exit: rn = sstk ? tmp2[1] : src_lo; if (sstk) emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); - switch (BPF_SIZE(code)) { - case BPF_W: - /* Load a Word */ - case BPF_H: - /* Load a Half-Word */ - case BPF_B: - /* Load a Byte */ - emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_SIZE(code)); - emit_a32_mov_i(dst_hi, 0, dstk, ctx); - break; - case BPF_DW: - /* Load a double word */ - emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_W); - emit_ldx_r(dst_hi, rn, dstk, off+4, ctx, BPF_W); - break; - } + emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code)); break; /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ case BPF_LD | BPF_ABS | BPF_W: diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi index 7c9bdc7ab50b..9db19314c60c 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi @@ -66,6 +66,7 @@ <&cpu1>, <&cpu2>, <&cpu3>; + interrupt-parent = <&intc>; }; psci { diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi index e3b64d03fbd8..9c7724e82aff 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi @@ -63,8 +63,10 @@ cpm_ethernet: ethernet@0 { compatible = "marvell,armada-7k-pp22"; reg = <0x0 0x100000>, <0x129000 0xb000>; - clocks = <&cpm_clk 1 3>, <&cpm_clk 1 9>, <&cpm_clk 1 5>; - clock-names = "pp_clk", "gop_clk", "mg_clk"; + clocks = <&cpm_clk 1 3>, <&cpm_clk 1 9>, + <&cpm_clk 1 5>, <&cpm_clk 1 18>; + clock-names = "pp_clk", "gop_clk", + "mg_clk","axi_clk"; marvell,system-controller = <&cpm_syscon0>; status = "disabled"; dma-coherent; @@ -155,7 +157,8 @@ #size-cells = <0>; compatible = "marvell,orion-mdio"; reg = <0x12a200 0x10>; - clocks = <&cpm_clk 1 9>, <&cpm_clk 1 5>; + clocks = <&cpm_clk 1 9>, <&cpm_clk 1 5>, + <&cpm_clk 1 6>, <&cpm_clk 1 18>; status = "disabled"; }; @@ -338,8 +341,8 @@ compatible = "marvell,armada-cp110-sdhci"; reg = <0x780000 0x300>; interrupts = <ICU_GRP_NSR 27 IRQ_TYPE_LEVEL_HIGH>; - clock-names = "core"; - clocks = <&cpm_clk 1 4>; + clock-names = "core","axi"; + clocks = <&cpm_clk 1 4>, <&cpm_clk 1 18>; dma-coherent; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi index 0d51096c69f8..87ac68b2cf37 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi @@ -63,8 +63,10 @@ cps_ethernet: ethernet@0 { compatible = "marvell,armada-7k-pp22"; reg = <0x0 0x100000>, <0x129000 0xb000>; - clocks = <&cps_clk 1 3>, <&cps_clk 1 9>, <&cps_clk 1 5>; - clock-names = "pp_clk", "gop_clk", "mg_clk"; + clocks = <&cps_clk 1 3>, <&cps_clk 1 9>, + <&cps_clk 1 5>, <&cps_clk 1 18>; + clock-names = "pp_clk", "gop_clk", + "mg_clk", "axi_clk"; marvell,system-controller = <&cps_syscon0>; status = "disabled"; dma-coherent; @@ -155,7 +157,8 @@ #size-cells = <0>; compatible = "marvell,orion-mdio"; reg = <0x12a200 0x10>; - clocks = <&cps_clk 1 9>, <&cps_clk 1 5>; + clocks = <&cps_clk 1 9>, <&cps_clk 1 5>, + <&cps_clk 1 6>, <&cps_clk 1 18>; status = "disabled"; }; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index acaa935ed977..0775d5ab8ee9 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -31,8 +31,6 @@ #include "bpf_jit.h" -int bpf_jit_enable __read_mostly; - #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) #define TCALL_CNT (MAX_BPF_JIT_REG + 2) @@ -162,7 +160,8 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) /* Stack must be multiples of 16B */ #define STACK_ALIGN(sz) (((sz) + 15) & ~15) -#define PROLOGUE_OFFSET 8 +/* Tail call offset to jump into */ +#define PROLOGUE_OFFSET 7 static int build_prologue(struct jit_ctx *ctx) { @@ -214,19 +213,19 @@ static int build_prologue(struct jit_ctx *ctx) /* Initialize tail_call_cnt */ emit(A64_MOVZ(1, tcc, 0, 0), ctx); - /* 4 byte extra for skb_copy_bits buffer */ - ctx->stack_size = prog->aux->stack_depth + 4; - ctx->stack_size = STACK_ALIGN(ctx->stack_size); - - /* Set up function call stack */ - emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); - cur_offset = ctx->idx - idx0; if (cur_offset != PROLOGUE_OFFSET) { pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", cur_offset, PROLOGUE_OFFSET); return -1; } + + /* 4 byte extra for skb_copy_bits buffer */ + ctx->stack_size = prog->aux->stack_depth + 4; + ctx->stack_size = STACK_ALIGN(ctx->stack_size); + + /* Set up function call stack */ + emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); return 0; } @@ -274,11 +273,12 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_LDR64(prg, tmp, prg), ctx); emit(A64_CBZ(1, prg, jmp_offset), ctx); - /* goto *(prog->bpf_func + prologue_size); */ + /* goto *(prog->bpf_func + prologue_offset); */ off = offsetof(struct bpf_prog, bpf_func); emit_a64_mov_i64(tmp, off, ctx); emit(A64_LDR64(tmp, prg, tmp), ctx); emit(A64_ADD_I(1, tmp, tmp, sizeof(u32) * PROLOGUE_OFFSET), ctx); + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); emit(A64_BR(tmp), ctx); /* out: */ diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 28e02c99be6d..762eeb0fcc1d 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -65,29 +65,30 @@ ia64_atomic_fetch_##op (int i, atomic_t *v) \ ATOMIC_OPS(add, +) ATOMIC_OPS(sub, -) -#define atomic_add_return(i,v) \ +#ifdef __OPTIMIZE__ +#define __ia64_atomic_const(i) __builtin_constant_p(i) ? \ + ((i) == 1 || (i) == 4 || (i) == 8 || (i) == 16 || \ + (i) == -1 || (i) == -4 || (i) == -8 || (i) == -16) : 0 + +#define atomic_add_return(i, v) \ ({ \ - int __ia64_aar_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \ - || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \ - || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \ - || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \ - ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \ - : ia64_atomic_add(__ia64_aar_i, v); \ + int __i = (i); \ + static const int __ia64_atomic_p = __ia64_atomic_const(i); \ + __ia64_atomic_p ? ia64_fetch_and_add(__i, &(v)->counter) : \ + ia64_atomic_add(__i, v); \ }) -#define atomic_sub_return(i,v) \ +#define atomic_sub_return(i, v) \ ({ \ - int __ia64_asr_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \ - || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \ - || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \ - || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \ - ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \ - : ia64_atomic_sub(__ia64_asr_i, v); \ + int __i = (i); \ + static const int __ia64_atomic_p = __ia64_atomic_const(i); \ + __ia64_atomic_p ? ia64_fetch_and_add(-__i, &(v)->counter) : \ + ia64_atomic_sub(__i, v); \ }) +#else +#define atomic_add_return(i, v) ia64_atomic_add(i, v) +#define atomic_sub_return(i, v) ia64_atomic_sub(i, v) +#endif #define atomic_fetch_add(i,v) \ ({ \ diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 44b925005dd3..4d8cb9bb8365 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1207,8 +1207,6 @@ jmp_cmp: return 0; } -int bpf_jit_enable __read_mostly; - void bpf_jit_compile(struct bpf_prog *fp) { struct jit_ctx ctx; diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 97069a1b6f43..4e347030ed2c 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -177,8 +177,6 @@ static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx) (ctx->idx * 4) - 4; } -int bpf_jit_enable __read_mostly; - enum which_ebpf_reg { src_reg, src_reg_no_fp, diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c51e6ce42e7a..2ed525a44734 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -166,6 +166,7 @@ config PPC select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64 select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_SMP_IDLE_THREAD diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index f0461618bf7b..eca3f9c68907 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -353,6 +353,7 @@ #define PROC_TABLE_GTSE 0x01 #ifndef __ASSEMBLY__ +#include <linux/types.h> /** * plpar_hcall_norets: - Make a pseries hypervisor call with no return arguments diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 9d213542a48b..8fd3a70047f1 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -242,14 +242,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) unsigned short maj; unsigned short min; - /* We only show online cpus: disable preempt (overzealous, I - * knew) to prevent cpu going down. */ - preempt_disable(); - if (!cpu_online(cpu_id)) { - preempt_enable(); - return 0; - } - #ifdef CONFIG_SMP pvr = per_cpu(cpu_pvr, cpu_id); #else @@ -358,9 +350,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP seq_printf(m, "\n"); #endif - - preempt_enable(); - /* If this is the last cpu, print the summary */ if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids) show_cpuinfo_summary(m); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 491be4179ddd..e67413f4a8f0 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -38,6 +38,7 @@ #include <linux/memory.h> #include <linux/nmi.h> +#include <asm/debugfs.h> #include <asm/io.h> #include <asm/kdump.h> #include <asm/prom.h> @@ -901,4 +902,41 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) if (!no_rfi_flush) rfi_flush_enable(enable); } + +#ifdef CONFIG_DEBUG_FS +static int rfi_flush_set(void *data, u64 val) +{ + if (val == 1) + rfi_flush_enable(true); + else if (val == 0) + rfi_flush_enable(false); + else + return -EINVAL; + + return 0; +} + +static int rfi_flush_get(void *data, u64 *val) +{ + *val = rfi_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); + +static __init int rfi_flush_debugfs_init(void) +{ + debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); + return 0; +} +device_initcall(rfi_flush_debugfs_init); +#endif + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (rfi_flush) + return sprintf(buf, "Mitigation: RFI Flush\n"); + + return sprintf(buf, "Vulnerable\n"); +} #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index f9941b3b5770..872d1f6dd11e 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -18,8 +18,6 @@ #include "bpf_jit32.h" -int bpf_jit_enable __read_mostly; - static inline void bpf_flush_icache(void *start, void *end) { smp_wmb(); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 6771c63b2bec..217a78e84865 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -21,8 +21,6 @@ #include "bpf_jit64.h" -int bpf_jit_enable __read_mostly; - static void bpf_jit_fill_ill_insns(void *area, unsigned int size) { memset32(area, BREAKPOINT_INSTRUCTION, size/4); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index cab24f549e7c..0ddc7ac6c5f1 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2344,10 +2344,10 @@ static void dump_one_paca(int cpu) DUMP(p, kernel_toc, "lx"); DUMP(p, kernelbase, "lx"); DUMP(p, kernel_msr, "lx"); - DUMP(p, emergency_sp, "p"); + DUMP(p, emergency_sp, "px"); #ifdef CONFIG_PPC_BOOK3S_64 - DUMP(p, nmi_emergency_sp, "p"); - DUMP(p, mc_emergency_sp, "p"); + DUMP(p, nmi_emergency_sp, "px"); + DUMP(p, mc_emergency_sp, "px"); DUMP(p, in_nmi, "x"); DUMP(p, in_mce, "x"); DUMP(p, hmi_event_available, "x"); @@ -2375,17 +2375,21 @@ static void dump_one_paca(int cpu) DUMP(p, slb_cache_ptr, "x"); for (i = 0; i < SLB_CACHE_ENTRIES; i++) printf(" slb_cache[%d]: = 0x%016lx\n", i, p->slb_cache[i]); + + DUMP(p, rfi_flush_fallback_area, "px"); + DUMP(p, l1d_flush_congruence, "llx"); + DUMP(p, l1d_flush_sets, "llx"); #endif DUMP(p, dscr_default, "llx"); #ifdef CONFIG_PPC_BOOK3E - DUMP(p, pgd, "p"); - DUMP(p, kernel_pgd, "p"); - DUMP(p, tcd_ptr, "p"); - DUMP(p, mc_kstack, "p"); - DUMP(p, crit_kstack, "p"); - DUMP(p, dbg_kstack, "p"); + DUMP(p, pgd, "px"); + DUMP(p, kernel_pgd, "px"); + DUMP(p, tcd_ptr, "px"); + DUMP(p, mc_kstack, "px"); + DUMP(p, crit_kstack, "px"); + DUMP(p, dbg_kstack, "px"); #endif - DUMP(p, __current, "p"); + DUMP(p, __current, "px"); DUMP(p, kstack, "lx"); printf(" kstack_base = 0x%016lx\n", p->kstack & ~(THREAD_SIZE - 1)); DUMP(p, stab_rr, "lx"); @@ -2403,7 +2407,7 @@ static void dump_one_paca(int cpu) #endif #ifdef CONFIG_PPC_POWERNV - DUMP(p, core_idle_state_ptr, "p"); + DUMP(p, core_idle_state_ptr, "px"); DUMP(p, thread_idle_state, "x"); DUMP(p, thread_mask, "x"); DUMP(p, subcore_sibling_mask, "x"); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1dfadbd126f3..e50188773ff3 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -28,8 +28,6 @@ #include <asm/set_memory.h> #include "bpf_jit.h" -int bpf_jit_enable __read_mostly; - struct bpf_jit { u32 seen; /* Flags to remember seen eBPF instructions */ u32 seen_reg[16]; /* Array to remember which registers are used */ diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index 09e318eb34ee..3bd8ca95e521 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -11,8 +11,6 @@ #include "bpf_jit_32.h" -int bpf_jit_enable __read_mostly; - static inline bool is_simm13(unsigned int value) { return value + 0x1000 < 0x2000; diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 635fdefd4ae2..50a24d7bd4c5 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -12,8 +12,6 @@ #include "bpf_jit_64.h" -int bpf_jit_enable __read_mostly; - static inline bool is_simm13(unsigned int value) { return value + 0x1000 < 0x2000; diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a1f28a54f23a..60c4c342316c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -244,6 +244,17 @@ ENTRY(__switch_to_asm) movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popl %esi popl %edi diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4f8e1d35a97c..aa15b4c0e3d1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -491,6 +491,17 @@ ENTRY(__switch_to_asm) movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popq %r15 popq %r14 diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 005908ee9333..a2efb490f743 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -755,14 +755,14 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init), diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9e57f08bfa6..98722773391d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -136,6 +136,7 @@ extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); extern void apic_intr_mode_init(void); extern void setup_local_APIC(void); extern void init_apic_mappings(void); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f275447862f4..25b9375c1484 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -206,11 +206,11 @@ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -245,6 +245,7 @@ #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c9459a4c3c68..22c5f3e6f820 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -39,7 +39,7 @@ void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sme_encrypt_kernel(void); +void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); @@ -67,7 +67,7 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } static inline bool sme_active(void) { return false; } diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 402a11c803c3..7b45d8424150 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,7 +11,7 @@ * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; jmp' loop to capture speculative execution. + * infinite 'pause; lfence; jmp' loop to capture speculative execution. * * This is required in various cases for retpoline and IBRS-based * mitigations for the Spectre variant 2 vulnerability. Sometimes to @@ -38,11 +38,13 @@ call 772f; \ 773: /* speculation trap */ \ pause; \ + lfence; \ jmp 773b; \ 772: \ call 774f; \ 775: /* speculation trap */ \ pause; \ + lfence; \ jmp 775b; \ 774: \ dec reg; \ @@ -73,6 +75,7 @@ call .Ldo_rop_\@ .Lspec_trap_\@: pause + lfence jmp .Lspec_trap_\@ .Ldo_rop_\@: mov \reg, (%_ASM_SP) @@ -165,6 +168,7 @@ " .align 16\n" \ "901: call 903f;\n" \ "902: pause;\n" \ + " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ "903: addl $4, %%esp;\n" \ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 880441f24146..25ddf02598d2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1286,6 +1286,55 @@ static int __init apic_intr_mode_select(void) return APIC_SYMMETRIC_IO; } +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + if (apic_extnmi == APIC_EXTNMI_NONE) + value |= APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} + /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f8b03bb8e725..3cc471beb50b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -542,14 +542,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, err = assign_irq_vector_policy(irqd, info); trace_vector_setup(virq + i, false, err); - if (err) + if (err) { + irqd->chip_data = NULL; + free_apic_chip_data(apicd); goto error; + } } return 0; error: - x86_vector_free_irqs(domain, virq, i + 1); + x86_vector_free_irqs(domain, virq, i); return err; } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e4dc26185aa7..390b3dc3d438 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -23,6 +23,7 @@ #include <asm/alternative.h> #include <asm/pgtable.h> #include <asm/set_memory.h> +#include <asm/intel-family.h> static void __init spectre_v2_select_mitigation(void); @@ -155,6 +156,23 @@ disable: return SPECTRE_V2_CMD_NONE; } +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -213,6 +231,24 @@ retpoline_auto: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If neither SMEP or KPTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_PTI) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 88dcf8479013..99442370de40 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -525,10 +525,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); - kfree(d->ctrl_val); - kfree(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); list_del(&d->list); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); @@ -545,6 +541,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); kfree(d); return; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 05459ad3db46..d0e69769abfd 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,7 +21,6 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a5d757b9cfd..7ba5d819ebe3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr, p = fixup_pointer(&phys_base, physaddr); *p += load_delta - sme_get_me_mask(); - /* Encrypt the kernel (if SME is active) */ - sme_encrypt_kernel(); + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); /* * Return the SME encryption mask (if SME is active) to be used as a diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d985cef3984f..56d99be3706a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -56,7 +56,7 @@ struct idt_data { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_idts[] = { +static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, debug), SYSG(X86_TRAP_BP, int3), #ifdef CONFIG_X86_32 @@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = { * the traps which use them are reinitialized with IST after cpu_init() has * set up TSS. */ -static const __initdata struct idt_data def_idts[] = { +static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, bounds), @@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = { /* * The APIC and SMP idt entries */ -static const __initdata struct idt_data apic_idts[] = { +static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP INTG(RESCHEDULE_VECTOR, reschedule_interrupt), INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), @@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_pf_idts[] = { +static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, page_fault), }; @@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = { * Override for the debug_idt. Same as the default, but with interrupt * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ -static const __initdata struct idt_data dbg_idts[] = { +static const __initconst struct idt_data dbg_idts[] = { INTG(X86_TRAP_DB, debug), INTG(X86_TRAP_BP, int3), }; @@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * The exceptions which use Interrupt stacks. They are setup after * cpu_init() when the TSS has been initialized. */ -static const __initdata struct idt_data ist_idts[] = { +static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_NMI, nmi, NMI_STACK), SISTG(X86_TRAP_BP, int3, DEBUG_STACK), diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 8da3e909e967..a539410c4ea9 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -61,6 +61,9 @@ void __init init_ISA_irqs(void) struct irq_chip *chip = legacy_pic->chip; int i; +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif legacy_pic->init(0); for (i = 0; i < nr_legacy_irqs(); i++) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 145810b0edf6..68d7ab81c62f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -364,16 +364,6 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - /* - * If SME is active, this memory will be marked encrypted by the - * kernel when it is accessed (including relocation). However, the - * ramdisk image was loaded decrypted by the bootloader, so make - * sure that it is encrypted before accessing it. For SEV the - * ramdisk will already be encrypted, so only do this for SME. - */ - if (sme_active()) - sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); - initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8ea117f8142e..e169e85db434 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -602,7 +602,6 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; @@ -612,6 +611,8 @@ unsigned long native_calibrate_tsc(void) } } + if (crystal_khz == 0) + return 0; /* * TSC frequency determined by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This @@ -1315,6 +1316,12 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + if (cpu_khz != tsc_khz) { + pr_info("Detected %lu.%03lu MHz TSC", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 06fe3d51d385..b3e40773dce0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -172,14 +172,15 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) +static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, + u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return; /* Fault not from Protection Keys: nothing to do */ - if (si_code != SEGV_PKUERR) + if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) return; /* * force_sig_info_fault() is called from a number of @@ -218,7 +219,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, pkey); + fill_sig_info_pkey(si_signo, si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 47388f0c0e59..af6f2f9c6a26 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -21,10 +21,14 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static __init void *early_alloc(size_t size, int nid) +static __init void *early_alloc(size_t size, int nid, bool panic) { - return memblock_virt_alloc_try_nid_nopanic(size, size, - __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); + if (panic) + return memblock_virt_alloc_try_nid(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); + else + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); } static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, @@ -38,14 +42,14 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, if (boot_cpu_has(X86_FEATURE_PSE) && ((end - addr) == PMD_SIZE) && IS_ALIGNED(addr, PMD_SIZE)) { - p = early_alloc(PMD_SIZE, nid); + p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; else if (p) memblock_free(__pa(p), PMD_SIZE); } - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); pmd_populate_kernel(&init_mm, pmd, p); } @@ -57,7 +61,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, if (!pte_none(*pte)) continue; - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); set_pte_at(&init_mm, addr, pte, entry); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -75,14 +79,14 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, if (boot_cpu_has(X86_FEATURE_GBPAGES) && ((end - addr) == PUD_SIZE) && IS_ALIGNED(addr, PUD_SIZE)) { - p = early_alloc(PUD_SIZE, nid); + p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; else if (p) memblock_free(__pa(p), PUD_SIZE); } - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); pud_populate(&init_mm, pud, p); } @@ -101,7 +105,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, unsigned long next; if (p4d_none(*p4d)) { - void *p = early_alloc(PAGE_SIZE, nid); + void *p = early_alloc(PAGE_SIZE, nid, true); p4d_populate(&init_mm, p4d, p); } @@ -122,7 +126,7 @@ static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, unsigned long next; if (pgd_none(*pgd)) { - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); pgd_populate(&init_mm, pgd, p); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 391b13402e40..3ef362f598e3 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -464,37 +464,62 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); } -static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, - unsigned long end) +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; - pgd_start = start & PGDIR_MASK; - pgd_end = end & PGDIR_MASK; + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); - pgd_size *= sizeof(pgd_t); + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - pgd_p = pgd_base + pgd_index(start); + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); memset(pgd_p, 0, pgd_size); } -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) -static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, - unsigned long vaddr, pmdval_t pmd_val) +static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd_p; p4d_t *p4d_p; pud_t *pud_p; pmd_t *pmd_p; - pgd_p = pgd_base + pgd_index(vaddr); + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); if (native_pgd_val(*pgd_p)) { if (IS_ENABLED(CONFIG_X86_5LEVEL)) p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); @@ -504,15 +529,15 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, pgd_t pgd; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = pgtable_area; + p4d_p = ppd->pgtable_area; memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); } else { - pud_p = pgtable_area; + pud_p = ppd->pgtable_area; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); } @@ -520,58 +545,160 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, } if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(vaddr); + p4d_p += p4d_index(ppd->vaddr); if (native_p4d_val(*p4d_p)) { pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); } else { p4d_t p4d; - pud_p = pgtable_area; + pud_p = ppd->pgtable_area; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); native_set_p4d(p4d_p, p4d); } } - pud_p += pud_index(vaddr); + pud_p += pud_index(ppd->vaddr); if (native_pud_val(*pud_p)) { if (native_pud_val(*pud_p) & _PAGE_PSE) - goto out; + return NULL; pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); } else { pud_t pud; - pmd_p = pgtable_area; + pmd_p = ppd->pgtable_area; memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); native_set_pud(pud_p, pud); } - pmd_p += pmd_index(vaddr); + return pmd_p; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); +} -out: - return pgtable_area; +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + pte_t *pte_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); + if (native_pmd_val(*pmd_p)) { + if (native_pmd_val(*pmd_p) & _PAGE_PSE) + return; + + pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); + } else { + pmd_t pmd; + + pte_p = ppd->pgtable_area; + memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; + + pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); + native_set_pmd(pmd_p, pmd); + } + + pte_p += pte_index(ppd->vaddr); + if (!native_pte_val(*pte_p)) + native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } static unsigned long __init sme_pgtable_calc(unsigned long len) { - unsigned long p4d_size, pud_size, pmd_size; + unsigned long p4d_size, pud_size, pmd_size, pte_size; unsigned long total; /* * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. That mappings will be covered by 2MB - * PMD entries so we can conservatively calculate the required + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required * number of P4D, PUD and PMD structures needed to perform the - * mappings. Incrementing the count for each covers the case where - * the addresses cross entries. + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. */ if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; @@ -585,8 +712,9 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) } pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - total = p4d_size + pud_size + pmd_size; + total = p4d_size + pud_size + pmd_size + pte_size; /* * Now calculate the added pagetable structures needed to populate @@ -610,29 +738,29 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return total; } -void __init sme_encrypt_kernel(void) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; unsigned long pgtable_area_len; - unsigned long paddr, pmd_flags; unsigned long decrypted_base; - void *pgtable_area; - pgd_t *pgd; if (!sme_active()) return; /* - * Prepare for encrypting the kernel by building new pagetables with - * the necessary attributes needed to encrypt the kernel in place. + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. * * One range of virtual addresses will map the memory occupied - * by the kernel as encrypted. + * by the kernel and initrd as encrypted. * * Another range of virtual addresses will map the memory occupied - * by the kernel as decrypted and write-protected. + * by the kernel and initrd as decrypted and write-protected. * * The use of write-protect attribute will prevent any of the * memory from being cached. @@ -643,6 +771,20 @@ void __init sme_encrypt_kernel(void) kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); kernel_len = kernel_end - kernel_start; + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + /* Set the encryption workarea to be immediately after the kernel */ workarea_start = kernel_end; @@ -665,16 +807,21 @@ void __init sme_encrypt_kernel(void) */ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; /* PUDs and PMDs needed in the current pagetables for the workarea */ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); /* * The total workarea includes the executable encryption area and - * the pagetable area. + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. */ workarea_len = execute_len + pgtable_area_len; - workarea_end = workarea_start + workarea_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); /* * Set the address to the start of where newly created pagetable @@ -683,45 +830,30 @@ void __init sme_encrypt_kernel(void) * pagetables and when the new encrypted and decrypted kernel * mappings are populated. */ - pgtable_area = (void *)execute_end; + ppd.pgtable_area = (void *)execute_end; /* * Make sure the current pagetable structure has entries for * addressing the workarea. */ - pgd = (pgd_t *)native_read_cr3_pa(); - paddr = workarea_start; - while (paddr < workarea_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + PMD_FLAGS); - - paddr += PMD_PAGE_SIZE; - } + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); /* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); /* * A new pagetable structure is being built to allow for the kernel - * to be encrypted. It starts with an empty PGD that will then be - * populated with new PUDs and PMDs as the encrypted and decrypted - * kernel mappings are created. + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. */ - pgd = pgtable_area; - memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); - pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; - - /* Add encrypted kernel (identity) mappings */ - pmd_flags = PMD_FLAGS | _PAGE_ENC; - paddr = kernel_start; - while (paddr < kernel_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + pmd_flags); - - paddr += PMD_PAGE_SIZE; - } + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; /* * A different PGD index/entry must be used to get different @@ -730,47 +862,79 @@ void __init sme_encrypt_kernel(void) * the base of the mapping. */ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } decrypted_base <<= PGDIR_SHIFT; + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + /* Add decrypted, write-protected kernel (non-identity) mappings */ - pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); - paddr = kernel_start; - while (paddr < kernel_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr + decrypted_base, - paddr + pmd_flags); - - paddr += PMD_PAGE_SIZE; + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); } /* Add decrypted workarea mappings to both kernel mappings */ - paddr = workarea_start; - while (paddr < workarea_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + PMD_FLAGS); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr + decrypted_base, - paddr + PMD_FLAGS); - - paddr += PMD_PAGE_SIZE; - } + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); /* Perform the encryption */ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)pgd); + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); /* * At this point we are running encrypted. Remove the mappings for * the decrypted areas - all that is needed for this is to remove * the PGD entry/entries. */ - sme_clear_pgd(pgd, kernel_start + decrypted_base, - kernel_end + decrypted_base); + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } - sme_clear_pgd(pgd, workarea_start + decrypted_base, - workarea_end + decrypted_base); + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); /* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 730e6d541df1..01f682cf77a8 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -22,9 +22,9 @@ ENTRY(sme_encrypt_execute) /* * Entry parameters: - * RDI - virtual address for the encrypted kernel mapping - * RSI - virtual address for the decrypted kernel mapping - * RDX - length of kernel + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - length to encrypt * RCX - virtual address of the encryption workarea, including: * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) @@ -41,9 +41,9 @@ ENTRY(sme_encrypt_execute) addq $PAGE_SIZE, %rax /* Workarea encryption routine */ push %r12 - movq %rdi, %r10 /* Encrypted kernel */ - movq %rsi, %r11 /* Decrypted kernel */ - movq %rdx, %r12 /* Kernel length */ + movq %rdi, %r10 /* Encrypted area */ + movq %rsi, %r11 /* Decrypted area */ + movq %rdx, %r12 /* Area length */ /* Copy encryption routine into the workarea */ movq %rax, %rdi /* Workarea encryption routine */ @@ -52,10 +52,10 @@ ENTRY(sme_encrypt_execute) rep movsb /* Setup registers for call */ - movq %r10, %rdi /* Encrypted kernel */ - movq %r11, %rsi /* Decrypted kernel */ + movq %r10, %rdi /* Encrypted area */ + movq %r11, %rsi /* Decrypted area */ movq %r8, %rdx /* Pagetables used for encryption */ - movq %r12, %rcx /* Kernel length */ + movq %r12, %rcx /* Area length */ movq %rax, %r8 /* Workarea encryption routine */ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ @@ -71,7 +71,7 @@ ENDPROC(sme_encrypt_execute) ENTRY(__enc_copy) /* - * Routine used to encrypt kernel. + * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since * the kernel will be encrypted during the process. So this * routine is defined here and then copied to an area outside @@ -79,19 +79,19 @@ ENTRY(__enc_copy) * during execution. * * On entry the registers must be: - * RDI - virtual address for the encrypted kernel mapping - * RSI - virtual address for the decrypted kernel mapping + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping * RDX - address of the pagetables to use for encryption - * RCX - length of kernel + * RCX - length of area * R8 - intermediate copy buffer * * RAX - points to this routine * - * The kernel will be encrypted by copying from the non-encrypted - * kernel space to an intermediate buffer and then copying from the - * intermediate buffer back to the encrypted kernel space. The physical - * addresses of the two kernel space mappings are the same which - * results in the kernel being encrypted "in place". + * The area will be encrypted by copying from the non-encrypted + * memory space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted memory space. The physical + * addresses of the two mappings are the same which results in the area + * being encrypted "in place". */ /* Enable the new page tables */ mov %rdx, %cr3 @@ -103,47 +103,55 @@ ENTRY(__enc_copy) orq $X86_CR4_PGE, %rdx mov %rdx, %cr4 + push %r15 + push %r12 + + movq %rcx, %r9 /* Save area length */ + movq %rdi, %r10 /* Save encrypted area address */ + movq %rsi, %r11 /* Save decrypted area address */ + /* Set the PAT register PA5 entry to write-protect */ - push %rcx movl $MSR_IA32_CR_PAT, %ecx rdmsr - push %rdx /* Save original PAT value */ + mov %rdx, %r15 /* Save original PAT value */ andl $0xffff00ff, %edx /* Clear PA5 */ orl $0x00000500, %edx /* Set PA5 to WP */ wrmsr - pop %rdx /* RDX contains original PAT value */ - pop %rcx - - movq %rcx, %r9 /* Save kernel length */ - movq %rdi, %r10 /* Save encrypted kernel address */ - movq %rsi, %r11 /* Save decrypted kernel address */ wbinvd /* Invalidate any cache entries */ - /* Copy/encrypt 2MB at a time */ + /* Copy/encrypt up to 2MB at a time */ + movq $PMD_PAGE_SIZE, %r12 1: - movq %r11, %rsi /* Source - decrypted kernel */ + cmpq %r12, %r9 + jnb 2f + movq %r9, %r12 + +2: + movq %r11, %rsi /* Source - decrypted area */ movq %r8, %rdi /* Dest - intermediate copy buffer */ - movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + movq %r12, %rcx rep movsb movq %r8, %rsi /* Source - intermediate copy buffer */ - movq %r10, %rdi /* Dest - encrypted kernel */ - movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + movq %r10, %rdi /* Dest - encrypted area */ + movq %r12, %rcx rep movsb - addq $PMD_PAGE_SIZE, %r11 - addq $PMD_PAGE_SIZE, %r10 - subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + addq %r12, %r11 + addq %r12, %r10 + subq %r12, %r9 /* Kernel length decrement */ jnz 1b /* Kernel length not zero? */ /* Restore PAT register */ - push %rdx /* Save original PAT value */ movl $MSR_IA32_CR_PAT, %ecx rdmsr - pop %rdx /* Restore original PAT value */ + mov %r15, %rdx /* Restore original PAT value */ wrmsr + pop %r12 + pop %r15 + ret .L__enc_copy_end: ENDPROC(__enc_copy) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 87f214fbe66e..5acee5139e28 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,8 +15,6 @@ #include <asm/set_memory.h> #include <linux/bpf.h> -int bpf_jit_enable __read_mostly; - /* * assembly code in arch/x86/net/bpf_jit.S */ @@ -154,6 +152,11 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_AX)); } +static bool is_axreg(u32 reg) +{ + return reg == BPF_REG_0; +} + /* add modifiers if 'reg' maps to x64 registers r8..r15 */ static u8 add_1mod(u8 byte, u32 reg) { @@ -447,16 +450,36 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); + /* b3 holds 'normal' opcode, b2 short form only valid + * in case dst is eax/rax. + */ switch (BPF_OP(insn->code)) { - case BPF_ADD: b3 = 0xC0; break; - case BPF_SUB: b3 = 0xE8; break; - case BPF_AND: b3 = 0xE0; break; - case BPF_OR: b3 = 0xC8; break; - case BPF_XOR: b3 = 0xF0; break; + case BPF_ADD: + b3 = 0xC0; + b2 = 0x05; + break; + case BPF_SUB: + b3 = 0xE8; + b2 = 0x2D; + break; + case BPF_AND: + b3 = 0xE0; + b2 = 0x25; + break; + case BPF_OR: + b3 = 0xC8; + b2 = 0x0D; + break; + case BPF_XOR: + b3 = 0xF0; + b2 = 0x35; + break; } if (is_imm8(imm32)) EMIT3(0x83, add_1reg(b3, dst_reg), imm32); + else if (is_axreg(dst_reg)) + EMIT1_off32(b2, imm32); else EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32); break; diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 8193b38a1cae..3c09122bf038 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4449,6 +4449,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { * https://bugzilla.kernel.org/show_bug.cgi?id=121671 */ { "LITEON CX1-JB*-HP", NULL, ATA_HORKAGE_MAX_SEC_1024 }, + { "LITEON EP1-*", NULL, ATA_HORKAGE_MAX_SEC_1024 }, /* Devices we expect to fail diagnostics */ diff --git a/drivers/bcma/Kconfig b/drivers/bcma/Kconfig index 02d78f6cecbb..ba8acca036df 100644 --- a/drivers/bcma/Kconfig +++ b/drivers/bcma/Kconfig @@ -55,7 +55,7 @@ config BCMA_DRIVER_PCI config BCMA_DRIVER_PCI_HOSTMODE bool "Driver for PCI core working in hostmode" - depends on MIPS && BCMA_DRIVER_PCI + depends on MIPS && BCMA_DRIVER_PCI && PCI_DRIVERS_LEGACY help PCI core hostmode operation (external PCI bus). diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index f9042bcc27a4..7b14d6280e44 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -152,14 +152,13 @@ static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask, { unsigned long get_mask = 0; unsigned long set_mask = 0; - int bit = 0; - while ((bit = find_next_bit(mask, gc->ngpio, bit)) != gc->ngpio) { - if (gc->bgpio_dir & BIT(bit)) - set_mask |= BIT(bit); - else - get_mask |= BIT(bit); - } + /* Make sure we first clear any bits that are zero when we read the register */ + *bits &= ~*mask; + + /* Exploit the fact that we know which directions are set */ + set_mask = *mask & gc->bgpio_dir; + get_mask = *mask & ~gc->bgpio_dir; if (set_mask) *bits |= gc->read_reg(gc->reg_set) & set_mask; @@ -176,13 +175,13 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) /* * This only works if the bits in the GPIO register are in native endianness. - * It is dirt simple and fast in this case. (Also the most common case.) */ static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - - *bits = gc->read_reg(gc->reg_dat) & *mask; + /* Make sure we first clear any bits that are zero when we read the register */ + *bits &= ~*mask; + *bits |= gc->read_reg(gc->reg_dat) & *mask; return 0; } @@ -196,9 +195,12 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, unsigned long val; int bit; + /* Make sure we first clear any bits that are zero when we read the register */ + *bits &= ~*mask; + /* Create a mirrored mask */ - bit = 0; - while ((bit = find_next_bit(mask, gc->ngpio, bit)) != gc->ngpio) + bit = -1; + while ((bit = find_next_bit(mask, gc->ngpio, bit + 1)) < gc->ngpio) readmask |= bgpio_line2mask(gc, bit); /* Read the register */ @@ -208,8 +210,8 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, * Mirror the result into the "bits" result, this will give line 0 * in bit 0 ... line 31 in bit 31 for a 32bit register. */ - bit = 0; - while ((bit = find_next_bit(&val, gc->ngpio, bit)) != gc->ngpio) + bit = -1; + while ((bit = find_next_bit(&val, gc->ngpio, bit + 1)) < gc->ngpio) *bits |= bgpio_line2mask(gc, bit); return 0; diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 123585eeb87d..50f8443641b8 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -1211,23 +1211,6 @@ void assert_panel_unlocked(struct drm_i915_private *dev_priv, enum pipe pipe) pipe_name(pipe)); } -static void assert_cursor(struct drm_i915_private *dev_priv, - enum pipe pipe, bool state) -{ - bool cur_state; - - if (IS_I845G(dev_priv) || IS_I865G(dev_priv)) - cur_state = I915_READ(CURCNTR(PIPE_A)) & CURSOR_ENABLE; - else - cur_state = I915_READ(CURCNTR(pipe)) & CURSOR_MODE; - - I915_STATE_WARN(cur_state != state, - "cursor on pipe %c assertion failure (expected %s, current %s)\n", - pipe_name(pipe), onoff(state), onoff(cur_state)); -} -#define assert_cursor_enabled(d, p) assert_cursor(d, p, true) -#define assert_cursor_disabled(d, p) assert_cursor(d, p, false) - void assert_pipe(struct drm_i915_private *dev_priv, enum pipe pipe, bool state) { @@ -1255,77 +1238,25 @@ void assert_pipe(struct drm_i915_private *dev_priv, pipe_name(pipe), onoff(state), onoff(cur_state)); } -static void assert_plane(struct drm_i915_private *dev_priv, - enum plane plane, bool state) +static void assert_plane(struct intel_plane *plane, bool state) { - u32 val; - bool cur_state; + bool cur_state = plane->get_hw_state(plane); - val = I915_READ(DSPCNTR(plane)); - cur_state = !!(val & DISPLAY_PLANE_ENABLE); I915_STATE_WARN(cur_state != state, - "plane %c assertion failure (expected %s, current %s)\n", - plane_name(plane), onoff(state), onoff(cur_state)); + "%s assertion failure (expected %s, current %s)\n", + plane->base.name, onoff(state), onoff(cur_state)); } -#define assert_plane_enabled(d, p) assert_plane(d, p, true) -#define assert_plane_disabled(d, p) assert_plane(d, p, false) - -static void assert_planes_disabled(struct drm_i915_private *dev_priv, - enum pipe pipe) -{ - int i; - - /* Primary planes are fixed to pipes on gen4+ */ - if (INTEL_GEN(dev_priv) >= 4) { - u32 val = I915_READ(DSPCNTR(pipe)); - I915_STATE_WARN(val & DISPLAY_PLANE_ENABLE, - "plane %c assertion failure, should be disabled but not\n", - plane_name(pipe)); - return; - } +#define assert_plane_enabled(p) assert_plane(p, true) +#define assert_plane_disabled(p) assert_plane(p, false) - /* Need to check both planes against the pipe */ - for_each_pipe(dev_priv, i) { - u32 val = I915_READ(DSPCNTR(i)); - enum pipe cur_pipe = (val & DISPPLANE_SEL_PIPE_MASK) >> - DISPPLANE_SEL_PIPE_SHIFT; - I915_STATE_WARN((val & DISPLAY_PLANE_ENABLE) && pipe == cur_pipe, - "plane %c assertion failure, should be off on pipe %c but is still active\n", - plane_name(i), pipe_name(pipe)); - } -} - -static void assert_sprites_disabled(struct drm_i915_private *dev_priv, - enum pipe pipe) +static void assert_planes_disabled(struct intel_crtc *crtc) { - int sprite; + struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); + struct intel_plane *plane; - if (INTEL_GEN(dev_priv) >= 9) { - for_each_sprite(dev_priv, pipe, sprite) { - u32 val = I915_READ(PLANE_CTL(pipe, sprite)); - I915_STATE_WARN(val & PLANE_CTL_ENABLE, - "plane %d assertion failure, should be off on pipe %c but is still active\n", - sprite, pipe_name(pipe)); - } - } else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) { - for_each_sprite(dev_priv, pipe, sprite) { - u32 val = I915_READ(SPCNTR(pipe, PLANE_SPRITE0 + sprite)); - I915_STATE_WARN(val & SP_ENABLE, - "sprite %c assertion failure, should be off on pipe %c but is still active\n", - sprite_name(pipe, sprite), pipe_name(pipe)); - } - } else if (INTEL_GEN(dev_priv) >= 7) { - u32 val = I915_READ(SPRCTL(pipe)); - I915_STATE_WARN(val & SPRITE_ENABLE, - "sprite %c assertion failure, should be off on pipe %c but is still active\n", - plane_name(pipe), pipe_name(pipe)); - } else if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)) { - u32 val = I915_READ(DVSCNTR(pipe)); - I915_STATE_WARN(val & DVS_ENABLE, - "sprite %c assertion failure, should be off on pipe %c but is still active\n", - plane_name(pipe), pipe_name(pipe)); - } + for_each_intel_plane_on_crtc(&dev_priv->drm, crtc, plane) + assert_plane_disabled(plane); } static void assert_vblank_disabled(struct drm_crtc *crtc) @@ -1918,9 +1849,7 @@ static void intel_enable_pipe(struct intel_crtc *crtc) DRM_DEBUG_KMS("enabling pipe %c\n", pipe_name(pipe)); - assert_planes_disabled(dev_priv, pipe); - assert_cursor_disabled(dev_priv, pipe); - assert_sprites_disabled(dev_priv, pipe); + assert_planes_disabled(crtc); /* * A pipe without a PLL won't actually be able to drive bits from @@ -1989,9 +1918,7 @@ static void intel_disable_pipe(struct intel_crtc *crtc) * Make sure planes won't keep trying to pump pixels to us, * or we might hang the display. */ - assert_planes_disabled(dev_priv, pipe); - assert_cursor_disabled(dev_priv, pipe); - assert_sprites_disabled(dev_priv, pipe); + assert_planes_disabled(crtc); reg = PIPECONF(cpu_transcoder); val = I915_READ(reg); @@ -2820,6 +2747,23 @@ intel_set_plane_visible(struct intel_crtc_state *crtc_state, crtc_state->active_planes); } +static void intel_plane_disable_noatomic(struct intel_crtc *crtc, + struct intel_plane *plane) +{ + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + + intel_set_plane_visible(crtc_state, plane_state, false); + + if (plane->id == PLANE_PRIMARY) + intel_pre_disable_primary_noatomic(&crtc->base); + + trace_intel_disable_plane(&plane->base, crtc); + plane->disable_plane(plane, crtc); +} + static void intel_find_initial_plane_obj(struct intel_crtc *intel_crtc, struct intel_initial_plane_config *plane_config) @@ -2877,12 +2821,7 @@ intel_find_initial_plane_obj(struct intel_crtc *intel_crtc, * simplest solution is to just disable the primary plane now and * pretend the BIOS never had it enabled. */ - intel_set_plane_visible(to_intel_crtc_state(crtc_state), - to_intel_plane_state(plane_state), - false); - intel_pre_disable_primary_noatomic(&intel_crtc->base); - trace_intel_disable_plane(primary, intel_crtc); - intel_plane->disable_plane(intel_plane, intel_crtc); + intel_plane_disable_noatomic(intel_crtc, intel_plane); return; @@ -3385,6 +3324,31 @@ static void i9xx_disable_primary_plane(struct intel_plane *primary, spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); } +static bool i9xx_plane_get_hw_state(struct intel_plane *primary) +{ + + struct drm_i915_private *dev_priv = to_i915(primary->base.dev); + enum intel_display_power_domain power_domain; + enum plane plane = primary->plane; + enum pipe pipe = primary->pipe; + bool ret; + + /* + * Not 100% correct for planes that can move between pipes, + * but that's only the case for gen2-4 which don't have any + * display power wells. + */ + power_domain = POWER_DOMAIN_PIPE(pipe); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(DSPCNTR(plane)) & DISPLAY_PLANE_ENABLE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} + static u32 intel_fb_stride_alignment(const struct drm_framebuffer *fb, int plane) { @@ -4866,7 +4830,8 @@ void hsw_enable_ips(struct intel_crtc *crtc) * a vblank wait. */ - assert_plane_enabled(dev_priv, crtc->plane); + assert_plane_enabled(to_intel_plane(crtc->base.primary)); + if (IS_BROADWELL(dev_priv)) { mutex_lock(&dev_priv->pcu_lock); WARN_ON(sandybridge_pcode_write(dev_priv, DISPLAY_IPS_CONTROL, @@ -4899,7 +4864,8 @@ void hsw_disable_ips(struct intel_crtc *crtc) if (!crtc->config->ips_enabled) return; - assert_plane_enabled(dev_priv, crtc->plane); + assert_plane_enabled(to_intel_plane(crtc->base.primary)); + if (IS_BROADWELL(dev_priv)) { mutex_lock(&dev_priv->pcu_lock); WARN_ON(sandybridge_pcode_write(dev_priv, DISPLAY_IPS_CONTROL, 0)); @@ -5899,6 +5865,7 @@ static void intel_crtc_disable_noatomic(struct drm_crtc *crtc, struct intel_crtc *intel_crtc = to_intel_crtc(crtc); struct drm_i915_private *dev_priv = to_i915(crtc->dev); enum intel_display_power_domain domain; + struct intel_plane *plane; u64 domains; struct drm_atomic_state *state; struct intel_crtc_state *crtc_state; @@ -5907,11 +5874,12 @@ static void intel_crtc_disable_noatomic(struct drm_crtc *crtc, if (!intel_crtc->active) return; - if (crtc->primary->state->visible) { - intel_pre_disable_primary_noatomic(crtc); + for_each_intel_plane_on_crtc(&dev_priv->drm, intel_crtc, plane) { + const struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); - intel_crtc_disable_planes(crtc, 1 << drm_plane_index(crtc->primary)); - crtc->primary->state->visible = false; + if (plane_state->base.visible) + intel_plane_disable_noatomic(intel_crtc, plane); } state = drm_atomic_state_alloc(crtc->dev); @@ -9477,6 +9445,23 @@ static void i845_disable_cursor(struct intel_plane *plane, i845_update_cursor(plane, NULL, NULL); } +static bool i845_cursor_get_hw_state(struct intel_plane *plane) +{ + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + enum intel_display_power_domain power_domain; + bool ret; + + power_domain = POWER_DOMAIN_PIPE(PIPE_A); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(CURCNTR(PIPE_A)) & CURSOR_ENABLE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} + static u32 i9xx_cursor_ctl(const struct intel_crtc_state *crtc_state, const struct intel_plane_state *plane_state) { @@ -9670,6 +9655,28 @@ static void i9xx_disable_cursor(struct intel_plane *plane, i9xx_update_cursor(plane, NULL, NULL); } +static bool i9xx_cursor_get_hw_state(struct intel_plane *plane) +{ + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + enum intel_display_power_domain power_domain; + enum pipe pipe = plane->pipe; + bool ret; + + /* + * Not 100% correct for planes that can move between pipes, + * but that's only the case for gen2-3 which don't have any + * display power wells. + */ + power_domain = POWER_DOMAIN_PIPE(pipe); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(CURCNTR(pipe)) & CURSOR_MODE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} /* VESA 640x480x72Hz mode to set on the pipe */ static const struct drm_display_mode load_detect_mode = { @@ -13205,6 +13212,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe) primary->update_plane = skl_update_plane; primary->disable_plane = skl_disable_plane; + primary->get_hw_state = skl_plane_get_hw_state; } else if (INTEL_GEN(dev_priv) >= 9) { intel_primary_formats = skl_primary_formats; num_formats = ARRAY_SIZE(skl_primary_formats); @@ -13215,6 +13223,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe) primary->update_plane = skl_update_plane; primary->disable_plane = skl_disable_plane; + primary->get_hw_state = skl_plane_get_hw_state; } else if (INTEL_GEN(dev_priv) >= 4) { intel_primary_formats = i965_primary_formats; num_formats = ARRAY_SIZE(i965_primary_formats); @@ -13222,6 +13231,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe) primary->update_plane = i9xx_update_primary_plane; primary->disable_plane = i9xx_disable_primary_plane; + primary->get_hw_state = i9xx_plane_get_hw_state; } else { intel_primary_formats = i8xx_primary_formats; num_formats = ARRAY_SIZE(i8xx_primary_formats); @@ -13229,6 +13239,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe) primary->update_plane = i9xx_update_primary_plane; primary->disable_plane = i9xx_disable_primary_plane; + primary->get_hw_state = i9xx_plane_get_hw_state; } if (INTEL_GEN(dev_priv) >= 9) @@ -13318,10 +13329,12 @@ intel_cursor_plane_create(struct drm_i915_private *dev_priv, if (IS_I845G(dev_priv) || IS_I865G(dev_priv)) { cursor->update_plane = i845_update_cursor; cursor->disable_plane = i845_disable_cursor; + cursor->get_hw_state = i845_cursor_get_hw_state; cursor->check_plane = i845_check_cursor; } else { cursor->update_plane = i9xx_update_cursor; cursor->disable_plane = i9xx_disable_cursor; + cursor->get_hw_state = i9xx_cursor_get_hw_state; cursor->check_plane = i9xx_check_cursor; } @@ -14671,8 +14684,11 @@ void i830_disable_pipe(struct drm_i915_private *dev_priv, enum pipe pipe) DRM_DEBUG_KMS("disabling pipe %c due to force quirk\n", pipe_name(pipe)); - assert_plane_disabled(dev_priv, PLANE_A); - assert_plane_disabled(dev_priv, PLANE_B); + WARN_ON(I915_READ(DSPCNTR(PLANE_A)) & DISPLAY_PLANE_ENABLE); + WARN_ON(I915_READ(DSPCNTR(PLANE_B)) & DISPLAY_PLANE_ENABLE); + WARN_ON(I915_READ(DSPCNTR(PLANE_C)) & DISPLAY_PLANE_ENABLE); + WARN_ON(I915_READ(CURCNTR(PIPE_A)) & CURSOR_MODE); + WARN_ON(I915_READ(CURCNTR(PIPE_B)) & CURSOR_MODE); I915_WRITE(PIPECONF(pipe), 0); POSTING_READ(PIPECONF(pipe)); @@ -14683,22 +14699,36 @@ void i830_disable_pipe(struct drm_i915_private *dev_priv, enum pipe pipe) POSTING_READ(DPLL(pipe)); } -static bool -intel_check_plane_mapping(struct intel_crtc *crtc) +static bool intel_plane_mapping_ok(struct intel_crtc *crtc, + struct intel_plane *primary) { struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); - u32 val; + enum plane plane = primary->plane; + u32 val = I915_READ(DSPCNTR(plane)); - if (INTEL_INFO(dev_priv)->num_pipes == 1) - return true; + return (val & DISPLAY_PLANE_ENABLE) == 0 || + (val & DISPPLANE_SEL_PIPE_MASK) == DISPPLANE_SEL_PIPE(crtc->pipe); +} - val = I915_READ(DSPCNTR(!crtc->plane)); +static void +intel_sanitize_plane_mapping(struct drm_i915_private *dev_priv) +{ + struct intel_crtc *crtc; - if ((val & DISPLAY_PLANE_ENABLE) && - (!!(val & DISPPLANE_SEL_PIPE_MASK) == crtc->pipe)) - return false; + if (INTEL_GEN(dev_priv) >= 4) + return; - return true; + for_each_intel_crtc(&dev_priv->drm, crtc) { + struct intel_plane *plane = + to_intel_plane(crtc->base.primary); + + if (intel_plane_mapping_ok(crtc, plane)) + continue; + + DRM_DEBUG_KMS("%s attached to the wrong pipe, disabling plane\n", + plane->base.name); + intel_plane_disable_noatomic(crtc, plane); + } } static bool intel_crtc_has_encoders(struct intel_crtc *crtc) @@ -14754,33 +14784,15 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc, /* Disable everything but the primary plane */ for_each_intel_plane_on_crtc(dev, crtc, plane) { - if (plane->base.type == DRM_PLANE_TYPE_PRIMARY) - continue; + const struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); - trace_intel_disable_plane(&plane->base, crtc); - plane->disable_plane(plane, crtc); + if (plane_state->base.visible && + plane->base.type != DRM_PLANE_TYPE_PRIMARY) + intel_plane_disable_noatomic(crtc, plane); } } - /* We need to sanitize the plane -> pipe mapping first because this will - * disable the crtc (and hence change the state) if it is wrong. Note - * that gen4+ has a fixed plane -> pipe mapping. */ - if (INTEL_GEN(dev_priv) < 4 && !intel_check_plane_mapping(crtc)) { - bool plane; - - DRM_DEBUG_KMS("[CRTC:%d:%s] wrong plane connection detected!\n", - crtc->base.base.id, crtc->base.name); - - /* Pipe has the wrong plane attached and the plane is active. - * Temporarily change the plane mapping and disable everything - * ... */ - plane = crtc->plane; - crtc->base.primary->state->visible = true; - crtc->plane = !plane; - intel_crtc_disable_noatomic(&crtc->base, ctx); - crtc->plane = plane; - } - /* Adjust the state of the output pipe according to whether we * have active connectors/encoders. */ if (crtc->active && !intel_crtc_has_encoders(crtc)) @@ -14885,24 +14897,21 @@ void i915_redisable_vga(struct drm_i915_private *dev_priv) intel_display_power_put(dev_priv, POWER_DOMAIN_VGA); } -static bool primary_get_hw_state(struct intel_plane *plane) -{ - struct drm_i915_private *dev_priv = to_i915(plane->base.dev); - - return I915_READ(DSPCNTR(plane->plane)) & DISPLAY_PLANE_ENABLE; -} - /* FIXME read out full plane state for all planes */ static void readout_plane_state(struct intel_crtc *crtc) { - struct intel_plane *primary = to_intel_plane(crtc->base.primary); - bool visible; + struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + struct intel_plane *plane; - visible = crtc->active && primary_get_hw_state(primary); + for_each_intel_plane_on_crtc(&dev_priv->drm, crtc, plane) { + struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + bool visible = plane->get_hw_state(plane); - intel_set_plane_visible(to_intel_crtc_state(crtc->base.state), - to_intel_plane_state(primary->base.state), - visible); + intel_set_plane_visible(crtc_state, plane_state, visible); + } } static void intel_modeset_readout_hw_state(struct drm_device *dev) @@ -15100,6 +15109,8 @@ intel_modeset_setup_hw_state(struct drm_device *dev, /* HW state is read out, now we need to sanitize this mess. */ get_encoder_power_domains(dev_priv); + intel_sanitize_plane_mapping(dev_priv); + for_each_intel_encoder(dev, encoder) { intel_sanitize_encoder(encoder); } diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 6c7f8bca574e..5d77f75a9f9c 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -862,6 +862,7 @@ struct intel_plane { const struct intel_plane_state *plane_state); void (*disable_plane)(struct intel_plane *plane, struct intel_crtc *crtc); + bool (*get_hw_state)(struct intel_plane *plane); int (*check_plane)(struct intel_plane *plane, struct intel_crtc_state *crtc_state, struct intel_plane_state *state); @@ -1924,6 +1925,7 @@ void skl_update_plane(struct intel_plane *plane, const struct intel_crtc_state *crtc_state, const struct intel_plane_state *plane_state); void skl_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc); +bool skl_plane_get_hw_state(struct intel_plane *plane); /* intel_tv.c */ void intel_tv_init(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c index 4fcf80ca91dd..4a8a5d918a83 100644 --- a/drivers/gpu/drm/i915/intel_sprite.c +++ b/drivers/gpu/drm/i915/intel_sprite.c @@ -329,6 +329,26 @@ skl_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc) spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); } +bool +skl_plane_get_hw_state(struct intel_plane *plane) +{ + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + enum intel_display_power_domain power_domain; + enum plane_id plane_id = plane->id; + enum pipe pipe = plane->pipe; + bool ret; + + power_domain = POWER_DOMAIN_PIPE(pipe); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(PLANE_CTL(pipe, plane_id)) & PLANE_CTL_ENABLE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} + static void chv_update_csc(struct intel_plane *plane, uint32_t format) { @@ -506,6 +526,26 @@ vlv_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc) spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); } +static bool +vlv_plane_get_hw_state(struct intel_plane *plane) +{ + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + enum intel_display_power_domain power_domain; + enum plane_id plane_id = plane->id; + enum pipe pipe = plane->pipe; + bool ret; + + power_domain = POWER_DOMAIN_PIPE(pipe); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(SPCNTR(pipe, plane_id)) & SP_ENABLE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} + static u32 ivb_sprite_ctl(const struct intel_crtc_state *crtc_state, const struct intel_plane_state *plane_state) { @@ -646,6 +686,25 @@ ivb_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc) spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); } +static bool +ivb_plane_get_hw_state(struct intel_plane *plane) +{ + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + enum intel_display_power_domain power_domain; + enum pipe pipe = plane->pipe; + bool ret; + + power_domain = POWER_DOMAIN_PIPE(pipe); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(SPRCTL(pipe)) & SPRITE_ENABLE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} + static u32 g4x_sprite_ctl(const struct intel_crtc_state *crtc_state, const struct intel_plane_state *plane_state) { @@ -777,6 +836,25 @@ g4x_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc) spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); } +static bool +g4x_plane_get_hw_state(struct intel_plane *plane) +{ + struct drm_i915_private *dev_priv = to_i915(plane->base.dev); + enum intel_display_power_domain power_domain; + enum pipe pipe = plane->pipe; + bool ret; + + power_domain = POWER_DOMAIN_PIPE(pipe); + if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) + return false; + + ret = I915_READ(DVSCNTR(pipe)) & DVS_ENABLE; + + intel_display_power_put(dev_priv, power_domain); + + return ret; +} + static int intel_check_sprite_plane(struct intel_plane *plane, struct intel_crtc_state *crtc_state, @@ -1232,6 +1310,7 @@ intel_sprite_plane_create(struct drm_i915_private *dev_priv, intel_plane->update_plane = skl_update_plane; intel_plane->disable_plane = skl_disable_plane; + intel_plane->get_hw_state = skl_plane_get_hw_state; plane_formats = skl_plane_formats; num_plane_formats = ARRAY_SIZE(skl_plane_formats); @@ -1242,6 +1321,7 @@ intel_sprite_plane_create(struct drm_i915_private *dev_priv, intel_plane->update_plane = skl_update_plane; intel_plane->disable_plane = skl_disable_plane; + intel_plane->get_hw_state = skl_plane_get_hw_state; plane_formats = skl_plane_formats; num_plane_formats = ARRAY_SIZE(skl_plane_formats); @@ -1252,6 +1332,7 @@ intel_sprite_plane_create(struct drm_i915_private *dev_priv, intel_plane->update_plane = vlv_update_plane; intel_plane->disable_plane = vlv_disable_plane; + intel_plane->get_hw_state = vlv_plane_get_hw_state; plane_formats = vlv_plane_formats; num_plane_formats = ARRAY_SIZE(vlv_plane_formats); @@ -1267,6 +1348,7 @@ intel_sprite_plane_create(struct drm_i915_private *dev_priv, intel_plane->update_plane = ivb_update_plane; intel_plane->disable_plane = ivb_disable_plane; + intel_plane->get_hw_state = ivb_plane_get_hw_state; plane_formats = snb_plane_formats; num_plane_formats = ARRAY_SIZE(snb_plane_formats); @@ -1277,6 +1359,7 @@ intel_sprite_plane_create(struct drm_i915_private *dev_priv, intel_plane->update_plane = g4x_update_plane; intel_plane->disable_plane = g4x_disable_plane; + intel_plane->get_hw_state = g4x_plane_get_hw_state; modifiers = i9xx_plane_format_modifiers; if (IS_GEN6(dev_priv)) { diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h index 0760b93e9d1f..baab93398e54 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h @@ -121,6 +121,7 @@ int nv41_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); int nv44_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); int nv50_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); int g84_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); +int mcp77_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); int gf100_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); int gk104_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); int gk20a_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **); diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 435ff8662cfa..ef687414969e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1447,11 +1447,13 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg) args.nv50.ro = 0; args.nv50.kind = mem->kind; args.nv50.comp = mem->comp; + argc = sizeof(args.nv50); break; case NVIF_CLASS_MEM_GF100: args.gf100.version = 0; args.gf100.ro = 0; args.gf100.kind = mem->kind; + argc = sizeof(args.gf100); break; default: WARN_ON(1); @@ -1459,7 +1461,7 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg) } ret = nvif_object_map_handle(&mem->mem.object, - &argc, argc, + &args, argc, &handle, &length); if (ret != 1) return ret ? ret : -EINVAL; diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 00eeaaffeae5..08e77cd55e6e 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -1251,7 +1251,7 @@ nvaa_chipset = { .i2c = g94_i2c_new, .imem = nv50_instmem_new, .mc = g98_mc_new, - .mmu = g84_mmu_new, + .mmu = mcp77_mmu_new, .mxm = nv50_mxm_new, .pci = g94_pci_new, .therm = g84_therm_new, @@ -1283,7 +1283,7 @@ nvac_chipset = { .i2c = g94_i2c_new, .imem = nv50_instmem_new, .mc = g98_mc_new, - .mmu = g84_mmu_new, + .mmu = mcp77_mmu_new, .mxm = nv50_mxm_new, .pci = g94_pci_new, .therm = g84_therm_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c index 9646adec57cb..243f0a5c8a62 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c @@ -73,7 +73,8 @@ static int nvkm_bar_fini(struct nvkm_subdev *subdev, bool suspend) { struct nvkm_bar *bar = nvkm_bar(subdev); - bar->func->bar1.fini(bar); + if (bar->func->bar1.fini) + bar->func->bar1.fini(bar); return 0; } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c index b10077d38839..35878fb538f2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c @@ -26,7 +26,6 @@ gk20a_bar_func = { .dtor = gf100_bar_dtor, .oneinit = gf100_bar_oneinit, .bar1.init = gf100_bar_bar1_init, - .bar1.fini = gf100_bar_bar1_fini, .bar1.wait = gf100_bar_bar1_wait, .bar1.vmm = gf100_bar_bar1_vmm, .flush = g84_bar_flush, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild index 352a65f9371c..67ee983bb026 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild @@ -4,6 +4,7 @@ nvkm-y += nvkm/subdev/mmu/nv41.o nvkm-y += nvkm/subdev/mmu/nv44.o nvkm-y += nvkm/subdev/mmu/nv50.o nvkm-y += nvkm/subdev/mmu/g84.o +nvkm-y += nvkm/subdev/mmu/mcp77.o nvkm-y += nvkm/subdev/mmu/gf100.o nvkm-y += nvkm/subdev/mmu/gk104.o nvkm-y += nvkm/subdev/mmu/gk20a.o @@ -22,6 +23,7 @@ nvkm-y += nvkm/subdev/mmu/vmmnv04.o nvkm-y += nvkm/subdev/mmu/vmmnv41.o nvkm-y += nvkm/subdev/mmu/vmmnv44.o nvkm-y += nvkm/subdev/mmu/vmmnv50.o +nvkm-y += nvkm/subdev/mmu/vmmmcp77.o nvkm-y += nvkm/subdev/mmu/vmmgf100.o nvkm-y += nvkm/subdev/mmu/vmmgk104.o nvkm-y += nvkm/subdev/mmu/vmmgk20a.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mcp77.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mcp77.c new file mode 100644 index 000000000000..0527b50730d9 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mcp77.c @@ -0,0 +1,41 @@ +/* + * Copyright 2017 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "mem.h" +#include "vmm.h" + +#include <nvif/class.h> + +static const struct nvkm_mmu_func +mcp77_mmu = { + .dma_bits = 40, + .mmu = {{ -1, -1, NVIF_CLASS_MMU_NV50}}, + .mem = {{ -1, 0, NVIF_CLASS_MEM_NV50}, nv50_mem_new, nv50_mem_map }, + .vmm = {{ -1, -1, NVIF_CLASS_VMM_NV50}, mcp77_vmm_new, false, 0x0200 }, + .kind = nv50_mmu_kind, + .kind_sys = true, +}; + +int +mcp77_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu) +{ + return nvkm_mmu_new_(&mcp77_mmu, device, index, pmmu); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h index 6d8f61ea467a..da06e64d8a7d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h @@ -95,6 +95,9 @@ struct nvkm_vmm_desc { const struct nvkm_vmm_desc_func *func; }; +extern const struct nvkm_vmm_desc nv50_vmm_desc_12[]; +extern const struct nvkm_vmm_desc nv50_vmm_desc_16[]; + extern const struct nvkm_vmm_desc gk104_vmm_desc_16_12[]; extern const struct nvkm_vmm_desc gk104_vmm_desc_16_16[]; extern const struct nvkm_vmm_desc gk104_vmm_desc_17_12[]; @@ -169,6 +172,11 @@ int nv04_vmm_new_(const struct nvkm_vmm_func *, struct nvkm_mmu *, u32, const char *, struct nvkm_vmm **); int nv04_vmm_valid(struct nvkm_vmm *, void *, u32, struct nvkm_vmm_map *); +int nv50_vmm_join(struct nvkm_vmm *, struct nvkm_memory *); +void nv50_vmm_part(struct nvkm_vmm *, struct nvkm_memory *); +int nv50_vmm_valid(struct nvkm_vmm *, void *, u32, struct nvkm_vmm_map *); +void nv50_vmm_flush(struct nvkm_vmm *, int); + int gf100_vmm_new_(const struct nvkm_vmm_func *, const struct nvkm_vmm_func *, struct nvkm_mmu *, u64, u64, void *, u32, struct lock_class_key *, const char *, struct nvkm_vmm **); @@ -200,6 +208,8 @@ int nv44_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32, struct lock_class_key *, const char *, struct nvkm_vmm **); int nv50_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32, struct lock_class_key *, const char *, struct nvkm_vmm **); +int mcp77_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32, + struct lock_class_key *, const char *, struct nvkm_vmm **); int g84_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32, struct lock_class_key *, const char *, struct nvkm_vmm **); int gf100_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmmcp77.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmmcp77.c new file mode 100644 index 000000000000..e63d984cbfd4 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmmcp77.c @@ -0,0 +1,45 @@ +/* + * Copyright 2017 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "vmm.h" + +static const struct nvkm_vmm_func +mcp77_vmm = { + .join = nv50_vmm_join, + .part = nv50_vmm_part, + .valid = nv50_vmm_valid, + .flush = nv50_vmm_flush, + .page_block = 1 << 29, + .page = { + { 16, &nv50_vmm_desc_16[0], NVKM_VMM_PAGE_xVxx }, + { 12, &nv50_vmm_desc_12[0], NVKM_VMM_PAGE_xVHx }, + {} + } +}; + +int +mcp77_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc, + struct lock_class_key *key, const char *name, + struct nvkm_vmm **pvmm) +{ + return nv04_vmm_new_(&mcp77_vmm, mmu, 0, addr, size, + argv, argc, key, name, pvmm); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c index 863a2edd9861..64f75d906202 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c @@ -32,7 +32,7 @@ static inline void nv50_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr) { - u64 next = addr | map->type, data; + u64 next = addr + map->type, data; u32 pten; int log2blk; @@ -69,7 +69,7 @@ nv50_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt, VMM_SPAM(vmm, "DMAA %08x %08x PTE(s)", ptei, ptes); nvkm_kmap(pt->memory); while (ptes--) { - const u64 data = *map->dma++ | map->type; + const u64 data = *map->dma++ + map->type; VMM_WO064(pt, vmm, ptei++ * 8, data); map->type += map->ctag; } @@ -163,21 +163,21 @@ nv50_vmm_pgd = { .pde = nv50_vmm_pgd_pde, }; -static const struct nvkm_vmm_desc +const struct nvkm_vmm_desc nv50_vmm_desc_12[] = { { PGT, 17, 8, 0x1000, &nv50_vmm_pgt }, { PGD, 11, 0, 0x0000, &nv50_vmm_pgd }, {} }; -static const struct nvkm_vmm_desc +const struct nvkm_vmm_desc nv50_vmm_desc_16[] = { { PGT, 13, 8, 0x1000, &nv50_vmm_pgt }, { PGD, 11, 0, 0x0000, &nv50_vmm_pgd }, {} }; -static void +void nv50_vmm_flush(struct nvkm_vmm *vmm, int level) { struct nvkm_subdev *subdev = &vmm->mmu->subdev; @@ -223,7 +223,7 @@ nv50_vmm_flush(struct nvkm_vmm *vmm, int level) mutex_unlock(&subdev->mutex); } -static int +int nv50_vmm_valid(struct nvkm_vmm *vmm, void *argv, u32 argc, struct nvkm_vmm_map *map) { @@ -321,7 +321,7 @@ nv50_vmm_valid(struct nvkm_vmm *vmm, void *argv, u32 argc, return 0; } -static void +void nv50_vmm_part(struct nvkm_vmm *vmm, struct nvkm_memory *inst) { struct nvkm_vmm_join *join; @@ -335,7 +335,7 @@ nv50_vmm_part(struct nvkm_vmm *vmm, struct nvkm_memory *inst) } } -static int +int nv50_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst) { const u32 pd_offset = vmm->mmu->func->vmm.pd_offset; diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_tmds_clk.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_tmds_clk.c index dc332ea56f6c..3ecffa52c814 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_tmds_clk.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_tmds_clk.c @@ -102,10 +102,13 @@ static int sun4i_tmds_determine_rate(struct clk_hw *hw, goto out; } - if (abs(rate - rounded / i) < - abs(rate - best_parent / best_div)) { + if (!best_parent || + abs(rate - rounded / i / j) < + abs(rate - best_parent / best_half / + best_div)) { best_parent = rounded; - best_div = i; + best_half = i; + best_div = j; } } } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 641294aef165..fcd58145d0da 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1863,7 +1863,7 @@ u32 vmw_get_vblank_counter(struct drm_device *dev, unsigned int pipe) */ int vmw_enable_vblank(struct drm_device *dev, unsigned int pipe) { - return -ENOSYS; + return -EINVAL; } /** diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c index b8a09807c5de..3824595fece1 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c @@ -266,8 +266,8 @@ static const struct drm_connector_funcs vmw_legacy_connector_funcs = { .set_property = vmw_du_connector_set_property, .destroy = vmw_ldu_connector_destroy, .reset = vmw_du_connector_reset, - .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, - .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, + .atomic_duplicate_state = vmw_du_connector_duplicate_state, + .atomic_destroy_state = vmw_du_connector_destroy_state, .atomic_set_property = vmw_du_connector_atomic_set_property, .atomic_get_property = vmw_du_connector_atomic_get_property, }; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c index bc5f6026573d..63a4cd794b73 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c @@ -420,8 +420,8 @@ static const struct drm_connector_funcs vmw_sou_connector_funcs = { .set_property = vmw_du_connector_set_property, .destroy = vmw_sou_connector_destroy, .reset = vmw_du_connector_reset, - .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, - .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, + .atomic_duplicate_state = vmw_du_connector_duplicate_state, + .atomic_destroy_state = vmw_du_connector_destroy_state, .atomic_set_property = vmw_du_connector_atomic_set_property, .atomic_get_property = vmw_du_connector_atomic_get_property, }; diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 706164b4c5be..f7829a74140c 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -821,8 +821,12 @@ void i2c_unregister_device(struct i2c_client *client) { if (!client) return; - if (client->dev.of_node) + + if (client->dev.of_node) { of_node_clear_flag(client->dev.of_node, OF_POPULATED); + of_node_put(client->dev.of_node); + } + if (ACPI_COMPANION(&client->dev)) acpi_device_clear_enumerated(ACPI_COMPANION(&client->dev)); device_unregister(&client->dev); diff --git a/drivers/i2c/i2c-core-smbus.c b/drivers/i2c/i2c-core-smbus.c index 4bb9927afd01..a1082c04ac5c 100644 --- a/drivers/i2c/i2c-core-smbus.c +++ b/drivers/i2c/i2c-core-smbus.c @@ -397,16 +397,17 @@ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter *adapter, u16 addr, the underlying bus driver */ break; case I2C_SMBUS_I2C_BLOCK_DATA: + if (data->block[0] > I2C_SMBUS_BLOCK_MAX) { + dev_err(&adapter->dev, "Invalid block %s size %d\n", + read_write == I2C_SMBUS_READ ? "read" : "write", + data->block[0]); + return -EINVAL; + } + if (read_write == I2C_SMBUS_READ) { msg[1].len = data->block[0]; } else { msg[0].len = data->block[0] + 1; - if (msg[0].len > I2C_SMBUS_BLOCK_MAX + 1) { - dev_err(&adapter->dev, - "Invalid block write size %d\n", - data->block[0]); - return -EINVAL; - } for (i = 1; i <= data->block[0]; i++) msgbuf0[i] = data->block[i]; } diff --git a/drivers/input/misc/twl4030-vibra.c b/drivers/input/misc/twl4030-vibra.c index 6c51d404874b..c37aea9ac272 100644 --- a/drivers/input/misc/twl4030-vibra.c +++ b/drivers/input/misc/twl4030-vibra.c @@ -178,12 +178,14 @@ static SIMPLE_DEV_PM_OPS(twl4030_vibra_pm_ops, twl4030_vibra_suspend, twl4030_vibra_resume); static bool twl4030_vibra_check_coexist(struct twl4030_vibra_data *pdata, - struct device_node *node) + struct device_node *parent) { + struct device_node *node; + if (pdata && pdata->coexist) return true; - node = of_find_node_by_name(node, "codec"); + node = of_get_child_by_name(parent, "codec"); if (node) { of_node_put(node); return true; diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c index 5690eb7ff954..15e0d352c4cc 100644 --- a/drivers/input/misc/twl6040-vibra.c +++ b/drivers/input/misc/twl6040-vibra.c @@ -248,8 +248,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev) int vddvibr_uV = 0; int error; - of_node_get(twl6040_core_dev->of_node); - twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node, + twl6040_core_node = of_get_child_by_name(twl6040_core_dev->of_node, "vibra"); if (!twl6040_core_node) { dev_err(&pdev->dev, "parent of node is missing?\n"); diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c index 579b899add26..dbe57da8c1a1 100644 --- a/drivers/input/mouse/alps.c +++ b/drivers/input/mouse/alps.c @@ -1250,29 +1250,32 @@ static int alps_decode_ss4_v2(struct alps_fields *f, case SS4_PACKET_ID_MULTI: if (priv->flags & ALPS_BUTTONPAD) { if (IS_SS4PLUS_DEV(priv->dev_id)) { - f->mt[0].x = SS4_PLUS_BTL_MF_X_V2(p, 0); - f->mt[1].x = SS4_PLUS_BTL_MF_X_V2(p, 1); + f->mt[2].x = SS4_PLUS_BTL_MF_X_V2(p, 0); + f->mt[3].x = SS4_PLUS_BTL_MF_X_V2(p, 1); + no_data_x = SS4_PLUS_MFPACKET_NO_AX_BL; } else { f->mt[2].x = SS4_BTL_MF_X_V2(p, 0); f->mt[3].x = SS4_BTL_MF_X_V2(p, 1); + no_data_x = SS4_MFPACKET_NO_AX_BL; } + no_data_y = SS4_MFPACKET_NO_AY_BL; f->mt[2].y = SS4_BTL_MF_Y_V2(p, 0); f->mt[3].y = SS4_BTL_MF_Y_V2(p, 1); - no_data_x = SS4_MFPACKET_NO_AX_BL; - no_data_y = SS4_MFPACKET_NO_AY_BL; } else { if (IS_SS4PLUS_DEV(priv->dev_id)) { - f->mt[0].x = SS4_PLUS_STD_MF_X_V2(p, 0); - f->mt[1].x = SS4_PLUS_STD_MF_X_V2(p, 1); + f->mt[2].x = SS4_PLUS_STD_MF_X_V2(p, 0); + f->mt[3].x = SS4_PLUS_STD_MF_X_V2(p, 1); + no_data_x = SS4_PLUS_MFPACKET_NO_AX; } else { - f->mt[0].x = SS4_STD_MF_X_V2(p, 0); - f->mt[1].x = SS4_STD_MF_X_V2(p, 1); + f->mt[2].x = SS4_STD_MF_X_V2(p, 0); + f->mt[3].x = SS4_STD_MF_X_V2(p, 1); + no_data_x = SS4_MFPACKET_NO_AX; } + no_data_y = SS4_MFPACKET_NO_AY; + f->mt[2].y = SS4_STD_MF_Y_V2(p, 0); f->mt[3].y = SS4_STD_MF_Y_V2(p, 1); - no_data_x = SS4_MFPACKET_NO_AX; - no_data_y = SS4_MFPACKET_NO_AY; } f->first_mp = 0; diff --git a/drivers/input/mouse/alps.h b/drivers/input/mouse/alps.h index c80a7c76cb76..79b6d69d1486 100644 --- a/drivers/input/mouse/alps.h +++ b/drivers/input/mouse/alps.h @@ -141,10 +141,12 @@ enum SS4_PACKET_ID { #define SS4_TS_Z_V2(_b) (s8)(_b[4] & 0x7F) -#define SS4_MFPACKET_NO_AX 8160 /* X-Coordinate value */ -#define SS4_MFPACKET_NO_AY 4080 /* Y-Coordinate value */ -#define SS4_MFPACKET_NO_AX_BL 8176 /* Buttonless X-Coordinate value */ -#define SS4_MFPACKET_NO_AY_BL 4088 /* Buttonless Y-Coordinate value */ +#define SS4_MFPACKET_NO_AX 8160 /* X-Coordinate value */ +#define SS4_MFPACKET_NO_AY 4080 /* Y-Coordinate value */ +#define SS4_MFPACKET_NO_AX_BL 8176 /* Buttonless X-Coord value */ +#define SS4_MFPACKET_NO_AY_BL 4088 /* Buttonless Y-Coord value */ +#define SS4_PLUS_MFPACKET_NO_AX 4080 /* SS4 PLUS, X */ +#define SS4_PLUS_MFPACKET_NO_AX_BL 4088 /* Buttonless SS4 PLUS, X */ /* * enum V7_PACKET_ID - defines the packet type for V7 diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index ee5466a374bf..cd9f61cb3fc6 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -173,6 +173,7 @@ static const char * const smbus_pnp_ids[] = { "LEN0046", /* X250 */ "LEN004a", /* W541 */ "LEN200f", /* T450s */ + "LEN2018", /* T460p */ NULL }; diff --git a/drivers/input/rmi4/rmi_driver.c b/drivers/input/rmi4/rmi_driver.c index 4f2bb5947a4e..141ea228aac6 100644 --- a/drivers/input/rmi4/rmi_driver.c +++ b/drivers/input/rmi4/rmi_driver.c @@ -230,8 +230,10 @@ static irqreturn_t rmi_irq_fn(int irq, void *dev_id) rmi_dbg(RMI_DEBUG_CORE, &rmi_dev->dev, "Failed to process interrupt request: %d\n", ret); - if (count) + if (count) { kfree(attn_data.data); + attn_data.data = NULL; + } if (!kfifo_is_empty(&drvdata->attn_fifo)) return rmi_irq_fn(irq, dev_id); diff --git a/drivers/input/touchscreen/88pm860x-ts.c b/drivers/input/touchscreen/88pm860x-ts.c index 7ed828a51f4c..3486d9403805 100644 --- a/drivers/input/touchscreen/88pm860x-ts.c +++ b/drivers/input/touchscreen/88pm860x-ts.c @@ -126,7 +126,7 @@ static int pm860x_touch_dt_init(struct platform_device *pdev, int data, n, ret; if (!np) return -ENODEV; - np = of_find_node_by_name(np, "touch"); + np = of_get_child_by_name(np, "touch"); if (!np) { dev_err(&pdev->dev, "Can't find touch node\n"); return -EINVAL; @@ -144,13 +144,13 @@ static int pm860x_touch_dt_init(struct platform_device *pdev, if (data) { ret = pm860x_reg_write(i2c, PM8607_GPADC_MISC1, data); if (ret < 0) - return -EINVAL; + goto err_put_node; } /* set tsi prebias time */ if (!of_property_read_u32(np, "marvell,88pm860x-tsi-prebias", &data)) { ret = pm860x_reg_write(i2c, PM8607_TSI_PREBIAS, data); if (ret < 0) - return -EINVAL; + goto err_put_node; } /* set prebias & prechg time of pen detect */ data = 0; @@ -161,10 +161,18 @@ static int pm860x_touch_dt_init(struct platform_device *pdev, if (data) { ret = pm860x_reg_write(i2c, PM8607_PD_PREBIAS, data); if (ret < 0) - return -EINVAL; + goto err_put_node; } of_property_read_u32(np, "marvell,88pm860x-resistor-X", res_x); + + of_node_put(np); + return 0; + +err_put_node: + of_node_put(np); + + return -EINVAL; } #else #define pm860x_touch_dt_init(x, y, z) (-1) diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c index 8d7f9c8f2771..9642f103b726 100644 --- a/drivers/input/touchscreen/of_touchscreen.c +++ b/drivers/input/touchscreen/of_touchscreen.c @@ -13,6 +13,7 @@ #include <linux/input.h> #include <linux/input/mt.h> #include <linux/input/touchscreen.h> +#include <linux/module.h> static bool touchscreen_get_prop_u32(struct device *dev, const char *property, @@ -185,3 +186,6 @@ void touchscreen_report_pos(struct input_dev *input, input_report_abs(input, multitouch ? ABS_MT_POSITION_Y : ABS_Y, y); } EXPORT_SYMBOL(touchscreen_report_pos); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Device-tree helpers functions for touchscreen devices"); diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 85140c9af581..8b941f814472 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -687,6 +687,20 @@ static inline void esdhc_pltfm_set_clock(struct sdhci_host *host, return; } + /* For i.MX53 eSDHCv3, SYSCTL.SDCLKFS may not be set to 0. */ + if (is_imx53_esdhc(imx_data)) { + /* + * According to the i.MX53 reference manual, if DLLCTRL[10] can + * be set, then the controller is eSDHCv3, else it is eSDHCv2. + */ + val = readl(host->ioaddr + ESDHC_DLL_CTRL); + writel(val | BIT(10), host->ioaddr + ESDHC_DLL_CTRL); + temp = readl(host->ioaddr + ESDHC_DLL_CTRL); + writel(val, host->ioaddr + ESDHC_DLL_CTRL); + if (temp & BIT(10)) + pre_div = 2; + } + temp = sdhci_readl(host, ESDHC_SYSTEM_CONTROL); temp &= ~(ESDHC_CLOCK_IPGEN | ESDHC_CLOCK_HCKEN | ESDHC_CLOCK_PEREN | ESDHC_CLOCK_MASK); diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index 18ff127020c0..dd161c5eea8e 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -184,7 +184,7 @@ static int pcan_usb_fd_send_cmd(struct peak_usb_device *dev, void *cmd_tail) void *cmd_head = pcan_usb_fd_cmd_buffer(dev); int err = 0; u8 *packet_ptr; - int i, n = 1, packet_len; + int packet_len; ptrdiff_t cmd_len; /* usb device unregistered? */ @@ -201,17 +201,13 @@ static int pcan_usb_fd_send_cmd(struct peak_usb_device *dev, void *cmd_tail) } packet_ptr = cmd_head; + packet_len = cmd_len; /* firmware is not able to re-assemble 512 bytes buffer in full-speed */ - if ((dev->udev->speed != USB_SPEED_HIGH) && - (cmd_len > PCAN_UFD_LOSPD_PKT_SIZE)) { - packet_len = PCAN_UFD_LOSPD_PKT_SIZE; - n += cmd_len / packet_len; - } else { - packet_len = cmd_len; - } + if (unlikely(dev->udev->speed != USB_SPEED_HIGH)) + packet_len = min(packet_len, PCAN_UFD_LOSPD_PKT_SIZE); - for (i = 0; i < n; i++) { + do { err = usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USBPRO_EP_CMDOUT), @@ -224,7 +220,12 @@ static int pcan_usb_fd_send_cmd(struct peak_usb_device *dev, void *cmd_tail) } packet_ptr += packet_len; - } + cmd_len -= packet_len; + + if (cmd_len < PCAN_UFD_LOSPD_PKT_SIZE) + packet_len = cmd_len; + + } while (packet_len > 0); return err; } diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 54cb00a27408..eb328bade225 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3999,9 +3999,11 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) out_mdio: mv88e6xxx_mdios_unregister(chip); out_g1_vtu_prob_irq: - mv88e6xxx_g1_vtu_prob_irq_free(chip); + if (chip->irq > 0) + mv88e6xxx_g1_vtu_prob_irq_free(chip); out_g1_atu_prob_irq: - mv88e6xxx_g1_atu_prob_irq_free(chip); + if (chip->irq > 0) + mv88e6xxx_g1_atu_prob_irq_free(chip); out_g2_irq: if (chip->info->g2_irqs > 0 && chip->irq > 0) mv88e6xxx_g2_irq_free(chip); diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c index b97de9d36337..20d941f4273b 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c @@ -377,7 +377,7 @@ int mv88e6xxx_g1_atu_prob_irq_setup(struct mv88e6xxx_chip *chip) chip->atu_prob_irq = irq_find_mapping(chip->g1_irq.domain, MV88E6XXX_G1_STS_IRQ_ATU_PROB); if (chip->atu_prob_irq < 0) - return chip->device_irq; + return chip->atu_prob_irq; err = request_threaded_irq(chip->atu_prob_irq, NULL, mv88e6xxx_g1_atu_prob_irq_thread_fn, diff --git a/drivers/net/dsa/mv88e6xxx/global1_vtu.c b/drivers/net/dsa/mv88e6xxx/global1_vtu.c index 53d58a01484a..7997961647de 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_vtu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_vtu.c @@ -570,7 +570,7 @@ int mv88e6xxx_g1_vtu_prob_irq_setup(struct mv88e6xxx_chip *chip) chip->vtu_prob_irq = irq_find_mapping(chip->g1_irq.domain, MV88E6XXX_G1_STS_IRQ_VTU_PROB); if (chip->vtu_prob_irq < 0) - return chip->device_irq; + return chip->vtu_prob_irq; err = request_threaded_irq(chip->vtu_prob_irq, NULL, mv88e6xxx_g1_vtu_prob_irq_thread_fn, diff --git a/drivers/net/ethernet/cortina/Kconfig b/drivers/net/ethernet/cortina/Kconfig index 0df743ea51f1..89bc4579724d 100644 --- a/drivers/net/ethernet/cortina/Kconfig +++ b/drivers/net/ethernet/cortina/Kconfig @@ -14,6 +14,7 @@ if NET_VENDOR_CORTINA config GEMINI_ETHERNET tristate "Gemini Gigabit Ethernet support" depends on OF + depends on HAS_IOMEM select PHYLIB select CRC32 ---help--- diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 7892f2f0c6b5..2c2976a2dda6 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -613,9 +613,11 @@ static int fs_enet_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static void fs_timeout(struct net_device *dev) +static void fs_timeout_work(struct work_struct *work) { - struct fs_enet_private *fep = netdev_priv(dev); + struct fs_enet_private *fep = container_of(work, struct fs_enet_private, + timeout_work); + struct net_device *dev = fep->ndev; unsigned long flags; int wake = 0; @@ -627,7 +629,6 @@ static void fs_timeout(struct net_device *dev) phy_stop(dev->phydev); (*fep->ops->stop)(dev); (*fep->ops->restart)(dev); - phy_start(dev->phydev); } phy_start(dev->phydev); @@ -639,6 +640,13 @@ static void fs_timeout(struct net_device *dev) netif_wake_queue(dev); } +static void fs_timeout(struct net_device *dev) +{ + struct fs_enet_private *fep = netdev_priv(dev); + + schedule_work(&fep->timeout_work); +} + /*----------------------------------------------------------------------------- * generic link-change handler - should be sufficient for most cases *-----------------------------------------------------------------------------*/ @@ -759,6 +767,7 @@ static int fs_enet_close(struct net_device *dev) netif_stop_queue(dev); netif_carrier_off(dev); napi_disable(&fep->napi); + cancel_work_sync(&fep->timeout_work); phy_stop(dev->phydev); spin_lock_irqsave(&fep->lock, flags); @@ -1019,6 +1028,7 @@ static int fs_enet_probe(struct platform_device *ofdev) ndev->netdev_ops = &fs_enet_netdev_ops; ndev->watchdog_timeo = 2 * HZ; + INIT_WORK(&fep->timeout_work, fs_timeout_work); netif_napi_add(ndev, &fep->napi, fs_enet_napi, fpi->napi_weight); ndev->ethtool_ops = &fs_ethtool_ops; diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet.h b/drivers/net/ethernet/freescale/fs_enet/fs_enet.h index 92e06b37a199..195fae6aec4a 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet.h +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet.h @@ -125,6 +125,7 @@ struct fs_enet_private { spinlock_t lock; /* during all ops except TX pckt processing */ spinlock_t tx_lock; /* during fs_start_xmit and fs_tx */ struct fs_platform_info *fpi; + struct work_struct timeout_work; const struct fs_ops *ops; int rx_ring, tx_ring; dma_addr_t ring_mem_addr; diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 736df59c16f5..be2ce8dece4a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1280,6 +1280,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) unsigned char *dst; u64 *handle_array; int index = 0; + u8 proto = 0; int ret = 0; if (adapter->resetting) { @@ -1368,17 +1369,18 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) } if (skb->protocol == htons(ETH_P_IP)) { - if (ip_hdr(skb)->version == 4) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV4; - else if (ip_hdr(skb)->version == 6) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV6; - - if (ip_hdr(skb)->protocol == IPPROTO_TCP) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_TCP; - else if (ip_hdr(skb)->protocol != IPPROTO_TCP) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_UDP; + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV4; + proto = ip_hdr(skb)->protocol; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV6; + proto = ipv6_hdr(skb)->nexthdr; } + if (proto == IPPROTO_TCP) + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_TCP; + else if (proto == IPPROTO_UDP) + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_UDP; + if (skb->ip_summed == CHECKSUM_PARTIAL) { tx_crq.v1.flags1 |= IBMVNIC_TX_CHKSUM_OFFLOAD; hdrs += 2; @@ -3357,7 +3359,11 @@ static void handle_query_ip_offload_rsp(struct ibmvnic_adapter *adapter) return; } + adapter->ip_offload_ctrl.len = + cpu_to_be32(sizeof(adapter->ip_offload_ctrl)); adapter->ip_offload_ctrl.version = cpu_to_be32(INITIAL_VERSION_IOB); + adapter->ip_offload_ctrl.ipv4_chksum = buf->ipv4_chksum; + adapter->ip_offload_ctrl.ipv6_chksum = buf->ipv6_chksum; adapter->ip_offload_ctrl.tcp_ipv4_chksum = buf->tcp_ipv4_chksum; adapter->ip_offload_ctrl.udp_ipv4_chksum = buf->udp_ipv4_chksum; adapter->ip_offload_ctrl.tcp_ipv6_chksum = buf->tcp_ipv6_chksum; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 7f605221a686..a434fecfdfeb 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -2463,7 +2463,6 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) return err; } -#ifdef CONFIG_PM /** * fm10k_resume - Generic PM resume hook * @dev: generic device structure @@ -2472,7 +2471,7 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) * suspend or hibernation. This function does not need to handle lower PCIe * device state as the stack takes care of that for us. **/ -static int fm10k_resume(struct device *dev) +static int __maybe_unused fm10k_resume(struct device *dev) { struct fm10k_intfc *interface = pci_get_drvdata(to_pci_dev(dev)); struct net_device *netdev = interface->netdev; @@ -2499,7 +2498,7 @@ static int fm10k_resume(struct device *dev) * system suspend or hibernation. This function does not need to handle lower * PCIe device state as the stack takes care of that for us. **/ -static int fm10k_suspend(struct device *dev) +static int __maybe_unused fm10k_suspend(struct device *dev) { struct fm10k_intfc *interface = pci_get_drvdata(to_pci_dev(dev)); struct net_device *netdev = interface->netdev; @@ -2511,8 +2510,6 @@ static int fm10k_suspend(struct device *dev) return 0; } -#endif /* CONFIG_PM */ - /** * fm10k_io_error_detected - called when PCI error is detected * @pdev: Pointer to PCI device @@ -2643,11 +2640,9 @@ static struct pci_driver fm10k_driver = { .id_table = fm10k_pci_tbl, .probe = fm10k_probe, .remove = fm10k_remove, -#ifdef CONFIG_PM .driver = { .pm = &fm10k_pm_ops, }, -#endif /* CONFIG_PM */ .sriov_configure = fm10k_iov_configure, .err_handler = &fm10k_err_handler }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index bbe48917dcad..7e2b552c2237 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -76,12 +76,7 @@ #define MLXSW_FWREV_MAJOR 13 #define MLXSW_FWREV_MINOR 1530 #define MLXSW_FWREV_SUBMINOR 152 - -static const struct mlxsw_fw_rev mlxsw_sp_supported_fw_rev = { - .major = MLXSW_FWREV_MAJOR, - .minor = MLXSW_FWREV_MINOR, - .subminor = MLXSW_FWREV_SUBMINOR -}; +#define MLXSW_FWREV_MINOR_TO_BRANCH(minor) ((minor) / 100) #define MLXSW_SP_FW_FILENAME \ "mellanox/mlxsw_spectrum-" __stringify(MLXSW_FWREV_MAJOR) \ @@ -339,28 +334,25 @@ static int mlxsw_sp_firmware_flash(struct mlxsw_sp *mlxsw_sp, return mlxfw_firmware_flash(&mlxsw_sp_mlxfw_dev.mlxfw_dev, firmware); } -static bool mlxsw_sp_fw_rev_ge(const struct mlxsw_fw_rev *a, - const struct mlxsw_fw_rev *b) -{ - if (a->major != b->major) - return a->major > b->major; - if (a->minor != b->minor) - return a->minor > b->minor; - return a->subminor >= b->subminor; -} - static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp) { const struct mlxsw_fw_rev *rev = &mlxsw_sp->bus_info->fw_rev; const struct firmware *firmware; int err; - if (mlxsw_sp_fw_rev_ge(rev, &mlxsw_sp_supported_fw_rev)) + /* Validate driver & FW are compatible */ + if (rev->major != MLXSW_FWREV_MAJOR) { + WARN(1, "Mismatch in major FW version [%d:%d] is never expected; Please contact support\n", + rev->major, MLXSW_FWREV_MAJOR); + return -EINVAL; + } + if (MLXSW_FWREV_MINOR_TO_BRANCH(rev->minor) == + MLXSW_FWREV_MINOR_TO_BRANCH(MLXSW_FWREV_MINOR)) return 0; - dev_info(mlxsw_sp->bus_info->dev, "The firmware version %d.%d.%d is out of date\n", + dev_info(mlxsw_sp->bus_info->dev, "The firmware version %d.%d.%d is incompatible with the driver\n", rev->major, rev->minor, rev->subminor); - dev_info(mlxsw_sp->bus_info->dev, "Upgrading firmware using file %s\n", + dev_info(mlxsw_sp->bus_info->dev, "Flashing firmware using file %s\n", MLXSW_SP_FW_FILENAME); err = request_firmware_direct(&firmware, MLXSW_SP_FW_FILENAME, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 01ff5ba6796e..31891ae11c9b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -821,13 +821,18 @@ static int mlxsw_sp_vr_lpm_tree_replace(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_lpm_tree *old_tree = fib->lpm_tree; int err; - err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, new_tree->id); - if (err) - return err; fib->lpm_tree = new_tree; mlxsw_sp_lpm_tree_hold(new_tree); + err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, new_tree->id); + if (err) + goto err_tree_bind; mlxsw_sp_lpm_tree_put(mlxsw_sp, old_tree); return 0; + +err_tree_bind: + mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree); + fib->lpm_tree = old_tree; + return err; } static int mlxsw_sp_vrs_lpm_tree_replace(struct mlxsw_sp *mlxsw_sp, @@ -868,11 +873,14 @@ err_tree_replace: return err; no_replace: - err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, new_tree->id); - if (err) - return err; fib->lpm_tree = new_tree; mlxsw_sp_lpm_tree_hold(new_tree); + err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, new_tree->id); + if (err) { + mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree); + fib->lpm_tree = NULL; + return err; + } return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index 064f00e23a19..d5866d708dfa 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -22,6 +22,7 @@ nfp-objs := \ nfp_hwmon.o \ nfp_main.o \ nfp_net_common.o \ + nfp_net_ctrl.o \ nfp_net_debugdump.o \ nfp_net_ethtool.o \ nfp_net_main.o \ diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 71e6586acc36..80d3aa0fc9d3 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -157,7 +157,14 @@ nfp_bpf_cmsg_wait_reply(struct nfp_app_bpf *bpf, enum nfp_bpf_cmsg_type type, int tag) { struct sk_buff *skb; - int err; + int i, err; + + for (i = 0; i < 50; i++) { + udelay(4); + skb = nfp_bpf_reply(bpf, tag); + if (skb) + return skb; + } err = wait_event_interruptible_timeout(bpf->cmsg_wq, skb = nfp_bpf_reply(bpf, tag), diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 8823c8360047..4ee11bf2aed7 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -389,6 +389,8 @@ const struct nfp_app_type app_bpf = { .id = NFP_APP_BPF_NIC, .name = "ebpf", + .ctrl_cap_mask = 0, + .init = nfp_bpf_init, .clean = nfp_bpf_clean, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index e2859b2e9c6a..1a357aacc444 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -127,6 +127,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; unsigned int stack_size; unsigned int max_instr; + int err; stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64; if (prog->aux->stack_depth > stack_size) { @@ -143,7 +144,14 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) if (!nfp_prog->prog) return -ENOMEM; - return nfp_bpf_jit(nfp_prog); + err = nfp_bpf_jit(nfp_prog); + if (err) + return err; + + prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64); + prog->aux->offload->jited_image = nfp_prog->prog; + + return 0; } static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) @@ -168,6 +176,8 @@ nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap, static int nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) + return -EINVAL; return nfp_bpf_ctrl_del_entry(offmap, key); } diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c index 615314d9e7c6..baaea6f1a9d8 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -211,12 +211,6 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb) cmsg_hdr = nfp_flower_cmsg_get_hdr(skb); - if (unlikely(cmsg_hdr->version != NFP_FLOWER_CMSG_VER1)) { - nfp_flower_cmsg_warn(app, "Cannot handle repr control version %u\n", - cmsg_hdr->version); - goto out; - } - type = cmsg_hdr->type; switch (type) { case NFP_FLOWER_CMSG_TYPE_PORT_REIFY: @@ -225,9 +219,6 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb) case NFP_FLOWER_CMSG_TYPE_PORT_MOD: nfp_flower_cmsg_portmod_rx(app, skb); break; - case NFP_FLOWER_CMSG_TYPE_FLOW_STATS: - nfp_flower_rx_flow_stats(app, skb); - break; case NFP_FLOWER_CMSG_TYPE_NO_NEIGH: nfp_tunnel_request_route(app, skb); break; @@ -263,7 +254,23 @@ void nfp_flower_cmsg_process_rx(struct work_struct *work) void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb) { struct nfp_flower_priv *priv = app->priv; + struct nfp_flower_cmsg_hdr *cmsg_hdr; + + cmsg_hdr = nfp_flower_cmsg_get_hdr(skb); + + if (unlikely(cmsg_hdr->version != NFP_FLOWER_CMSG_VER1)) { + nfp_flower_cmsg_warn(app, "Cannot handle repr control version %u\n", + cmsg_hdr->version); + dev_kfree_skb_any(skb); + return; + } - skb_queue_tail(&priv->cmsg_skbs, skb); - schedule_work(&priv->cmsg_work); + if (cmsg_hdr->type == NFP_FLOWER_CMSG_TYPE_FLOW_STATS) { + /* We need to deal with stats updates from HW asap */ + nfp_flower_rx_flow_stats(app, skb); + dev_consume_skb_any(skb); + } else { + skb_queue_tail(&priv->cmsg_skbs, skb); + schedule_work(&priv->cmsg_work); + } } diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 67c406815365..742d6f1575b5 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -99,7 +99,7 @@ nfp_flower_repr_get(struct nfp_app *app, u32 port_id) if (port >= reprs->num_reprs) return NULL; - return reprs->reprs[port]; + return rcu_dereference(reprs->reprs[port]); } static int @@ -114,15 +114,19 @@ nfp_flower_reprs_reify(struct nfp_app *app, enum nfp_repr_type type, if (!reprs) return 0; - for (i = 0; i < reprs->num_reprs; i++) - if (reprs->reprs[i]) { - struct nfp_repr *repr = netdev_priv(reprs->reprs[i]); + for (i = 0; i < reprs->num_reprs; i++) { + struct net_device *netdev; + + netdev = nfp_repr_get_locked(app, reprs, i); + if (netdev) { + struct nfp_repr *repr = netdev_priv(netdev); err = nfp_flower_cmsg_portreify(repr, exists); if (err) return err; count++; } + } return count; } @@ -234,19 +238,21 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, return -ENOMEM; for (i = 0; i < cnt; i++) { + struct net_device *repr; struct nfp_port *port; u32 port_id; - reprs->reprs[i] = nfp_repr_alloc(app); - if (!reprs->reprs[i]) { + repr = nfp_repr_alloc(app); + if (!repr) { err = -ENOMEM; goto err_reprs_clean; } + RCU_INIT_POINTER(reprs->reprs[i], repr); /* For now we only support 1 PF */ WARN_ON(repr_type == NFP_REPR_TYPE_PF && i); - port = nfp_port_alloc(app, port_type, reprs->reprs[i]); + port = nfp_port_alloc(app, port_type, repr); if (repr_type == NFP_REPR_TYPE_PF) { port->pf_id = i; port->vnic = priv->nn->dp.ctrl_bar; @@ -257,11 +263,11 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, app->pf->vf_cfg_mem + i * NFP_NET_CFG_BAR_SZ; } - eth_hw_addr_random(reprs->reprs[i]); + eth_hw_addr_random(repr); port_id = nfp_flower_cmsg_pcie_port(nfp_pcie, vnic_type, i, queue); - err = nfp_repr_init(app, reprs->reprs[i], + err = nfp_repr_init(app, repr, port_id, port, priv->nn->dp.netdev); if (err) { nfp_port_free(port); @@ -270,7 +276,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, nfp_info(app->cpp, "%s%d Representor(%s) created\n", repr_type == NFP_REPR_TYPE_PF ? "PF" : "VF", i, - reprs->reprs[i]->name); + repr->name); } nfp_app_reprs_set(app, repr_type, reprs); @@ -291,7 +297,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, err_reprs_remove: reprs = nfp_app_reprs_set(app, repr_type, NULL); err_reprs_clean: - nfp_reprs_clean_and_free(reprs); + nfp_reprs_clean_and_free(app, reprs); return err; } @@ -329,17 +335,18 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) for (i = 0; i < eth_tbl->count; i++) { unsigned int phys_port = eth_tbl->ports[i].index; + struct net_device *repr; struct nfp_port *port; u32 cmsg_port_id; - reprs->reprs[phys_port] = nfp_repr_alloc(app); - if (!reprs->reprs[phys_port]) { + repr = nfp_repr_alloc(app); + if (!repr) { err = -ENOMEM; goto err_reprs_clean; } + RCU_INIT_POINTER(reprs->reprs[phys_port], repr); - port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, - reprs->reprs[phys_port]); + port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr); if (IS_ERR(port)) { err = PTR_ERR(port); goto err_reprs_clean; @@ -350,11 +357,11 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) goto err_reprs_clean; } - SET_NETDEV_DEV(reprs->reprs[phys_port], &priv->nn->pdev->dev); + SET_NETDEV_DEV(repr, &priv->nn->pdev->dev); nfp_net_get_mac_addr(app->pf, port); cmsg_port_id = nfp_flower_cmsg_phys_port(phys_port); - err = nfp_repr_init(app, reprs->reprs[phys_port], + err = nfp_repr_init(app, repr, cmsg_port_id, port, priv->nn->dp.netdev); if (err) { nfp_port_free(port); @@ -367,7 +374,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) phys_port); nfp_info(app->cpp, "Phys Port %d Representor(%s) created\n", - phys_port, reprs->reprs[phys_port]->name); + phys_port, repr->name); } nfp_app_reprs_set(app, NFP_REPR_TYPE_PHYS_PORT, reprs); @@ -397,7 +404,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) err_reprs_remove: reprs = nfp_app_reprs_set(app, NFP_REPR_TYPE_PHYS_PORT, NULL); err_reprs_clean: - nfp_reprs_clean_and_free(reprs); + nfp_reprs_clean_and_free(app, reprs); err_free_ctrl_skb: kfree_skb(ctrl_skb); return err; @@ -558,6 +565,8 @@ static void nfp_flower_stop(struct nfp_app *app) const struct nfp_app_type app_flower = { .id = NFP_APP_FLOWER_NIC, .name = "flower", + + .ctrl_cap_mask = ~0U, .ctrl_has_meta = true, .extra_cap = nfp_flower_extra_cap, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c index 955a9f44d244..6aedef0ad433 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c @@ -32,6 +32,8 @@ */ #include <linux/bug.h> +#include <linux/lockdep.h> +#include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> @@ -99,13 +101,19 @@ nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size, gfp_t priority) } struct nfp_reprs * +nfp_reprs_get_locked(struct nfp_app *app, enum nfp_repr_type type) +{ + return rcu_dereference_protected(app->reprs[type], + lockdep_is_held(&app->pf->lock)); +} + +struct nfp_reprs * nfp_app_reprs_set(struct nfp_app *app, enum nfp_repr_type type, struct nfp_reprs *reprs) { struct nfp_reprs *old; - old = rcu_dereference_protected(app->reprs[type], - lockdep_is_held(&app->pf->lock)); + old = nfp_reprs_get_locked(app, type); rcu_assign_pointer(app->reprs[type], reprs); return old; @@ -116,7 +124,7 @@ struct nfp_app *nfp_app_alloc(struct nfp_pf *pf, enum nfp_app_id id) struct nfp_app *app; if (id >= ARRAY_SIZE(apps) || !apps[id]) { - nfp_err(pf->cpp, "failed to find app with ID 0x%02hhx\n", id); + nfp_err(pf->cpp, "unknown FW app ID 0x%02hhx, driver too old or support for FW not built in\n", id); return ERR_PTR(-EINVAL); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index 6a6eb02b516e..7e474df90598 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -66,6 +66,9 @@ extern const struct nfp_app_type app_flower; * struct nfp_app_type - application definition * @id: application ID * @name: application name + * @ctrl_cap_mask: ctrl vNIC capability mask, allows disabling features like + * IRQMOD which are on by default but counter-productive for + * control messages which are often latency-sensitive * @ctrl_has_meta: control messages have prepend of type:5/port:CTRL * * Callbacks @@ -100,6 +103,7 @@ struct nfp_app_type { enum nfp_app_id id; const char *name; + u32 ctrl_cap_mask; bool ctrl_has_meta; int (*init)(struct nfp_app *app); @@ -385,6 +389,8 @@ static inline struct net_device *nfp_app_repr_get(struct nfp_app *app, u32 id) struct nfp_app *nfp_app_from_netdev(struct net_device *netdev); struct nfp_reprs * +nfp_reprs_get_locked(struct nfp_app *app, enum nfp_repr_type type); +struct nfp_reprs * nfp_app_reprs_set(struct nfp_app *app, enum nfp_repr_type type, struct nfp_reprs *reprs); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 6c9f29c2e975..eb0fc614673d 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -152,18 +152,8 @@ out: static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) { struct nfp_pf *pf = devlink_priv(devlink); - int ret; - - mutex_lock(&pf->lock); - if (!pf->app) { - ret = -EBUSY; - goto out; - } - ret = nfp_app_eswitch_mode_get(pf->app, mode); -out: - mutex_unlock(&pf->lock); - return ret; + return nfp_app_eswitch_mode_get(pf->app, mode); } const struct devlink_ops nfp_devlink_ops = { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index 0953fa8f3109..c5b91040b12e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -499,13 +499,9 @@ static int nfp_pci_probe(struct pci_dev *pdev, if (err) goto err_hwinfo_free; - err = devlink_register(devlink, &pdev->dev); - if (err) - goto err_hwinfo_free; - err = nfp_nsp_init(pdev, pf); if (err) - goto err_devlink_unreg; + goto err_hwinfo_free; pf->mip = nfp_mip_open(pf->cpp); pf->rtbl = __nfp_rtsym_table_read(pf->cpp, pf->mip); @@ -549,8 +545,6 @@ err_fw_unload: kfree(pf->eth_tbl); kfree(pf->nspi); vfree(pf->dumpspec); -err_devlink_unreg: - devlink_unregister(devlink); err_hwinfo_free: kfree(pf->hwinfo); nfp_cpp_free(pf->cpp); @@ -571,18 +565,13 @@ err_pci_disable: static void nfp_pci_remove(struct pci_dev *pdev) { struct nfp_pf *pf = pci_get_drvdata(pdev); - struct devlink *devlink; nfp_hwmon_unregister(pf); - devlink = priv_to_devlink(pf); - - nfp_net_pci_remove(pf); - nfp_pcie_sriov_disable(pdev); pci_sriov_set_totalvfs(pf->pdev, 0); - devlink_unregister(devlink); + nfp_net_pci_remove(pf); vfree(pf->dumpspec); kfree(pf->rtbl); @@ -598,7 +587,7 @@ static void nfp_pci_remove(struct pci_dev *pdev) kfree(pf->eth_tbl); kfree(pf->nspi); mutex_destroy(&pf->lock); - devlink_free(devlink); + devlink_free(priv_to_devlink(pf)); pci_release_regions(pdev); pci_disable_device(pdev); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 6f6e3d6fd935..d88eda9707e6 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -578,6 +578,7 @@ struct nfp_net_dp { * @qcp_cfg: Pointer to QCP queue used for configuration notification * @tx_bar: Pointer to mapped TX queues * @rx_bar: Pointer to mapped FL/RX queues + * @tlv_caps: Parsed TLV capabilities * @debugfs_dir: Device directory in debugfs * @vnic_list: Entry on device vNIC list * @pdev: Backpointer to PCI device @@ -644,6 +645,8 @@ struct nfp_net { u8 __iomem *tx_bar; u8 __iomem *rx_bar; + struct nfp_net_tlv_caps tlv_caps; + struct dentry *debugfs_dir; struct list_head vnic_list; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 2b5cad3069a7..cdf52421eaca 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -293,9 +293,15 @@ int nfp_net_reconfig(struct nfp_net *nn, u32 update) */ static int nfp_net_reconfig_mbox(struct nfp_net *nn, u32 mbox_cmd) { + u32 mbox = nn->tlv_caps.mbox_off; int ret; - nn_writeq(nn, NFP_NET_CFG_MBOX_CMD, mbox_cmd); + if (!nfp_net_has_mbox(&nn->tlv_caps)) { + nn_err(nn, "no mailbox present, command: %u\n", mbox_cmd); + return -EIO; + } + + nn_writeq(nn, mbox + NFP_NET_CFG_MBOX_SIMPLE_CMD, mbox_cmd); ret = nfp_net_reconfig(nn, NFP_NET_CFG_UPDATE_MBOX); if (ret) { @@ -303,7 +309,7 @@ static int nfp_net_reconfig_mbox(struct nfp_net *nn, u32 mbox_cmd) return ret; } - return -nn_readl(nn, NFP_NET_CFG_MBOX_RET); + return -nn_readl(nn, mbox + NFP_NET_CFG_MBOX_SIMPLE_RET); } /* Interrupt configuration and handling @@ -2458,7 +2464,7 @@ void nfp_net_coalesce_write_cfg(struct nfp_net *nn) * ME timestamp ticks. There are 16 ME clock cycles for each timestamp * count. */ - factor = nn->me_freq_mhz / 16; + factor = nn->tlv_caps.me_freq_mhz / 16; /* copy RX interrupt coalesce parameters */ value = (nn->rx_coalesce_max_frames << 16) | @@ -3084,8 +3090,9 @@ nfp_net_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) if (!vid) return 0; - nn_writew(nn, NFP_NET_CFG_VLAN_FILTER_VID, vid); - nn_writew(nn, NFP_NET_CFG_VLAN_FILTER_PROTO, ETH_P_8021Q); + nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_VLAN_FILTER_VID, vid); + nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_VLAN_FILTER_PROTO, + ETH_P_8021Q); return nfp_net_reconfig_mbox(nn, NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_ADD); } @@ -3101,8 +3108,9 @@ nfp_net_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) if (!vid) return 0; - nn_writew(nn, NFP_NET_CFG_VLAN_FILTER_VID, vid); - nn_writew(nn, NFP_NET_CFG_VLAN_FILTER_PROTO, ETH_P_8021Q); + nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_VLAN_FILTER_VID, vid); + nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_VLAN_FILTER_PROTO, + ETH_P_8021Q); return nfp_net_reconfig_mbox(nn, NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_KILL); } @@ -3748,18 +3756,8 @@ static void nfp_net_netdev_init(struct nfp_net *nn) nfp_net_set_ethtool_ops(netdev); } -/** - * nfp_net_init() - Initialise/finalise the nfp_net structure - * @nn: NFP Net device structure - * - * Return: 0 on success or negative errno on error. - */ -int nfp_net_init(struct nfp_net *nn) +static int nfp_net_read_caps(struct nfp_net *nn) { - int err; - - nn->dp.rx_dma_dir = DMA_FROM_DEVICE; - /* Get some of the read-only fields from the BAR */ nn->cap = nn_readl(nn, NFP_NET_CFG_CAP); nn->max_mtu = nn_readl(nn, NFP_NET_CFG_MAX_MTU); @@ -3792,6 +3790,29 @@ int nfp_net_init(struct nfp_net *nn) nn->dp.rx_offset = NFP_NET_RX_OFFSET; } + /* For control vNICs mask out the capabilities app doesn't want. */ + if (!nn->dp.netdev) + nn->cap &= nn->app->type->ctrl_cap_mask; + + return 0; +} + +/** + * nfp_net_init() - Initialise/finalise the nfp_net structure + * @nn: NFP Net device structure + * + * Return: 0 on success or negative errno on error. + */ +int nfp_net_init(struct nfp_net *nn) +{ + int err; + + nn->dp.rx_dma_dir = DMA_FROM_DEVICE; + + err = nfp_net_read_caps(nn); + if (err) + return err; + /* Set default MTU and Freelist buffer size */ if (nn->max_mtu < NFP_NET_DEFAULT_MTU) nn->dp.mtu = nn->max_mtu; @@ -3815,6 +3836,11 @@ int nfp_net_init(struct nfp_net *nn) nn->dp.ctrl |= NFP_NET_CFG_CTRL_IRQMOD; } + err = nfp_net_tlv_caps_parse(&nn->pdev->dev, nn->dp.ctrl_bar, + &nn->tlv_caps); + if (err) + return err; + if (nn->dp.netdev) nfp_net_netdev_init(nn); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c new file mode 100644 index 000000000000..ffb402746ad4 --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2018 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/bitfield.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include "nfp_net_ctrl.h" +#include "nfp_net.h" + +static void nfp_net_tlv_caps_reset(struct nfp_net_tlv_caps *caps) +{ + memset(caps, 0, sizeof(*caps)); + caps->me_freq_mhz = 1200; + caps->mbox_off = NFP_NET_CFG_MBOX_BASE; + caps->mbox_len = NFP_NET_CFG_MBOX_VAL_MAX_SZ; +} + +int nfp_net_tlv_caps_parse(struct device *dev, u8 __iomem *ctrl_mem, + struct nfp_net_tlv_caps *caps) +{ + u8 __iomem *data = ctrl_mem + NFP_NET_CFG_TLV_BASE; + u8 __iomem *end = ctrl_mem + NFP_NET_CFG_BAR_SZ; + u32 hdr; + + nfp_net_tlv_caps_reset(caps); + + hdr = readl(data); + if (!hdr) + return 0; + + while (true) { + unsigned int length, offset; + u32 hdr = readl(data); + + length = FIELD_GET(NFP_NET_CFG_TLV_HEADER_LENGTH, hdr); + offset = data - ctrl_mem + NFP_NET_CFG_TLV_BASE; + + /* Advance past the header */ + data += 4; + + if (length % NFP_NET_CFG_TLV_LENGTH_INC) { + dev_err(dev, "TLV size not multiple of %u len:%u\n", + NFP_NET_CFG_TLV_LENGTH_INC, length); + return -EINVAL; + } + if (data + length > end) { + dev_err(dev, "oversized TLV offset:%u len:%u\n", + offset, length); + return -EINVAL; + } + + switch (FIELD_GET(NFP_NET_CFG_TLV_HEADER_TYPE, hdr)) { + case NFP_NET_CFG_TLV_TYPE_UNKNOWN: + dev_err(dev, "NULL TLV at offset:%u\n", offset); + return -EINVAL; + case NFP_NET_CFG_TLV_TYPE_RESERVED: + break; + case NFP_NET_CFG_TLV_TYPE_END: + if (!length) + return 0; + + dev_err(dev, "END TLV should be empty, has len:%d\n", + length); + return -EINVAL; + case NFP_NET_CFG_TLV_TYPE_ME_FREQ: + if (length != 4) { + dev_err(dev, + "ME FREQ TLV should be 4B, is %dB\n", + length); + return -EINVAL; + } + + caps->me_freq_mhz = readl(data); + break; + case NFP_NET_CFG_TLV_TYPE_MBOX: + if (!length) { + caps->mbox_off = 0; + caps->mbox_len = 0; + } else { + caps->mbox_off = data - ctrl_mem; + caps->mbox_len = length; + } + break; + default: + if (!FIELD_GET(NFP_NET_CFG_TLV_HEADER_REQUIRED, hdr)) + break; + + dev_err(dev, "unknown TLV type:%u offset:%u len:%u\n", + FIELD_GET(NFP_NET_CFG_TLV_HEADER_TYPE, hdr), + offset, length); + return -EINVAL; + } + + data += length; + if (data + 4 > end) { + dev_err(dev, "reached end of BAR without END TLV\n"); + return -EINVAL; + } + } + + /* Not reached */ + return -EINVAL; +} diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index 25c36001bffa..eeecef2caac6 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -43,9 +43,7 @@ #ifndef _NFP_NET_CTRL_H_ #define _NFP_NET_CTRL_H_ -/* IMPORTANT: This header file is shared with the FW, - * no OS specific constructs, please! - */ +#include <linux/types.h> /** * Configuration BAR size. @@ -236,6 +234,12 @@ #define NFP_NET_CFG_RSS_CAP_HFUNC 0xff000000 /** + * TLV area start + * %NFP_NET_CFG_TLV_BASE: start anchor of the TLV area + */ +#define NFP_NET_CFG_TLV_BASE 0x0058 + +/** * VXLAN/UDP encap configuration * %NFP_NET_CFG_VXLAN_PORT: Base address of table of tunnels' UDP dst ports * %NFP_NET_CFG_VXLAN_SZ: Size of the UDP port table in bytes @@ -409,11 +413,14 @@ * 4B used for update command and 4B return code * followed by a max of 504B of variable length value */ -#define NFP_NET_CFG_MBOX_CMD 0x1800 -#define NFP_NET_CFG_MBOX_RET 0x1804 -#define NFP_NET_CFG_MBOX_VAL 0x1808 +#define NFP_NET_CFG_MBOX_BASE 0x1800 #define NFP_NET_CFG_MBOX_VAL_MAX_SZ 0x1F8 +#define NFP_NET_CFG_MBOX_SIMPLE_CMD 0x0 +#define NFP_NET_CFG_MBOX_SIMPLE_RET 0x4 +#define NFP_NET_CFG_MBOX_SIMPLE_VAL 0x8 +#define NFP_NET_CFG_MBOX_SIMPLE_LEN 0x12 + #define NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_ADD 1 #define NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_KILL 2 @@ -424,9 +431,87 @@ * %NFP_NET_CFG_VLAN_FILTER_PROTO: VLAN proto to filter * %NFP_NET_CFG_VXLAN_SZ: Size of the VLAN filter mailbox in bytes */ -#define NFP_NET_CFG_VLAN_FILTER NFP_NET_CFG_MBOX_VAL +#define NFP_NET_CFG_VLAN_FILTER NFP_NET_CFG_MBOX_SIMPLE_VAL #define NFP_NET_CFG_VLAN_FILTER_VID NFP_NET_CFG_VLAN_FILTER #define NFP_NET_CFG_VLAN_FILTER_PROTO (NFP_NET_CFG_VLAN_FILTER + 2) #define NFP_NET_CFG_VLAN_FILTER_SZ 0x0004 +/** + * TLV capabilities + * %NFP_NET_CFG_TLV_TYPE: Offset of type within the TLV + * %NFP_NET_CFG_TLV_TYPE_REQUIRED: Driver must be able to parse the TLV + * %NFP_NET_CFG_TLV_LENGTH: Offset of length within the TLV + * %NFP_NET_CFG_TLV_LENGTH_INC: TLV length increments + * %NFP_NET_CFG_TLV_VALUE: Offset of value with the TLV + * + * List of simple TLV structures, first one starts at %NFP_NET_CFG_TLV_BASE. + * Last structure must be of type %NFP_NET_CFG_TLV_TYPE_END. Presence of TLVs + * is indicated by %NFP_NET_CFG_TLV_BASE being non-zero. TLV structures may + * fill the entire remainder of the BAR or be shorter. FW must make sure TLVs + * don't conflict with other features which allocate space beyond + * %NFP_NET_CFG_TLV_BASE. %NFP_NET_CFG_TLV_TYPE_RESERVED should be used to wrap + * space used by such features. + * Note that the 4 byte TLV header is not counted in %NFP_NET_CFG_TLV_LENGTH. + */ +#define NFP_NET_CFG_TLV_TYPE 0x00 +#define NFP_NET_CFG_TLV_TYPE_REQUIRED 0x8000 +#define NFP_NET_CFG_TLV_LENGTH 0x02 +#define NFP_NET_CFG_TLV_LENGTH_INC 4 +#define NFP_NET_CFG_TLV_VALUE 0x04 + +#define NFP_NET_CFG_TLV_HEADER_REQUIRED 0x80000000 +#define NFP_NET_CFG_TLV_HEADER_TYPE 0x7fff0000 +#define NFP_NET_CFG_TLV_HEADER_LENGTH 0x0000ffff + +/** + * Capability TLV types + * + * %NFP_NET_CFG_TLV_TYPE_UNKNOWN: + * Special TLV type to catch bugs, should never be encountered. Drivers should + * treat encountering this type as error and refuse to probe. + * + * %NFP_NET_CFG_TLV_TYPE_RESERVED: + * Reserved space, may contain legacy fixed-offset fields, or be used for + * padding. The use of this type should be otherwise avoided. + * + * %NFP_NET_CFG_TLV_TYPE_END: + * Empty, end of TLV list. Must be the last TLV. Drivers will stop processing + * further TLVs when encountered. + * + * %NFP_NET_CFG_TLV_TYPE_ME_FREQ: + * Single word, ME frequency in MHz as used in calculation for + * %NFP_NET_CFG_RXR_IRQ_MOD and %NFP_NET_CFG_TXR_IRQ_MOD. + * + * %NFP_NET_CFG_TLV_TYPE_MBOX: + * Variable, mailbox area. Overwrites the default location which is + * %NFP_NET_CFG_MBOX_BASE and length %NFP_NET_CFG_MBOX_VAL_MAX_SZ. + */ +#define NFP_NET_CFG_TLV_TYPE_UNKNOWN 0 +#define NFP_NET_CFG_TLV_TYPE_RESERVED 1 +#define NFP_NET_CFG_TLV_TYPE_END 2 +#define NFP_NET_CFG_TLV_TYPE_ME_FREQ 3 +#define NFP_NET_CFG_TLV_TYPE_MBOX 4 + +struct device; + +/** + * struct nfp_net_tlv_caps - parsed control BAR TLV capabilities + * @me_freq_mhz: ME clock_freq (MHz) + * @mbox_off: vNIC mailbox area offset + * @mbox_len: vNIC mailbox area length + */ +struct nfp_net_tlv_caps { + u32 me_freq_mhz; + unsigned int mbox_off; + unsigned int mbox_len; +}; + +int nfp_net_tlv_caps_parse(struct device *dev, u8 __iomem *ctrl_mem, + struct nfp_net_tlv_caps *caps); + +static inline bool nfp_net_has_mbox(struct nfp_net_tlv_caps *caps) +{ + return caps->mbox_len >= NFP_NET_CFG_MBOX_SIMPLE_LEN; +} + #endif /* _NFP_NET_CTRL_H_ */ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_debugdump.c b/drivers/net/ethernet/netronome/nfp/nfp_net_debugdump.c index 173646e17e94..e6f19f44b461 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_debugdump.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_debugdump.c @@ -518,16 +518,15 @@ nfp_dump_csr_range(struct nfp_pf *pf, struct nfp_dumpspec_csr *spec_csr, max_rd_addr = cpp_rd_addr + be32_to_cpu(spec_csr->cpp.dump_length); while (cpp_rd_addr < max_rd_addr) { - if (is_xpb_read(&spec_csr->cpp.cpp_id)) - bytes_read = nfp_xpb_readl(pf->cpp, cpp_rd_addr, - (u32 *)dest); - else + if (is_xpb_read(&spec_csr->cpp.cpp_id)) { + err = nfp_xpb_readl(pf->cpp, cpp_rd_addr, (u32 *)dest); + } else { bytes_read = nfp_cpp_read(pf->cpp, cpp_id, cpp_rd_addr, dest, reg_sz); - if (bytes_read != reg_sz) { - if (bytes_read >= 0) - bytes_read = -EIO; - dump_header->error = cpu_to_be32(bytes_read); + err = bytes_read == reg_sz ? 0 : -EIO; + } + if (err) { + dump_header->error = cpu_to_be32(err); dump_header->error_offset = cpu_to_be32(cpp_rd_addr); break; } @@ -555,8 +554,8 @@ nfp_read_indirect_csr(struct nfp_cpp *cpp, NFP_IND_ME_REFL_WR_SIG_INIT, cpp_params.token, cpp_params.island); result = nfp_cpp_writel(cpp, cpp_id, csr_ctx_ptr_offs, context); - if (result != sizeof(context)) - return result < 0 ? result : -EIO; + if (result) + return result; cpp_id = nfp_get_numeric_cpp_id(&cpp_params); result = nfp_cpp_read(cpp, cpp_id, csr_ctx_ptr_offs, dest, reg_sz); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index c505014121c4..15fa47f622aa 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -208,12 +208,6 @@ nfp_net_pf_init_vnic(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id) { int err; - /* Get ME clock frequency from ctrl BAR - * XXX for now frequency is hardcoded until we figure out how - * to get the value from nfp-hwinfo into ctrl bar - */ - nn->me_freq_mhz = 1200; - err = nfp_net_init(nn); if (err) return err; @@ -373,7 +367,9 @@ nfp_net_pf_app_init(struct nfp_pf *pf, u8 __iomem *qc_bar, unsigned int stride) if (IS_ERR(pf->app)) return PTR_ERR(pf->app); + mutex_lock(&pf->lock); err = nfp_app_init(pf->app); + mutex_unlock(&pf->lock); if (err) goto err_free; @@ -401,7 +397,9 @@ nfp_net_pf_app_init(struct nfp_pf *pf, u8 __iomem *qc_bar, unsigned int stride) err_unmap: nfp_cpp_area_release_free(pf->ctrl_vnic_bar); err_app_clean: + mutex_lock(&pf->lock); nfp_app_clean(pf->app); + mutex_unlock(&pf->lock); err_free: nfp_app_free(pf->app); pf->app = NULL; @@ -414,7 +412,11 @@ static void nfp_net_pf_app_clean(struct nfp_pf *pf) nfp_net_pf_free_vnic(pf, pf->ctrl_vnic); nfp_cpp_area_release_free(pf->ctrl_vnic_bar); } + + mutex_lock(&pf->lock); nfp_app_clean(pf->app); + mutex_unlock(&pf->lock); + nfp_app_free(pf->app); pf->app = NULL; } @@ -570,17 +572,6 @@ err_unmap_ctrl: return err; } -static void nfp_net_pci_remove_finish(struct nfp_pf *pf) -{ - nfp_net_pf_app_stop(pf); - /* stop app first, to avoid double free of ctrl vNIC's ddir */ - nfp_net_debugfs_dir_clean(&pf->ddir); - - nfp_net_pf_free_irqs(pf); - nfp_net_pf_app_clean(pf); - nfp_net_pci_unmap_mem(pf); -} - static int nfp_net_eth_port_update(struct nfp_cpp *cpp, struct nfp_port *port, struct nfp_eth_table *eth_table) @@ -655,9 +646,6 @@ int nfp_net_refresh_port_table_sync(struct nfp_pf *pf) nfp_net_pf_free_vnic(pf, nn); } - if (list_empty(&pf->vnics)) - nfp_net_pci_remove_finish(pf); - return 0; } @@ -707,6 +695,7 @@ int nfp_net_refresh_eth_port(struct nfp_port *port) */ int nfp_net_pci_probe(struct nfp_pf *pf) { + struct devlink *devlink = priv_to_devlink(pf); struct nfp_net_fw_version fw_ver; u8 __iomem *ctrl_bar, *qc_bar; int stride; @@ -720,16 +709,13 @@ int nfp_net_pci_probe(struct nfp_pf *pf) return -EINVAL; } - mutex_lock(&pf->lock); pf->max_data_vnics = nfp_net_pf_get_num_ports(pf); - if ((int)pf->max_data_vnics < 0) { - err = pf->max_data_vnics; - goto err_unlock; - } + if ((int)pf->max_data_vnics < 0) + return pf->max_data_vnics; err = nfp_net_pci_map_mem(pf); if (err) - goto err_unlock; + return err; ctrl_bar = nfp_cpp_area_iomem(pf->data_vnic_bar); qc_bar = nfp_cpp_area_iomem(pf->qc_area); @@ -768,6 +754,11 @@ int nfp_net_pci_probe(struct nfp_pf *pf) if (err) goto err_unmap; + err = devlink_register(devlink, &pf->pdev->dev); + if (err) + goto err_app_clean; + + mutex_lock(&pf->lock); pf->ddir = nfp_net_debugfs_device_add(pf->pdev); /* Allocate the vnics and do basic init */ @@ -799,32 +790,39 @@ err_free_vnics: nfp_net_pf_free_vnics(pf); err_clean_ddir: nfp_net_debugfs_dir_clean(&pf->ddir); + mutex_unlock(&pf->lock); + cancel_work_sync(&pf->port_refresh_work); + devlink_unregister(devlink); +err_app_clean: nfp_net_pf_app_clean(pf); err_unmap: nfp_net_pci_unmap_mem(pf); -err_unlock: - mutex_unlock(&pf->lock); - cancel_work_sync(&pf->port_refresh_work); return err; } void nfp_net_pci_remove(struct nfp_pf *pf) { - struct nfp_net *nn; + struct nfp_net *nn, *next; mutex_lock(&pf->lock); - if (list_empty(&pf->vnics)) - goto out; - - list_for_each_entry(nn, &pf->vnics, vnic_list) - if (nfp_net_is_data_vnic(nn)) - nfp_net_pf_clean_vnic(pf, nn); + list_for_each_entry_safe(nn, next, &pf->vnics, vnic_list) { + if (!nfp_net_is_data_vnic(nn)) + continue; + nfp_net_pf_clean_vnic(pf, nn); + nfp_net_pf_free_vnic(pf, nn); + } - nfp_net_pf_free_vnics(pf); + nfp_net_pf_app_stop(pf); + /* stop app first, to avoid double free of ctrl vNIC's ddir */ + nfp_net_debugfs_dir_clean(&pf->ddir); - nfp_net_pci_remove_finish(pf); -out: mutex_unlock(&pf->lock); + devlink_unregister(priv_to_devlink(pf)); + + nfp_net_pf_free_irqs(pf); + nfp_net_pf_app_clean(pf); + nfp_net_pci_unmap_mem(pf); + cancel_work_sync(&pf->port_refresh_work); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 317f87cc3cc6..f67da6bde9da 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -46,6 +46,13 @@ #include "nfp_net_sriov.h" #include "nfp_port.h" +struct net_device * +nfp_repr_get_locked(struct nfp_app *app, struct nfp_reprs *set, unsigned int id) +{ + return rcu_dereference_protected(set->reprs[id], + lockdep_is_held(&app->pf->lock)); +} + static void nfp_repr_inc_tx_stats(struct net_device *netdev, unsigned int len, int tx_status) @@ -369,21 +376,24 @@ static void nfp_repr_clean_and_free(struct nfp_repr *repr) nfp_repr_free(repr); } -void nfp_reprs_clean_and_free(struct nfp_reprs *reprs) +void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs) { + struct net_device *netdev; unsigned int i; - for (i = 0; i < reprs->num_reprs; i++) - if (reprs->reprs[i]) - nfp_repr_clean_and_free(netdev_priv(reprs->reprs[i])); + for (i = 0; i < reprs->num_reprs; i++) { + netdev = nfp_repr_get_locked(app, reprs, i); + if (netdev) + nfp_repr_clean_and_free(netdev_priv(netdev)); + } kfree(reprs); } void -nfp_reprs_clean_and_free_by_type(struct nfp_app *app, - enum nfp_repr_type type) +nfp_reprs_clean_and_free_by_type(struct nfp_app *app, enum nfp_repr_type type) { + struct net_device *netdev; struct nfp_reprs *reprs; int i; @@ -395,14 +405,16 @@ nfp_reprs_clean_and_free_by_type(struct nfp_app *app, /* Preclean must happen before we remove the reprs reference from the * app below. */ - for (i = 0; i < reprs->num_reprs; i++) - if (reprs->reprs[i]) - nfp_app_repr_preclean(app, reprs->reprs[i]); + for (i = 0; i < reprs->num_reprs; i++) { + netdev = nfp_repr_get_locked(app, reprs, i); + if (netdev) + nfp_app_repr_preclean(app, netdev); + } reprs = nfp_app_reprs_set(app, type, NULL); synchronize_rcu(); - nfp_reprs_clean_and_free(reprs); + nfp_reprs_clean_and_free(app, reprs); } struct nfp_reprs *nfp_reprs_alloc(unsigned int num_reprs) @@ -420,48 +432,29 @@ struct nfp_reprs *nfp_reprs_alloc(unsigned int num_reprs) int nfp_reprs_resync_phys_ports(struct nfp_app *app) { - struct nfp_reprs *reprs, *old_reprs; + struct net_device *netdev; + struct nfp_reprs *reprs; struct nfp_repr *repr; int i; - old_reprs = - rcu_dereference_protected(app->reprs[NFP_REPR_TYPE_PHYS_PORT], - lockdep_is_held(&app->pf->lock)); - if (!old_reprs) - return 0; - - reprs = nfp_reprs_alloc(old_reprs->num_reprs); + reprs = nfp_reprs_get_locked(app, NFP_REPR_TYPE_PHYS_PORT); if (!reprs) - return -ENOMEM; - - for (i = 0; i < old_reprs->num_reprs; i++) { - if (!old_reprs->reprs[i]) - continue; - - repr = netdev_priv(old_reprs->reprs[i]); - if (repr->port->type == NFP_PORT_INVALID) { - nfp_app_repr_preclean(app, old_reprs->reprs[i]); - continue; - } - - reprs->reprs[i] = old_reprs->reprs[i]; - } - - old_reprs = nfp_app_reprs_set(app, NFP_REPR_TYPE_PHYS_PORT, reprs); - synchronize_rcu(); + return 0; - /* Now we free up removed representors */ - for (i = 0; i < old_reprs->num_reprs; i++) { - if (!old_reprs->reprs[i]) + for (i = 0; i < reprs->num_reprs; i++) { + netdev = nfp_repr_get_locked(app, reprs, i); + if (!netdev) continue; - repr = netdev_priv(old_reprs->reprs[i]); + repr = netdev_priv(netdev); if (repr->port->type != NFP_PORT_INVALID) continue; + nfp_app_repr_preclean(app, netdev); + rcu_assign_pointer(reprs->reprs[i], NULL); + synchronize_rcu(); nfp_repr_clean(repr); } - kfree(old_reprs); return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h index cbc7badf40a0..a621e8ff528e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h @@ -35,6 +35,7 @@ #define NFP_NET_REPR_H struct metadata_dst; +struct nfp_app; struct nfp_net; struct nfp_port; @@ -47,7 +48,7 @@ struct nfp_port; */ struct nfp_reprs { unsigned int num_reprs; - struct net_device *reprs[0]; + struct net_device __rcu *reprs[0]; }; /** @@ -114,16 +115,18 @@ static inline int nfp_repr_get_port_id(struct net_device *netdev) return priv->dst->u.port_info.port_id; } +struct net_device * +nfp_repr_get_locked(struct nfp_app *app, struct nfp_reprs *set, + unsigned int id); + void nfp_repr_inc_rx_stats(struct net_device *netdev, unsigned int len); int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 cmsg_port_id, struct nfp_port *port, struct net_device *pf_netdev); struct net_device *nfp_repr_alloc(struct nfp_app *app); -void -nfp_reprs_clean_and_free(struct nfp_reprs *reprs); -void -nfp_reprs_clean_and_free_by_type(struct nfp_app *app, - enum nfp_repr_type type); +void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs); +void nfp_reprs_clean_and_free_by_type(struct nfp_app *app, + enum nfp_repr_type type); struct nfp_reprs *nfp_reprs_alloc(unsigned int num_reprs); int nfp_reprs_resync_phys_ports(struct nfp_app *app); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c index c879626e035b..b802a1d55449 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c @@ -277,12 +277,6 @@ static int nfp_netvf_pci_probe(struct pci_dev *pdev, } nfp_net_irqs_assign(nn, vf->irq_entries, num_irqs); - /* Get ME clock frequency from ctrl BAR - * XXX for now frequency is hardcoded until we figure out how - * to get the value from nfp-hwinfo into ctrl bar - */ - nn->me_freq_mhz = 1200; - err = nfp_net_init(nn); if (err) goto err_irqs_disable; diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c index 28262470dabf..ef30597aa319 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c @@ -674,18 +674,20 @@ void __iomem *nfp_cpp_area_iomem(struct nfp_cpp_area *area) * @offset: Offset into area * @value: Pointer to read buffer * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_area_readl(struct nfp_cpp_area *area, unsigned long offset, u32 *value) { u8 tmp[4]; - int err; + int n; - err = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp)); - *value = get_unaligned_le32(tmp); + n = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp)); + if (n != sizeof(tmp)) + return n < 0 ? n : -EIO; - return err; + *value = get_unaligned_le32(tmp); + return 0; } /** @@ -694,16 +696,18 @@ int nfp_cpp_area_readl(struct nfp_cpp_area *area, * @offset: Offset into area * @value: Value to write * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_area_writel(struct nfp_cpp_area *area, unsigned long offset, u32 value) { u8 tmp[4]; + int n; put_unaligned_le32(value, tmp); + n = nfp_cpp_area_write(area, offset, &tmp, sizeof(tmp)); - return nfp_cpp_area_write(area, offset, &tmp, sizeof(tmp)); + return n == sizeof(tmp) ? 0 : n < 0 ? n : -EIO; } /** @@ -712,18 +716,20 @@ int nfp_cpp_area_writel(struct nfp_cpp_area *area, * @offset: Offset into area * @value: Pointer to read buffer * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_area_readq(struct nfp_cpp_area *area, unsigned long offset, u64 *value) { u8 tmp[8]; - int err; + int n; - err = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp)); - *value = get_unaligned_le64(tmp); + n = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp)); + if (n != sizeof(tmp)) + return n < 0 ? n : -EIO; - return err; + *value = get_unaligned_le64(tmp); + return 0; } /** @@ -732,16 +738,18 @@ int nfp_cpp_area_readq(struct nfp_cpp_area *area, * @offset: Offset into area * @value: Value to write * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_area_writeq(struct nfp_cpp_area *area, unsigned long offset, u64 value) { u8 tmp[8]; + int n; put_unaligned_le64(value, tmp); + n = nfp_cpp_area_write(area, offset, &tmp, sizeof(tmp)); - return nfp_cpp_area_write(area, offset, &tmp, sizeof(tmp)); + return n == sizeof(tmp) ? 0 : n < 0 ? n : -EIO; } /** @@ -1080,7 +1088,7 @@ static u32 nfp_xpb_to_cpp(struct nfp_cpp *cpp, u32 *xpb_addr) * @xpb_addr: Address for operation * @value: Pointer to read buffer * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_xpb_readl(struct nfp_cpp *cpp, u32 xpb_addr, u32 *value) { @@ -1095,7 +1103,7 @@ int nfp_xpb_readl(struct nfp_cpp *cpp, u32 xpb_addr, u32 *value) * @xpb_addr: Address for operation * @value: Value to write * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_xpb_writel(struct nfp_cpp *cpp, u32 xpb_addr, u32 value) { @@ -1113,7 +1121,7 @@ int nfp_xpb_writel(struct nfp_cpp *cpp, u32 xpb_addr, u32 value) * * KERNEL: This operation is safe to call in interrupt or softirq context. * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_xpb_writelm(struct nfp_cpp *cpp, u32 xpb_tgt, u32 mask, u32 value) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpplib.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpplib.c index ab86bceb93f2..20bad05e2e92 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpplib.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpplib.c @@ -64,18 +64,20 @@ * @address: Address for operation * @value: Pointer to read buffer * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_readl(struct nfp_cpp *cpp, u32 cpp_id, unsigned long long address, u32 *value) { u8 tmp[4]; - int err; + int n; - err = nfp_cpp_read(cpp, cpp_id, address, tmp, sizeof(tmp)); - *value = get_unaligned_le32(tmp); + n = nfp_cpp_read(cpp, cpp_id, address, tmp, sizeof(tmp)); + if (n != sizeof(tmp)) + return n < 0 ? n : -EIO; - return err; + *value = get_unaligned_le32(tmp); + return 0; } /** @@ -85,15 +87,18 @@ int nfp_cpp_readl(struct nfp_cpp *cpp, u32 cpp_id, * @address: Address for operation * @value: Value to write * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_writel(struct nfp_cpp *cpp, u32 cpp_id, unsigned long long address, u32 value) { u8 tmp[4]; + int n; put_unaligned_le32(value, tmp); - return nfp_cpp_write(cpp, cpp_id, address, tmp, sizeof(tmp)); + n = nfp_cpp_write(cpp, cpp_id, address, tmp, sizeof(tmp)); + + return n == sizeof(tmp) ? 0 : n < 0 ? n : -EIO; } /** @@ -103,18 +108,20 @@ int nfp_cpp_writel(struct nfp_cpp *cpp, u32 cpp_id, * @address: Address for operation * @value: Pointer to read buffer * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_readq(struct nfp_cpp *cpp, u32 cpp_id, unsigned long long address, u64 *value) { u8 tmp[8]; - int err; + int n; - err = nfp_cpp_read(cpp, cpp_id, address, tmp, sizeof(tmp)); - *value = get_unaligned_le64(tmp); + n = nfp_cpp_read(cpp, cpp_id, address, tmp, sizeof(tmp)); + if (n != sizeof(tmp)) + return n < 0 ? n : -EIO; - return err; + *value = get_unaligned_le64(tmp); + return 0; } /** @@ -124,15 +131,18 @@ int nfp_cpp_readq(struct nfp_cpp *cpp, u32 cpp_id, * @address: Address for operation * @value: Value to write * - * Return: length of the io, or -ERRNO + * Return: 0 on success, or -ERRNO */ int nfp_cpp_writeq(struct nfp_cpp *cpp, u32 cpp_id, unsigned long long address, u64 value) { u8 tmp[8]; + int n; put_unaligned_le64(value, tmp); - return nfp_cpp_write(cpp, cpp_id, address, tmp, sizeof(tmp)); + n = nfp_cpp_write(cpp, cpp_id, address, tmp, sizeof(tmp)); + + return n == sizeof(tmp) ? 0 : n < 0 ? n : -EIO; } /* NOTE: This code should not use nfp_xpb_* functions, diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c index ecda474ac7c3..46107aefad1c 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c @@ -277,10 +277,6 @@ u64 nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name, break; } - if (err == sym->size) - err = 0; - else if (err >= 0) - err = -EIO; exit: if (error) *error = err; diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index ed58c746e4af..f5a7eb22d0f5 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -715,7 +715,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp) /* warning!!!! We are retrieving the virtual ptr in the sw_data * field as a 32bit value. Will not work on 64bit machines */ - page = (struct page *)GET_SW_DATA0(desc); + page = (struct page *)GET_SW_DATA0(ndesc); if (likely(dma_buff && buf_len && page)) { dma_unmap_page(netcp->dev, dma_buff, PAGE_SIZE, diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 5134d5c1306c..b3851bbefad3 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -17,6 +17,7 @@ #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> +#include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> @@ -31,6 +32,19 @@ struct nsim_bpf_bound_prog { struct list_head l; }; +#define NSIM_BPF_MAX_KEYS 2 + +struct nsim_bpf_bound_map { + struct netdevsim *ns; + struct bpf_offloaded_map *map; + struct mutex mutex; + struct nsim_map_entry { + void *key; + void *value; + } entry[NSIM_BPF_MAX_KEYS]; + struct list_head l; +}; + static int nsim_debugfs_bpf_string_read(struct seq_file *file, void *data) { const char **str = file->private; @@ -284,6 +298,224 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) return 0; } +static bool +nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) +{ + return e->key && !memcmp(key, e->key, map->key_size); +} + +static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) + if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) + return i; + + return -ENOENT; +} + +static int +nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + + nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_USER); + if (!nmap->entry[idx].key) + return -ENOMEM; + nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_USER); + if (!nmap->entry[idx].value) { + kfree(nmap->entry[idx].key); + nmap->entry[idx].key = NULL; + return -ENOMEM; + } + + return 0; +} + +static int +nsim_map_get_next_key(struct bpf_offloaded_map *offmap, + void *key, void *next_key) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx = -ENOENT; + + mutex_lock(&nmap->mutex); + + if (key) + idx = nsim_map_key_find(offmap, key); + if (idx == -ENOENT) + idx = 0; + else + idx++; + + for (; idx < ARRAY_SIZE(nmap->entry); idx++) { + if (nmap->entry[idx].key) { + memcpy(next_key, nmap->entry[idx].key, + offmap->map.key_size); + break; + } + } + + mutex_unlock(&nmap->mutex); + + if (idx == ARRAY_SIZE(nmap->entry)) + return -ENOENT; + return 0; +} + +static int +nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx; + + mutex_lock(&nmap->mutex); + + idx = nsim_map_key_find(offmap, key); + if (idx >= 0) + memcpy(value, nmap->entry[idx].value, offmap->map.value_size); + + mutex_unlock(&nmap->mutex); + + return idx < 0 ? idx : 0; +} + +static int +nsim_map_update_elem(struct bpf_offloaded_map *offmap, + void *key, void *value, u64 flags) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx, err = 0; + + mutex_lock(&nmap->mutex); + + idx = nsim_map_key_find(offmap, key); + if (idx < 0 && flags == BPF_EXIST) { + err = idx; + goto exit_unlock; + } + if (idx >= 0 && flags == BPF_NOEXIST) { + err = -EEXIST; + goto exit_unlock; + } + + if (idx < 0) { + for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) + if (!nmap->entry[idx].key) + break; + if (idx == ARRAY_SIZE(nmap->entry)) { + err = -E2BIG; + goto exit_unlock; + } + + err = nsim_map_alloc_elem(offmap, idx); + if (err) + goto exit_unlock; + } + + memcpy(nmap->entry[idx].key, key, offmap->map.key_size); + memcpy(nmap->entry[idx].value, value, offmap->map.value_size); +exit_unlock: + mutex_unlock(&nmap->mutex); + + return err; +} + +static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx; + + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) + return -EINVAL; + + mutex_lock(&nmap->mutex); + + idx = nsim_map_key_find(offmap, key); + if (idx >= 0) { + kfree(nmap->entry[idx].key); + kfree(nmap->entry[idx].value); + memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); + } + + mutex_unlock(&nmap->mutex); + + return idx < 0 ? idx : 0; +} + +static const struct bpf_map_dev_ops nsim_bpf_map_ops = { + .map_get_next_key = nsim_map_get_next_key, + .map_lookup_elem = nsim_map_lookup_elem, + .map_update_elem = nsim_map_update_elem, + .map_delete_elem = nsim_map_delete_elem, +}; + +static int +nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) +{ + struct nsim_bpf_bound_map *nmap; + unsigned int i; + int err; + + if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && + offmap->map.map_type != BPF_MAP_TYPE_HASH)) + return -EINVAL; + if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) + return -ENOMEM; + if (offmap->map.map_flags) + return -EINVAL; + + nmap = kzalloc(sizeof(*nmap), GFP_USER); + if (!nmap) + return -ENOMEM; + + offmap->dev_priv = nmap; + nmap->ns = ns; + nmap->map = offmap; + mutex_init(&nmap->mutex); + + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { + for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { + u32 *key; + + err = nsim_map_alloc_elem(offmap, i); + if (err) + goto err_free; + key = nmap->entry[i].key; + *key = i; + } + } + + offmap->dev_ops = &nsim_bpf_map_ops; + list_add_tail(&nmap->l, &ns->bpf_bound_maps); + + return 0; + +err_free: + while (--i) { + kfree(nmap->entry[i].key); + kfree(nmap->entry[i].value); + } + kfree(nmap); + return err; +} + +static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { + kfree(nmap->entry[i].key); + kfree(nmap->entry[i].value); + } + list_del_init(&nmap->l); + mutex_destroy(&nmap->mutex); + kfree(nmap); +} + int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); @@ -328,6 +560,14 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) return err; return nsim_xdp_set_prog(ns, bpf); + case BPF_OFFLOAD_MAP_ALLOC: + if (!ns->bpf_map_accept) + return -EOPNOTSUPP; + + return nsim_bpf_map_alloc(ns, bpf->offmap); + case BPF_OFFLOAD_MAP_FREE: + nsim_bpf_map_free(bpf->offmap); + return 0; default: return -EINVAL; } @@ -336,6 +576,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) int nsim_bpf_init(struct netdevsim *ns) { INIT_LIST_HEAD(&ns->bpf_bound_progs); + INIT_LIST_HEAD(&ns->bpf_bound_maps); debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, &ns->bpf_offloaded_id); @@ -362,12 +603,17 @@ int nsim_bpf_init(struct netdevsim *ns) debugfs_create_bool("bpf_xdpoffload_accept", 0600, ns->ddir, &ns->bpf_xdpoffload_accept); + ns->bpf_map_accept = true; + debugfs_create_bool("bpf_map_accept", 0600, ns->ddir, + &ns->bpf_map_accept); + return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(!list_empty(&ns->bpf_bound_progs)); + WARN_ON(!list_empty(&ns->bpf_bound_maps)); WARN_ON(ns->xdp_prog); WARN_ON(ns->bpf_offloaded); } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 32270de9395a..b80361200302 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -61,6 +61,9 @@ struct netdevsim { bool bpf_tc_non_bound_accept; bool bpf_xdpdrv_accept; bool bpf_xdpoffload_accept; + + bool bpf_map_accept; + struct list_head bpf_bound_maps; }; extern struct dentry *nsim_ddir; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 170a3e89b5af..698874684b4e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -679,6 +679,15 @@ static void tun_queue_purge(struct tun_file *tfile) skb_queue_purge(&tfile->sk.sk_error_queue); } +static void tun_cleanup_tx_ring(struct tun_file *tfile) +{ + if (tfile->tx_ring.queue) { + ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); + xdp_rxq_info_unreg(&tfile->xdp_rxq); + memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring)); + } +} + static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; @@ -725,10 +734,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } - if (tun) { - ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); - xdp_rxq_info_unreg(&tfile->xdp_rxq); - } + tun_cleanup_tx_ring(tfile); sock_put(&tfile->sk); } } @@ -770,12 +776,14 @@ static void tun_detach_all(struct net_device *dev) tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); + tun_cleanup_tx_ring(tfile); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); + tun_cleanup_tx_ring(tfile); } BUG_ON(tun->numdisabled != 0); @@ -3145,6 +3153,8 @@ static int tun_chr_open(struct inode *inode, struct file * file) sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); + memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring)); + return 0; } diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index d51d9abf7986..0657203ffb91 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -606,6 +606,7 @@ enum rtl8152_flags { PHY_RESET, SCHEDULE_NAPI, GREEN_ETHERNET, + DELL_TB_RX_AGG_BUG, }; /* Define these values to match your device */ @@ -1798,6 +1799,9 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg) dev_kfree_skb_any(skb); remain = agg_buf_sz - (int)(tx_agg_align(tx_data) - agg->head); + + if (test_bit(DELL_TB_RX_AGG_BUG, &tp->flags)) + break; } if (!skb_queue_empty(&skb_head)) { @@ -4133,6 +4137,9 @@ static void r8153_init(struct r8152 *tp) /* rx aggregation */ ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL); ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN); + if (test_bit(DELL_TB_RX_AGG_BUG, &tp->flags)) + ocp_data |= RX_AGG_DISABLE; + ocp_write_word(tp, MCU_TYPE_USB, USB_USB_CTRL, ocp_data); rtl_tally_reset(tp); @@ -5207,6 +5214,12 @@ static int rtl8152_probe(struct usb_interface *intf, netdev->hw_features &= ~NETIF_F_RXCSUM; } + if (le16_to_cpu(udev->descriptor.bcdDevice) == 0x3011 && + udev->serial && !strcmp(udev->serial, "000001000000")) { + dev_info(&udev->dev, "Dell TB16 Dock, disable RX aggregation"); + set_bit(DELL_TB_RX_AGG_BUG, &tp->flags); + } + netdev->ethtool_ops = &ops; netif_set_gso_max_size(netdev, RTL_LIMITED_TSO_SIZE); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c index 6a59d0609d30..9be0b051066a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c @@ -182,12 +182,9 @@ static int brcmf_c_process_clm_blob(struct brcmf_if *ifp) err = request_firmware(&clm, clm_name, dev); if (err) { - if (err == -ENOENT) { - brcmf_dbg(INFO, "continue with CLM data currently present in firmware\n"); - return 0; - } - brcmf_err("request CLM blob file failed (%d)\n", err); - return err; + brcmf_info("no clm_blob available(err=%d), device may have limited channels available\n", + err); + return 0; } chunk_buf = kzalloc(sizeof(*chunk_buf) + MAX_CHUNK_LEN - 1, GFP_KERNEL); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d53550e612bc..4276ebfff22b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -451,10 +451,13 @@ static void **nvme_pci_iod_list(struct request *req) static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), - blk_rq_nr_phys_segments(req)); + if (nseg == 0) + return false; + + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) return false; @@ -722,20 +725,19 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, } static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmd) + struct request *req, struct nvme_rw_command *cmd, int entries) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int length = blk_rq_payload_bytes(req); struct dma_pool *pool; struct nvme_sgl_desc *sg_list; struct scatterlist *sg = iod->sg; - int entries = iod->nents, i = 0; dma_addr_t sgl_dma; + int i = 0; /* setting the transfer type as SGL */ cmd->flags = NVME_CMD_SGL_METABUF; - if (length == sg_dma_len(sg)) { + if (entries == 1) { nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); return BLK_STS_OK; } @@ -775,13 +777,9 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, } nvme_pci_sgl_set_data(&sg_list[i++], sg); - - length -= sg_dma_len(sg); sg = sg_next(sg); - entries--; - } while (length > 0); + } while (--entries > 0); - WARN_ON(entries > 0); return BLK_STS_OK; } @@ -793,6 +791,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; blk_status_t ret = BLK_STS_IOERR; + int nr_mapped; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); @@ -800,12 +799,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out; ret = BLK_STS_RESOURCE; - if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, - DMA_ATTR_NO_WARN)) + nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, + DMA_ATTR_NO_WARN); + if (!nr_mapped) goto out; if (iod->use_sgl) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index b4964b067aec..8f6e8e28996d 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -410,6 +410,10 @@ static struct phy *_of_phy_get(struct device_node *np, int index) if (ret) return ERR_PTR(-ENODEV); + /* This phy type handled by the usb-phy subsystem for now */ + if (of_device_is_compatible(args.np, "usb-nop-xceiv")) + return ERR_PTR(-ENODEV); + mutex_lock(&phy_provider_mutex); phy_provider = of_phy_provider_lookup(args.np); if (IS_ERR(phy_provider) || !try_module_get(phy_provider->owner)) { diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig index f48a2ee587a4..ee18428a051f 100644 --- a/drivers/ssb/Kconfig +++ b/drivers/ssb/Kconfig @@ -31,7 +31,7 @@ config SSB_BLOCKIO config SSB_PCIHOST_POSSIBLE bool - depends on SSB && (PCI = y || PCI = SSB) + depends on SSB && (PCI = y || PCI = SSB) && PCI_DRIVERS_LEGACY default y config SSB_PCIHOST diff --git a/fs/proc/array.c b/fs/proc/array.c index 79375fc115d2..d67a72dcb92c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -430,8 +430,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * safe because the task has stopped executing permanently. */ if (permitted && (task->flags & PF_DUMPCORE)) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); + if (try_get_task_stack(task)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + put_task_stack(task); + } } } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5c2c104dc2c5..66df387106de 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -234,6 +234,8 @@ struct bpf_prog_offload { struct list_head offloads; bool dev_state; const struct bpf_prog_offload_ops *dev_ops; + void *jited_image; + u32 jited_len; }; struct bpf_prog_aux { @@ -584,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); + int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2272ded07496..631354acfa72 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -219,7 +219,7 @@ /* Mark a function definition as prohibited from being cloned. */ #define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) -#ifdef RANDSTRUCT_PLUGIN +#if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) #define __randomize_layout __attribute__((randomize_layout)) #define __no_randomize_layout __attribute__((no_randomize_layout)) #endif diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 4178d2493547..5e335b6203f4 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -71,7 +71,7 @@ extern void delayacct_init(void); extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); -extern void __delayacct_blkio_end(void); +extern void __delayacct_blkio_end(struct task_struct *); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); @@ -122,10 +122,10 @@ static inline void delayacct_blkio_start(void) __delayacct_blkio_start(); } -static inline void delayacct_blkio_end(void) +static inline void delayacct_blkio_end(struct task_struct *p) { if (current->delays) - __delayacct_blkio_end(); + __delayacct_blkio_end(p); delayacct_clear_flag(DELAYACCT_PF_BLKIO); } @@ -169,7 +169,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) {} static inline void delayacct_blkio_start(void) {} -static inline void delayacct_blkio_end(void) +static inline void delayacct_blkio_end(struct task_struct *p) {} static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index bae807eb2933..853291714ae0 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -31,11 +31,17 @@ #else #define MODULE_RANDSTRUCT_PLUGIN #endif +#ifdef RETPOLINE +#define MODULE_VERMAGIC_RETPOLINE "retpoline " +#else +#define MODULE_VERMAGIC_RETPOLINE "" +#endif #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ MODULE_ARCH_VERMAGIC \ - MODULE_RANDSTRUCT_PLUGIN + MODULE_RANDSTRUCT_PLUGIN \ + MODULE_VERMAGIC_RETPOLINE diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index dd238950df81..663b015dace5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -143,22 +143,22 @@ static inline void nft_data_debug(const struct nft_data *data) * struct nft_ctx - nf_tables rule/set context * * @net: net namespace - * @afi: address family info * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @family: protocol family * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; + u8 family; bool report; }; @@ -374,6 +374,7 @@ void nft_unregister_set(struct nft_set_type *type); * @list: table set list node * @bindings: list of set bindings * @name: name of the set + * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) @@ -396,6 +397,7 @@ struct nft_set { struct list_head list; struct list_head bindings; char *name; + u64 handle; u32 ktype; u32 dtype; u32 objtype; @@ -946,9 +948,11 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state + * @handle: table handle * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask + * @afinfo: address family info * @name: name of the table */ struct nft_table { @@ -958,38 +962,14 @@ struct nft_table { struct list_head objects; struct list_head flowtables; u64 hgenerator; + u64 handle; u32 use; - u16 flags:14, + u16 family:6, + flags:8, genmask:2; char *name; }; -enum nft_af_flags { - NFT_AF_NEEDS_DEV = (1 << 0), -}; - -/** - * struct nft_af_info - nf_tables address family info - * - * @list: used internally - * @family: address family - * @nhooks: number of hooks in this family - * @owner: module owner - * @tables: used internally - * @flags: family flags - */ -struct nft_af_info { - struct list_head list; - int family; - unsigned int nhooks; - struct module *owner; - struct list_head tables; - u32 flags; -}; - -int nft_register_afinfo(struct net *, struct nft_af_info *); -void nft_unregister_afinfo(struct net *, struct nft_af_info *); - int nft_register_chain_type(const struct nf_chain_type *); void nft_unregister_chain_type(const struct nf_chain_type *); @@ -1007,9 +987,9 @@ int nft_verdict_dump(struct sk_buff *skb, int type, * @name: name of this stateful object * @genmask: generation mask * @use: number of references to this stateful object - * @data: object data, layout depends on type + * @handle: unique object handle * @ops: object operations - * @data: pointer to object data + * @data: object data, layout depends on type */ struct nft_object { struct list_head list; @@ -1017,6 +997,7 @@ struct nft_object { struct nft_table *table; u32 genmask:2, use:30; + u64 handle; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] @@ -1098,6 +1079,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table + * @handle: unique object handle * @data: rhashtable and garbage collector * @ops: array of hooks */ @@ -1110,6 +1092,7 @@ struct nft_flowtable { int ops_len; u32 genmask:2, use:30; + u64 handle; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; @@ -1154,9 +1137,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, void nft_trace_notify(struct nft_traceinfo *info); -#define MODULE_ALIAS_NFT_FAMILY(family) \ - MODULE_ALIAS("nft-afinfo-" __stringify(family)) - #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 4109b5f3010f..48134353411d 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,14 +7,8 @@ struct nft_af_info; struct netns_nftables { - struct list_head af_info; + struct list_head tables; struct list_head commit_list; - struct nft_af_info *ipv4; - struct nft_af_info *ipv6; - struct nft_af_info *inet; - struct nft_af_info *arp; - struct nft_af_info *bridge; - struct nft_af_info *netdev; unsigned int base_seq; u8 gencursor; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 789d818c4a61..2e4b8e436d25 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -376,7 +376,8 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, - struct tcf_exts *exts, bool ovr); + struct tcf_exts *exts, bool ovr, + struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); @@ -556,13 +557,16 @@ static inline int tcf_valid_offset(const struct sk_buff *skb, #include <net/net_namespace.h> static inline int -tcf_change_indev(struct net *net, struct nlattr *indev_tlv) +tcf_change_indev(struct net *net, struct nlattr *indev_tlv, + struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; - if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ) + if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "Interface name too long"); return -EINVAL; + } dev = __dev_get_by_name(net, indev); if (!dev) return -ENODEV; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cfc19d0ba2ad..cd1be1f25c36 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -239,8 +239,11 @@ struct tcf_proto_ops { int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, - void **, bool); - int (*delete)(struct tcf_proto*, void *, bool*); + void **, bool, + struct netlink_ext_ack *); + int (*delete)(struct tcf_proto *tp, void *arg, + bool *last, + struct netlink_ext_ack *); void (*walk)(struct tcf_proto*, struct tcf_walker *arg); void (*bind_class)(void *, u32, unsigned long); diff --git a/include/net/tcp.h b/include/net/tcp.h index 6939e69d3c37..5a1d26a18599 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -953,6 +953,7 @@ struct rate_sample { u32 prior_in_flight; /* in flight before this ACK */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c2259e8bc54..406c19d6016b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -17,7 +17,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ @@ -938,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h index 18be90725ab0..ee97668bdadb 100644 --- a/include/uapi/linux/bpf_common.h +++ b/include/uapi/linux/bpf_common.h @@ -15,9 +15,10 @@ /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 53e8dd2a3a03..66dceee0ae30 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -174,6 +174,8 @@ enum nft_table_attributes { NFTA_TABLE_NAME, NFTA_TABLE_FLAGS, NFTA_TABLE_USE, + NFTA_TABLE_HANDLE, + NFTA_TABLE_PAD, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) @@ -317,6 +319,7 @@ enum nft_set_desc_attributes { * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) * @NFTA_SET_USERDATA: user data (NLA_BINARY) * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*) + * @NFTA_SET_HANDLE: set handle (NLA_U64) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -335,6 +338,7 @@ enum nft_set_attributes { NFTA_SET_USERDATA, NFTA_SET_PAD, NFTA_SET_OBJ_TYPE, + NFTA_SET_HANDLE, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) @@ -1314,6 +1318,7 @@ enum nft_ct_helper_attributes { * @NFTA_OBJ_TYPE: stateful object type (NLA_U32) * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) + * @NFTA_OBJ_HANDLE: object handle (NLA_U64) */ enum nft_object_attributes { NFTA_OBJ_UNSPEC, @@ -1322,6 +1327,8 @@ enum nft_object_attributes { NFTA_OBJ_TYPE, NFTA_OBJ_DATA, NFTA_OBJ_USE, + NFTA_OBJ_HANDLE, + NFTA_OBJ_PAD, __NFTA_OBJ_MAX }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) @@ -1333,6 +1340,7 @@ enum nft_object_attributes { * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING) * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) + * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) */ enum nft_flowtable_attributes { NFTA_FLOWTABLE_UNSPEC, @@ -1340,6 +1348,8 @@ enum nft_flowtable_attributes { NFTA_FLOWTABLE_NAME, NFTA_FLOWTABLE_HOOK, NFTA_FLOWTABLE_USE, + NFTA_FLOWTABLE_HANDLE, + NFTA_FLOWTABLE_PAD, __NFTA_FLOWTABLE_MAX }; #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index e6b1a84f5dd3..c3b060775e13 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -57,6 +57,7 @@ enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, + NF_IP_PRI_RAW_BEFORE_DEFRAG = -450, NF_IP_PRI_CONNTRACK_DEFRAG = -400, NF_IP_PRI_RAW = -300, NF_IP_PRI_SELINUX_FIRST = -225, diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h index 2f9724611cc2..dc624fd24d25 100644 --- a/include/uapi/linux/netfilter_ipv6.h +++ b/include/uapi/linux/netfilter_ipv6.h @@ -62,6 +62,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_FIRST = INT_MIN, + NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450, NF_IP6_PRI_CONNTRACK_DEFRAG = -400, NF_IP6_PRI_RAW = -300, NF_IP6_PRI_SELINUX_FIRST = -225, diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h new file mode 100644 index 000000000000..f3cc0ef514a7 --- /dev/null +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _IP6T_SRH_H +#define _IP6T_SRH_H + +#include <linux/types.h> +#include <linux/netfilter.h> + +/* Values for "mt_flags" field in struct ip6t_srh */ +#define IP6T_SRH_NEXTHDR 0x0001 +#define IP6T_SRH_LEN_EQ 0x0002 +#define IP6T_SRH_LEN_GT 0x0004 +#define IP6T_SRH_LEN_LT 0x0008 +#define IP6T_SRH_SEGS_EQ 0x0010 +#define IP6T_SRH_SEGS_GT 0x0020 +#define IP6T_SRH_SEGS_LT 0x0040 +#define IP6T_SRH_LAST_EQ 0x0080 +#define IP6T_SRH_LAST_GT 0x0100 +#define IP6T_SRH_LAST_LT 0x0200 +#define IP6T_SRH_TAG 0x0400 +#define IP6T_SRH_MASK 0x07FF + +/* Values for "mt_invflags" field in struct ip6t_srh */ +#define IP6T_SRH_INV_NEXTHDR 0x0001 +#define IP6T_SRH_INV_LEN_EQ 0x0002 +#define IP6T_SRH_INV_LEN_GT 0x0004 +#define IP6T_SRH_INV_LEN_LT 0x0008 +#define IP6T_SRH_INV_SEGS_EQ 0x0010 +#define IP6T_SRH_INV_SEGS_GT 0x0020 +#define IP6T_SRH_INV_SEGS_LT 0x0040 +#define IP6T_SRH_INV_LAST_EQ 0x0080 +#define IP6T_SRH_INV_LAST_GT 0x0100 +#define IP6T_SRH_INV_LAST_LT 0x0200 +#define IP6T_SRH_INV_TAG 0x0400 +#define IP6T_SRH_INV_MASK 0x07FF + +/** + * struct ip6t_srh - SRH match options + * @ next_hdr: Next header field of SRH + * @ hdr_len: Extension header length field of SRH + * @ segs_left: Segments left field of SRH + * @ last_entry: Last entry field of SRH + * @ tag: Tag field of SRH + * @ mt_flags: match options + * @ mt_invflags: Invert the sense of match options + */ + +struct ip6t_srh { + __u8 next_hdr; + __u8 hdr_len; + __u8 segs_left; + __u8 last_entry; + __u16 tag; + __u16 mt_flags; + __u16 mt_invflags; +}; + +#endif /*_IP6T_SRH_H*/ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ab94d304a634..b1f66480135b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static struct bpf_map *array_map_alloc(union bpf_attr *attr) +static int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); - u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); - struct bpf_array *array; - u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); + struct bpf_array *array; + u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); @@ -112,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (!percpu) @@ -327,6 +330,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -337,6 +341,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -345,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -474,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -561,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -592,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -610,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -673,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 25e723b0dfd4..3aa0658add76 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -970,7 +971,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -989,7 +990,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); @@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) } #else -static unsigned int __bpf_prog_ret0(const void *ctx, - const struct bpf_insn *insn) +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) { + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); return 0; } #endif @@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else - fp->bpf_func = __bpf_prog_ret0; + fp->bpf_func = __bpf_prog_ret0_warn; #endif /* eBPF JITs can rewrite the program in case constant diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 584e02227671..d7ea96218516 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -591,9 +591,100 @@ unlock: raw_spin_unlock(&trie->lock); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node *node, *next_node = NULL, *parent; + struct lpm_trie_node **node_stack = NULL; + struct lpm_trie_node __rcu **root; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + if (!rcu_dereference(trie->root)) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) { + root = &trie->root; + goto find_leftmost; + } + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_USER | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = rcu_dereference(trie->root); node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + root = &trie->root; + goto find_leftmost; + } + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node && + rcu_dereference(parent->child[1])) { + root = &parent->child[1]; + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = rcu_dereference(*root); node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a88cebf368bf..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -230,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, .prog = prog, .info = info, }; + struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; + char __user *uinsns; void *res; + u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (IS_ERR(res)) { @@ -241,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, return PTR_ERR(res); } + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; @@ -276,7 +299,8 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_type != BPF_MAP_TYPE_HASH) + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = kzalloc(sizeof(*offmap), GFP_USER); @@ -389,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) return ret; } +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c691b9e972e3..5bdb0cc84ad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1504,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -1724,19 +1726,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } else { - info.jited_prog_insns = 0; - } - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { @@ -1762,6 +1751,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } done: @@ -1794,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e7a43edf264..dfb138b46488 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1349,6 +1349,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return reg->type == PTR_TO_CTX; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1728,6 +1735,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -1837,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1886,9 +1912,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test @@ -1949,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -2111,7 +2122,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -2126,7 +2137,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2282,7 +2330,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id); - if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -2306,10 +2353,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -2478,17 +2522,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, state); - verbose(env, - "verifier internal error: known but bad sbounds\n"); - return -EINVAL; - } - if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, state); - verbose(env, - "verifier internal error: known but bad ubounds\n"); - return -EINVAL; + if ((known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; } if (BPF_CLASS(insn->code) != BPF_ALU64) { @@ -2680,6 +2720,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; + } + if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { __mark_reg_unknown(dst_reg); @@ -4661,6 +4710,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -4779,7 +4834,8 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth ", insn_processed); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt + 1; i++) { u32 depth = env->subprog_stack_depth[i]; @@ -5330,6 +5386,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2cf06c274e4c..7e4c44538119 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4447,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c33416b6a..e2764d767f18 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(u64 *start, u64 *total, u32 *count) +static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(¤t->delays->lock, flags); + spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + spin_unlock_irqrestore(lock, flags); } } @@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) current->delays->blkio_start = ktime_get_ns(); } -void __delayacct_blkio_end(void) +/* + * We cannot rely on the `current` macro, as we haven't yet switched back to + * the process being woken. + */ +void __delayacct_blkio_end(struct task_struct *p) { - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); + struct task_delay_info *delays = p->delays; + u64 *total; + u32 *count; + + if (p->delays->flags & DELAYACCT_PF_SWAPIN) { + total = &delays->swapin_delay; + count = &delays->swapin_count; + } else { + total = &delays->blkio_delay; + count = &delays->blkio_count; + } + + delayacct_end(&delays->lock, &delays->blkio_start, total, count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) void __delayacct_freepages_end(void) { - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end( + ¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } diff --git a/kernel/futex.c b/kernel/futex.c index 57d0b3657e16..8c5424dd5924 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past @@ -2294,21 +2297,17 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret; + lockdep_assert_held(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; @@ -2317,11 +2316,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, newtid |= FUTEX_OWNER_DIED; /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), + * + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). * - * We have to replace the newowner TID in the user space variable. + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2334,6 +2339,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the PID check in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; @@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6e4e9e..65cc0cb984e6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); } +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98ca0b17..68686b3ec3c1 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 644fa2e3d993..a7bf32aabfda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 89a9e1b4264a..0bcf00e3ce48 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1696,7 +1696,7 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ if (time_before(jiffies, base->clk)) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f274468cbc45..fc2838ac8b78 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(1 /* Fake ip will not be printed. */, \ + __trace_printk(0 /* Fake ip */, \ fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0cddf60186da..5af2842dea96 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2579,8 +2579,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = RB_CTX_NORMAL; else bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : - pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ; + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; if (unlikely(val & (1 << bit))) return 1; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..1b87157edbff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..f699122dab32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> +#include <linux/nmi.h> #include "workqueue_internal.h" @@ -4463,6 +4464,12 @@ void show_workqueue_state(void) if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } } @@ -4490,6 +4497,12 @@ void show_workqueue_state(void) pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } rcu_read_unlock_sched(); diff --git a/lib/test_bpf.c b/lib/test_bpf.c index f369889e521d..e3938e395cba 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6109,6 +6109,110 @@ static struct bpf_test tests[] = { { { ETH_HLEN, 42 } }, .fill_helper = bpf_fill_ld_abs_vlan_push_pop2, }, + /* Checking interpreter vs JIT wrt signed extended imms. */ + { + "JNE signed compare, test 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12), + BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000), + BPF_MOV64_REG(R2, R1), + BPF_ALU64_REG(BPF_AND, R2, R3), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R2, -17104896, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JNE signed compare, test 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12), + BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000), + BPF_MOV64_REG(R2, R1), + BPF_ALU64_REG(BPF_AND, R2, R3), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R2, 0xfefb0000, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JNE signed compare, test 3", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12), + BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000), + BPF_ALU32_IMM(BPF_MOV, R4, 0xfefb0000), + BPF_MOV64_REG(R2, R1), + BPF_ALU64_REG(BPF_AND, R2, R3), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_REG(BPF_JNE, R2, R4, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 2 } }, + }, + { + "JNE signed compare, test 4", + .u.insns_int = { + BPF_LD_IMM64(R1, -17104896), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R1, -17104896, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 2 } }, + }, + { + "JNE signed compare, test 5", + .u.insns_int = { + BPF_LD_IMM64(R1, 0xfefb0000), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R1, 0xfefb0000, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JNE signed compare, test 6", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x7efb0000), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R1, 0x7efb0000, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 2 } }, + }, + { + "JNE signed compare, test 7", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffff0000), + BPF_STMT(BPF_MISC | BPF_TAX, 0), + BPF_STMT(BPF_LD | BPF_IMM, 0xfefbbc12), + BPF_STMT(BPF_ALU | BPF_AND | BPF_X, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xfefb0000, 1, 0), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, 2), + }, + CLASSIC | FLAG_NO_DATA, + {}, + { { 0, 2 } }, + }, }; static struct net_device dev; diff --git a/mm/memory.c b/mm/memory.c index ca5674cbaff2..793004608332 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2857,8 +2857,11 @@ int do_swap_page(struct vm_fault *vmf) int ret = 0; bool vma_readahead = swap_use_vma_readahead(); - if (vma_readahead) + if (vma_readahead) { page = swap_readahead_detect(vmf, &swap_ra); + swapcache = page; + } + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { if (page) put_page(page); @@ -2889,9 +2892,12 @@ int do_swap_page(struct vm_fault *vmf) delayacct_set_flag(DELAYACCT_PF_SWAPIN); - if (!page) + if (!page) { page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, vmf->address); + swapcache = page; + } + if (!page) { struct swap_info_struct *si = swp_swap_info(entry); diff --git a/mm/page_owner.c b/mm/page_owner.c index 8592543a0f15..270a8219ccd0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -616,7 +616,6 @@ static void init_early_allocated_pages(void) { pg_data_t *pgdat; - drain_all_pages(NULL); for_each_online_pgdat(pgdat) init_zones_in_node(pgdat); } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 37817d25b63d..02c4b409d317 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2445,7 +2445,6 @@ static int __init ebtables_init(void) return ret; } - printk(KERN_INFO "Ebtables v2.0 registered\n"); return 0; } @@ -2453,7 +2452,6 @@ static void __exit ebtables_fini(void) { nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); - printk(KERN_INFO "Ebtables v2.0 unregistered\n"); } EXPORT_SYMBOL(ebt_register_table); diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 86774b5c3b73..5160cf614176 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -42,40 +42,6 @@ nft_do_chain_bridge(void *priv, return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_bridge __read_mostly = { - .family = NFPROTO_BRIDGE, - .nhooks = NF_BR_NUMHOOKS, - .owner = THIS_MODULE, -}; - -static int nf_tables_bridge_init_net(struct net *net) -{ - net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.bridge == NULL) - return -ENOMEM; - - memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge)); - - if (nft_register_afinfo(net, net->nft.bridge) < 0) - goto err; - - return 0; -err: - kfree(net->nft.bridge); - return -ENOMEM; -} - -static void nf_tables_bridge_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.bridge); - kfree(net->nft.bridge); -} - -static struct pernet_operations nf_tables_bridge_net_ops = { - .init = nf_tables_bridge_init_net, - .exit = nf_tables_bridge_exit_net, -}; - static const struct nf_chain_type filter_bridge = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -97,27 +63,11 @@ static const struct nf_chain_type filter_bridge = { static int __init nf_tables_bridge_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_bridge); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_bridge_net_ops); - if (ret < 0) - goto err_register_subsys; - - return ret; - -err_register_subsys: - nft_unregister_chain_type(&filter_bridge); - - return ret; + return nft_register_chain_type(&filter_bridge); } static void __exit nf_tables_bridge_exit(void) { - unregister_pernet_subsys(&nf_tables_bridge_net_ops); nft_unregister_chain_type(&filter_bridge); } @@ -126,4 +76,4 @@ module_exit(nf_tables_bridge_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_BRIDGE); +MODULE_ALIAS_NFT_CHAIN(AF_BRIDGE, "filter"); diff --git a/net/can/af_can.c b/net/can/af_can.c index f22b886ed081..6da324550eec 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -721,20 +721,16 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN, - "PF_CAN: dropped non conform CAN skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) - goto drop; + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); + kfree_skb(skb); + return NET_RX_DROP; + } can_receive(skb, dev); return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, @@ -742,20 +738,16 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN, - "PF_CAN: dropped non conform CAN FD skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) - goto drop; + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); + kfree_skb(skb); + return NET_RX_DROP; + } can_receive(skb, dev); return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; } /* diff --git a/net/core/filter.c b/net/core/filter.c index db2ee8c7e1bd..18da42a81d0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -458,6 +458,10 @@ do_pass: convert_bpf_extensions(fp, &insn)) break; + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; @@ -2861,7 +2865,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -3150,7 +3154,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -3456,6 +3460,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: @@ -4526,6 +4532,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 02db7b122a73..559db9ea8d86 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1031,8 +1031,8 @@ ip_proto_again: out_good: ret = true; - key_control->thoff = (u16)nhoff; out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; @@ -1040,7 +1040,6 @@ out: out_bad: ret = false; - key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); goto out; } EXPORT_SYMBOL(__skb_flow_dissect); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a47ad6cd41c0..f2d0462611c3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -250,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok())) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -325,13 +366,14 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - .proc_handler = proc_dointvec -#else - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, -#endif +# else + .extra1 = &zero, + .extra2 = &two, +# endif }, # ifdef CONFIG_HAVE_EBPF_JIT { @@ -339,14 +381,18 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &one, }, # endif #endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7d5d444964aa..5f52236780b4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -79,8 +79,9 @@ config NF_TABLES_ARP endif # NF_TABLES config NF_FLOW_TABLE_IPV4 - select NF_FLOW_TABLE tristate "Netfilter flow table IPv4 module" + depends on NF_CONNTRACK && NF_TABLES + select NF_FLOW_TABLE help This option adds the flow table IPv4 support. @@ -157,6 +158,7 @@ config NF_NAT_SNMP_BASIC depends on NF_CONNTRACK_SNMP depends on NETFILTER_ADVANCED default NF_NAT && NF_CONNTRACK_SNMP + select ASN1 ---help--- This module implements an Application Layer Gateway (ALG) for @@ -342,6 +344,7 @@ config IP_NF_TARGET_CLUSTERIP depends on NF_CONNTRACK_IPV4 depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK + select NETFILTER_FAMILY_ARP help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 8bb1f0c7a375..2dad20eefd26 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,9 +27,15 @@ obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o + +nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o +nf_nat_snmp_basic-y : nf_nat_snmp_basic-asn1.h nf_nat_snmp_basic-asn1.c obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h + obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o + # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index bf8a5340f15e..5f7c0d643fb3 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1656,7 +1656,6 @@ static int __init arp_tables_init(void) if (ret < 0) goto err4; - pr_info("arp_tables: (C) 2002 David S. Miller\n"); return 0; err4: diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 0b975aa2d363..1f534aec22f0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1939,7 +1939,6 @@ static int __init ip_tables_init(void) if (ret < 0) goto err5; - pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index a869d1fea7d9..960625aabf04 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/slab.h> @@ -12,6 +13,10 @@ static int __net_init iptable_raw_table_init(struct net *net); +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, @@ -21,6 +26,15 @@ static const struct xt_table packet_raw = { .table_init = iptable_raw_table_init, }; +static const struct xt_table packet_raw_before_defrag = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG, + .table_init = iptable_raw_table_init, +}; + /* The work comes in here from netfilter.c. */ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, @@ -34,15 +48,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + const struct xt_table *table = &packet_raw; int ret; + if (raw_before_defrag) + table = &packet_raw_before_defrag; + if (net->ipv4.iptable_raw) return 0; - repl = ipt_alloc_initial_table(&packet_raw); + repl = ipt_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + ret = ipt_register_table(net, table, repl, rawtable_ops, &net->ipv4.iptable_raw); kfree(repl); return ret; @@ -63,8 +81,15 @@ static struct pernet_operations iptable_raw_net_ops = { static int __init iptable_raw_init(void) { int ret; + const struct xt_table *table = &packet_raw; + + if (raw_before_defrag) { + table = &packet_raw_before_defrag; + + pr_info("Enabling raw table before defrag\n"); + } - rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 37fe1616ca0b..a0d3ad60a411 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -78,6 +78,8 @@ static unsigned int ipv4_conntrack_defrag(void *priv, if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 new file mode 100644 index 000000000000..24b73268f362 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 @@ -0,0 +1,177 @@ +Message ::= + SEQUENCE { + version + INTEGER ({snmp_version}), + + community + OCTET STRING, + + pdu + PDUs + } + + +ObjectName ::= + OBJECT IDENTIFIER + +ObjectSyntax ::= + CHOICE { + simple + SimpleSyntax, + + application-wide + ApplicationSyntax + } + +SimpleSyntax ::= + CHOICE { + integer-value + INTEGER, + + string-value + OCTET STRING, + + objectID-value + OBJECT IDENTIFIER + } + +ApplicationSyntax ::= + CHOICE { + ipAddress-value + IpAddress, + + counter-value + Counter32, + + timeticks-value + TimeTicks, + + arbitrary-value + Opaque, + + big-counter-value + Counter64, + + unsigned-integer-value + Unsigned32 + } + +IpAddress ::= + [APPLICATION 0] + IMPLICIT OCTET STRING OPTIONAL ({snmp_helper}) + +Counter32 ::= + [APPLICATION 1] + IMPLICIT INTEGER OPTIONAL + +Unsigned32 ::= + [APPLICATION 2] + IMPLICIT INTEGER OPTIONAL + +Gauge32 ::= Unsigned32 OPTIONAL + +TimeTicks ::= + [APPLICATION 3] + IMPLICIT INTEGER OPTIONAL + +Opaque ::= + [APPLICATION 4] + IMPLICIT OCTET STRING OPTIONAL + +Counter64 ::= + [APPLICATION 6] + IMPLICIT INTEGER OPTIONAL + +PDUs ::= + CHOICE { + get-request + GetRequest-PDU, + + get-next-request + GetNextRequest-PDU, + + get-bulk-request + GetBulkRequest-PDU, + + response + Response-PDU, + + set-request + SetRequest-PDU, + + inform-request + InformRequest-PDU, + + snmpV2-trap + SNMPv2-Trap-PDU, + + report + Report-PDU + } + +GetRequest-PDU ::= + [0] IMPLICIT PDU OPTIONAL + +GetNextRequest-PDU ::= + [1] IMPLICIT PDU OPTIONAL + +Response-PDU ::= + [2] IMPLICIT PDU OPTIONAL + +SetRequest-PDU ::= + [3] IMPLICIT PDU OPTIONAL + +-- [4] is obsolete + +GetBulkRequest-PDU ::= + [5] IMPLICIT PDU OPTIONAL + +InformRequest-PDU ::= + [6] IMPLICIT PDU OPTIONAL + +SNMPv2-Trap-PDU ::= + [7] IMPLICIT PDU OPTIONAL + +Report-PDU ::= + [8] IMPLICIT PDU OPTIONAL + +PDU ::= + SEQUENCE { + request-id + INTEGER, + + error-status + INTEGER, + + error-index + INTEGER, + + variable-bindings + VarBindList + } + + +VarBind ::= + SEQUENCE { + name + ObjectName, + + CHOICE { + value + ObjectSyntax, + + unSpecified + NULL, + + noSuchObject + [0] IMPLICIT NULL, + + noSuchInstance + [1] IMPLICIT NULL, + + endOfMibView + [2] IMPLICIT NULL + } +} + +VarBindList ::= SEQUENCE OF VarBind diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c deleted file mode 100644 index d5b1e0b3f687..000000000000 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ /dev/null @@ -1,1286 +0,0 @@ -/* - * nf_nat_snmp_basic.c - * - * Basic SNMP Application Layer Gateway - * - * This IP NAT module is intended for use with SNMP network - * discovery and monitoring applications where target networks use - * conflicting private address realms. - * - * Static NAT is used to remap the networks from the view of the network - * management system at the IP layer, and this module remaps some application - * layer addresses to match. - * - * The simplest form of ALG is performed, where only tagged IP addresses - * are modified. The module does not need to be MIB aware and only scans - * messages at the ASN.1/BER level. - * - * Currently, only SNMPv1 and SNMPv2 are supported. - * - * More information on ALG and associated issues can be found in - * RFC 2962 - * - * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory - * McLean & Jochen Friedrich, stripped down for use in the kernel. - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - * - * Author: James Morris <jmorris@intercode.com.au> - * - * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> - */ -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/udp.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_nat_helper.h> -#include <linux/netfilter/nf_conntrack_snmp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); -MODULE_ALIAS("ip_nat_snmp_basic"); - -#define SNMP_PORT 161 -#define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)(n)) - -static int debug; -static DEFINE_SPINLOCK(snmp_lock); - -/* - * Application layer address mapping mimics the NAT mapping, but - * only for the first octet in this case (a more flexible system - * can be implemented if needed). - */ -struct oct1_map -{ - u_int8_t from; - u_int8_t to; -}; - - -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -/* - * ASN.1 context. - */ -struct asn1_ctx -{ - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr -{ - unsigned char *data; - unsigned int len; -}; - -static void asn1_open(struct asn1_ctx *ctx, - unsigned char *buf, - unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do - { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char asn1_length_decode(struct asn1_ctx *ctx, - unsigned int *def, - unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = ch & 0x7F; - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - - /* don't trust len bigger than ctx buffer */ - if (*len > ctx->end - ctx->pointer) - return 0; - - return 1; -} - -static unsigned char asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned int def, len; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - def = len = 0; - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - /* primitive shall be definite, indefinite shall be constructed */ - if (*con == ASN1_PRI && !def) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - unsigned char ch; - - if (eoc == NULL) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } - return 1; - } -} - -static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, - unsigned int *len) -{ - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) - return 0; - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} - -static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, - unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long **oid, - unsigned int *len) -{ - unsigned long subid; - unsigned long *optr; - size_t size; - - size = eoc - ctx->pointer + 1; - - /* first subid actually encodes first two subids */ - if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) - return 0; - - *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) - return 0; - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *len = 2; - optr += 2; - - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } - - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } - } - return 1; -} - -/***************************************************************************** - * - * SNMP decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* SNMP Versions */ -#define SNMP_V1 0 -#define SNMP_V2C 1 -#define SNMP_V2 2 -#define SNMP_V3 3 - -/* Default Sizes */ -#define SNMP_SIZE_COMM 256 -#define SNMP_SIZE_OBJECTID 128 -#define SNMP_SIZE_BUFCHR 256 -#define SNMP_SIZE_BUFINT 128 -#define SNMP_SIZE_SMALLOBJECTID 16 - -/* Requests */ -#define SNMP_PDU_GET 0 -#define SNMP_PDU_NEXT 1 -#define SNMP_PDU_RESPONSE 2 -#define SNMP_PDU_SET 3 -#define SNMP_PDU_TRAP1 4 -#define SNMP_PDU_BULK 5 -#define SNMP_PDU_INFORM 6 -#define SNMP_PDU_TRAP2 7 - -/* Errors */ -#define SNMP_NOERROR 0 -#define SNMP_TOOBIG 1 -#define SNMP_NOSUCHNAME 2 -#define SNMP_BADVALUE 3 -#define SNMP_READONLY 4 -#define SNMP_GENERROR 5 -#define SNMP_NOACCESS 6 -#define SNMP_WRONGTYPE 7 -#define SNMP_WRONGLENGTH 8 -#define SNMP_WRONGENCODING 9 -#define SNMP_WRONGVALUE 10 -#define SNMP_NOCREATION 11 -#define SNMP_INCONSISTENTVALUE 12 -#define SNMP_RESOURCEUNAVAILABLE 13 -#define SNMP_COMMITFAILED 14 -#define SNMP_UNDOFAILED 15 -#define SNMP_AUTHORIZATIONERROR 16 -#define SNMP_NOTWRITABLE 17 -#define SNMP_INCONSISTENTNAME 18 - -/* General SNMP V1 Traps */ -#define SNMP_TRAP_COLDSTART 0 -#define SNMP_TRAP_WARMSTART 1 -#define SNMP_TRAP_LINKDOWN 2 -#define SNMP_TRAP_LINKUP 3 -#define SNMP_TRAP_AUTFAILURE 4 -#define SNMP_TRAP_EQPNEIGHBORLOSS 5 -#define SNMP_TRAP_ENTSPECIFIC 6 - -/* SNMPv1 Types */ -#define SNMP_NULL 0 -#define SNMP_INTEGER 1 /* l */ -#define SNMP_OCTETSTR 2 /* c */ -#define SNMP_DISPLAYSTR 2 /* c */ -#define SNMP_OBJECTID 3 /* ul */ -#define SNMP_IPADDR 4 /* uc */ -#define SNMP_COUNTER 5 /* ul */ -#define SNMP_GAUGE 6 /* ul */ -#define SNMP_TIMETICKS 7 /* ul */ -#define SNMP_OPAQUE 8 /* c */ - -/* Additional SNMPv2 Types */ -#define SNMP_UINTEGER 5 /* ul */ -#define SNMP_BITSTR 9 /* uc */ -#define SNMP_NSAP 10 /* uc */ -#define SNMP_COUNTER64 11 /* ul */ -#define SNMP_NOSUCHOBJECT 12 -#define SNMP_NOSUCHINSTANCE 13 -#define SNMP_ENDOFMIBVIEW 14 - -union snmp_syntax -{ - unsigned char uc[0]; /* 8 bit unsigned */ - char c[0]; /* 8 bit signed */ - unsigned long ul[0]; /* 32 bit unsigned */ - long l[0]; /* 32 bit signed */ -}; - -struct snmp_object -{ - unsigned long *id; - unsigned int id_len; - unsigned short type; - unsigned int syntax_len; - union snmp_syntax syntax; -}; - -struct snmp_request -{ - unsigned long id; - unsigned int error_status; - unsigned int error_index; -}; - -struct snmp_v1_trap -{ - unsigned long *id; - unsigned int id_len; - unsigned long ip_address; /* pointer */ - unsigned int general; - unsigned int specific; - unsigned long time; -}; - -/* SNMP types */ -#define SNMP_IPA 0 -#define SNMP_CNT 1 -#define SNMP_GGE 2 -#define SNMP_TIT 3 -#define SNMP_OPQ 4 -#define SNMP_C64 6 - -/* SNMP errors */ -#define SERR_NSO 0 -#define SERR_NSI 1 -#define SERR_EOM 2 - -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check); -struct snmp_cnv -{ - unsigned int class; - unsigned int tag; - int syntax; -}; - -static const struct snmp_cnv snmp_conv[] = { - {ASN1_UNI, ASN1_NUL, SNMP_NULL}, - {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, - {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, - {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, - {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, - {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, - {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ - {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ - {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, - {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, - - /* SNMPv2 data types and errors */ - {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, - {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, - {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, - {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, - {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, - {0, 0, -1} -}; - -static unsigned char snmp_tag_cls2syntax(unsigned int tag, - unsigned int cls, - unsigned short *syntax) -{ - const struct snmp_cnv *cnv; - - cnv = snmp_conv; - - while (cnv->syntax != -1) { - if (cnv->tag == tag && cnv->class == cls) { - *syntax = cnv->syntax; - return 1; - } - cnv++; - } - return 0; -} - -static unsigned char snmp_object_decode(struct asn1_ctx *ctx, - struct snmp_object **obj) -{ - unsigned int cls, con, tag, len, idlen; - unsigned short type; - unsigned char *eoc, *end, *p; - unsigned long *lp, *id; - unsigned long ul; - long l; - - *obj = NULL; - id = NULL; - - if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &id, &idlen)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { - kfree(id); - return 0; - } - - if (con != ASN1_PRI) { - kfree(id); - return 0; - } - - type = 0; - if (!snmp_tag_cls2syntax(tag, cls, &type)) { - kfree(id); - return 0; - } - - l = 0; - switch (type) { - case SNMP_INTEGER: - len = sizeof(long); - if (!asn1_long_decode(ctx, end, &l)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - (*obj)->syntax.l[0] = l; - break; - case SNMP_OCTETSTR: - case SNMP_OPAQUE: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.c, p, len); - kfree(p); - break; - case SNMP_NULL: - case SNMP_NOSUCHOBJECT: - case SNMP_NOSUCHINSTANCE: - case SNMP_ENDOFMIBVIEW: - len = 0; - *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - if (!asn1_null_decode(ctx, end)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - break; - case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, &lp, &len)) { - kfree(id); - return 0; - } - len *= sizeof(unsigned long); - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(lp); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.ul, lp, len); - kfree(lp); - break; - case SNMP_IPADDR: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - if (len != 4) { - kfree(p); - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.uc, p, len); - kfree(p); - break; - case SNMP_COUNTER: - case SNMP_GAUGE: - case SNMP_TIMETICKS: - len = sizeof(unsigned long); - if (!asn1_ulong_decode(ctx, end, &ul)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - (*obj)->syntax.ul[0] = ul; - break; - default: - kfree(id); - return 0; - } - - (*obj)->syntax_len = len; - (*obj)->type = type; - (*obj)->id = id; - (*obj)->id_len = idlen; - - if (!asn1_eoc_decode(ctx, eoc)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - return 1; -} - -static unsigned char noinline_for_stack -snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request) -{ - unsigned int cls, con, tag; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_ulong_decode(ctx, end, &request->id)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_status)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_index)) - return 0; - - return 1; -} - -/* - * Fast checksum update for possibly oddly-aligned UDP byte, from the - * code example in the draft. - */ -static void fast_csum(__sum16 *csum, - const unsigned char *optr, - const unsigned char *nptr, - int offset) -{ - unsigned char s[4]; - - if (offset & 1) { - s[0] = ~0; - s[1] = ~*optr; - s[2] = 0; - s[3] = *nptr; - } else { - s[0] = ~*optr; - s[1] = ~0; - s[2] = *nptr; - s[3] = 0; - } - - *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); -} - -/* - * Mangle IP address. - * - begin points to the start of the snmp messgae - * - addr points to the start of the address - */ -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check) -{ - if (map->from == NOCT1(addr)) { - u_int32_t old; - - if (debug) - memcpy(&old, addr, sizeof(old)); - - *addr = map->to; - - /* Update UDP checksum if being used */ - if (*check) { - fast_csum(check, - &map->from, &map->to, addr - begin); - - } - - if (debug) - printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n", - &old, addr); - } -} - -static unsigned char noinline_for_stack -snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned int cls, con, tag, len; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_id_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) - goto err_id_free; - - if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) - goto err_id_free; - - /* IPv4 only */ - if (len != 4) - goto err_addr_free; - - mangle_address(ctx->begin, ctx->pointer - 4, map, check); - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->general)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->specific)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) - goto err_addr_free; - - if (!asn1_ulong_decode(ctx, end, &trap->time)) - goto err_addr_free; - - return 1; - -err_addr_free: - kfree((unsigned long *)trap->ip_address); - -err_id_free: - kfree(trap->id); - - return 0; -} - -/***************************************************************************** - * - * Misc. routines - * - *****************************************************************************/ - -/* - * Parse and mangle SNMP message according to mapping. - * (And this is the fucking 'basic' method). - */ -static int snmp_parse_mangle(unsigned char *msg, - u_int16_t len, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned char *eoc, *end; - unsigned int cls, con, tag, vers, pdutype; - struct asn1_ctx ctx; - struct asn1_octstr comm; - struct snmp_object *obj; - - if (debug > 1) - print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1, - msg, len, 0); - - asn1_open(&ctx, msg, len); - - /* - * Start of SNMP message. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - /* - * Version 1 or 2 handled. - */ - if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - if (!asn1_uint_decode (&ctx, end, &vers)) - return 0; - if (debug > 1) - pr_debug("bsalg: snmp version: %u\n", vers + 1); - if (vers > 1) - return 1; - - /* - * Community. - */ - if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) - return 0; - if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) - return 0; - if (debug > 1) { - unsigned int i; - - pr_debug("bsalg: community: "); - for (i = 0; i < comm.len; i++) - pr_cont("%c", comm.data[i]); - pr_cont("\n"); - } - kfree(comm.data); - - /* - * PDU type - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) - return 0; - if (cls != ASN1_CTX || con != ASN1_CON) - return 0; - if (debug > 1) { - static const unsigned char *const pdus[] = { - [SNMP_PDU_GET] = "get", - [SNMP_PDU_NEXT] = "get-next", - [SNMP_PDU_RESPONSE] = "response", - [SNMP_PDU_SET] = "set", - [SNMP_PDU_TRAP1] = "trapv1", - [SNMP_PDU_BULK] = "bulk", - [SNMP_PDU_INFORM] = "inform", - [SNMP_PDU_TRAP2] = "trapv2" - }; - - if (pdutype > SNMP_PDU_TRAP2) - pr_debug("bsalg: bad pdu type %u\n", pdutype); - else - pr_debug("bsalg: pdu: %s\n", pdus[pdutype]); - } - if (pdutype != SNMP_PDU_RESPONSE && - pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) - return 1; - - /* - * Request header or v1 trap - */ - if (pdutype == SNMP_PDU_TRAP1) { - struct snmp_v1_trap trap; - unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); - - if (ret) { - kfree(trap.id); - kfree((unsigned long *)trap.ip_address); - } else - return ret; - - } else { - struct snmp_request req; - - if (!snmp_request_decode(&ctx, &req)) - return 0; - - if (debug > 1) - pr_debug("bsalg: request: id=0x%lx error_status=%u " - "error_index=%u\n", req.id, req.error_status, - req.error_index); - } - - /* - * Loop through objects, look for IP addresses to mangle. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - while (!asn1_eoc_decode(&ctx, eoc)) { - unsigned int i; - - if (!snmp_object_decode(&ctx, &obj)) { - if (obj) { - kfree(obj->id); - kfree(obj); - } - return 0; - } - - if (debug > 1) { - pr_debug("bsalg: object: "); - for (i = 0; i < obj->id_len; i++) { - if (i > 0) - pr_cont("."); - pr_cont("%lu", obj->id[i]); - } - pr_cont(": type=%u\n", obj->type); - - } - - if (obj->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4, map, check); - - kfree(obj->id); - kfree(obj); - } - - if (!asn1_eoc_decode(&ctx, eoc)) - return 0; - - return 1; -} - -/***************************************************************************** - * - * NAT routines. - * - *****************************************************************************/ - -/* - * SNMP translation routine. - */ -static int snmp_translate(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - u_int16_t udplen = ntohs(udph->len); - u_int16_t paylen = udplen - sizeof(struct udphdr); - int dir = CTINFO2DIR(ctinfo); - struct oct1_map map; - - /* - * Determine mappping for application layer addresses based - * on NAT manipulations for the packet. - */ - if (dir == IP_CT_DIR_ORIGINAL) { - /* SNAT traps */ - map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); - } else { - /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip); - } - - if (map.from == map.to) - return NF_ACCEPT; - - if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), - paylen, &map, &udph->check)) { - net_warn_ratelimited("bsalg: parser failed\n"); - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We don't actually set up expectations, just adjust internal IP - * addresses if this is being NATted */ -static int help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - int dir = CTINFO2DIR(ctinfo); - unsigned int ret; - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - - /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) - return NF_ACCEPT; - if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* No NAT? */ - if (!(ct->status & IPS_NAT_MASK)) - return NF_ACCEPT; - - /* - * Make sure the packet length is ok. So far, we were only guaranteed - * to have a valid length IP header plus 8 bytes, which means we have - * enough room for a UDP header. Just verify the UDP length field so we - * can mess around with the payload. - */ - if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { - net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", - &iph->saddr, &iph->daddr); - return NF_DROP; - } - - if (!skb_make_writable(skb, skb->len)) - return NF_DROP; - - spin_lock_bh(&snmp_lock); - ret = snmp_translate(ct, ctinfo, skb); - spin_unlock_bh(&snmp_lock); - return ret; -} - -static const struct nf_conntrack_expect_policy snmp_exp_policy = { - .max_expected = 0, - .timeout = 180, -}; - -static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { - .me = THIS_MODULE, - .help = help, - .expect_policy = &snmp_exp_policy, - .name = "snmp_trap", - .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), - .tuple.dst.protonum = IPPROTO_UDP, -}; - -/***************************************************************************** - * - * Module stuff. - * - *****************************************************************************/ - -static int __init nf_nat_snmp_basic_init(void) -{ - BUG_ON(nf_nat_snmp_hook != NULL); - RCU_INIT_POINTER(nf_nat_snmp_hook, help); - - return nf_conntrack_helper_register(&snmp_trap_helper); -} - -static void __exit nf_nat_snmp_basic_fini(void) -{ - RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); - synchronize_rcu(); - nf_conntrack_helper_unregister(&snmp_trap_helper); -} - -module_init(nf_nat_snmp_basic_init); -module_exit(nf_nat_snmp_basic_fini); - -module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c new file mode 100644 index 000000000000..b6e277093e7e --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -0,0 +1,235 @@ +/* + * nf_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Author: James Morris <jmorris@intercode.com.au> + * + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <net/checksum.h> +#include <net/udp.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> +#include "nf_nat_snmp_basic-asn1.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); +MODULE_ALIAS("ip_nat_snmp_basic"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 + +static DEFINE_SPINLOCK(snmp_lock); + +struct snmp_ctx { + unsigned char *begin; + __sum16 *check; + __be32 from; + __be32 to; +}; + +static void fast_csum(struct snmp_ctx *ctx, unsigned char offset) +{ + unsigned char s[12] = {0,}; + int size; + + if (offset & 1) { + memcpy(&s[1], &ctx->from, 4); + memcpy(&s[7], &ctx->to, 4); + s[0] = ~0; + s[1] = ~s[1]; + s[2] = ~s[2]; + s[3] = ~s[3]; + s[4] = ~s[4]; + s[5] = ~0; + size = 12; + } else { + memcpy(&s[0], &ctx->from, 4); + memcpy(&s[4], &ctx->to, 4); + s[0] = ~s[0]; + s[1] = ~s[1]; + s[2] = ~s[2]; + s[3] = ~s[3]; + size = 8; + } + *ctx->check = csum_fold(csum_partial(s, size, + ~csum_unfold(*ctx->check))); +} + +int snmp_version(void *context, size_t hdrlen, unsigned char tag, + const void *data, size_t datalen) +{ + if (*(unsigned char *)data > 1) + return -ENOTSUPP; + return 1; +} + +int snmp_helper(void *context, size_t hdrlen, unsigned char tag, + const void *data, size_t datalen) +{ + struct snmp_ctx *ctx = (struct snmp_ctx *)context; + __be32 *pdata = (__be32 *)data; + + if (*pdata == ctx->from) { + pr_debug("%s: %pI4 to %pI4\n", __func__, + (void *)&ctx->from, (void *)&ctx->to); + + if (*ctx->check) + fast_csum(ctx, (unsigned char *)data - ctx->begin); + *pdata = ctx->to; + } + + return 1; +} + +static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + u16 datalen = ntohs(udph->len) - sizeof(struct udphdr); + char *data = (unsigned char *)udph + sizeof(struct udphdr); + struct snmp_ctx ctx; + int ret; + + if (dir == IP_CT_DIR_ORIGINAL) { + ctx.from = ct->tuplehash[dir].tuple.src.u3.ip; + ctx.to = ct->tuplehash[!dir].tuple.dst.u3.ip; + } else { + ctx.from = ct->tuplehash[!dir].tuple.src.u3.ip; + ctx.to = ct->tuplehash[dir].tuple.dst.u3.ip; + } + + if (ctx.from == ctx.to) + return NF_ACCEPT; + + ctx.begin = (unsigned char *)udph + sizeof(struct udphdr); + ctx.check = &udph->check; + ret = asn1_ber_decoder(&nf_nat_snmp_basic_decoder, &ctx, data, datalen); + if (ret < 0) { + nf_ct_helper_log(skb, ct, "parser failed\n"); + return NF_DROP; + } + + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted + */ +static int help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + const struct iphdr *iph = ip_hdr(skb); + const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { + nf_ct_helper_log(skb, ct, "dropping malformed packet\n"); + return NF_DROP; + } + + if (!skb_make_writable(skb, skb->len)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, dir, skb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static const struct nf_conntrack_expect_policy snmp_exp_policy = { + .max_expected = 0, + .timeout = 180, +}; + +static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { + .me = THIS_MODULE, + .help = help, + .expect_policy = &snmp_exp_policy, + .name = "snmp_trap", + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, +}; + +static int __init nf_nat_snmp_basic_init(void) +{ + BUG_ON(nf_nat_snmp_hook != NULL); + RCU_INIT_POINTER(nf_nat_snmp_hook, help); + + return nf_conntrack_helper_register(&snmp_trap_helper); +} + +static void __exit nf_nat_snmp_basic_fini(void) +{ + RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); + synchronize_rcu(); + nf_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(nf_nat_snmp_basic_init); +module_exit(nf_nat_snmp_basic_fini); diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index f84c17763f6f..036c074736b0 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -27,40 +27,6 @@ nft_do_chain_arp(void *priv, return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_arp __read_mostly = { - .family = NFPROTO_ARP, - .nhooks = NF_ARP_NUMHOOKS, - .owner = THIS_MODULE, -}; - -static int nf_tables_arp_init_net(struct net *net) -{ - net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.arp== NULL) - return -ENOMEM; - - memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); - - if (nft_register_afinfo(net, net->nft.arp) < 0) - goto err; - - return 0; -err: - kfree(net->nft.arp); - return -ENOMEM; -} - -static void nf_tables_arp_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.arp); - kfree(net->nft.arp); -} - -static struct pernet_operations nf_tables_arp_net_ops = { - .init = nf_tables_arp_init_net, - .exit = nf_tables_arp_exit_net, -}; - static const struct nf_chain_type filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -76,22 +42,11 @@ static const struct nf_chain_type filter_arp = { static int __init nf_tables_arp_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_arp); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_arp_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_arp); - - return ret; + return nft_register_chain_type(&filter_arp); } static void __exit nf_tables_arp_exit(void) { - unregister_pernet_subsys(&nf_tables_arp_net_ops); nft_unregister_chain_type(&filter_arp); } @@ -100,4 +55,4 @@ module_exit(nf_tables_arp_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ +MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index f4675253f1e6..96f955496d5f 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -30,40 +30,6 @@ static unsigned int nft_do_chain_ipv4(void *priv, return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_ipv4 __read_mostly = { - .family = NFPROTO_IPV4, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, -}; - -static int nf_tables_ipv4_init_net(struct net *net) -{ - net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv4 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); - - if (nft_register_afinfo(net, net->nft.ipv4) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv4); - return -ENOMEM; -} - -static void nf_tables_ipv4_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.ipv4); - kfree(net->nft.ipv4); -} - -static struct pernet_operations nf_tables_ipv4_net_ops = { - .init = nf_tables_ipv4_init_net, - .exit = nf_tables_ipv4_exit_net, -}; - static const struct nf_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -85,22 +51,11 @@ static const struct nf_chain_type filter_ipv4 = { static int __init nf_tables_ipv4_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_ipv4); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_ipv4); - - return ret; + return nft_register_chain_type(&filter_ipv4); } static void __exit nf_tables_ipv4_exit(void) { - unregister_pernet_subsys(&nf_tables_ipv4_net_ops); nft_unregister_chain_type(&filter_ipv4); } @@ -109,4 +64,4 @@ module_exit(nf_tables_ipv4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter"); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 8322f26e770e..785712be5b0d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -766,7 +766,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + (rs->rtt_us <= bbr->min_rtt_us || + (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ff71b18d9682..cfa51cfd2d99 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ +#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -2857,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, *rexmit = REXMIT_LOST; } -static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); + if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { + /* If the remote keeps returning delayed ACKs, eventually + * the min filter would pick it up and overestimate the + * prop. delay when it expires. Skip suspected delayed ACKs. + */ + return; + } minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } @@ -2901,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * always taken together with ACK, SACK, or TS-opts. Any negative * values will be skipped with the seq_rtt_us < 0 check above. */ - tcp_update_rtt_min(sk, ca_rtt_us); + tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -3125,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); + + if (pkts_acked == 1 && last_in_flight < tp->mss_cache && + last_in_flight && !prior_sacked && fully_acked && + sack->rate->prior_delivered + 1 == tp->delivered && + !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { + /* Conservatively mark a delayed ACK. It's typically + * from a lone runt packet over the round trip to + * a receiver w/o out-of-order or CE events. + */ + flag |= FLAG_ACK_MAYBE_DELAYED; + } } if (sack->first_sackt) { sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); @@ -3614,6 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b7c4befe67ec..92b8d8c75eed 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1223,8 +1223,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } if (!rcu_access_pointer(fn->leaf)) { - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(fn->leaf, rt); + if (fn->fn_flags & RTN_TL_ROOT) { + /* put back null_entry for root node */ + rcu_assign_pointer(fn->leaf, + info->nl_net->ipv6.ip6_null_entry); + } else { + atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); + } } fn = sn; } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index db99446e0276..a88480193d77 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -352,11 +352,12 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) goto failed_free; + ip6gre_tnl_link_config(nt, 1); + /* Can use a lockless transmit, unless we generate output sequences */ if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; @@ -1709,7 +1710,6 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { - struct ip6_tnl *tunnel; int ret; ret = ip6gre_tunnel_init_common(dev); @@ -1718,10 +1718,6 @@ static int ip6gre_tap_init(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); - - ip6gre_tnl_link_config(tunnel, 1); - return 0; } @@ -1872,12 +1868,16 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); err = register_netdevice(dev); if (err) goto out; + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + if (tb[IFLA_MTU]) + ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + dev_hold(dev); ip6gre_tunnel_link(ign, nt); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 806e95375ec8..4a634b7a2c80 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -72,8 +72,9 @@ endif # NF_TABLES_IPV6 endif # NF_TABLES config NF_FLOW_TABLE_IPV6 - select NF_FLOW_TABLE tristate "Netfilter flow table IPv6 module" + depends on NF_CONNTRACK && NF_TABLES + select NF_FLOW_TABLE help This option adds the flow table IPv6 support. @@ -240,6 +241,15 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_SRH + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment + routing header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + # The targets config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 95611c4b39b0..d984057b8395 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o +obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o # targets obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 6ebbef2dfb60..37fa76ee5130 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1952,7 +1952,6 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; - pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c new file mode 100644 index 000000000000..9642164107ce --- /dev/null +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -0,0 +1,161 @@ +/* Kernel module to match Segment Routing Header (SRH) parameters. */ + +/* Author: + * Ahmed Abdelsalam <amsalam20@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/ipv6.h> +#include <net/seg6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6t_srh.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +/* Test a struct->mt_invflags and a boolean for inequality */ +#define NF_SRH_INVF(ptr, flag, boolean) \ + ((boolean) ^ !!((ptr)->mt_invflags & (flag))) + +static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + struct ipv6_sr_hdr *srh; + struct ipv6_sr_hdr _srh; + int hdrlen, srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return false; + srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); + if (!srh) + return false; + + hdrlen = ipv6_optlen(srh); + if (skb->len - srhoff < hdrlen) + return false; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (srh->segments_left > srh->first_segment) + return false; + + /* Next Header matching */ + if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, + !(srh->nexthdr == srhinfo->next_hdr))) + return false; + + /* Header Extension Length matching */ + if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, + !(srh->hdrlen == srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, + !(srh->hdrlen > srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, + !(srh->hdrlen < srhinfo->hdr_len))) + return false; + + /* Segments Left matching */ + if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, + !(srh->segments_left == srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, + !(srh->segments_left > srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, + !(srh->segments_left < srhinfo->segs_left))) + return false; + + /** + * Last Entry matching + * Last_Entry field was introduced in revision 6 of the SRH draft. + * It was called First_Segment in the previous revision + */ + if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, + !(srh->first_segment == srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, + !(srh->first_segment > srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, + !(srh->first_segment < srhinfo->last_entry))) + return false; + + /** + * Tag matchig + * Tag field was introduced in revision 6 of the SRH draft. + */ + if (srhinfo->mt_flags & IP6T_SRH_TAG) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, + !(srh->tag == srhinfo->tag))) + return false; + return true; +} + +static int srh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + + if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { + pr_err("unknown srh match flags %X\n", srhinfo->mt_flags); + return -EINVAL; + } + + if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { + pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match srh_mt6_reg __read_mostly = { + .name = "srh", + .family = NFPROTO_IPV6, + .match = srh_mt6, + .matchsize = sizeof(struct ip6t_srh), + .checkentry = srh_mt6_check, + .me = THIS_MODULE, +}; + +static int __init srh_mt6_init(void) +{ + return xt_register_match(&srh_mt6_reg); +} + +static void __exit srh_mt6_exit(void) +{ + xt_unregister_match(&srh_mt6_reg); +} + +module_init(srh_mt6_init); +module_exit(srh_mt6_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match"); +MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>"); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index d4bc56443dc1..710fa0806c37 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> @@ -11,6 +12,10 @@ static int __net_init ip6table_raw_table_init(struct net *net); +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, @@ -20,6 +25,15 @@ static const struct xt_table packet_raw = { .table_init = ip6table_raw_table_init, }; +static const struct xt_table packet_raw_before_defrag = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, + .table_init = ip6table_raw_table_init, +}; + /* The work comes in here from netfilter.c. */ static unsigned int ip6table_raw_hook(void *priv, struct sk_buff *skb, @@ -33,15 +47,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; + const struct xt_table *table = &packet_raw; int ret; + if (raw_before_defrag) + table = &packet_raw_before_defrag; + if (net->ipv6.ip6table_raw) return 0; - repl = ip6t_alloc_initial_table(&packet_raw); + repl = ip6t_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops, + ret = ip6t_register_table(net, table, repl, rawtable_ops, &net->ipv6.ip6table_raw); kfree(repl); return ret; @@ -62,9 +80,16 @@ static struct pernet_operations ip6table_raw_net_ops = { static int __init ip6table_raw_init(void) { int ret; + const struct xt_table *table = &packet_raw; + + if (raw_before_defrag) { + table = &packet_raw_before_defrag; + + pr_info("Enabling raw table before defrag\n"); + } /* Register hooks */ - rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 977d8900cfd1..ce53dcfda88a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -231,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, if ((unsigned int)end > IPV6_MAXPLEN) { pr_debug("offset is too large.\n"); - return -1; + return -EINVAL; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); @@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - return -1; + return -EPROTO; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ @@ -358,7 +358,7 @@ found: discard_fq: inet_frag_kill(&fq->q, &nf_frags); err: - return -1; + return -EINVAL; } /* @@ -567,6 +567,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { + u16 savethdr = skb->transport_header; struct net_device *dev = skb->dev; int fhoff, nhoff, ret; struct frag_hdr *fhdr; @@ -600,8 +601,12 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); - if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) { - ret = -EINVAL; + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + if (ret < 0) { + if (ret == -EPROTO) { + skb->transport_header = savethdr; + ret = 0; + } goto out_unlock; } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index b326da59257f..c87b48359e8f 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -63,6 +63,9 @@ static unsigned int ipv6_defrag(void *priv, /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; + + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index 0c3b9d32f64f..fff21602875a 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -5,7 +5,6 @@ #include <linux/rhashtable.h> #include <linux/ipv6.h> #include <linux/netdevice.h> -#include <linux/ipv6.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/neighbour.h> diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 9cd45b964123..17e03589331c 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -28,40 +28,6 @@ static unsigned int nft_do_chain_ipv6(void *priv, return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_ipv6 __read_mostly = { - .family = NFPROTO_IPV6, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, -}; - -static int nf_tables_ipv6_init_net(struct net *net) -{ - net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv6 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6)); - - if (nft_register_afinfo(net, net->nft.ipv6) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv6); - return -ENOMEM; -} - -static void nf_tables_ipv6_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.ipv6); - kfree(net->nft.ipv6); -} - -static struct pernet_operations nf_tables_ipv6_net_ops = { - .init = nf_tables_ipv6_init_net, - .exit = nf_tables_ipv6_exit_net, -}; - static const struct nf_chain_type filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -83,22 +49,11 @@ static const struct nf_chain_type filter_ipv6 = { static int __init nf_tables_ipv6_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_ipv6); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_ipv6); - - return ret; + return nft_register_chain_type(&filter_ipv6); } static void __exit nf_tables_ipv6_exit(void) { - unregister_pernet_subsys(&nf_tables_ipv6_net_ops); nft_unregister_chain_type(&filter_ipv6); } @@ -107,4 +62,4 @@ module_exit(nf_tables_ipv6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET6); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ee0fcf3abbf..9019fa98003d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -506,7 +506,7 @@ config NFT_CT connection tracking information such as the flow state. config NFT_FLOW_OFFLOAD - depends on NF_CONNTRACK + depends on NF_CONNTRACK && NF_FLOW_TABLE tristate "Netfilter nf_tables hardware flow offload module" help This option adds the "flow_offload" expression that you can use to @@ -665,8 +665,9 @@ endif # NF_TABLES_NETDEV endif # NF_TABLES config NF_FLOW_TABLE_INET - select NF_FLOW_TABLE tristate "Netfilter flow table mixed IPv4/IPv6 module" + depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6 + select NF_FLOW_TABLE help This option adds the flow table mixed IPv4/IPv6 support. @@ -674,6 +675,7 @@ config NF_FLOW_TABLE_INET config NF_FLOW_TABLE tristate "Netfilter flow table module" + depends on NF_CONNTRACK && NF_TABLES help This option adds the flow table core infrastructure. diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 997dd387d259..0f6b8172fb9a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -140,7 +140,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, if (reg->nat_hook && orig_ops[i]->nat_hook) { kvfree(new); - return ERR_PTR(-EEXIST); + return ERR_PTR(-EBUSY); } if (inserted || reg->priority > orig_ops[i]->priority) { @@ -377,8 +377,8 @@ static void nf_remove_net_hook(struct nf_hook_entries *old, } } -void __nf_unregister_net_hook(struct net *net, int pf, - const struct nf_hook_ops *reg) +static void __nf_unregister_net_hook(struct net *net, int pf, + const struct nf_hook_ops *reg) { struct nf_hook_entries __rcu **pp; struct nf_hook_entries *p; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 728bf31bb386..975a85a48d39 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2122,7 +2122,6 @@ ip_set_init(void) return ret; } - pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -2138,3 +2137,5 @@ ip_set_fini(void) module_init(ip_set_init); module_exit(ip_set_fini); + +MODULE_DESCRIPTION("ip_set: protocol " __stringify(IPSET_PROTOCOL)); diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 299edc6add5a..1c98c907bc63 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -595,7 +595,6 @@ static int ip_vs_app_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, .open = ip_vs_app_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index f489b8db2406..370abbf6f421 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1143,7 +1143,6 @@ static int ip_vs_conn_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, .open = ip_vs_conn_open, .read = seq_read, .llseek = seq_lseek, @@ -1221,7 +1220,6 @@ static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, .open = ip_vs_conn_sync_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index fff213eacf2a..5ebde4b15810 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2116,7 +2116,6 @@ static int ip_vs_info_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, .open = ip_vs_info_open, .read = seq_read, .llseek = seq_lseek, @@ -2161,7 +2160,6 @@ static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, .open = ip_vs_stats_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2230,7 +2228,6 @@ static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_stats_percpu_fops = { - .owner = THIS_MODULE, .open = ip_vs_stats_percpu_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index a95518261168..6d65389e308f 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -71,7 +71,7 @@ static inline bool already_closed(const struct nf_conn *conn) return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; else - return 0; + return false; } static int key_diff(const u32 *a, const u32 *b, unsigned int klen) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6a64d528d076..3d72a0842c01 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -58,8 +58,6 @@ #include "nf_internals.h" -#define NF_CONNTRACK_VERSION "0.5.0" - int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) __read_mostly; @@ -2068,10 +2066,6 @@ int nf_conntrack_init_start(void) if (!nf_conntrack_cachep) goto err_cachep; - printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", - NF_CONNTRACK_VERSION, nf_conntrack_htable_size, - nf_conntrack_max); - ret = nf_conntrack_expect_init(); if (ret < 0) goto err_expect; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index d6748a8a79c5..8ef21d9f9a00 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -649,7 +649,6 @@ static int exp_open(struct inode *inode, struct file *file) } static const struct file_operations exp_file_ops = { - .owner = THIS_MODULE, .open = exp_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7c7921a53b13..dd177ebee9aa 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -57,8 +57,6 @@ MODULE_LICENSE("GPL"); -static char __initdata version[] = "0.93"; - static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) @@ -3425,7 +3423,6 @@ static int __init ctnetlink_init(void) { int ret; - pr_info("ctnetlink v%s: registering with nfnetlink.\n", version); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); @@ -3459,8 +3456,6 @@ err_out: static void __exit ctnetlink_exit(void) { - pr_info("ctnetlink: unregistering from nfnetlink.\n"); - unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 46d32baad095..9123fdec5e14 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -382,7 +382,6 @@ static int ct_open(struct inode *inode, struct file *file) } static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, .open = ct_open, .read = seq_read, .llseek = seq_lseek, @@ -475,7 +474,6 @@ static int ct_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ct_cpu_seq_fops = { - .owner = THIS_MODULE, .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 8bb152a7cca4..c2c1b16b7538 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -402,7 +402,6 @@ static int nflog_open(struct inode *inode, struct file *file) } static const struct file_operations nflog_file_ops = { - .owner = THIS_MODULE, .open = nflog_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 7f55af5f3d1a..d67a96a25a68 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -15,8 +15,6 @@ #include <linux/netfilter_bridge.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> #include <net/protocol.h> #include <net/netfilter/nf_queue.h> #include <net/dst.h> diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 49bd8bb16b18..92139a087260 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -317,7 +317,6 @@ static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations synproxy_cpu_seq_fops = { - .owner = THIS_MODULE, .open = synproxy_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 336b81689ac9..0791813a1e7d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -26,86 +26,19 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); - -/** - * nft_register_afinfo - register nf_tables address family info - * - * @afi: address family info to register - * - * Register the address family for use with nf_tables. Returns zero on - * success or a negative errno code otherwise. - */ -int nft_register_afinfo(struct net *net, struct nft_af_info *afi) -{ - INIT_LIST_HEAD(&afi->tables); - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail_rcu(&afi->list, &net->nft.af_info); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - return 0; -} -EXPORT_SYMBOL_GPL(nft_register_afinfo); - -static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi); - -/** - * nft_unregister_afinfo - unregister nf_tables address family info - * - * @afi: address family info to unregister - * - * Unregister the address family for use with nf_tables. - */ -void nft_unregister_afinfo(struct net *net, struct nft_af_info *afi) -{ - nfnl_lock(NFNL_SUBSYS_NFTABLES); - __nft_release_afinfo(net, afi); - list_del_rcu(&afi->list); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_unregister_afinfo); - -static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family) -{ - struct nft_af_info *afi; - - list_for_each_entry(afi, &net->nft.af_info, list) { - if (afi->family == family) - return afi; - } - return NULL; -} - -static struct nft_af_info * -nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) -{ - struct nft_af_info *afi; - - afi = nft_afinfo_lookup(net, family); - if (afi != NULL) - return afi; -#ifdef CONFIG_MODULES - if (autoload) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-afinfo-%u", family); - nfnl_lock(NFNL_SUBSYS_NFTABLES); - afi = nft_afinfo_lookup(net, family); - if (afi != NULL) - return ERR_PTR(-EAGAIN); - } -#endif - return ERR_PTR(-EAFNOSUPPORT); -} +static u64 table_handle; static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - struct nft_af_info *afi, + u8 family, struct nft_table *table, struct nft_chain *chain, const struct nlattr * const *nla) { ctx->net = net; - ctx->afi = afi; + ctx->family = family; ctx->table = table; ctx->chain = chain; ctx->nla = nla; @@ -385,30 +318,61 @@ static int nft_delflowtable(struct nft_ctx *ctx, * Tables */ -static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, +static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 family, u8 genmask) { struct nft_table *table; - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && + table->family == family && + nft_active_genmask(table, genmask)) + return table; + } + return NULL; +} + +static struct nft_table *nft_table_lookup_byhandle(const struct net *net, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_table *table; + + list_for_each_entry(table, &net->nft.tables, list) { + if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) return table; } return NULL; } -static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, +static struct nft_table *nf_tables_table_lookup(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 family, u8 genmask) { struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - table = nft_table_lookup(afi, nla, genmask); + table = nft_table_lookup(net, nla, family, genmask); + if (table != NULL) + return table; + + return ERR_PTR(-ENOENT); +} + +static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_table *table; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + table = nft_table_lookup_byhandle(net, nla, genmask); if (table != NULL) return table; @@ -423,7 +387,7 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nf_chain_type * -__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { int i; @@ -436,22 +400,20 @@ __nf_tables_chain_type_lookup(int family, const struct nlattr *nla) } static const struct nf_chain_type * -nf_tables_chain_type_lookup(const struct nft_af_info *afi, - const struct nlattr *nla, - bool autoload) +nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) { const struct nf_chain_type *type; - type = __nf_tables_chain_type_lookup(afi->family, nla); + type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; #ifdef CONFIG_MODULES if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%.*s", afi->family, + request_module("nft-chain-%u-%.*s", family, nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); - type = __nf_tables_chain_type_lookup(afi->family, nla); + type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return ERR_PTR(-EAGAIN); } @@ -463,6 +425,7 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, + [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -484,7 +447,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || - nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) + nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || + nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), + NFTA_TABLE_PAD)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -509,7 +474,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->afi->family, ctx->table); + event, 0, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -526,7 +491,6 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_af_info *afi; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); @@ -535,30 +499,27 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (!nft_is_active(net, table)) - continue; - if (nf_tables_fill_table_info(skb, net, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWTABLE, - NLM_F_MULTI, - afi->family, table) < 0) - goto done; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, table)) + continue; + if (nf_tables_fill_table_info(skb, net, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWTABLE, NLM_F_MULTI, + table->family, table) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } done: rcu_read_unlock(); @@ -573,7 +534,6 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; int family = nfmsg->nfgen_family; @@ -586,11 +546,8 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -611,10 +568,7 @@ err: return err; } -static void _nf_tables_table_disable(struct net *net, - const struct nft_af_info *afi, - struct nft_table *table, - u32 cnt) +static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) { struct nft_chain *chain; u32 i = 0; @@ -632,9 +586,7 @@ static void _nf_tables_table_disable(struct net *net, } } -static int nf_tables_table_enable(struct net *net, - const struct nft_af_info *afi, - struct nft_table *table) +static int nf_tables_table_enable(struct net *net, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; @@ -654,15 +606,13 @@ static int nf_tables_table_enable(struct net *net, return 0; err: if (i) - _nf_tables_table_disable(net, afi, table, i); + nft_table_disable(net, table, i); return err; } -static void nf_tables_table_disable(struct net *net, - const struct nft_af_info *afi, - struct nft_table *table) +static void nf_tables_table_disable(struct net *net, struct nft_table *table) { - _nf_tables_table_disable(net, afi, table, 0); + nft_table_disable(net, table, 0); } static int nf_tables_updtable(struct nft_ctx *ctx) @@ -691,7 +641,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_enable(trans) = false; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table); + ret = nf_tables_table_enable(ctx->net, ctx->table); if (ret >= 0) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; nft_trans_table_enable(trans) = true; @@ -716,19 +666,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); const struct nlattr *name; - struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; int err; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(afi, name, genmask); + table = nf_tables_table_lookup(net, name, family, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -738,7 +683,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -748,39 +693,35 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, return -EINVAL; } - err = -EAFNOSUPPORT; - if (!try_module_get(afi->owner)) - goto err1; - err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) - goto err2; + goto err_kzalloc; table->name = nla_strdup(name, GFP_KERNEL); if (table->name == NULL) - goto err3; + goto err_strdup; INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); + table->family = family; table->flags = flags; + table->handle = ++table_handle; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err4; + goto err_trans; - list_add_tail_rcu(&table->list, &afi->tables); + list_add_tail_rcu(&table->list, &net->nft.tables); return 0; -err4: +err_trans: kfree(table->name); -err3: +err_strdup: kfree(table); -err2: - module_put(afi->owner); -err1: +err_kzalloc: return err; } @@ -846,30 +787,28 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { - struct nft_af_info *afi; struct nft_table *table, *nt; const struct nlattr * const *nla = ctx->nla; int err = 0; - list_for_each_entry(afi, &ctx->net->nft.af_info, list) { - if (family != AF_UNSPEC && afi->family != family) + list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) { + if (family != AF_UNSPEC && table->family != family) continue; - ctx->afi = afi; - list_for_each_entry_safe(table, nt, &afi->tables, list) { - if (!nft_is_active_next(ctx->net, table)) - continue; + ctx->family = table->family; - if (nla[NFTA_TABLE_NAME] && - nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) - continue; + if (!nft_is_active_next(ctx->net, table)) + continue; - ctx->table = table; + if (nla[NFTA_TABLE_NAME] && + nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) + continue; - err = nft_flush_table(ctx); - if (err < 0) - goto out; - } + ctx->table = table; + + err = nft_flush_table(ctx); + if (err < 0) + goto out; } out: return err; @@ -882,20 +821,23 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); - if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) + nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); + if (family == AF_UNSPEC || + (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); + if (nla[NFTA_TABLE_HANDLE]) + table = nf_tables_table_lookup_byhandle(net, + nla[NFTA_TABLE_HANDLE], + genmask); + else + table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], + family, genmask); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -903,7 +845,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, table->use > 0) return -EBUSY; - ctx.afi = afi; + ctx.family = family; ctx.table = table; return nft_flush_table(&ctx); @@ -915,7 +857,6 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) kfree(ctx->table->name); kfree(ctx->table); - module_put(ctx->afi->owner); } int nft_register_chain_type(const struct nf_chain_type *ctype) @@ -1116,7 +1057,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->afi->family, ctx->table, + event, 0, ctx->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -1134,7 +1075,6 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0, s_idx = cb->args[0]; @@ -1144,31 +1084,30 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - list_for_each_entry_rcu(chain, &table->chains, list) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (!nft_is_active(net, chain)) - continue; - if (nf_tables_fill_chain_info(skb, net, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWCHAIN, - NLM_F_MULTI, - afi->family, table, chain) < 0) - goto done; + list_for_each_entry_rcu(chain, &table->chains, list) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, chain)) + continue; + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWCHAIN, + NLM_F_MULTI, + table->family, table, + chain) < 0) + goto done; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } done: @@ -1184,7 +1123,6 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; struct sk_buff *skb2; @@ -1198,11 +1136,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1310,8 +1245,8 @@ struct nft_chain_hook { static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], - struct nft_af_info *afi, - struct nft_chain_hook *hook, bool create) + struct nft_chain_hook *hook, u8 family, + bool create) { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nf_chain_type *type; @@ -1328,15 +1263,12 @@ static int nft_chain_parse_hook(struct net *net, return -EINVAL; hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); - if (hook->num >= afi->nhooks) - return -EINVAL; - hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT]; + type = chain_type[family][NFT_CHAIN_T_DEFAULT]; if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE], - create); + type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], + family, create); if (IS_ERR(type)) return PTR_ERR(type); } @@ -1353,7 +1285,7 @@ static int nft_chain_parse_hook(struct net *net, hook->type = type; hook->dev = NULL; - if (afi->flags & NFT_AF_NEEDS_DEV) { + if (family == NFPROTO_NETDEV) { char ifname[IFNAMSIZ]; if (!ha[NFTA_HOOK_DEV]) { @@ -1388,7 +1320,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; - struct nft_af_info *afi = ctx->afi; struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; @@ -1402,7 +1333,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_chain_hook hook; struct nf_hook_ops *ops; - err = nft_chain_parse_hook(net, nla, afi, &hook, create); + err = nft_chain_parse_hook(net, nla, &hook, family, create); if (err < 0) return err; @@ -1495,7 +1426,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (!nft_is_base_chain(chain)) return -EBUSY; - err = nft_chain_parse_hook(ctx->net, nla, ctx->afi, &hook, + err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, create); if (err < 0) return err; @@ -1574,7 +1505,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nlattr * uninitialized_var(name); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; u8 policy = NF_ACCEPT; @@ -1584,11 +1514,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1628,7 +1555,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { if (nlh->nlmsg_flags & NLM_F_EXCL) @@ -1649,24 +1576,26 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule; int family = nfmsg->nfgen_family; struct nft_ctx ctx; + u64 handle; u32 use; int err; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + if (nla[NFTA_CHAIN_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); + chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + } else { + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + } if (IS_ERR(chain)) return PTR_ERR(chain); @@ -1674,7 +1603,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); use = chain->use; list_for_each_entry(rule, &chain->rules, list) { @@ -1839,7 +1768,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); @@ -2062,7 +1991,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->afi->family, ctx->table, + event, 0, ctx->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -2086,7 +2015,6 @@ static int nf_tables_dump_rules(struct sk_buff *skb, { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); const struct nft_rule_dump_ctx *ctx = cb->data; - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; const struct nft_rule *rule; @@ -2097,39 +2025,37 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - if (ctx && ctx->table && - strcmp(ctx->table, table->name) != 0) + if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) + continue; + + list_for_each_entry_rcu(chain, &table->chains, list) { + if (ctx && ctx->chain && + strcmp(ctx->chain, chain->name) != 0) continue; - list_for_each_entry_rcu(chain, &table->chains, list) { - if (ctx && ctx->chain && - strcmp(ctx->chain, chain->name) != 0) - continue; - - list_for_each_entry_rcu(rule, &chain->rules, list) { - if (!nft_is_active(net, rule)) - goto cont; - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWRULE, - NLM_F_MULTI | NLM_F_APPEND, - afi->family, table, chain, rule) < 0) - goto done; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + list_for_each_entry_rcu(rule, &chain->rules, list) { + if (!nft_is_active(net, rule)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWRULE, + NLM_F_MULTI | NLM_F_APPEND, + table->family, + table, chain, rule) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } } @@ -2159,7 +2085,6 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; const struct nft_rule *rule; @@ -2203,11 +2128,8 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2264,7 +2186,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; + int family = nfmsg->nfgen_family; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2280,11 +2202,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2323,7 +2242,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); n = 0; size = 0; @@ -2447,18 +2366,14 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2469,7 +2384,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2636,6 +2551,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_HANDLE] = { .type = NLA_U64 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2649,26 +2565,17 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nft_af_info *afi = NULL; + int family = nfmsg->nfgen_family; struct nft_table *table = NULL; - if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - } - if (nla[NFTA_SET_TABLE] != NULL) { - if (afi == NULL) - return -EAFNOSUPPORT; - - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], - genmask); + table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); } - nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; } @@ -2688,6 +2595,22 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } +static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) +{ + struct nft_set *set; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(set, &table->sets, list) { + if (be64_to_cpu(nla_get_be64(nla)) == set->handle && + nft_active_genmask(set, genmask)) + return set; + } + return ERR_PTR(-ENOENT); +} + static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) @@ -2795,7 +2718,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->afi->family; + nfmsg->nfgen_family = ctx->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); @@ -2803,6 +2726,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; + if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), + NFTA_SET_PAD)) + goto nla_put_failure; if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; @@ -2887,10 +2813,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; - struct nft_af_info *afi; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); - int cur_family = cb->args[3]; struct nft_ctx *ctx = cb->data, ctx_set; if (cb->args[1]) @@ -2899,51 +2823,44 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (ctx->afi && ctx->afi != afi) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (ctx->family != NFPROTO_UNSPEC && + ctx->family != table->family) continue; - if (cur_family) { - if (afi->family != cur_family) + if (ctx->table && ctx->table != table) + continue; + + if (cur_table) { + if (cur_table != table) continue; - cur_family = 0; + cur_table = NULL; } - list_for_each_entry_rcu(table, &afi->tables, list) { - if (ctx->table && ctx->table != table) - continue; + idx = 0; + list_for_each_entry_rcu(set, &table->sets, list) { + if (idx < s_idx) + goto cont; + if (!nft_is_active(net, set)) + goto cont; - if (cur_table) { - if (cur_table != table) - continue; + ctx_set = *ctx; + ctx_set.table = table; + ctx_set.family = table->family; - cur_table = NULL; + if (nf_tables_fill_set(skb, &ctx_set, set, + NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + goto done; } - idx = 0; - list_for_each_entry_rcu(set, &table->sets, list) { - if (idx < s_idx) - goto cont; - if (!nft_is_active(net, set)) - goto cont; - - ctx_set = *ctx; - ctx_set.table = table; - ctx_set.afi = afi; - if (nf_tables_fill_set(skb, &ctx_set, set, - NFT_MSG_NEWSET, - NLM_F_MULTI) < 0) { - cb->args[0] = idx; - cb->args[2] = (unsigned long) table; - cb->args[3] = afi->family; - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } - if (s_idx) - s_idx = 0; + idx++; } + if (s_idx) + s_idx = 0; } cb->args[1] = 1; done: @@ -3041,8 +2958,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; const struct nft_set_ops *ops; - struct nft_af_info *afi; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -3149,15 +3066,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { @@ -3223,6 +3137,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->udata = udata; set->timeout = timeout; set->gc_int = gc_int; + set->handle = nf_tables_alloc_handle(table); err = ops->init(set, &desc, nla); if (err < 0) @@ -3280,7 +3195,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + if (nla[NFTA_SET_HANDLE]) + set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); + else + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3415,19 +3333,15 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nft_af_info *afi; + int family = nfmsg->nfgen_family; struct nft_table *table; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], - genmask); + table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; } @@ -3532,7 +3446,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); - struct nft_af_info *afi; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; @@ -3544,21 +3457,19 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) int event; rcu_read_lock(); - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (afi != dump_ctx->ctx.afi) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (dump_ctx->ctx.family != NFPROTO_UNSPEC && + dump_ctx->ctx.family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - if (table != dump_ctx->ctx.table) - continue; + if (table != dump_ctx->ctx.table) + continue; - list_for_each_entry_rcu(set, &table->sets, list) { - if (set == dump_ctx->set) { - set_found = true; - break; - } + list_for_each_entry_rcu(set, &table->sets, list) { + if (set == dump_ctx->set) { + set_found = true; + break; } - break; } break; } @@ -3578,7 +3489,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = afi->family; + nfmsg->nfgen_family = table->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(net->nft.base_seq & 0xffff); @@ -3641,7 +3552,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->afi->family; + nfmsg->nfgen_family = ctx->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); @@ -3998,7 +3909,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { .net = ctx->net, - .afi = ctx->afi, + .family = ctx->family, .table = ctx->table, .chain = (struct nft_chain *)binding->chain, }; @@ -4417,6 +4328,21 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); +struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) +{ + struct nft_object *obj; + + list_for_each_entry(obj, &table->objects, list) { + if (be64_to_cpu(nla_get_be64(nla)) == obj->handle && + objtype == obj->ops->type->type && + nft_active_genmask(obj, genmask)) + return obj; + } + return ERR_PTR(-ENOENT); +} + static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, @@ -4424,6 +4350,7 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, + [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -4529,7 +4456,6 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, const struct nft_object_type *type; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_af_info *afi; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -4541,11 +4467,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -4563,7 +4486,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, return 0; } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); type = nft_obj_type_get(objtype); if (IS_ERR(type)) @@ -4575,6 +4498,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, goto err1; } obj->table = table; + obj->handle = nf_tables_alloc_handle(table); + obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); if (!obj->name) { err = -ENOMEM; @@ -4621,7 +4546,9 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || - nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) || + nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), + NFTA_OBJ_PAD)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -4640,7 +4567,6 @@ struct nft_obj_filter { static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_af_info *afi; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct nft_obj_filter *filter = cb->data; @@ -4655,38 +4581,37 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - list_for_each_entry_rcu(obj, &table->objects, list) { - if (!nft_is_active(net, obj)) - goto cont; - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table[0] && - strcmp(filter->table, table->name)) - goto cont; - if (filter && - filter->type != NFT_OBJECT_UNSPEC && - obj->ops->type->type != filter->type) - goto cont; + list_for_each_entry_rcu(obj, &table->objects, list) { + if (!nft_is_active(net, obj)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (filter && filter->table[0] && + strcmp(filter->table, table->name)) + goto cont; + if (filter && + filter->type != NFT_OBJECT_UNSPEC && + obj->ops->type->type != filter->type) + goto cont; - if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWOBJ, - NLM_F_MULTI | NLM_F_APPEND, - afi->family, table, obj, reset) < 0) - goto done; + if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWOBJ, + NLM_F_MULTI | NLM_F_APPEND, + table->family, table, + obj, reset) < 0) + goto done; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } done: @@ -4738,7 +4663,6 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); int family = nfmsg->nfgen_family; - const struct nft_af_info *afi; const struct nft_table *table; struct nft_object *obj; struct sk_buff *skb2; @@ -4769,11 +4693,8 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -4819,32 +4740,33 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_af_info *afi; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; u32 objtype; if (!nla[NFTA_OBJ_TYPE] || - !nla[NFTA_OBJ_NAME]) + (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (nla[NFTA_OBJ_HANDLE]) + obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], + objtype, genmask); + else + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], + objtype, genmask); if (IS_ERR(obj)) return PTR_ERR(obj); if (obj->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } @@ -4882,7 +4804,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->afi->family, ctx->report, GFP_KERNEL); + ctx->family, ctx->report, GFP_KERNEL); } /* @@ -4910,6 +4832,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, + [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, }; struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, @@ -4927,6 +4850,20 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); +struct nft_flowtable * +nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) +{ + struct nft_flowtable *flowtable; + + list_for_each_entry(flowtable, &table->flowtables, list) { + if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle && + nft_active_genmask(flowtable, genmask)) + return flowtable; + } + return ERR_PTR(-ENOENT); +} + #define NFT_FLOWTABLE_DEVICE_MAX 8 static int nf_tables_parse_devices(const struct nft_ctx *ctx, @@ -4993,7 +4930,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, return -EINVAL; hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); - if (hooknum >= ctx->afi->nhooks) + if (hooknum != NF_NETDEV_INGRESS) return -EINVAL; priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); @@ -5009,6 +4946,8 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, goto err1; } + flowtable->hooknum = hooknum; + flowtable->priority = priority; flowtable->ops = ops; flowtable->ops_len = n; @@ -5029,33 +4968,31 @@ err1: return err; } -static const struct nf_flowtable_type * -__nft_flowtable_type_get(const struct nft_af_info *afi) +static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; list_for_each_entry(type, &nf_tables_flowtables, list) { - if (afi->family == type->family) + if (family == type->family) return type; } return NULL; } -static const struct nf_flowtable_type * -nft_flowtable_type_get(const struct nft_af_info *afi) +static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - type = __nft_flowtable_type_get(afi); + type = __nft_flowtable_type_get(family); if (type != NULL && try_module_get(type->owner)) return type; #ifdef CONFIG_MODULES if (type == NULL) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nf-flowtable-%u", afi->family); + request_module("nf-flowtable-%u", family); nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (__nft_flowtable_type_get(afi)) + if (__nft_flowtable_type_get(family)) return ERR_PTR(-EAGAIN); } #endif @@ -5067,15 +5004,12 @@ void nft_flow_table_iterate(struct net *net, void *data) { struct nft_flowtable *flowtable; - const struct nft_af_info *afi; const struct nft_table *table; rcu_read_lock(); - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - list_for_each_entry_rcu(table, &afi->tables, list) { - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { - iter(&flowtable->data, data); - } + list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + iter(&flowtable->data, data); } } rcu_read_unlock(); @@ -5106,7 +5040,6 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; - struct nft_af_info *afi; struct nft_table *table; struct nft_ctx ctx; int err, i, k; @@ -5116,11 +5049,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HOOK]) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -5137,20 +5067,22 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return 0; } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); if (!flowtable) return -ENOMEM; flowtable->table = table; + flowtable->handle = nf_tables_alloc_handle(table); + flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { err = -ENOMEM; goto err1; } - type = nft_flowtable_type_get(afi); + type = nft_flowtable_type_get(family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err2; @@ -5210,26 +5142,28 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; - struct nft_af_info *afi; struct nft_table *table; struct nft_ctx ctx; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + if (nla[NFTA_FLOWTABLE_HANDLE]) + flowtable = nf_tables_flowtable_lookup_byhandle(table, + nla[NFTA_FLOWTABLE_HANDLE], + genmask); + else + flowtable = nf_tables_flowtable_lookup(table, + nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (flowtable->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); return nft_delflowtable(&ctx, flowtable); } @@ -5256,7 +5190,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || - nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use))) + nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || + nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), + NFTA_FLOWTABLE_PAD)) goto nla_put_failure; nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); @@ -5298,40 +5234,37 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; - const struct nft_af_info *afi; const struct nft_table *table; rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { - if (!nft_is_active(net, flowtable)) - goto cont; - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table[0] && - strcmp(filter->table, table->name)) - goto cont; + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + if (!nft_is_active(net, flowtable)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (filter && filter->table[0] && + strcmp(filter->table, table->name)) + goto cont; - if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWFLOWTABLE, - NLM_F_MULTI | NLM_F_APPEND, - afi->family, flowtable) < 0) - goto done; + if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWFLOWTABLE, + NLM_F_MULTI | NLM_F_APPEND, + table->family, flowtable) < 0) + goto done; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } done: @@ -5384,7 +5317,6 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_cur(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; - const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; int err; @@ -5410,17 +5342,14 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], genmask); - if (IS_ERR(table)) + if (IS_ERR(flowtable)) return PTR_ERR(flowtable); skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); @@ -5457,7 +5386,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, flowtable); + ctx->family, flowtable); if (err < 0) { kfree_skb(skb); goto err; @@ -5535,17 +5464,14 @@ static int nf_tables_flowtable_event(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; struct nft_table *table; - struct nft_af_info *afi; if (event != NETDEV_UNREGISTER) return 0; nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); - } + list_for_each_entry(table, &dev_net(dev)->nft.tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + nft_flowtable_event(event, dev, flowtable); } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); @@ -5798,7 +5724,6 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_table_update(trans)) { if (!nft_trans_table_enable(trans)) { nf_tables_table_disable(net, - trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -5960,7 +5885,6 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) if (nft_trans_table_update(trans)) { if (nft_trans_table_enable(trans)) { nf_tables_table_disable(net, - trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -6563,20 +6487,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -static int __net_init nf_tables_init_net(struct net *net) -{ - INIT_LIST_HEAD(&net->nft.af_info); - INIT_LIST_HEAD(&net->nft.commit_list); - net->nft.base_seq = 1; - return 0; -} - -static void __net_exit nf_tables_exit_net(struct net *net) -{ - WARN_ON_ONCE(!list_empty(&net->nft.af_info)); - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); -} - int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; @@ -6597,8 +6507,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) } EXPORT_SYMBOL_GPL(__nft_release_basechain); -/* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */ -static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) +static void __nft_release_tables(struct net *net) { struct nft_flowtable *flowtable, *nf; struct nft_table *table, *nt; @@ -6608,10 +6517,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) struct nft_set *set, *ns; struct nft_ctx ctx = { .net = net, - .afi = afi, }; - list_for_each_entry_safe(table, nt, &afi->tables, list) { + list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + ctx.family = table->family; + list_for_each_entry(chain, &table->chains, list) nf_tables_unregister_hook(net, table, chain); list_for_each_entry(flowtable, &table->flowtables, list) @@ -6652,6 +6562,21 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) } } +static int __net_init nf_tables_init_net(struct net *net) +{ + INIT_LIST_HEAD(&net->nft.tables); + INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; + return 0; +} + +static void __net_exit nf_tables_exit_net(struct net *net) +{ + __nft_release_tables(net); + WARN_ON_ONCE(!list_empty(&net->nft.tables)); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .exit = nf_tables_exit_net, @@ -6678,7 +6603,6 @@ static int __init nf_tables_module_init(void) register_netdevice_notifier(&nf_tables_flowtable_notifier); - pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n"); return register_pernet_subsys(&nf_tables_net_ops); err3: nf_tables_core_module_exit(); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index 58b9be7480bb..e30c7da09d0d 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -38,40 +38,6 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_inet __read_mostly = { - .family = NFPROTO_INET, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, -}; - -static int __net_init nf_tables_inet_init_net(struct net *net) -{ - net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.inet == NULL) - return -ENOMEM; - memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); - - if (nft_register_afinfo(net, net->nft.inet) < 0) - goto err; - - return 0; - -err: - kfree(net->nft.inet); - return -ENOMEM; -} - -static void __net_exit nf_tables_inet_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.inet); - kfree(net->nft.inet); -} - -static struct pernet_operations nf_tables_inet_net_ops = { - .init = nf_tables_inet_init_net, - .exit = nf_tables_inet_exit_net, -}; - static const struct nf_chain_type filter_inet = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -93,22 +59,11 @@ static const struct nf_chain_type filter_inet = { static int __init nf_tables_inet_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_inet); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_inet_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_inet); - - return ret; + return nft_register_chain_type(&filter_inet); } static void __exit nf_tables_inet_exit(void) { - unregister_pernet_subsys(&nf_tables_inet_net_ops); nft_unregister_chain_type(&filter_inet); } @@ -117,4 +72,4 @@ module_exit(nf_tables_inet_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(1); +MODULE_ALIAS_NFT_CHAIN(1, "filter"); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 42f6f6d42a6d..4041fafca934 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -38,41 +38,6 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_netdev __read_mostly = { - .family = NFPROTO_NETDEV, - .nhooks = NF_NETDEV_NUMHOOKS, - .owner = THIS_MODULE, - .flags = NFT_AF_NEEDS_DEV, -}; - -static int nf_tables_netdev_init_net(struct net *net) -{ - net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.netdev == NULL) - return -ENOMEM; - - memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev)); - - if (nft_register_afinfo(net, net->nft.netdev) < 0) - goto err; - - return 0; -err: - kfree(net->nft.netdev); - return -ENOMEM; -} - -static void nf_tables_netdev_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.netdev); - kfree(net->nft.netdev); -} - -static struct pernet_operations nf_tables_netdev_net_ops = { - .init = nf_tables_netdev_init_net, - .exit = nf_tables_netdev_exit_net, -}; - static const struct nf_chain_type nft_filter_chain_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -109,7 +74,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain, *nr; struct nft_ctx ctx = { @@ -121,20 +85,18 @@ static int nf_tables_netdev_event(struct notifier_block *this, return NOTIFY_DONE; nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { - ctx.afi = afi; - if (afi->family != NFPROTO_NETDEV) + list_for_each_entry(table, &ctx.net->nft.tables, list) { + if (table->family != NFPROTO_NETDEV) continue; - list_for_each_entry(table, &afi->tables, list) { - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!nft_is_base_chain(chain)) - continue; + ctx.family = table->family; + ctx.table = table; + list_for_each_entry_safe(chain, nr, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); - } + ctx.chain = chain; + nft_netdev_event(event, dev, &ctx); } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); @@ -154,27 +116,21 @@ static int __init nf_tables_netdev_init(void) if (ret) return ret; - ret = register_pernet_subsys(&nf_tables_netdev_net_ops); - if (ret) - goto err1; - ret = register_netdevice_notifier(&nf_tables_netdev_notifier); if (ret) - goto err2; + goto err_register_netdevice_notifier; return 0; -err2: - unregister_pernet_subsys(&nf_tables_netdev_net_ops); -err1: +err_register_netdevice_notifier: nft_unregister_chain_type(&nft_filter_chain_netdev); + return ret; } static void __exit nf_tables_netdev_exit(void) { unregister_netdevice_notifier(&nf_tables_netdev_notifier); - unregister_pernet_subsys(&nf_tables_netdev_net_ops); nft_unregister_chain_type(&nft_filter_chain_netdev); } @@ -183,4 +139,4 @@ module_exit(nf_tables_netdev_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */ +MODULE_ALIAS_NFT_CHAIN(5, "filter"); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 733d3e4a30d8..03ead8a9e90c 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -37,8 +37,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); rcu_dereference_protected(table[(id)].subsys, \ lockdep_nfnl_is_held((id))) -static char __initdata nfversion[] = "0.30"; - static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; @@ -580,13 +578,11 @@ static int __init nfnetlink_init(void) for (i=0; i<NFNL_SUBSYS_COUNT; i++) mutex_init(&table[i].mutex); - pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); return register_pernet_subsys(&nfnetlink_net_ops); } static void __exit nfnetlink_exit(void) { - pr_info("Removing netfilter NETLINK layer.\n"); unregister_pernet_subsys(&nfnetlink_net_ops); } module_init(nfnetlink_init); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c45e6d4358ab..88d427f9f9e6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -527,7 +527,6 @@ static int __init nfnl_acct_init(void) goto err_out; } - pr_info("nfnl_acct: registering with nfnetlink.\n"); ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); @@ -543,7 +542,6 @@ err_out: static void __exit nfnl_acct_exit(void) { - pr_info("nfnl_acct: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&nfnl_acct_subsys); unregister_pernet_subsys(&nfnl_acct_ops); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 32b1c0b44e79..95b04702a655 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -615,8 +615,6 @@ err_out: static void __exit cttimeout_exit(void) { - pr_info("cttimeout: unregistering from nfnetlink.\n"); - nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e955bec0acc6..7b46aa4c478d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1054,7 +1054,6 @@ static int nful_open(struct inode *inode, struct file *file) } static const struct file_operations nful_file_ops = { - .owner = THIS_MODULE, .open = nful_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 2db35f2d553d..8bba23160a68 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1477,7 +1477,6 @@ static int nfqnl_open(struct inode *inode, struct file *file) } static const struct file_operations nfqnl_file_ops = { - .owner = THIS_MODULE, .open = nfqnl_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index dcff0dc8d28b..8e23726b9081 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -144,7 +144,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, { par->net = ctx->net; par->table = ctx->table->name; - switch (ctx->afi->family) { + switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; @@ -175,7 +175,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, } else { par->hook_mask = 0; } - par->family = ctx->afi->family; + par->family = ctx->family; par->nft_compat = true; } @@ -267,7 +267,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) par.net = ctx->net; par.target = target; par.targinfo = info; - par.family = ctx->afi->family; + par.family = ctx->family; if (par.target->destroy != NULL) par.target->destroy(&par); @@ -358,7 +358,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, { par->net = ctx->net; par->table = ctx->table->name; - switch (ctx->afi->family) { + switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; @@ -389,7 +389,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, } else { par->hook_mask = 0; } - par->family = ctx->afi->family; + par->family = ctx->family; par->nft_compat = true; } @@ -446,7 +446,7 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) par.net = ctx->net; par.match = match; par.matchinfo = info; - par.family = ctx->afi->family; + par.family = ctx->family; if (par.match->destroy != NULL) par.match->destroy(&par); @@ -648,7 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, mt_name = nla_data(tb[NFTA_MATCH_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); - family = ctx->afi->family; + family = ctx->family; /* Re-use the existing match if it's already loaded. */ list_for_each_entry(nft_match, &nft_match_list, head) { @@ -733,7 +733,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, tg_name = nla_data(tb[NFTA_TARGET_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); - family = ctx->afi->family; + family = ctx->family; /* Re-use the existing target if it's already loaded. */ list_for_each_entry(nft_target, &nft_target_list, head) { @@ -812,8 +812,6 @@ static int __init nft_compat_module_init(void) goto err_target; } - pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n"); - return ret; err_target: diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 2647b895f4b0..6ab274b14484 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -405,7 +405,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - switch (ctx->afi->family) { + switch (ctx->family) { case NFPROTO_IPV4: len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); @@ -456,7 +456,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nf_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) return err; @@ -550,7 +550,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (err < 0) goto err1; - err = nf_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err1; @@ -564,7 +564,7 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { - nf_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, @@ -573,7 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); - nf_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -734,7 +734,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; - int family = ctx->afi->family; + int family = ctx->family; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; @@ -753,14 +753,14 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, switch (family) { case NFPROTO_IPV4: - if (ctx->afi->family == NFPROTO_IPV6) + if (ctx->family == NFPROTO_IPV6) return -EINVAL; help4 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_IPV6: - if (ctx->afi->family == NFPROTO_IPV4) + if (ctx->family == NFPROTO_IPV4) return -EINVAL; help6 = nf_conntrack_helper_try_module_get(name, family, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index ec0fd78231d8..fc83e29d6634 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -164,7 +164,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, } priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); - err = nft_validate_register_load(priv->sreg_key, set->klen);; + err = nft_validate_register_load(priv->sreg_key, set->klen); if (err < 0) return err; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index dd38785dfed9..4503b8dcf9c0 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -151,7 +151,7 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, priv->flowtable = flowtable; flowtable->use++; - return nf_ct_netns_get(ctx->net, ctx->afi->family); + return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, @@ -160,7 +160,7 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx, struct nft_flow_offload *priv = nft_expr_priv(expr); priv->flowtable->use--; - nf_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 6f6e64423643..a27be36dc0af 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -112,7 +112,7 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - err = nf_logger_find_get(ctx->afi->family, li->type); + err = nf_logger_find_get(ctx->family, li->type); if (err < 0) goto err1; @@ -133,7 +133,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); - nf_logger_put(ctx->afi->family, li->type); + nf_logger_put(ctx->family, li->type); } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 6ac03d4266c9..9d8655bc1bea 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -73,7 +73,7 @@ int nft_masq_init(const struct nft_ctx *ctx, } } - return nf_ct_netns_get(ctx->net, ctx->afi->family); + return nf_ct_netns_get(ctx->net, ctx->family); } EXPORT_SYMBOL_GPL(nft_masq_init); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1a91e676f13e..8fb91940e2e7 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -339,7 +339,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, if (priv->key != NFT_META_SECPATH) return 0; - switch (ctx->afi->family) { + switch (ctx->family) { case NFPROTO_NETDEV: hooks = 1 << NF_NETDEV_INGRESS; break; @@ -370,7 +370,7 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, if (priv->key != NFT_META_PKTTYPE) return 0; - switch (ctx->afi->family) { + switch (ctx->family) { case NFPROTO_BRIDGE: hooks = 1 << NF_BR_PRE_ROUTING; break; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index ed548d06b6dd..1f36954c2ba9 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -142,7 +142,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (family != ctx->afi->family) + if (family != ctx->family) return -EOPNOTSUPP; switch (family) { diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 1e66538bf0ff..c64cbe78dee7 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -75,7 +75,7 @@ int nft_redir_init(const struct nft_ctx *ctx, return -EINVAL; } - return nf_ct_netns_get(ctx->net, ctx->afi->family); + return nf_ct_netns_get(ctx->net, ctx->family); } EXPORT_SYMBOL_GPL(nft_redir_init); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 10c19a3f4cbd..0b56bf05c169 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1082,10 +1082,10 @@ struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, { struct xt_table *t = xt_find_table_lock(net, af, name); -#ifdef CONFIG_MODULE +#ifdef CONFIG_MODULES if (IS_ERR(t)) { int err = request_module("%stable_%s", xt_prefix[af], name); - if (err) + if (err < 0) return ERR_PTR(err); t = xt_find_table_lock(net, af, name); } @@ -1362,7 +1362,6 @@ static int xt_table_open(struct inode *inode, struct file *file) } static const struct file_operations xt_table_ops = { - .owner = THIS_MODULE, .open = xt_table_open, .read = seq_read, .llseek = seq_lseek, @@ -1498,7 +1497,6 @@ static int xt_match_open(struct inode *inode, struct file *file) } static const struct file_operations xt_match_ops = { - .owner = THIS_MODULE, .open = xt_match_open, .read = seq_read, .llseek = seq_lseek, @@ -1551,7 +1549,6 @@ static int xt_target_open(struct inode *inode, struct file *file) } static const struct file_operations xt_target_ops = { - .owner = THIS_MODULE, .open = xt_target_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 5da8746f7b88..ca6847403ca2 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -353,7 +353,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, static bool select_all(const struct xt_hashlimit_htable *ht, const struct dsthash_ent *he) { - return 1; + return true; } static bool select_gc(const struct xt_hashlimit_htable *ht, @@ -1266,7 +1266,6 @@ static int dl_proc_open(struct inode *inode, struct file *file) } static const struct file_operations dl_file_ops_v2 = { - .owner = THIS_MODULE, .open = dl_proc_open_v2, .read = seq_read, .llseek = seq_lseek, @@ -1274,7 +1273,6 @@ static const struct file_operations dl_file_ops_v2 = { }; static const struct file_operations dl_file_ops_v1 = { - .owner = THIS_MODULE, .open = dl_proc_open_v1, .read = seq_read, .llseek = seq_lseek, @@ -1282,7 +1280,6 @@ static const struct file_operations dl_file_ops_v1 = { }; static const struct file_operations dl_file_ops = { - .owner = THIS_MODULE, .open = dl_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c index 000e70377f85..7ca64a50db04 100644 --- a/net/netfilter/xt_ipcomp.c +++ b/net/netfilter/xt_ipcomp.c @@ -58,7 +58,7 @@ static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) */ pr_debug("Dropping evil IPComp tinygram.\n"); par->hotdrop = true; - return 0; + return false; } return spi_match(compinfo->spis[0], compinfo->spis[1], diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 54cbf5b9864c..2ad445c1d27c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2424,6 +2424,7 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, while (skb->len >= nlmsg_total_size(0)) { int msglen; + memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; @@ -2438,7 +2439,6 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; - memset(&extack, 0, sizeof(extack)); err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 86d6e9d2cf00..f5d293416f46 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -122,7 +122,8 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, - u32 prio, struct tcf_chain *chain) + u32 prio, struct tcf_chain *chain, + struct netlink_ext_ack *extack) { struct tcf_proto *tp; int err; @@ -148,6 +149,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, module_put(tp->ops->owner); err = -EAGAIN; } else { + NL_SET_ERR_MSG(extack, "TC classifier not found"); err = -ENOENT; } goto errout; @@ -935,7 +937,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, - u32 parent, void *fh, bool unicast, bool *last) + u32 parent, void *fh, bool unicast, bool *last, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -947,11 +950,12 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; } - err = tp->ops->delete(tp, fh, last); + err = tp->ops->delete(tp, fh, last, extack); if (err) { kfree_skb(skb); return err; @@ -960,8 +964,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (unicast) return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); - return rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); + return err; } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, @@ -1021,8 +1028,10 @@ replay: if (prio == 0) { switch (n->nlmsg_type) { case RTM_DELTFILTER: - if (protocol || t->tcm_handle || tca[TCA_KIND]) + if (protocol || t->tcm_handle || tca[TCA_KIND]) { + NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); return -ENOENT; + } break; case RTM_NEWTFILTER: /* If no priority is provided by the user, @@ -1035,6 +1044,7 @@ replay: } /* fall-through */ default: + NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } } @@ -1063,23 +1073,31 @@ replay: parent = q->handle; } else { q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); - if (!q) + if (!q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); return -EINVAL; + } } /* Is it classful? */ cops = q->ops->cl_ops; - if (!cops) + if (!cops) { + NL_SET_ERR_MSG(extack, "Qdisc not classful"); return -EINVAL; + } - if (!cops->tcf_block) + if (!cops->tcf_block) { + NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); return -EOPNOTSUPP; + } /* Do we search for filter, attached to class? */ if (TC_H_MIN(parent)) { cl = cops->find(q, parent); - if (cl == 0) + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); return -ENOENT; + } } /* And the last stroke */ @@ -1097,12 +1115,14 @@ replay: chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, n->nlmsg_type == RTM_NEWTFILTER); if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL; goto errout; } @@ -1118,6 +1138,7 @@ replay: tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, prio_allocate); if (IS_ERR(tp)) { + NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = PTR_ERR(tp); goto errout; } @@ -1126,12 +1147,14 @@ replay: /* Proto-tcf does not exist, create new one */ if (tca[TCA_KIND] == NULL || !protocol) { + NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified"); err = -EINVAL; goto errout; } if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } @@ -1140,13 +1163,14 @@ replay: prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, chain); + protocol, prio, chain, extack); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; } tp_created = 1; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; } @@ -1165,6 +1189,7 @@ replay: if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } @@ -1176,13 +1201,15 @@ replay: if (n->nlmsg_flags & NLM_F_EXCL) { if (tp_created) tcf_proto_destroy(tp); + NL_SET_ERR_MSG(extack, "Filter already exists"); err = -EEXIST; goto errout; } break; case RTM_DELTFILTER: err = tfilter_del_notify(net, skb, n, tp, block, - q, parent, fh, false, &last); + q, parent, fh, false, &last, + extack); if (err) goto errout; if (last) { @@ -1193,15 +1220,19 @@ replay: case RTM_GETTFILTER: err = tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); goto errout; default: + NL_SET_ERR_MSG(extack, "Invalid netlink message type"); err = -EINVAL; goto errout; } } err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, - n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); + n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, + extack); if (err == 0) { if (tp_created) tcf_chain_tp_insert(chain, &chain_info, tp); @@ -1392,7 +1423,8 @@ void tcf_exts_destroy(struct tcf_exts *exts) EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr, + struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { @@ -1425,8 +1457,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, } #else if ((exts->action && tb[exts->action]) || - (exts->police && tb[exts->police])) + (exts->police && tb[exts->police])) { + NL_SET_ERR_MSG(extack, "Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT)"); return -EOPNOTSUPP; + } #endif return 0; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 5f169ded347e..6088be65d167 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -130,7 +130,8 @@ static void basic_destroy(struct tcf_proto *tp) kfree_rcu(head, rcu); } -static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) +static int basic_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f = arg; @@ -152,11 +153,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct basic_filter *f, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); if (err < 0) return err; @@ -175,7 +177,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { int err; struct basic_head *head = rtnl_dereference(tp->root); @@ -221,7 +224,8 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, fnew->handle = idr_index; } - err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); + err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, + extack); if (err < 0) { if (!fold) idr_remove_ext(&head->handle_idr, fnew->handle); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index cf72aefcf98d..988ad45d78b8 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -186,10 +186,17 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, return 0; } +static u32 cls_bpf_flags(u32 flags) +{ + return flags & CLS_BPF_SUPPORTED_GEN_FLAGS; +} + static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog) { - if (prog && oldprog && prog->gen_flags != oldprog->gen_flags) + if (prog && oldprog && + cls_bpf_flags(prog->gen_flags) != + cls_bpf_flags(oldprog->gen_flags)) return -EINVAL; if (prog && tc_skip_hw(prog->gen_flags)) @@ -295,7 +302,8 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) __cls_bpf_delete_prog(prog); } -static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last) +static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); @@ -403,7 +411,8 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, struct cls_bpf_prog *prog, unsigned long base, - struct nlattr **tb, struct nlattr *est, bool ovr) + struct nlattr **tb, struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { bool is_bpf, is_ebpf, have_exts = false; u32 gen_flags = 0; @@ -414,7 +423,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr); + ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack); if (ret < 0) return ret; @@ -452,7 +461,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *oldprog = *arg; @@ -500,7 +509,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->handle = handle; } - ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); + ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr, + extack); if (ret < 0) goto errout_idr; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 309d5899265f..1b54fbfca414 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -91,7 +91,8 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root) static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; struct cls_cgroup_head *head = rtnl_dereference(tp->root); @@ -121,7 +122,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr, + extack); if (err < 0) goto errout; @@ -154,7 +156,8 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) } } -static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last) +static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 28cd6fb52c16..64c24b488058 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -401,7 +401,7 @@ static void flow_destroy_filter(struct rcu_head *head) static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *fold, *fnew; @@ -454,7 +454,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err2; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr, + extack); if (err < 0) goto err2; @@ -574,7 +575,8 @@ err1: return err; } -static int flow_delete(struct tcf_proto *tp, void *arg, bool *last) +static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f = arg; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f61df19b1026..c6ac4a612c4a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -526,13 +526,14 @@ static void fl_set_key_ip(struct nlattr **tb, } static int fl_set_key(struct net *net, struct nlattr **tb, - struct fl_flow_key *key, struct fl_flow_key *mask) + struct fl_flow_key *key, struct fl_flow_key *mask, + struct netlink_ext_ack *extack) { __be16 ethertype; int ret = 0; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { - int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack); if (err < 0) return err; key->indev_ifindex = err; @@ -827,11 +828,12 @@ static int fl_check_assign_mask(struct cls_fl_head *head, static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); if (err < 0) return err; @@ -840,7 +842,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &f->res, base); } - err = fl_set_key(net, tb, &f->key, &mask->key); + err = fl_set_key(net, tb, &f->key, &mask->key, extack); if (err) return err; @@ -853,7 +855,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *fold = *arg; @@ -916,7 +918,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } } - err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, + extack); if (err) goto errout_idr; @@ -983,7 +986,8 @@ errout_tb: return err; } -static int fl_delete(struct tcf_proto *tp, void *arg, bool *last) +static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 20f0de1a960a..94d159a8869a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -172,7 +172,8 @@ static void fw_destroy(struct tcf_proto *tp) kfree_rcu(head, rcu); } -static int fw_delete(struct tcf_proto *tp, void *arg, bool *last) +static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = arg; @@ -218,13 +219,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_set_parms(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, - struct nlattr **tca, unsigned long base, bool ovr) + struct nlattr **tca, unsigned long base, bool ovr, + struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); u32 mask; int err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr, + extack); if (err < 0) return err; @@ -236,7 +239,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; - ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); + ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); if (ret < 0) return ret; f->ifindex = ret; @@ -257,7 +260,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, - bool ovr) + bool ovr, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; @@ -296,7 +299,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return err; } - err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr); + err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack); if (err < 0) { tcf_exts_destroy(&fnew->exts); kfree(fnew); @@ -345,7 +348,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, f->id = handle; f->tp = tp; - err = fw_set_parms(net, tp, f, tb, tca, base, ovr); + err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index d0e57c86636f..f67d3d7fcf40 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -142,11 +142,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { static int mall_set_parms(struct net *net, struct tcf_proto *tp, struct cls_mall_head *head, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack); if (err < 0) return err; @@ -160,7 +161,7 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; @@ -198,12 +199,13 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, new->handle = handle; new->flags = flags; - err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr); + err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, + extack); if (err) goto err_set_parms; if (!tc_skip_hw(new->flags)) { - err = mall_replace_hw_filter(tp, new, (unsigned long) new); + err = mall_replace_hw_filter(tp, new, (unsigned long)new); if (err) goto err_replace_hw_filter; } @@ -223,7 +225,8 @@ err_exts_init: return err; } -static int mall_delete(struct tcf_proto *tp, void *arg, bool *last) +static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index a1f2b1b7c014..55467c30d524 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -316,7 +316,8 @@ static void route4_destroy(struct tcf_proto *tp) kfree_rcu(head, rcu); } -static int route4_delete(struct tcf_proto *tp, void *arg, bool *last) +static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter *f = arg; @@ -389,7 +390,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, struct nlattr **tb, struct nlattr *est, int new, - bool ovr) + bool ovr, struct netlink_ext_ack *extack) { u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; @@ -397,7 +398,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); if (err < 0) return err; @@ -471,7 +472,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; @@ -515,7 +517,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, } err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], new, ovr); + tca[TCA_RATE], new, ovr, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index cf325625c99d..5cc0df690cff 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -350,7 +350,8 @@ static void rsvp_destroy(struct tcf_proto *tp) kfree_rcu(data, rcu); } -static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last) +static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct rsvp_head *head = rtnl_dereference(tp->root); struct rsvp_filter *nfp, *f = arg; @@ -486,7 +487,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); struct rsvp_filter *f, *nfp; @@ -511,7 +512,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack); if (err < 0) goto errout2; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 67467ae24c97..01a163e0b6aa 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -193,7 +193,8 @@ static void tcindex_destroy_fexts(struct rcu_head *head) tcf_queue_work(&f->work); } -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last) +static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = arg; @@ -246,7 +247,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp, { bool last; - return tcindex_delete(tp, arg, &last); + return tcindex_delete(tp, arg, &last, NULL); } static void __tcindex_destroy(struct rcu_head *head) @@ -322,7 +323,7 @@ static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) { struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; @@ -334,7 +335,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack); if (err < 0) goto errout; @@ -520,7 +521,8 @@ errout: static int tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; @@ -540,7 +542,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, return err; return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr); + tca[TCA_RATE], ovr, extack); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 020d328d0afd..57113e936155 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -672,7 +672,8 @@ static void u32_destroy(struct tcf_proto *tp) tp->data = NULL; } -static int u32_delete(struct tcf_proto *tp, void *arg, bool *last) +static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); @@ -688,13 +689,16 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last) goto out; } - if (root_ht == ht) + if (root_ht == ht) { + NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); return -EINVAL; + } if (ht->refcnt == 1) { ht->refcnt--; u32_destroy_hnode(tp, ht); } else { + NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); return -EBUSY; } @@ -765,11 +769,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { static int u32_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tc_u_hnode *ht, struct tc_u_knode *n, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack); if (err < 0) return err; @@ -777,14 +782,18 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, u32 handle = nla_get_u32(tb[TCA_U32_LINK]); struct tc_u_hnode *ht_down = NULL, *ht_old; - if (TC_U32_KEY(handle)) + if (TC_U32_KEY(handle)) { + NL_SET_ERR_MSG_MOD(extack, "u32 Link handle must be a hash table"); return -EINVAL; + } if (handle) { ht_down = u32_lookup_ht(ht->tp_c, handle); - if (ht_down == NULL) + if (!ht_down) { + NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); return -EINVAL; + } ht_down->refcnt++; } @@ -802,7 +811,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { int ret; - ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); + ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); if (ret < 0) return -EINVAL; n->ifindex = ret; @@ -893,7 +902,8 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -907,28 +917,40 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, size_t size; #endif - if (opt == NULL) - return handle ? -EINVAL : 0; + if (!opt) { + if (handle) { + NL_SET_ERR_MSG_MOD(extack, "Filter handle requires options"); + return -EINVAL; + } else { + return 0; + } + } - err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL); + err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack); if (err < 0) return err; if (tb[TCA_U32_FLAGS]) { flags = nla_get_u32(tb[TCA_U32_FLAGS]); - if (!tc_flags_valid(flags)) + if (!tc_flags_valid(flags)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags"); return -EINVAL; + } } n = *arg; if (n) { struct tc_u_knode *new; - if (TC_U32_KEY(n->handle) == 0) + if (TC_U32_KEY(n->handle) == 0) { + NL_SET_ERR_MSG_MOD(extack, "Key node id cannot be zero"); return -EINVAL; + } - if (n->flags != flags) + if (n->flags != flags) { + NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; + } new = u32_init_knode(tp, n); if (!new) @@ -936,7 +958,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_set_parms(net, tp, base, rtnl_dereference(n->ht_up), new, tb, - tca[TCA_RATE], ovr); + tca[TCA_RATE], ovr, extack); if (err) { u32_destroy_key(tp, new, false); @@ -962,10 +984,14 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_U32_DIVISOR]) { unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); - if (--divisor > 0x100) + if (--divisor > 0x100) { + NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); return -EINVAL; - if (TC_U32_KEY(handle)) + } + if (TC_U32_KEY(handle)) { + NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table"); return -EINVAL; + } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; @@ -1011,20 +1037,26 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, htid = ht->handle; } else { ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); - if (ht == NULL) + if (!ht) { + NL_SET_ERR_MSG_MOD(extack, "Specified hash table not found"); return -EINVAL; + } } } else { ht = rtnl_dereference(tp->root); htid = ht->handle; } - if (ht->divisor < TC_U32_HASH(htid)) + if (ht->divisor < TC_U32_HASH(htid)) { + NL_SET_ERR_MSG_MOD(extack, "Specified hash table buckets exceed configured value"); return -EINVAL; + } if (handle) { - if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) { + NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch"); return -EINVAL; + } handle = htid | TC_U32_NODE(handle); err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, handle, handle + 1, @@ -1035,6 +1067,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, handle = gen_new_kid(ht, htid); if (tb[TCA_U32_SEL] == NULL) { + NL_SET_ERR_MSG_MOD(extack, "Selector not specified"); err = -EINVAL; goto erridr; } @@ -1083,7 +1116,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr, + extack); if (err == 0) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; diff --git a/net/socket.c b/net/socket.c index fbfae1ed3ff5..1536515b6437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2613,15 +2613,6 @@ out_fs: core_initcall(sock_init); /* early initcall */ -static int __init jit_init(void) -{ -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - bpf_jit_enable = 1; -#endif - return 0; -} -pure_initcall(jit_init); - #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e07ee3ae0023..736719c8314e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -367,8 +367,10 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, crypto_info = &ctx->crypto_send; /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + rc = -EBUSY; goto out; + } rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { @@ -386,7 +388,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; - goto out; + goto err_crypto_info; } rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), optlen - sizeof(*crypto_info)); @@ -398,7 +400,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } default: rc = -EINVAL; - goto out; + goto err_crypto_info; } /* currently SW is default, we will have ethtool in future */ @@ -454,6 +456,15 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; + /* The TLS ulp is currently supported only for TCP sockets + * in ESTABLISHED state. + * Supporting sockets in LISTEN state will require us + * to modify the accept implementation to clone rather then + * share the ulp context. + */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTSUPP; + /* allocate tls context */ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9773571b6a34..61f394d369bf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -681,18 +681,17 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } default: rc = -EINVAL; - goto out; + goto free_priv; } ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; ctx->tag_size = tag_size; ctx->overhead_size = ctx->prepend_size + ctx->tag_size; ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); if (!ctx->iv) { rc = -ENOMEM; - goto out; + goto free_priv; } memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); @@ -740,7 +739,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); if (!rc) - goto out; + return 0; free_aead: crypto_free_aead(sw_ctx->aead_send); @@ -751,6 +750,9 @@ free_rec_seq: free_iv: kfree(ctx->iv); ctx->iv = NULL; +free_priv: + kfree(ctx->priv_ctx); + ctx->priv_ctx = NULL; out: return rc; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b48eb6d104c9..ab0c687d0c44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9835,7 +9835,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, */ if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss && rdev->ops->get_station) { - struct station_info sinfo; + struct station_info sinfo = {}; u8 *mac_addr; mac_addr = wdev->current_bss->pub.bssid; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 7ca04a7de85a..05186a47878f 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1254,8 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - /* we are under RTNL - globally locked - so can use a static struct */ - static struct station_info sinfo; + struct station_info sinfo = {}; u8 addr[ETH_ALEN]; int err; diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c index 12e1024069c2..0c12048ac79f 100644 --- a/samples/bpf/xdp2skb_meta_kern.c +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -35,15 +35,17 @@ int _xdp_mark(struct xdp_md *ctx) void *data, *data_end; int ret; - /* Reserve space in-front data pointer for our meta info. + /* Reserve space in-front of data pointer for our meta info. * (Notice drivers not supporting data_meta will fail here!) */ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); if (ret < 0) return XDP_ABORTED; - /* For some unknown reason, these ctx pointers must be read - * after bpf_xdp_adjust_meta, else verifier will reject prog. + /* Notice: Kernel-side verifier requires that loading of + * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), + * as pkt-data pointers are invalidated. Helpers that require + * this are determined/marked by bpf_helper_changes_pkt_data() */ data = (void *)(unsigned long)ctx->data; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index c969141bfa8b..211db8ded0de 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -1,6 +1,7 @@ -/* XDP monitor tool, based on tracepoints +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. * - * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * XDP monitor tool, based on tracepoints */ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" @@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx) return 0; } + +/* Common stats data record shared with _user.c */ +struct datarec { + u64 processed; + u64 dropped; + u64 info; +}; +#define MAX_CPUS 64 + +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->info += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->info++; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index eaba165b3549..eec14520d513 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -1,4 +1,5 @@ -/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= "XDP monitor tool, based on tracepoints\n" @@ -40,6 +41,9 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ +#define EXIT_FAIL_MEM 5 + static void usage(char *argv[]) { int i; @@ -108,23 +112,93 @@ static const char *action2str(int action) return NULL; } +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 info; +}; +#define MAX_CPUS 64 + +/* Userspace structs for collection of stats from maps */ struct record { - __u64 counter; __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct u64rec { + __u64 processed; +}; +struct record_u64 { + /* record for _kern side __u64 values */ + __u64 timestamp; + struct u64rec total; + struct u64rec *cpu; }; struct stats_record { - struct record xdp_redir[REDIR_RES_MAX]; - struct record xdp_exception[XDP_ACTION_MAX]; + struct record_u64 xdp_redirect[REDIR_RES_MAX]; + struct record_u64 xdp_exception[XDP_ACTION_MAX]; + struct record xdp_cpumap_kthread; + struct record xdp_cpumap_enqueue[MAX_CPUS]; }; -static void stats_print_headers(bool err_only) +static bool map_collect_record(int fd, __u32 key, struct record *rec) { - if (err_only) - printf("\n%s\n", __doc_err_only__); + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_info = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); - printf("%-14s %-11s %-10s %-18s %-9s\n", - "ACTION", "result", "pps ", "pps-human-readable", "measure-period"); + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].info = values[i].info; + sum_info += values[i].info; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.info = sum_info; + return true; +} + +static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct u64rec values[nr_cpus]; + __u64 sum_total = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_total += values[i].processed; + } + rec->total.processed = sum_total; + return true; } static double calc_period(struct record *r, struct record *p) @@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p) return period_; } -static double calc_pps(struct record *r, struct record *p, double period) +static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_drop(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->dropped - p->dropped; + pps = packets / period; + } + return pps; +} + +static double calc_info(struct datarec *r, struct datarec *p, double period) { __u64 packets = 0; double pps = 0; if (period > 0) { - packets = r->counter - p->counter; + packets = r->info - p->info; pps = packets / period; } return pps; } -static void stats_print(struct stats_record *rec, - struct stats_record *prev, +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, bool err_only) { - double period = 0, pps = 0; - struct record *r, *p; - int i = 0; + unsigned int nr_cpus = bpf_num_possible_cpus(); + int rec_i = 0, i, to_cpu; + double t = 0, pps = 0; - char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n"; + /* Header */ + printf("%-15s %-7s %-12s %-12s %-9s\n", + "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); /* tracepoint: xdp:xdp_redirect_* */ if (err_only) - i = REDIR_ERROR; - - for (; i < REDIR_RES_MAX; i++) { - r = &rec->xdp_redir[i]; - p = &prev->xdp_redir[i]; - - if (p->timestamp) { - period = calc_period(r, p); - pps = calc_pps(r, p, period); + rec_i = REDIR_ERROR; + + for (; rec_i < REDIR_RES_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_redirect[rec_i]; + prev = &stats_prev->xdp_redirect[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "XDP_REDIRECT", i, + rec_i ? 0.0: pps, rec_i ? pps : 0.0, + err2str(rec_i)); } - printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period); + pps = calc_pps_u64(&rec->total, &prev->total, t); + printf(fmt2, "XDP_REDIRECT", "total", + rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); } /* tracepoint: xdp:xdp_exception */ - for (i = 0; i < XDP_ACTION_MAX; i++) { - r = &rec->xdp_exception[i]; - p = &prev->xdp_exception[i]; - if (p->timestamp) { - period = calc_period(r, p); - pps = calc_pps(r, p, period); + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_exception[rec_i]; + prev = &stats_prev->xdp_exception[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "Exception", i, + 0.0, pps, err2str(rec_i)); } + pps = calc_pps_u64(&rec->total, &prev->total, t); if (pps > 0) - printf(fmt, action2str(i), "Exception", - pps, pps, period); + printf(fmt2, "Exception", "total", + 0.0, pps, action2str(rec_i)); } - printf("\n"); -} -static __u64 get_key32_value64_percpu(int fd, __u32 key) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus]; - __u64 sum = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return 0; + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + struct record *rec, *prev; + char *info_str = ""; + double drop, info; + + rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; + prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt1, "cpumap-enqueue", + i, to_cpu, pps, drop, info, info_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + printf(fmt2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, info, info_str); + } } - /* Sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - sum += values[i]; + /* cpumap kthread stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; + struct record *rec, *prev; + double drop, info; + char *i_str = ""; + + rec = &stats_rec->xdp_cpumap_kthread; + prev = &stats_prev->xdp_cpumap_kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) + i_str = "sched"; + if (pps > 0) + printf(fmt1, "cpumap-kthread", + i, pps, drop, info, i_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) + i_str = "sched-sum"; + printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); } - return sum; + + printf("\n"); } static bool stats_collect(struct stats_record *rec) @@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec) */ fd = map_data[0].fd; /* map0: redirect_err_cnt */ - for (i = 0; i < REDIR_RES_MAX; i++) { - rec->xdp_redir[i].timestamp = gettime(); - rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); - } + for (i = 0; i < REDIR_RES_MAX; i++) + map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); fd = map_data[1].fd; /* map1: exception_cnt */ for (i = 0; i < XDP_ACTION_MAX; i++) { - rec->xdp_exception[i].timestamp = gettime(); - rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i); + map_collect_record_u64(fd, i, &rec->xdp_exception[i]); } + fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); + + fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ + map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); + return true; } +static void *alloc_rec_per_cpu(int record_size) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + void *array; + size_t size; + + size = record_size * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int rec_sz; + int i; + + /* Alloc main stats_record structure */ + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + + /* Alloc stats stored per CPU for each record */ + rec_sz = sizeof(struct u64rec); + for (i = 0; i < REDIR_RES_MAX; i++) + rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < XDP_ACTION_MAX; i++) + rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); + + rec_sz = sizeof(struct datarec); + rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < MAX_CPUS; i++) + rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < REDIR_RES_MAX; i++) + free(r->xdp_redirect[i].cpu); + + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->xdp_exception[i].cpu); + + free(r->xdp_cpumap_kthread.cpu); + + for (i = 0; i < MAX_CPUS; i++) + free(r->xdp_cpumap_enqueue[i].cpu); + + free(r); +} + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + static void stats_poll(int interval, bool err_only) { - struct stats_record rec, prev; + struct stats_record *rec, *prev; - memset(&rec, 0, sizeof(rec)); + rec = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(rec); + + if (err_only) + printf("\n%s\n", __doc_err_only__); /* Trick to pretty printf with thousands separators use %' */ setlocale(LC_NUMERIC, "en_US"); @@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only) fflush(stdout); while (1) { - memcpy(&prev, &rec, sizeof(rec)); - stats_collect(&rec); - stats_print_headers(err_only); - stats_print(&rec, &prev, err_only); + swap(&prev, &rec); + stats_collect(rec); + stats_print(rec, prev, err_only); fflush(stdout); sleep(interval); } + + free_stats_record(rec); + free_stats_record(prev); } static void print_bpf_prog_info(void) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index cb8997ed0149..47cddf32aeba 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -265,12 +265,18 @@ else objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable) endif +ifdef CONFIG_MODVERSIONS +objtool_o = $(@D)/.tmp_$(@F) +else +objtool_o = $(@) +endif + # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory # 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file # 'OBJECT_FILES_NON_STANDARD_foo.o := 'n': override directory skip for a file cmd_objtool = $(if $(patsubst y%,, \ $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \ - $(__objtool_obj) $(objtool_args) "$(@)";) + $(__objtool_obj) $(objtool_args) "$(objtool_o)";) objtool_obj = $(if $(patsubst y%,, \ $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \ $(__objtool_obj)) @@ -286,16 +292,16 @@ objtool_dep = $(objtool_obj) \ define rule_cc_o_c $(call echo-cmd,checksrc) $(cmd_checksrc) \ $(call cmd_and_fixdep,cc_o_c) \ - $(cmd_modversions_c) \ $(cmd_checkdoc) \ $(call echo-cmd,objtool) $(cmd_objtool) \ + $(cmd_modversions_c) \ $(call echo-cmd,record_mcount) $(cmd_record_mcount) endef define rule_as_o_S $(call cmd_and_fixdep,as_o_S) \ - $(cmd_modversions_S) \ - $(call echo-cmd,objtool) $(cmd_objtool) + $(call echo-cmd,objtool) $(cmd_objtool) \ + $(cmd_modversions_S) endef # List module undefined symbols (or empty line if not enabled) diff --git a/scripts/decodecode b/scripts/decodecode index 438120da1361..5ea071099330 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -59,6 +59,14 @@ disas() { ${CROSS_COMPILE}strip $1.o fi + if [ "$ARCH" = "arm64" ]; then + if [ $width -eq 4 ]; then + type=inst + fi + + ${CROSS_COMPILE}strip $1.o + fi + ${CROSS_COMPILE}objdump $OBJDUMPFLAGS -S $1.o | \ grep -v "/tmp\|Disassembly\|\.text\|^$" > $1.dis 2>&1 } diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index 1bf949c43b76..f6ab3ccf698f 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -96,6 +96,8 @@ def get_thread_info(task): thread_info_addr = task.address + ia64_task_size thread_info = thread_info_addr.cast(thread_info_ptr_type) else: + if task.type.fields()[0].type == thread_info_type.get_type(): + return task['thread_info'] thread_info = task['stack'].cast(thread_info_ptr_type) return thread_info.dereference() diff --git a/tools/bpf/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c index 30044bc4f389..58c2bab4ef6e 100644 --- a/tools/bpf/bpf_jit_disasm.c +++ b/tools/bpf/bpf_jit_disasm.c @@ -172,7 +172,8 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen, { char *ptr, *pptr, *tmp; off_t off = 0; - int ret, flen, proglen, pass, ulen = 0; + unsigned int proglen; + int ret, flen, pass, ulen = 0; regmatch_t pmatch[1]; unsigned long base; regex_t regex; @@ -199,7 +200,7 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen, } ptr = haystack + off - (pmatch[0].rm_eo - pmatch[0].rm_so); - ret = sscanf(ptr, "flen=%d proglen=%d pass=%d image=%lx", + ret = sscanf(ptr, "flen=%d proglen=%u pass=%d image=%lx", &flen, &proglen, &pass, &base); if (ret != 4) { regfree(®ex); @@ -239,7 +240,7 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen, } assert(ulen == proglen); - printf("%d bytes emitted from JIT compiler (pass:%d, flen:%d)\n", + printf("%u bytes emitted from JIT compiler (pass:%d, flen:%d)\n", proglen, pass, flen); printf("%lx + <x>:\n", base); diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 6601c95a9258..0b482c0070e0 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -34,6 +34,7 @@ /* Author: Jakub Kicinski <kubakici@wp.pl> */ #include <errno.h> +#include <fcntl.h> #include <fts.h> #include <libgen.h> #include <mntent.h> @@ -433,6 +434,77 @@ ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf) return if_indextoname(ifindex, buf); } +static int read_sysfs_hex_int(char *path) +{ + char vendor_id_buf[8]; + int len; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) { + p_err("Can't open %s: %s", path, strerror(errno)); + return -1; + } + + len = read(fd, vendor_id_buf, sizeof(vendor_id_buf)); + close(fd); + if (len < 0) { + p_err("Can't read %s: %s", path, strerror(errno)); + return -1; + } + if (len >= (int)sizeof(vendor_id_buf)) { + p_err("Value in %s too long", path); + return -1; + } + + vendor_id_buf[len] = 0; + + return strtol(vendor_id_buf, NULL, 0); +} + +static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name) +{ + char full_path[64]; + + snprintf(full_path, sizeof(full_path), "/sys/class/net/%s/device/%s", + devname, entry_name); + + return read_sysfs_hex_int(full_path); +} + +const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino) +{ + char devname[IF_NAMESIZE]; + int vendor_id; + int device_id; + + if (!ifindex_to_name_ns(ifindex, ns_dev, ns_ino, devname)) { + p_err("Can't get net device name for ifindex %d: %s", ifindex, + strerror(errno)); + return NULL; + } + + vendor_id = read_sysfs_netdev_hex_int(devname, "vendor"); + if (vendor_id < 0) { + p_err("Can't get device vendor id for %s", devname); + return NULL; + } + + switch (vendor_id) { + case 0x19ee: + device_id = read_sysfs_netdev_hex_int(devname, "device"); + if (device_id != 0x4000 && + device_id != 0x6000 && + device_id != 0x6003) + p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch"); + return "NFP-6xxx"; + default: + p_err("Can't get bfd arch name for device vendor id 0x%04x", + vendor_id); + return NULL; + } +} + void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) { char name[IF_NAMESIZE]; diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index 57d32e8a1391..87439320ef70 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -76,7 +76,8 @@ static int fprintf_json(void *out, const char *fmt, ...) return 0; } -void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes) +void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch) { disassembler_ftype disassemble; struct disassemble_info info; @@ -100,6 +101,19 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes) else init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf); + + /* Update architecture info for offload. */ + if (arch) { + const bfd_arch_info_type *inf = bfd_scan_arch(arch); + + if (inf) { + bfdf->arch_info = inf; + } else { + p_err("No libfd support for %s", arch); + return; + } + } + info.arch = bfd_get_arch(bfdf); info.mach = bfd_get_mach(bfdf); info.buffer = image; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 65b526fe6e7e..b8e9584d6246 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -121,7 +121,10 @@ int do_cgroup(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); -void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes); +void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch); void print_hex_data_json(uint8_t *data, size_t len); +const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); + #endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 8d7db9d6b9cd..f95fa67bb498 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -66,6 +66,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", [BPF_MAP_TYPE_DEVMAP] = "devmap", [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", }; static unsigned int get_possible_cpus(void) @@ -428,6 +429,9 @@ static int show_map_close_json(int fd, struct bpf_map_info *info) jsonw_name(json_wtr, "flags"); jsonw_printf(json_wtr, "%#x", info->map_flags); + + print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); + jsonw_uint_field(json_wtr, "bytes_key", info->key_size); jsonw_uint_field(json_wtr, "bytes_value", info->value_size); jsonw_uint_field(json_wtr, "max_entries", info->max_entries); @@ -469,7 +473,9 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info) if (*info->name) printf("name %s ", info->name); - printf("flags 0x%x\n", info->map_flags); + printf("flags 0x%x", info->map_flags); + print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); + printf("\n"); printf("\tkey %uB value %uB max_entries %u", info->key_size, info->value_size, info->max_entries); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 099e21cf1b5c..e8e2baaf93c2 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -776,7 +776,17 @@ static int do_dump(int argc, char **argv) } } else { if (member_len == &info.jited_prog_len) { - disasm_print_insn(buf, *member_len, opcodes); + const char *name = NULL; + + if (info.ifindex) { + name = ifindex_to_bfd_name_ns(info.ifindex, + info.netns_dev, + info.netns_ino); + if (!name) + goto err_free; + } + + disasm_print_insn(buf, *member_len, opcodes, name); } else { kernel_syms_load(&dd); if (json_output) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69f96af4a569..af1f49ad8b88 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -900,6 +900,9 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ }; enum sk_action { @@ -935,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops @@ -956,6 +962,12 @@ struct bpf_sock_ops { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ }; /* List of known BPF sock_ops operators. @@ -1010,7 +1022,8 @@ struct bpf_perf_event_value { #define BPF_DEVCG_DEV_CHAR (1ULL << 1) struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; __u32 major; __u32 minor; }; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 24460155c82c..c1c338661699 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -26,6 +26,7 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <errno.h> #include "elf.h" #include "warn.h" @@ -358,7 +359,8 @@ struct elf *elf_open(const char *name, int flags) elf->fd = open(name, flags); if (elf->fd == -1) { - perror("open"); + fprintf(stderr, "objtool: Can't open '%s': %s\n", + name, strerror(errno)); goto err; } diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 541d9d7fad5a..1e09d77f1948 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -3,3 +3,10 @@ test_maps test_lru_map test_lpm_map test_tag +FEATURE-DUMP.libbpf +fixdep +test_align +test_dev_cgroup +test_progs +test_verifier_log +feature diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a8aa7e251c8e..3a44b655d852 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -19,7 +19,8 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ - test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o + test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ + sample_map_ret0.o TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ test_offload.py diff --git a/tools/testing/selftests/bpf/sample_map_ret0.c b/tools/testing/selftests/bpf/sample_map_ret0.c new file mode 100644 index 000000000000..0756303676ac --- /dev/null +++ b/tools/testing/selftests/bpf/sample_map_ret0.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#include <linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") htab = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__u32), + .value_size = sizeof(long), + .max_entries = 2, +}; + +struct bpf_map_def SEC("maps") array = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(long), + .max_entries = 2, +}; + +/* Sample program which should always load for testing control paths. */ +SEC(".text") int func() +{ + __u64 key64 = 0; + __u32 key = 0; + long *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (!value) + return 1; + value = bpf_map_lookup_elem(&array, &key64); + if (!value) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index f61480641b6e..081510853c6d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -521,6 +521,126 @@ static void test_lpm_delete(void) close(map_fd); } +static void test_lpm_get_next_key(void) +{ + struct bpf_lpm_trie_key *key_p, *next_key_p; + size_t key_size; + __u32 value = 0; + int map_fd; + + key_size = sizeof(*key_p) + sizeof(__u32); + key_p = alloca(key_size); + next_key_p = alloca(key_size); + + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, sizeof(value), + 100, BPF_F_NO_PREALLOC); + assert(map_fd >= 0); + + /* empty tree. get_next_key should return ENOENT */ + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == -1 && + errno == ENOENT); + + /* get and verify the first key, get the second one should fail. */ + key_p->prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 16 && key_p->data[0] == 192 && + key_p->data[1] == 168); + + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* no exact matching key should get the first one in post order. */ + key_p->prefixlen = 8; + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 16 && key_p->data[0] == 192 && + key_p->data[1] == 168); + + /* add one more element (total two) */ + key_p->prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && + key_p->data[1] == 168 && key_p->data[2] == 0); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* Add one more element (total three) */ + key_p->prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && + key_p->data[1] == 168 && key_p->data[2] == 0); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 128); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* Add one more element (total four) */ + key_p->prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && + key_p->data[1] == 168 && key_p->data[2] == 0); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 1); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 128); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* no exact matching key should return the first one in post order */ + key_p->prefixlen = 22; + inet_pton(AF_INET, "192.168.1.0", key_p->data); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 0); + + close(map_fd); +} + int main(void) { struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; @@ -545,6 +665,8 @@ int main(void) test_lpm_delete(); + test_lpm_get_next_key(); + printf("test_lpm: OK\n"); return 0; } diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index e3c750f17cb8..833b9c1ec450 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -20,6 +20,7 @@ import os import pprint import random import string +import struct import subprocess import time @@ -156,6 +157,14 @@ def bpftool_prog_list(expected=None, ns=""): (len(progs), expected)) return progs +def bpftool_map_list(expected=None, ns=""): + _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) + if expected is not None: + if len(maps) != expected: + fail(True, "%d BPF maps loaded, expected %d" % + (len(maps), expected)) + return maps + def bpftool_prog_list_wait(expected=0, n_retry=20): for i in range(n_retry): nprogs = len(bpftool_prog_list()) @@ -164,6 +173,14 @@ def bpftool_prog_list_wait(expected=0, n_retry=20): time.sleep(0.05) raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) +def bpftool_map_list_wait(expected=0, n_retry=20): + for i in range(n_retry): + nmaps = len(bpftool_map_list()) + if nmaps == expected: + return + time.sleep(0.05) + raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) + def ip(args, force=False, JSON=True, ns="", fail=True): if force: args = "-force " + args @@ -193,6 +210,26 @@ def mknetns(n_retry=10): return name return None +def int2str(fmt, val): + ret = [] + for b in struct.pack(fmt, val): + ret.append(int(b)) + return " ".join(map(lambda x: str(x), ret)) + +def str2int(strtab): + inttab = [] + for i in strtab: + inttab.append(int(i, 16)) + ba = bytearray(inttab) + if len(strtab) == 4: + fmt = "I" + elif len(strtab) == 8: + fmt = "Q" + else: + raise Exception("String array of len %d can't be unpacked to an int" % + (len(strtab))) + return struct.unpack(fmt, ba)[0] + class DebugfsDir: """ Class for accessing DebugFS directories as a dictionary. @@ -311,13 +348,13 @@ class NetdevSim: return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu), fail=fail) - def set_xdp(self, bpf, mode, force=False, fail=True): + def set_xdp(self, bpf, mode, force=False, JSON=True, fail=True): return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf), - force=force, fail=fail) + force=force, JSON=JSON, fail=fail) - def unset_xdp(self, mode, force=False, fail=True): + def unset_xdp(self, mode, force=False, JSON=True, fail=True): return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode), - force=force, fail=fail) + force=force, JSON=JSON, fail=fail) def ip_link_show(self, xdp): _, link = ip("link show dev %s" % (self['ifname'])) @@ -390,12 +427,16 @@ class NetdevSim: ################################################################################ def clean_up(): + global files, netns, devs + for dev in devs: dev.remove() for f in files: cmd("rm -f %s" % (f)) for ns in netns: cmd("ip netns delete %s" % (ns)) + files = [] + netns = [] def pin_prog(file_name, idx=0): progs = bpftool_prog_list(expected=(idx + 1)) @@ -405,16 +446,31 @@ def pin_prog(file_name, idx=0): return file_name, bpf_pinned(file_name) -def check_dev_info(other_ns, ns, pin_file=None, removed=False): - if removed: - bpftool_prog_list(expected=0) - ret, err = bpftool("prog show pin %s" % (pin_file), fail=False) - fail(ret == 0, "Showing prog with removed device did not fail") - fail(err["error"].find("No such device") == -1, - "Showing prog with removed device expected ENODEV, error is %s" % - (err["error"])) - return - progs = bpftool_prog_list(expected=int(not removed), ns=ns) +def pin_map(file_name, idx=0, expected=1): + maps = bpftool_map_list(expected=expected) + m = maps[idx] + bpftool("map pin id %d %s" % (m["id"], file_name)) + files.append(file_name) + + return file_name, bpf_pinned(file_name) + +def check_dev_info_removed(prog_file=None, map_file=None): + bpftool_prog_list(expected=0) + ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) + fail(ret == 0, "Showing prog with removed device did not fail") + fail(err["error"].find("No such device") == -1, + "Showing prog with removed device expected ENODEV, error is %s" % + (err["error"])) + + bpftool_map_list(expected=0) + ret, err = bpftool("map show pin %s" % (map_file), fail=False) + fail(ret == 0, "Showing map with removed device did not fail") + fail(err["error"].find("No such device") == -1, + "Showing map with removed device expected ENODEV, error is %s" % + (err["error"])) + +def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): + progs = bpftool_prog_list(expected=1, ns=ns) prog = progs[0] fail("dev" not in prog.keys(), "Device parameters not reported") @@ -423,16 +479,17 @@ def check_dev_info(other_ns, ns, pin_file=None, removed=False): fail("ns_dev" not in dev.keys(), "Device parameters not reported") fail("ns_inode" not in dev.keys(), "Device parameters not reported") - if not removed and not other_ns: + if not other_ns: fail("ifname" not in dev.keys(), "Ifname not reported") fail(dev["ifname"] != sim["ifname"], "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"])) else: fail("ifname" in dev.keys(), "Ifname is reported for other ns") - if removed: - fail(dev["ifindex"] != 0, "Device perameters not zero on removed") - fail(dev["ns_dev"] != 0, "Device perameters not zero on removed") - fail(dev["ns_inode"] != 0, "Device perameters not zero on removed") + + maps = bpftool_map_list(expected=2, ns=ns) + for m in maps: + fail("dev" not in m.keys(), "Device parameters not reported") + fail(dev != m["dev"], "Map's device different than program's") # Parse command line parser = argparse.ArgumentParser() @@ -464,7 +521,7 @@ if out.find("/sys/kernel/debug type debugfs") == -1: cmd("mount -t debugfs none /sys/kernel/debug") # Check samples are compiled -samples = ["sample_ret0.o"] +samples = ["sample_ret0.o", "sample_map_ret0.o"] for s in samples: ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) skip(ret != 0, "sample %s/%s not found, please compile it" % @@ -739,8 +796,9 @@ try: bpftool_prog_list_wait(expected=0) sim = NetdevSim() - sim.set_ethtool_tc_offloads(True) - sim.set_xdp(obj, "offload") + map_obj = bpf_obj("sample_map_ret0.o") + start_test("Test loading program with maps...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON start_test("Test bpftool bound info reporting (own ns)...") check_dev_info(False, "") @@ -757,11 +815,111 @@ try: sim.set_ns("") check_dev_info(False, "") - pin_file, _ = pin_prog("/sys/fs/bpf/tmp") + prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog") + map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2) sim.remove() start_test("Test bpftool bound info reporting (removed dev)...") - check_dev_info(True, "", pin_file=pin_file, removed=True) + check_dev_info_removed(prog_file=prog_file, map_file=map_file) + + # Remove all pinned files and reinstantiate the netdev + clean_up() + bpftool_prog_list_wait(expected=0) + + sim = NetdevSim() + + start_test("Test map update (no flags)...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + maps = bpftool_map_list(expected=2) + array = maps[0] if maps[0]["type"] == "array" else maps[1] + htab = maps[0] if maps[0]["type"] == "hash" else maps[1] + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, _ = bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "added too many entries") + + start_test("Test map update (exists)...") + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, err = bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "updated non-existing key") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map update (noexist)...") + for m in maps: + for i in range(2): + ret, err = bpftool("map update id %d key %s value %s noexist" % + (m["id"], int2str("I", i), int2str("Q", i * 3)), + fail=False) + fail(ret == 0, "updated existing key") + fail(err["error"].find("File exists") == -1, + "expected EEXIST, error is '%s'" % (err["error"])) + + start_test("Test map dump...") + for m in maps: + _, entries = bpftool("map dump id %d" % (m["id"])) + for i in range(2): + key = str2int(entries[i]["key"]) + fail(key != i, "expected key %d, got %d" % (key, i)) + val = str2int(entries[i]["value"]) + fail(val != i * 3, "expected value %d, got %d" % (val, i * 3)) + + start_test("Test map getnext...") + for m in maps: + _, entry = bpftool("map getnext id %d" % (m["id"])) + key = str2int(entry["next_key"]) + fail(key != 0, "next key %d, expected %d" % (key, 0)) + _, entry = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 0))) + key = str2int(entry["next_key"]) + fail(key != 1, "next key %d, expected %d" % (key, 1)) + ret, err = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 1)), fail=False) + fail(ret == 0, "got next key past the end of map") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map delete (htab)...") + for i in range(2): + bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i))) + + start_test("Test map delete (array)...") + for i in range(2): + ret, err = bpftool("map delete id %d key %s" % + (htab["id"], int2str("I", i)), fail=False) + fail(ret == 0, "removed entry from an array") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map remove...") + sim.unset_xdp("offload") + bpftool_map_list_wait(expected=0) + sim.remove() + + sim = NetdevSim() + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + sim.remove() + bpftool_map_list_wait(expected=0) + + start_test("Test map creation fail path...") + sim = NetdevSim() + sim.dfs["bpf_map_accept"] = "N" + ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) + fail(ret == 0, + "netdevsim didn't refuse to create a map with offload disabled") print("%s: OK" % (os.path.basename(__file__))) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 960179882a1c..fb82d29ee863 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -29,6 +29,7 @@ #include <linux/filter.h> #include <linux/bpf_perf_event.h> #include <linux/bpf.h> +#include <linux/if_ether.h> #include <bpf/bpf.h> @@ -49,6 +50,8 @@ #define MAX_INSNS 512 #define MAX_FIXUPS 8 #define MAX_NR_MAPS 4 +#define POINTER_VALUE 0xcafe4all +#define TEST_DATA_LEN 64 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) #define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) @@ -62,6 +65,7 @@ struct bpf_test { int fixup_map_in_map[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; + uint32_t retval; enum { UNDEF, ACCEPT, @@ -95,6 +99,94 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = -3, + }, + { + "DIV32 by 0, zero check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV32 by 0, zero check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 by 0, zero check", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU64_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD32 by 0, zero check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD32 by 0, zero check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD64 by 0, zero check", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "empty prog", + .insns = { + }, + .errstr = "last insn is not an exit or jmp", + .result = REJECT, + }, + { + "only exit insn", + .insns = { + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, }, { "unreachable", @@ -210,6 +302,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 1, }, { "test8 ld_imm64", @@ -517,6 +610,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, + .retval = POINTER_VALUE, }, { "check valid spill/fill, skb mark", @@ -803,6 +897,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "R1 pointer comparison", .result_unpriv = REJECT, .result = ACCEPT, + .retval = -ENOENT, }, { "jump test 4", @@ -1823,6 +1918,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 0xfaceb00c, }, { "PTR_TO_STACK store/load - bad alignment on off", @@ -1881,6 +1977,7 @@ static struct bpf_test tests[] = { .result = ACCEPT, .result_unpriv = REJECT, .errstr_unpriv = "R0 leaks addr", + .retval = POINTER_VALUE, }, { "unpriv: add const to pointer", @@ -2054,6 +2151,7 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -2594,6 +2692,29 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { + "context stores via ST", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_1, offsetof(struct __sk_buff, mark), 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_ST stores into R1 context is not allowed", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "context stores via XADD", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1, + BPF_REG_0, offsetof(struct __sk_buff, mark), 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_XADD stores into R1 context is not allowed", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { "direct packet access: test1", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, @@ -2818,6 +2939,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test12 (and, good access)", @@ -2842,6 +2964,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test13 (branches, good access)", @@ -2872,6 +2995,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test14 (pkt_ptr += 0, CONST_IMM, good access)", @@ -2895,6 +3019,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test15 (spill with xadd)", @@ -3181,6 +3306,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test28 (marking on <=, bad access)", @@ -4313,7 +4439,8 @@ static struct bpf_test tests[] = { .fixup_map1 = { 2 }, .errstr_unpriv = "R2 leaks addr into mem", .result_unpriv = REJECT, - .result = ACCEPT, + .result = REJECT, + .errstr = "BPF_XADD stores into R1 context is not allowed", }, { "leak pointer into ctx 2", @@ -4327,7 +4454,8 @@ static struct bpf_test tests[] = { }, .errstr_unpriv = "R10 leaks addr into mem", .result_unpriv = REJECT, - .result = ACCEPT, + .result = REJECT, + .errstr = "BPF_XADD stores into R1 context is not allowed", }, { "leak pointer into ctx 3", @@ -5798,6 +5926,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 0 /* csum_diff of 64-byte packet */, }, { "helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)", @@ -6166,6 +6295,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 42 /* ultimate return value */, }, { "ld_ind: check calling conv, r1", @@ -6237,6 +6367,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 1, }, { "check bpf_perf_event_data->sample_period byte load permitted", @@ -6708,7 +6839,7 @@ static struct bpf_test tests[] = { BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, .fixup_map1 = { 4 }, - .errstr = "unbounded min value", + .errstr = "R0 invalid mem access 'inv'", .result = REJECT, }, { @@ -7224,6 +7355,7 @@ static struct bpf_test tests[] = { }, .fixup_map1 = { 3 }, .result = ACCEPT, + .retval = POINTER_VALUE, .result_unpriv = REJECT, .errstr_unpriv = "R0 leaks addr as return value" }, @@ -7244,6 +7376,7 @@ static struct bpf_test tests[] = { }, .fixup_map1 = { 3 }, .result = ACCEPT, + .retval = POINTER_VALUE, .result_unpriv = REJECT, .errstr_unpriv = "R0 leaks addr as return value" }, @@ -7685,6 +7818,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = TEST_DATA_LEN, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@ -8610,6 +8744,127 @@ static struct bpf_test tests[] = { .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { + "check deducing bounds from const, 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 1, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check deducing bounds from const, 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check deducing bounds from const, 5", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 6", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 7", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, ~0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "check deducing bounds from const, 8", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, ~0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "check deducing bounds from const, 9", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 10", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0), + /* Marks reg as unknown. */ + BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + }, + { "bpf_exit with invalid return code. test1", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), @@ -8705,6 +8960,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "function calls to other bpf functions are allowed for root only", .result_unpriv = REJECT, .result = ACCEPT, + .retval = 1, }, { "calls: overlapping caller/callee", @@ -8900,6 +9156,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_ACT, .result = ACCEPT, + .retval = TEST_DATA_LEN, }, { "calls: callee using args1", @@ -8912,6 +9169,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "allowed for root only", .result_unpriv = REJECT, .result = ACCEPT, + .retval = POINTER_VALUE, }, { "calls: callee using wrong args2", @@ -8942,6 +9200,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "allowed for root only", .result_unpriv = REJECT, .result = ACCEPT, + .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, }, { "calls: callee changing pkt pointers", @@ -8990,6 +9249,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = TEST_DATA_LEN + TEST_DATA_LEN, }, { "calls: calls with stack arith", @@ -9008,6 +9268,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 42, }, { "calls: calls with misaligned stack access", @@ -9041,6 +9302,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 43, }, { "calls: calls control flow, jump test 2", @@ -9533,6 +9795,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_XDP, .result = ACCEPT, + .retval = 42, }, { "calls: write into callee stack frame", @@ -10144,6 +10407,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = POINTER_VALUE, }, { "calls: pkt_ptr spill into caller stack 2", @@ -10209,6 +10473,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 1, }, { "calls: pkt_ptr spill into caller stack 4", @@ -10242,6 +10507,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 1, }, { "calls: pkt_ptr spill into caller stack 5", @@ -10650,10 +10916,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv, int fd_prog, expected_ret, reject_from_alignment; struct bpf_insn *prog = test->insns; int prog_len = probe_filter_length(prog); + char data_in[TEST_DATA_LEN] = {}; int prog_type = test->prog_type; int map_fds[MAX_NR_MAPS]; const char *expected_err; - int i; + uint32_t retval; + int i, err; for (i = 0; i < MAX_NR_MAPS; i++) map_fds[i] = -1; @@ -10696,6 +10964,19 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } } + if (fd_prog >= 0) { + err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in), + NULL, NULL, &retval, NULL); + if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { + printf("Unexpected bpf_prog_test_run error\n"); + goto fail_log; + } + if (!err && retval != test->retval && + test->retval != POINTER_VALUE) { + printf("FAIL retval %d != %d\n", retval, test->retval); + goto fail_log; + } + } (*passes)++; printf("OK%s\n", reject_from_alignment ? " (NOTE: reject due to unknown alignment)" : ""); |