diff options
108 files changed, 2963 insertions, 1018 deletions
diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt index 497d668288f95..433b6724797ad 100644 --- a/Documentation/networking/nf_conntrack-sysctl.txt +++ b/Documentation/networking/nf_conntrack-sysctl.txt @@ -96,17 +96,6 @@ nf_conntrack_max - INTEGER Size of connection tracking table. Default value is nf_conntrack_buckets value * 4. -nf_conntrack_default_on - BOOLEAN - 0 - don't register conntrack in new net namespaces - 1 - register conntrack in new net namespaces (default) - - This controls wheter newly created network namespaces have connection - tracking enabled by default. It will be enabled automatically - regardless of this setting if the new net namespace requires - connection tracking, e.g. when NAT rules are created. - This setting is only visible in initial user namespace, it has no - effect on existing namespaces. - nf_conntrack_tcp_be_liberal - BOOLEAN 0 - disabled (default) not 0 - enabled diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index d5624894152e1..dd0ee2691c863 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -7465,6 +7465,34 @@ static bool mvpp2_port_has_tx_irqs(struct mvpp2 *priv, return true; } +static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv, + struct device_node *port_node, + char **mac_from) +{ + struct mvpp2_port *port = netdev_priv(dev); + char hw_mac_addr[ETH_ALEN] = {0}; + const char *dt_mac_addr; + + dt_mac_addr = of_get_mac_address(port_node); + if (dt_mac_addr && is_valid_ether_addr(dt_mac_addr)) { + *mac_from = "device tree"; + ether_addr_copy(dev->dev_addr, dt_mac_addr); + return; + } + + if (priv->hw_version == MVPP21) { + mvpp21_get_mac_address(port, hw_mac_addr); + if (is_valid_ether_addr(hw_mac_addr)) { + *mac_from = "hardware"; + ether_addr_copy(dev->dev_addr, hw_mac_addr); + return; + } + } + + *mac_from = "random"; + eth_hw_addr_random(dev); +} + /* Ports initialization */ static int mvpp2_port_probe(struct platform_device *pdev, struct device_node *port_node, @@ -7476,9 +7504,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, struct mvpp2_port_pcpu *port_pcpu; struct net_device *dev; struct resource *res; - const char *dt_mac_addr; - const char *mac_from; - char hw_mac_addr[ETH_ALEN] = {0}; + char *mac_from = ""; unsigned int ntxqs, nrxqs; bool has_tx_irqs; u32 id; @@ -7587,21 +7613,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, goto err_free_irq; } - dt_mac_addr = of_get_mac_address(port_node); - if (dt_mac_addr && is_valid_ether_addr(dt_mac_addr)) { - mac_from = "device tree"; - ether_addr_copy(dev->dev_addr, dt_mac_addr); - } else { - if (priv->hw_version == MVPP21) - mvpp21_get_mac_address(port, hw_mac_addr); - if (is_valid_ether_addr(hw_mac_addr)) { - mac_from = "hardware"; - ether_addr_copy(dev->dev_addr, hw_mac_addr); - } else { - mac_from = "random"; - eth_hw_addr_random(dev); - } - } + mvpp2_port_copy_mac_addr(dev, priv, port_node, &mac_from); port->tx_ring_size = MVPP2_MAX_TXD; port->rx_ring_size = MVPP2_MAX_RXD; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index c631d157b97dc..b3e50f4892c57 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -977,7 +977,8 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) if (dev->caps.num_ports > MLX4_MAX_PORTS) { mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n", dev->caps.num_ports, MLX4_MAX_PORTS); - return -ENODEV; + err = -ENODEV; + goto free_mem; } mlx4_replace_zero_macs(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 28c441c0d31f3..e40a6d1d0966a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -844,8 +844,9 @@ int mlx4_init_qp_table(struct mlx4_dev *dev) /* In mfunc, calculate proxy and tunnel qp offsets for the PF here, * since the PF does not call mlx4_slave_caps */ - dev->caps.spec_qps = kcalloc(dev->caps.num_ports, sizeof(dev->caps.spec_qps), GFP_KERNEL); - + dev->caps.spec_qps = kcalloc(dev->caps.num_ports, + sizeof(*dev->caps.spec_qps), + GFP_KERNEL); if (!dev->caps.spec_qps) { err = -ENOMEM; goto err_mem; diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index f9956f3bc45c4..891ff418bb5ed 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -16,8 +16,8 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_switchdev.o spectrum_router.o \ spectrum_kvdl.o spectrum_acl_tcam.o \ spectrum_acl.o spectrum_flower.o \ - spectrum_cnt.o \ - spectrum_fid.o + spectrum_cnt.o spectrum_fid.o \ + spectrum_ipip.o mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB) += spectrum_dcb.o mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o obj-$(CONFIG_MLXSW_MINIMAL) += mlxsw_minimal.o diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 11e290c34aafd..cc27c5de5a1dd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5,6 +5,7 @@ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com> * Copyright (c) 2015-2017 Jiri Pirko <jiri@mellanox.com> * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com> + * Copyright (c) 2017 Petr Machata <petrm@mellanox.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -3998,6 +3999,8 @@ enum mlxsw_reg_ritr_if_type { MLXSW_REG_RITR_FID_IF, /* Sub-port interface. */ MLXSW_REG_RITR_SP_IF, + /* Loopback Interface. */ + MLXSW_REG_RITR_LOOPBACK_IF, }; /* reg_ritr_type @@ -4129,6 +4132,67 @@ MLXSW_ITEM32(reg, ritr, sp_if_system_port, 0x08, 0, 16); */ MLXSW_ITEM32(reg, ritr, sp_if_vid, 0x18, 0, 12); +/* Loopback Interface */ + +enum mlxsw_reg_ritr_loopback_protocol { + /* IPinIP IPv4 underlay Unicast */ + MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV4, + /* IPinIP IPv6 underlay Unicast */ + MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV6, +}; + +/* reg_ritr_loopback_protocol + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, loopback_protocol, 0x08, 28, 4); + +enum mlxsw_reg_ritr_loopback_ipip_type { + /* Tunnel is IPinIP. */ + MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_IP, + /* Tunnel is GRE, no key. */ + MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_IN_IP, + /* Tunnel is GRE, with a key. */ + MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_KEY_IN_IP, +}; + +/* reg_ritr_loopback_ipip_type + * Encapsulation type. + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, loopback_ipip_type, 0x10, 24, 4); + +enum mlxsw_reg_ritr_loopback_ipip_options { + /* The key is defined by gre_key. */ + MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET, +}; + +/* reg_ritr_loopback_ipip_options + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, loopback_ipip_options, 0x10, 20, 4); + +/* reg_ritr_loopback_ipip_uvr + * Underlay Virtual Router ID. + * Range is 0..cap_max_virtual_routers-1. + * Reserved for Spectrum-2. + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, loopback_ipip_uvr, 0x10, 0, 16); + +/* reg_ritr_loopback_ipip_usip* + * Encapsulation Underlay source IP. + * Access: RW + */ +MLXSW_ITEM_BUF(reg, ritr, loopback_ipip_usip6, 0x18, 16); +MLXSW_ITEM32(reg, ritr, loopback_ipip_usip4, 0x24, 0, 32); + +/* reg_ritr_loopback_ipip_gre_key + * GRE Key. + * Reserved when ipip_type is not IP_IN_GRE_KEY_IN_IP. + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, loopback_ipip_gre_key, 0x28, 0, 32); + /* Shared between ingress/egress */ enum mlxsw_reg_ritr_counter_set_type { /* No Count. */ @@ -4199,8 +4263,7 @@ static inline void mlxsw_reg_ritr_sp_if_pack(char *payload, bool lag, static inline void mlxsw_reg_ritr_pack(char *payload, bool enable, enum mlxsw_reg_ritr_if_type type, - u16 rif, u16 vr_id, u16 mtu, - const char *mac) + u16 rif, u16 vr_id, u16 mtu) { bool op = enable ? MLXSW_REG_RITR_RIF_CREATE : MLXSW_REG_RITR_RIF_DEL; @@ -4216,9 +4279,38 @@ static inline void mlxsw_reg_ritr_pack(char *payload, bool enable, mlxsw_reg_ritr_lb_en_set(payload, 1); mlxsw_reg_ritr_virtual_router_set(payload, vr_id); mlxsw_reg_ritr_mtu_set(payload, mtu); +} + +static inline void mlxsw_reg_ritr_mac_pack(char *payload, const char *mac) +{ mlxsw_reg_ritr_if_mac_memcpy_to(payload, mac); } +static inline void +mlxsw_reg_ritr_loopback_ipip_common_pack(char *payload, + enum mlxsw_reg_ritr_loopback_ipip_type ipip_type, + enum mlxsw_reg_ritr_loopback_ipip_options options, + u16 uvr_id, u32 gre_key) +{ + mlxsw_reg_ritr_loopback_ipip_type_set(payload, ipip_type); + mlxsw_reg_ritr_loopback_ipip_options_set(payload, options); + mlxsw_reg_ritr_loopback_ipip_uvr_set(payload, uvr_id); + mlxsw_reg_ritr_loopback_ipip_gre_key_set(payload, gre_key); +} + +static inline void +mlxsw_reg_ritr_loopback_ipip4_pack(char *payload, + enum mlxsw_reg_ritr_loopback_ipip_type ipip_type, + enum mlxsw_reg_ritr_loopback_ipip_options options, + u16 uvr_id, u32 usip, u32 gre_key) +{ + mlxsw_reg_ritr_loopback_protocol_set(payload, + MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV4); + mlxsw_reg_ritr_loopback_ipip_common_pack(payload, ipip_type, options, + uvr_id, gre_key); + mlxsw_reg_ritr_loopback_ipip_usip4_set(payload, usip); +} + /* RATR - Router Adjacency Table Register * -------------------------------------- * The RATR register is used to configure the Router Adjacency (next-hop) @@ -4274,6 +4366,38 @@ MLXSW_ITEM32(reg, ratr, v, 0x00, 24, 1); */ MLXSW_ITEM32(reg, ratr, a, 0x00, 16, 1); +enum mlxsw_reg_ratr_type { + /* Ethernet */ + MLXSW_REG_RATR_TYPE_ETHERNET, + /* IPoIB Unicast without GRH. + * Reserved for Spectrum. + */ + MLXSW_REG_RATR_TYPE_IPOIB_UC, + /* IPoIB Unicast with GRH. Supported only in table 0 (Ethernet unicast + * adjacency). + * Reserved for Spectrum. + */ + MLXSW_REG_RATR_TYPE_IPOIB_UC_W_GRH, + /* IPoIB Multicast. + * Reserved for Spectrum. + */ + MLXSW_REG_RATR_TYPE_IPOIB_MC, + /* MPLS. + * Reserved for SwitchX/-2. + */ + MLXSW_REG_RATR_TYPE_MPLS, + /* IPinIP Encap. + * Reserved for SwitchX/-2. + */ + MLXSW_REG_RATR_TYPE_IPIP, +}; + +/* reg_ratr_type + * Adjacency entry type. + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, type, 0x04, 28, 4); + /* reg_ratr_adjacency_index_low * Bits 15:0 of index into the adjacency table. * For SwitchX and SwitchX-2, the adjacency table is linear and @@ -4303,17 +4427,17 @@ enum mlxsw_reg_ratr_trap_action { */ MLXSW_ITEM32(reg, ratr, trap_action, 0x0C, 28, 4); -enum mlxsw_reg_ratr_trap_id { - MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS0 = 0, - MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS1 = 1, -}; - /* reg_ratr_adjacency_index_high * Bits 23:16 of the adjacency_index. * Access: Index */ MLXSW_ITEM32(reg, ratr, adjacency_index_high, 0x0C, 16, 8); +enum mlxsw_reg_ratr_trap_id { + MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS0, + MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS1, +}; + /* reg_ratr_trap_id * Trap ID to be reported to CPU. * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1. @@ -4328,14 +4452,44 @@ MLXSW_ITEM32(reg, ratr, trap_id, 0x0C, 0, 8); */ MLXSW_ITEM_BUF(reg, ratr, eth_destination_mac, 0x12, 6); +enum mlxsw_reg_ratr_ipip_type { + /* IPv4, address set by mlxsw_reg_ratr_ipip_ipv4_udip. */ + MLXSW_REG_RATR_IPIP_TYPE_IPV4, + /* IPv6, address set by mlxsw_reg_ratr_ipip_ipv6_ptr. */ + MLXSW_REG_RATR_IPIP_TYPE_IPV6, +}; + +/* reg_ratr_ipip_type + * Underlay destination ip type. + * Note: the type field must match the protocol of the router interface. + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, ipip_type, 0x10, 16, 4); + +/* reg_ratr_ipip_ipv4_udip + * Underlay ipv4 dip. + * Reserved when ipip_type is IPv6. + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, ipip_ipv4_udip, 0x18, 0, 32); + +/* reg_ratr_ipip_ipv6_ptr + * Pointer to IPv6 underlay destination ip address. + * For Spectrum: Pointer to KVD linear space. + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, ipip_ipv6_ptr, 0x1C, 0, 24); + static inline void mlxsw_reg_ratr_pack(char *payload, enum mlxsw_reg_ratr_op op, bool valid, + enum mlxsw_reg_ratr_type type, u32 adjacency_index, u16 egress_rif) { MLXSW_REG_ZERO(ratr, payload); mlxsw_reg_ratr_op_set(payload, op); mlxsw_reg_ratr_v_set(payload, valid); + mlxsw_reg_ratr_type_set(payload, type); mlxsw_reg_ratr_adjacency_index_low_set(payload, adjacency_index); mlxsw_reg_ratr_adjacency_index_high_set(payload, adjacency_index >> 16); mlxsw_reg_ratr_egress_router_interface_set(payload, egress_rif); @@ -4347,6 +4501,12 @@ static inline void mlxsw_reg_ratr_eth_entry_pack(char *payload, mlxsw_reg_ratr_eth_destination_mac_memcpy_to(payload, dest_mac); } +static inline void mlxsw_reg_ratr_ipip4_entry_pack(char *payload, u32 ipv4_udip) +{ + mlxsw_reg_ratr_ipip_type_set(payload, MLXSW_REG_RATR_IPIP_TYPE_IPV4); + mlxsw_reg_ratr_ipip_ipv4_udip_set(payload, ipv4_udip); +} + /* RICNT - Router Interface Counter Register * ----------------------------------------- * The RICNT register retrieves per port performance counters @@ -4900,6 +5060,15 @@ mlxsw_reg_ralue_act_ip2me_pack(char *payload) MLXSW_REG_RALUE_ACTION_TYPE_IP2ME); } +static inline void +mlxsw_reg_ralue_act_ip2me_tun_pack(char *payload, u32 tunnel_ptr) +{ + mlxsw_reg_ralue_action_type_set(payload, + MLXSW_REG_RALUE_ACTION_TYPE_IP2ME); + mlxsw_reg_ralue_ip2me_v_set(payload, 1); + mlxsw_reg_ralue_ip2me_tunnel_ptr_set(payload, tunnel_ptr); +} + /* RAUHT - Router Algorithmic LPM Unicast Host Table Register * ---------------------------------------------------------- * The RAUHT register is used to configure and query the Unicast Host table in @@ -5300,6 +5469,133 @@ static inline void mlxsw_reg_rauhtd_ent_ipv6_unpack(char *payload, mlxsw_reg_rauhtd_ipv6_ent_dip_memcpy_from(payload, rec_index, p_dip); } +/* RTDP - Routing Tunnel Decap Properties Register + * ----------------------------------------------- + * The RTDP register is used for configuring the tunnel decap properties of NVE + * and IPinIP. + */ +#define MLXSW_REG_RTDP_ID 0x8020 +#define MLXSW_REG_RTDP_LEN 0x44 + +MLXSW_REG_DEFINE(rtdp, MLXSW_REG_RTDP_ID, MLXSW_REG_RTDP_LEN); + +enum mlxsw_reg_rtdp_type { + MLXSW_REG_RTDP_TYPE_NVE, + MLXSW_REG_RTDP_TYPE_IPIP, +}; + +/* reg_rtdp_type + * Type of the RTDP entry as per enum mlxsw_reg_rtdp_type. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, type, 0x00, 28, 4); + +/* reg_rtdp_tunnel_index + * Index to the Decap entry. + * For Spectrum, Index to KVD Linear. + * Access: Index + */ +MLXSW_ITEM32(reg, rtdp, tunnel_index, 0x00, 0, 24); + +/* IPinIP */ + +/* reg_rtdp_ipip_irif + * Ingress Router Interface for the overlay router + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_irif, 0x04, 16, 16); + +enum mlxsw_reg_rtdp_ipip_sip_check { + /* No sip checks. */ + MLXSW_REG_RTDP_IPIP_SIP_CHECK_NO, + /* Filter packet if underlay is not IPv4 or if underlay SIP does not + * equal ipv4_usip. + */ + MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV4, + /* Filter packet if underlay is not IPv6 or if underlay SIP does not + * equal ipv6_usip. + */ + MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV6 = 3, +}; + +/* reg_rtdp_ipip_sip_check + * SIP check to perform. If decapsulation failed due to these configurations + * then trap_id is IPIP_DECAP_ERROR. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_sip_check, 0x04, 0, 3); + +/* If set, allow decapsulation of IPinIP (without GRE). */ +#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_IPIP BIT(0) +/* If set, allow decapsulation of IPinGREinIP without a key. */ +#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE BIT(1) +/* If set, allow decapsulation of IPinGREinIP with a key. */ +#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE_KEY BIT(2) + +/* reg_rtdp_ipip_type_check + * Flags as per MLXSW_REG_RTDP_IPIP_TYPE_CHECK_*. If decapsulation failed due to + * these configurations then trap_id is IPIP_DECAP_ERROR. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_type_check, 0x08, 24, 3); + +/* reg_rtdp_ipip_gre_key_check + * Whether GRE key should be checked. When check is enabled: + * - A packet received as IPinIP (without GRE) will always pass. + * - A packet received as IPinGREinIP without a key will not pass the check. + * - A packet received as IPinGREinIP with a key will pass the check only if the + * key in the packet is equal to expected_gre_key. + * If decapsulation failed due to GRE key then trap_id is IPIP_DECAP_ERROR. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_gre_key_check, 0x08, 23, 1); + +/* reg_rtdp_ipip_ipv4_usip + * Underlay IPv4 address for ipv4 source address check. + * Reserved when sip_check is not '1'. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_ipv4_usip, 0x0C, 0, 32); + +/* reg_rtdp_ipip_ipv6_usip_ptr + * This field is valid when sip_check is "sipv6 check explicitly". This is a + * pointer to the IPv6 DIP which is configured by RIPS. For Spectrum, the index + * is to the KVD linear. + * Reserved when sip_check is not MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV6. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_ipv6_usip_ptr, 0x10, 0, 24); + +/* reg_rtdp_ipip_expected_gre_key + * GRE key for checking. + * Reserved when gre_key_check is '0'. + * Access: RW + */ +MLXSW_ITEM32(reg, rtdp, ipip_expected_gre_key, 0x14, 0, 32); + +static inline void mlxsw_reg_rtdp_pack(char *payload, + enum mlxsw_reg_rtdp_type type, + u32 tunnel_index) +{ + MLXSW_REG_ZERO(rtdp, payload); + mlxsw_reg_rtdp_type_set(payload, type); + mlxsw_reg_rtdp_tunnel_index_set(payload, tunnel_index); +} + +static inline void +mlxsw_reg_rtdp_ipip4_pack(char *payload, u16 irif, + enum mlxsw_reg_rtdp_ipip_sip_check sip_check, + unsigned int type_check, bool gre_key_check, + u32 ipv4_usip, u32 expected_gre_key) +{ + mlxsw_reg_rtdp_ipip_irif_set(payload, irif); + mlxsw_reg_rtdp_ipip_sip_check_set(payload, sip_check); + mlxsw_reg_rtdp_ipip_type_check_set(payload, type_check); + mlxsw_reg_rtdp_ipip_gre_key_check_set(payload, gre_key_check); + mlxsw_reg_rtdp_ipip_ipv4_usip_set(payload, ipv4_usip); + mlxsw_reg_rtdp_ipip_expected_gre_key_set(payload, expected_gre_key); +} + /* MFCR - Management Fan Control Register * -------------------------------------- * This register controls the settings of the Fan Speed PWM mechanism. @@ -6561,6 +6857,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(rgcr), MLXSW_REG(ritr), MLXSW_REG(ratr), + MLXSW_REG(rtdp), MLXSW_REG(ricnt), MLXSW_REG(ralta), MLXSW_REG(ralst), diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 992cbfa1f2bcd..ed7cd6c48019a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3400,6 +3400,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_MARK(HOST_MISS_IPV6, TRAP_TO_CPU, HOST_MISS, false), MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false), + MLXSW_SP_RXL_MARK(IPIP_DECAP_ERROR, TRAP_TO_CPU, ROUTER_EXP, false), /* PKT Sample trap */ MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU, false, SP_IP2ME, DISCARD), diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index f8c7f7e930c58..84ce83acdc199 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -77,6 +77,7 @@ enum mlxsw_sp_rif_type { MLXSW_SP_RIF_TYPE_SUBPORT, MLXSW_SP_RIF_TYPE_VLAN, MLXSW_SP_RIF_TYPE_FID, + MLXSW_SP_RIF_TYPE_IPIP_LB, /* IP-in-IP loopback. */ MLXSW_SP_RIF_TYPE_MAX, }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c new file mode 100644 index 0000000000000..702fe945227c2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c @@ -0,0 +1,214 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Petr Machata <petrm@mellanox.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <net/ip_tunnels.h> + +#include "spectrum_ipip.h" + +static bool +mlxsw_sp_ipip_netdev_has_ikey(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + + return !!(tun->parms.i_flags & TUNNEL_KEY); +} + +static bool +mlxsw_sp_ipip_netdev_has_okey(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + + return !!(tun->parms.o_flags & TUNNEL_KEY); +} + +static u32 mlxsw_sp_ipip_netdev_ikey(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + + return mlxsw_sp_ipip_netdev_has_ikey(ol_dev) ? + be32_to_cpu(tun->parms.i_key) : 0; +} + +static u32 mlxsw_sp_ipip_netdev_okey(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + + return mlxsw_sp_ipip_netdev_has_okey(ol_dev) ? + be32_to_cpu(tun->parms.o_key) : 0; +} + +static int +mlxsw_sp_ipip_nexthop_update_gre4(struct mlxsw_sp *mlxsw_sp, u32 adj_index, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb); + __be32 daddr4 = mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev); + char ratr_pl[MLXSW_REG_RATR_LEN]; + + mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY, + true, MLXSW_REG_RATR_TYPE_IPIP, + adj_index, rif_index); + mlxsw_reg_ratr_ipip4_entry_pack(ratr_pl, be32_to_cpu(daddr4)); + + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl); +} + +static int +mlxsw_sp_ipip_fib_entry_op_gre4_rtdp(struct mlxsw_sp *mlxsw_sp, + u32 tunnel_index, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + bool has_ikey = mlxsw_sp_ipip_netdev_has_ikey(ipip_entry->ol_dev); + u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb); + u32 ikey = mlxsw_sp_ipip_netdev_ikey(ipip_entry->ol_dev); + char rtdp_pl[MLXSW_REG_RTDP_LEN]; + unsigned int type_check; + u32 daddr4; + + mlxsw_reg_rtdp_pack(rtdp_pl, MLXSW_REG_RTDP_TYPE_IPIP, tunnel_index); + + type_check = has_ikey ? + MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE_KEY : + MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE; + + /* Linux demuxes tunnels based on packet SIP (which must match tunnel + * remote IP). Thus configure decap so that it filters out packets that + * are not IPv4 or have the wrong SIP. IPIP_DECAP_ERROR trap is + * generated for packets that fail this criterion. Linux then handles + * such packets in slow path and generates ICMP destination unreachable. + */ + daddr4 = be32_to_cpu(mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev)); + mlxsw_reg_rtdp_ipip4_pack(rtdp_pl, rif_index, + MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV4, + type_check, has_ikey, daddr4, ikey); + + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtdp), rtdp_pl); +} + +static int +mlxsw_sp_ipip_fib_entry_op_gre4_ralue(struct mlxsw_sp *mlxsw_sp, + u32 dip, u8 prefix_len, u16 ul_vr_id, + enum mlxsw_reg_ralue_op op, + u32 tunnel_index) +{ + char ralue_pl[MLXSW_REG_RALUE_LEN]; + + mlxsw_reg_ralue_pack4(ralue_pl, MLXSW_REG_RALXX_PROTOCOL_IPV4, op, + ul_vr_id, prefix_len, dip); + mlxsw_reg_ralue_act_ip2me_tun_pack(ralue_pl, tunnel_index); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl); +} + +static int mlxsw_sp_ipip_fib_entry_op_gre4(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_ipip_entry *ipip_entry, + enum mlxsw_reg_ralue_op op, + u32 tunnel_index) +{ + u16 ul_vr_id = mlxsw_sp_ipip_lb_ul_vr_id(ipip_entry->ol_lb); + __be32 dip; + int err; + + err = mlxsw_sp_ipip_fib_entry_op_gre4_rtdp(mlxsw_sp, tunnel_index, + ipip_entry); + if (err) + return err; + + dip = mlxsw_sp_ipip_netdev_saddr(MLXSW_SP_L3_PROTO_IPV4, + ipip_entry->ol_dev).addr4; + return mlxsw_sp_ipip_fib_entry_op_gre4_ralue(mlxsw_sp, be32_to_cpu(dip), + 32, ul_vr_id, op, + tunnel_index); +} + +static bool mlxsw_sp_ipip_tunnel_complete(enum mlxsw_sp_l3proto proto, + const struct net_device *ol_dev) +{ + union mlxsw_sp_l3addr saddr = mlxsw_sp_ipip_netdev_saddr(proto, ol_dev); + union mlxsw_sp_l3addr daddr = mlxsw_sp_ipip_netdev_daddr(proto, ol_dev); + union mlxsw_sp_l3addr naddr = {0}; + + /* Tunnels with unset local or remote address are valid in Linux and + * used for lightweight tunnels (LWT) and Non-Broadcast Multi-Access + * (NBMA) tunnels. In principle these can be offloaded, but the driver + * currently doesn't support this. So punt. + */ + return memcmp(&saddr, &naddr, sizeof(naddr)) && + memcmp(&daddr, &naddr, sizeof(naddr)); +} + +static bool mlxsw_sp_ipip_can_offload_gre4(const struct mlxsw_sp *mlxsw_sp, + const struct net_device *ol_dev, + enum mlxsw_sp_l3proto ol_proto) +{ + struct ip_tunnel *tunnel = netdev_priv(ol_dev); + __be16 okflags = TUNNEL_KEY; /* We can't offload any other features. */ + bool inherit_ttl = tunnel->parms.iph.ttl == 0; + bool inherit_tos = tunnel->parms.iph.tos & 0x1; + + return (tunnel->parms.i_flags & ~okflags) == 0 && + (tunnel->parms.o_flags & ~okflags) == 0 && + inherit_ttl && inherit_tos && + mlxsw_sp_ipip_tunnel_complete(MLXSW_SP_L3_PROTO_IPV4, ol_dev); +} + +static struct mlxsw_sp_rif_ipip_lb_config +mlxsw_sp_ipip_ol_loopback_config_gre4(struct mlxsw_sp *mlxsw_sp, + const struct net_device *ol_dev) +{ + enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt; + + lb_ipipt = mlxsw_sp_ipip_netdev_has_okey(ol_dev) ? + MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_KEY_IN_IP : + MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_IN_IP; + return (struct mlxsw_sp_rif_ipip_lb_config){ + .lb_ipipt = lb_ipipt, + .okey = mlxsw_sp_ipip_netdev_okey(ol_dev), + .ul_protocol = MLXSW_SP_L3_PROTO_IPV4, + .saddr = mlxsw_sp_ipip_netdev_saddr(MLXSW_SP_L3_PROTO_IPV4, + ol_dev), + }; +} + +static const struct mlxsw_sp_ipip_ops mlxsw_sp_ipip_gre4_ops = { + .dev_type = ARPHRD_IPGRE, + .ul_proto = MLXSW_SP_L3_PROTO_IPV4, + .nexthop_update = mlxsw_sp_ipip_nexthop_update_gre4, + .fib_entry_op = mlxsw_sp_ipip_fib_entry_op_gre4, + .can_offload = mlxsw_sp_ipip_can_offload_gre4, + .ol_loopback_config = mlxsw_sp_ipip_ol_loopback_config_gre4, +}; + +const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[] = { + [MLXSW_SP_IPIP_TYPE_GRE4] = &mlxsw_sp_ipip_gre4_ops, +}; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h new file mode 100644 index 0000000000000..1c2db831d83b1 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h @@ -0,0 +1,79 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Petr Machata <petrm@mellanox.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _MLXSW_IPIP_H_ +#define _MLXSW_IPIP_H_ + +#include "spectrum_router.h" +#include <net/ip_fib.h> + +enum mlxsw_sp_ipip_type { + MLXSW_SP_IPIP_TYPE_GRE4, + MLXSW_SP_IPIP_TYPE_MAX, +}; + +struct mlxsw_sp_ipip_entry { + enum mlxsw_sp_ipip_type ipipt; + struct net_device *ol_dev; /* Overlay. */ + struct mlxsw_sp_rif_ipip_lb *ol_lb; + unsigned int ref_count; /* Number of next hops using the tunnel. */ + struct mlxsw_sp_fib_entry *decap_fib_entry; + struct list_head ipip_list_node; +}; + +struct mlxsw_sp_ipip_ops { + int dev_type; + enum mlxsw_sp_l3proto ul_proto; /* Underlay. */ + + int (*nexthop_update)(struct mlxsw_sp *mlxsw_sp, u32 adj_index, + struct mlxsw_sp_ipip_entry *ipip_entry); + + bool (*can_offload)(const struct mlxsw_sp *mlxsw_sp, + const struct net_device *ol_dev, + enum mlxsw_sp_l3proto ol_proto); + + /* Return a configuration for creating an overlay loopback RIF. */ + struct mlxsw_sp_rif_ipip_lb_config + (*ol_loopback_config)(struct mlxsw_sp *mlxsw_sp, + const struct net_device *ol_dev); + + int (*fib_entry_op)(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_ipip_entry *ipip_entry, + enum mlxsw_reg_ralue_op op, + u32 tunnel_index); +}; + +extern const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[]; + +#endif /* _MLXSW_IPIP_H_*/ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index de15eac50866e..f0fb898533fbd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1,9 +1,10 @@ /* * drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> * Copyright (c) 2016 Ido Schimmel <idosch@mellanox.com> * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com> + * Copyright (c) 2017 Petr Machata <petrm@mellanox.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -51,6 +52,7 @@ #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/fib_rules.h> +#include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/addrconf.h> #include <net/ndisc.h> @@ -62,6 +64,7 @@ #include "reg.h" #include "spectrum_cnt.h" #include "spectrum_dpipe.h" +#include "spectrum_ipip.h" #include "spectrum_router.h" struct mlxsw_sp_vr; @@ -86,9 +89,11 @@ struct mlxsw_sp_router { struct delayed_work nexthop_probe_dw; #define MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL 5000 /* ms */ struct list_head nexthop_neighs_list; + struct list_head ipip_list; bool aborted; struct notifier_block fib_nb; const struct mlxsw_sp_rif_ops **rif_ops_arr; + const struct mlxsw_sp_ipip_ops **ipip_ops_arr; }; struct mlxsw_sp_rif { @@ -129,6 +134,17 @@ struct mlxsw_sp_rif_subport { bool lag; }; +struct mlxsw_sp_rif_ipip_lb { + struct mlxsw_sp_rif common; + struct mlxsw_sp_rif_ipip_lb_config lb_config; + u16 ul_vr_id; /* Reserved for Spectrum-2. */ +}; + +struct mlxsw_sp_rif_params_ipip_lb { + struct mlxsw_sp_rif_params common; + struct mlxsw_sp_rif_ipip_lb_config lb_config; +}; + struct mlxsw_sp_rif_ops { enum mlxsw_sp_rif_type type; size_t rif_size; @@ -365,6 +381,14 @@ enum mlxsw_sp_fib_entry_type { MLXSW_SP_FIB_ENTRY_TYPE_REMOTE, MLXSW_SP_FIB_ENTRY_TYPE_LOCAL, MLXSW_SP_FIB_ENTRY_TYPE_TRAP, + + /* This is a special case of local delivery, where a packet should be + * decapsulated on reception. Note that there is no corresponding ENCAP, + * because that's a type of next hop, not of FIB entry. (There can be + * several next hops in a REMOTE entry, and some of them may be + * encapsulating entries.) + */ + MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP, }; struct mlxsw_sp_nexthop_group; @@ -378,12 +402,18 @@ struct mlxsw_sp_fib_node { struct mlxsw_sp_fib_key key; }; +struct mlxsw_sp_fib_entry_decap { + struct mlxsw_sp_ipip_entry *ipip_entry; + u32 tunnel_index; +}; + struct mlxsw_sp_fib_entry { struct list_head list; struct mlxsw_sp_fib_node *fib_node; enum mlxsw_sp_fib_entry_type type; struct list_head nexthop_group_node; struct mlxsw_sp_nexthop_group *nh_group; + struct mlxsw_sp_fib_entry_decap decap; /* Valid for decap entries. */ }; struct mlxsw_sp_fib4_entry { @@ -405,11 +435,6 @@ struct mlxsw_sp_rt6 { struct rt6_info *rt; }; -enum mlxsw_sp_l3proto { - MLXSW_SP_L3_PROTO_IPV4, - MLXSW_SP_L3_PROTO_IPV6, -}; - struct mlxsw_sp_lpm_tree { u8 id; /* tree ID */ unsigned int ref_count; @@ -886,6 +911,374 @@ static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp) kfree(mlxsw_sp->router->vrs); } +static struct net_device * +__mlxsw_sp_ipip_netdev_ul_dev_get(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + struct net *net = dev_net(ol_dev); + + return __dev_get_by_index(net, tun->parms.link); +} + +static u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev) +{ + struct net_device *d = __mlxsw_sp_ipip_netdev_ul_dev_get(ol_dev); + + if (d) + return l3mdev_fib_table(d) ? : RT_TABLE_MAIN; + else + return l3mdev_fib_table(ol_dev) ? : RT_TABLE_MAIN; +} + +static struct mlxsw_sp_rif * +mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, + const struct mlxsw_sp_rif_params *params); + +static struct mlxsw_sp_rif_ipip_lb * +mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_ipip_type ipipt, + struct net_device *ol_dev) +{ + struct mlxsw_sp_rif_params_ipip_lb lb_params; + const struct mlxsw_sp_ipip_ops *ipip_ops; + struct mlxsw_sp_rif *rif; + + ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipipt]; + lb_params = (struct mlxsw_sp_rif_params_ipip_lb) { + .common.dev = ol_dev, + .common.lag = false, + .lb_config = ipip_ops->ol_loopback_config(mlxsw_sp, ol_dev), + }; + + rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common); + if (IS_ERR(rif)) + return ERR_CAST(rif); + return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common); +} + +static struct mlxsw_sp_ipip_entry * +mlxsw_sp_ipip_entry_alloc(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_ipip_type ipipt, + struct net_device *ol_dev) +{ + struct mlxsw_sp_ipip_entry *ipip_entry; + struct mlxsw_sp_ipip_entry *ret = NULL; + + ipip_entry = kzalloc(sizeof(*ipip_entry), GFP_KERNEL); + if (!ipip_entry) + return ERR_PTR(-ENOMEM); + + ipip_entry->ol_lb = mlxsw_sp_ipip_ol_ipip_lb_create(mlxsw_sp, ipipt, + ol_dev); + if (IS_ERR(ipip_entry->ol_lb)) { + ret = ERR_CAST(ipip_entry->ol_lb); + goto err_ol_ipip_lb_create; + } + + ipip_entry->ipipt = ipipt; + ipip_entry->ol_dev = ol_dev; + + return ipip_entry; + +err_ol_ipip_lb_create: + kfree(ipip_entry); + return ret; +} + +static void +mlxsw_sp_ipip_entry_destroy(struct mlxsw_sp_ipip_entry *ipip_entry) +{ + WARN_ON(ipip_entry->ref_count > 0); + mlxsw_sp_rif_destroy(&ipip_entry->ol_lb->common); + kfree(ipip_entry); +} + +static __be32 +mlxsw_sp_ipip_netdev_saddr4(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + + return tun->parms.iph.saddr; +} + +union mlxsw_sp_l3addr +mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto, + const struct net_device *ol_dev) +{ + switch (proto) { + case MLXSW_SP_L3_PROTO_IPV4: + return (union mlxsw_sp_l3addr) { + .addr4 = mlxsw_sp_ipip_netdev_saddr4(ol_dev), + }; + case MLXSW_SP_L3_PROTO_IPV6: + break; + }; + + WARN_ON(1); + return (union mlxsw_sp_l3addr) { + .addr4 = 0, + }; +} + +__be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev) +{ + struct ip_tunnel *tun = netdev_priv(ol_dev); + + return tun->parms.iph.daddr; +} + +union mlxsw_sp_l3addr +mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto, + const struct net_device *ol_dev) +{ + switch (proto) { + case MLXSW_SP_L3_PROTO_IPV4: + return (union mlxsw_sp_l3addr) { + .addr4 = mlxsw_sp_ipip_netdev_daddr4(ol_dev), + }; + case MLXSW_SP_L3_PROTO_IPV6: + break; + }; + + WARN_ON(1); + return (union mlxsw_sp_l3addr) { + .addr4 = 0, + }; +} + +static bool mlxsw_sp_l3addr_eq(const union mlxsw_sp_l3addr *addr1, + const union mlxsw_sp_l3addr *addr2) +{ + return !memcmp(addr1, addr2, sizeof(*addr1)); +} + +static bool +mlxsw_sp_ipip_entry_saddr_matches(struct mlxsw_sp *mlxsw_sp, + const enum mlxsw_sp_l3proto ul_proto, + union mlxsw_sp_l3addr saddr, + u32 ul_tb_id, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + u32 tun_ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev); + enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt; + union mlxsw_sp_l3addr tun_saddr; + + if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto) + return false; + + tun_saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ipip_entry->ol_dev); + return tun_ul_tb_id == ul_tb_id && + mlxsw_sp_l3addr_eq(&tun_saddr, &saddr); +} + +static int +mlxsw_sp_fib_entry_decap_init(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + u32 tunnel_index; + int err; + + err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &tunnel_index); + if (err) + return err; + + ipip_entry->decap_fib_entry = fib_entry; + fib_entry->decap.ipip_entry = ipip_entry; + fib_entry->decap.tunnel_index = tunnel_index; + return 0; +} + +static void mlxsw_sp_fib_entry_decap_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) +{ + /* Unlink this node from the IPIP entry that it's the decap entry of. */ + fib_entry->decap.ipip_entry->decap_fib_entry = NULL; + fib_entry->decap.ipip_entry = NULL; + mlxsw_sp_kvdl_free(mlxsw_sp, fib_entry->decap.tunnel_index); +} + +static struct mlxsw_sp_fib_node * +mlxsw_sp_fib_node_lookup(struct mlxsw_sp_fib *fib, const void *addr, + size_t addr_len, unsigned char prefix_len); +static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry); + +static void +mlxsw_sp_ipip_entry_demote_decap(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + struct mlxsw_sp_fib_entry *fib_entry = ipip_entry->decap_fib_entry; + + mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, fib_entry); + fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP; + + mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry); +} + +static void +mlxsw_sp_ipip_entry_promote_decap(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_ipip_entry *ipip_entry, + struct mlxsw_sp_fib_entry *decap_fib_entry) +{ + if (mlxsw_sp_fib_entry_decap_init(mlxsw_sp, decap_fib_entry, + ipip_entry)) + return; + decap_fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP; + + if (mlxsw_sp_fib_entry_update(mlxsw_sp, decap_fib_entry)) + mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry); +} + +/* Given an IPIP entry, find the corresponding decap route. */ +static struct mlxsw_sp_fib_entry * +mlxsw_sp_ipip_entry_find_decap(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + static struct mlxsw_sp_fib_node *fib_node; + const struct mlxsw_sp_ipip_ops *ipip_ops; + struct mlxsw_sp_fib_entry *fib_entry; + unsigned char saddr_prefix_len; + union mlxsw_sp_l3addr saddr; + struct mlxsw_sp_fib *ul_fib; + struct mlxsw_sp_vr *ul_vr; + const void *saddrp; + size_t saddr_len; + u32 ul_tb_id; + u32 saddr4; + + ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt]; + + ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev); + ul_vr = mlxsw_sp_vr_find(mlxsw_sp, ul_tb_id); + if (!ul_vr) + return NULL; + + ul_fib = mlxsw_sp_vr_fib(ul_vr, ipip_ops->ul_proto); + saddr = mlxsw_sp_ipip_netdev_saddr(ipip_ops->ul_proto, + ipip_entry->ol_dev); + + switch (ipip_ops->ul_proto) { + case MLXSW_SP_L3_PROTO_IPV4: + saddr4 = be32_to_cpu(saddr.addr4); + saddrp = &saddr4; + saddr_len = 4; + saddr_prefix_len = 32; + break; + case MLXSW_SP_L3_PROTO_IPV6: + WARN_ON(1); + return NULL; + } + + fib_node = mlxsw_sp_fib_node_lookup(ul_fib, saddrp, saddr_len, + saddr_prefix_len); + if (!fib_node || list_empty(&fib_node->entry_list)) + return NULL; + + fib_entry = list_first_entry(&fib_node->entry_list, + struct mlxsw_sp_fib_entry, list); + if (fib_entry->type != MLXSW_SP_FIB_ENTRY_TYPE_TRAP) + return NULL; + + return fib_entry; +} + +static struct mlxsw_sp_ipip_entry * +mlxsw_sp_ipip_entry_get(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_ipip_type ipipt, + struct net_device *ol_dev) +{ + u32 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ol_dev); + struct mlxsw_sp_router *router = mlxsw_sp->router; + struct mlxsw_sp_fib_entry *decap_fib_entry; + struct mlxsw_sp_ipip_entry *ipip_entry; + enum mlxsw_sp_l3proto ul_proto; + union mlxsw_sp_l3addr saddr; + + list_for_each_entry(ipip_entry, &mlxsw_sp->router->ipip_list, + ipip_list_node) { + if (ipip_entry->ol_dev == ol_dev) + goto inc_ref_count; + + /* The configuration where several tunnels have the same local + * address in the same underlay table needs special treatment in + * the HW. That is currently not implemented in the driver. + */ + ul_proto = router->ipip_ops_arr[ipip_entry->ipipt]->ul_proto; + saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ol_dev); + if (mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, saddr, + ul_tb_id, ipip_entry)) + return ERR_PTR(-EEXIST); + } + + ipip_entry = mlxsw_sp_ipip_entry_alloc(mlxsw_sp, ipipt, ol_dev); + if (IS_ERR(ipip_entry)) + return ipip_entry; + + decap_fib_entry = mlxsw_sp_ipip_entry_find_decap(mlxsw_sp, ipip_entry); + if (decap_fib_entry) + mlxsw_sp_ipip_entry_promote_decap(mlxsw_sp, ipip_entry, + decap_fib_entry); + + list_add_tail(&ipip_entry->ipip_list_node, + &mlxsw_sp->router->ipip_list); + +inc_ref_count: + ++ipip_entry->ref_count; + return ipip_entry; +} + +static void +mlxsw_sp_ipip_entry_put(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + if (--ipip_entry->ref_count == 0) { + list_del(&ipip_entry->ipip_list_node); + if (ipip_entry->decap_fib_entry) + mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry); + mlxsw_sp_ipip_entry_destroy(ipip_entry); + } +} + +static bool +mlxsw_sp_ipip_entry_matches_decap(struct mlxsw_sp *mlxsw_sp, + const struct net_device *ul_dev, + enum mlxsw_sp_l3proto ul_proto, + union mlxsw_sp_l3addr ul_dip, + struct mlxsw_sp_ipip_entry *ipip_entry) +{ + u32 ul_tb_id = l3mdev_fib_table(ul_dev) ? : RT_TABLE_MAIN; + enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt; + struct net_device *ipip_ul_dev; + + if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto) + return false; + + ipip_ul_dev = __mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev); + return mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, ul_dip, + ul_tb_id, ipip_entry) && + (!ipip_ul_dev || ipip_ul_dev == ul_dev); +} + +/* Given decap parameters, find the corresponding IPIP entry. */ +static struct mlxsw_sp_ipip_entry * +mlxsw_sp_ipip_entry_find_by_decap(struct mlxsw_sp *mlxsw_sp, + const struct net_device *ul_dev, + enum mlxsw_sp_l3proto ul_proto, + union mlxsw_sp_l3addr ul_dip) +{ + struct mlxsw_sp_ipip_entry *ipip_entry; + + list_for_each_entry(ipip_entry, &mlxsw_sp->router->ipip_list, + ipip_list_node) + if (mlxsw_sp_ipip_entry_matches_decap(mlxsw_sp, ul_dev, + ul_proto, ul_dip, + ipip_entry)) + return ipip_entry; + + return NULL; +} + struct mlxsw_sp_neigh_key { struct neighbour *n; }; @@ -1623,6 +2016,11 @@ static void mlxsw_sp_neigh_rif_gone_sync(struct mlxsw_sp *mlxsw_sp, } } +enum mlxsw_sp_nexthop_type { + MLXSW_SP_NEXTHOP_TYPE_ETH, + MLXSW_SP_NEXTHOP_TYPE_IPIP, +}; + struct mlxsw_sp_nexthop_key { struct fib_nh *fib_nh; }; @@ -1647,7 +2045,11 @@ struct mlxsw_sp_nexthop { update:1; /* set indicates that MAC of this neigh should be * updated in HW */ - struct mlxsw_sp_neigh_entry *neigh_entry; + enum mlxsw_sp_nexthop_type type; + union { + struct mlxsw_sp_neigh_entry *neigh_entry; + struct mlxsw_sp_ipip_entry *ipip_entry; + }; }; struct mlxsw_sp_nexthop_group { @@ -1928,15 +2330,26 @@ static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, char ratr_pl[MLXSW_REG_RATR_LEN]; mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY, - true, adj_index, neigh_entry->rif); + true, MLXSW_REG_RATR_TYPE_ETHERNET, + adj_index, neigh_entry->rif); mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha); return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl); } +static int mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp, + u32 adj_index, + struct mlxsw_sp_nexthop *nh) +{ + const struct mlxsw_sp_ipip_ops *ipip_ops; + + ipip_ops = mlxsw_sp->router->ipip_ops_arr[nh->ipip_entry->ipipt]; + return ipip_ops->nexthop_update(mlxsw_sp, adj_index, nh->ipip_entry); +} + static int -mlxsw_sp_nexthop_group_mac_update(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_nexthop_group *nh_grp, - bool reallocate) +mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp, + bool reallocate) { u32 adj_index = nh_grp->adj_index; /* base */ struct mlxsw_sp_nexthop *nh; @@ -1952,8 +2365,16 @@ mlxsw_sp_nexthop_group_mac_update(struct mlxsw_sp *mlxsw_sp, } if (nh->update || reallocate) { - err = mlxsw_sp_nexthop_mac_update(mlxsw_sp, - adj_index, nh); + switch (nh->type) { + case MLXSW_SP_NEXTHOP_TYPE_ETH: + err = mlxsw_sp_nexthop_mac_update + (mlxsw_sp, adj_index, nh); + break; + case MLXSW_SP_NEXTHOP_TYPE_IPIP: + err = mlxsw_sp_nexthop_ipip_update + (mlxsw_sp, adj_index, nh); + break; + } if (err) return err; nh->update = 0; @@ -1964,9 +2385,6 @@ mlxsw_sp_nexthop_group_mac_update(struct mlxsw_sp *mlxsw_sp, return 0; } -static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_fib_entry *fib_entry); - static bool mlxsw_sp_fib_node_entry_is_first(const struct mlxsw_sp_fib_node *fib_node, const struct mlxsw_sp_fib_entry *fib_entry); @@ -2041,8 +2459,7 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp, /* Nothing was added or removed, so no need to reallocate. Just * update MAC on existing adjacency indexes. */ - err = mlxsw_sp_nexthop_group_mac_update(mlxsw_sp, nh_grp, - false); + err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, false); if (err) { dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n"); goto set_trap; @@ -2069,7 +2486,7 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp, nh_grp->adj_index_valid = 1; nh_grp->adj_index = adj_index; nh_grp->ecmp_size = ecmp_size; - err = mlxsw_sp_nexthop_group_mac_update(mlxsw_sp, nh_grp, true); + err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, true); if (err) { dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n"); goto set_trap; @@ -2238,6 +2655,119 @@ static void mlxsw_sp_nexthop_neigh_fini(struct mlxsw_sp *mlxsw_sp, neigh_release(n); } +static bool mlxsw_sp_netdev_ipip_type(const struct mlxsw_sp *mlxsw_sp, + const struct net_device *dev, + enum mlxsw_sp_ipip_type *p_type) +{ + struct mlxsw_sp_router *router = mlxsw_sp->router; + const struct mlxsw_sp_ipip_ops *ipip_ops; + enum mlxsw_sp_ipip_type ipipt; + + for (ipipt = 0; ipipt < MLXSW_SP_IPIP_TYPE_MAX; ++ipipt) { + ipip_ops = router->ipip_ops_arr[ipipt]; + if (dev->type == ipip_ops->dev_type) { + if (p_type) + *p_type = ipipt; + return true; + } + } + return false; +} + +static int mlxsw_sp_nexthop_ipip_init(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_ipip_type ipipt, + struct mlxsw_sp_nexthop *nh, + struct net_device *ol_dev) +{ + if (!nh->nh_grp->gateway || nh->ipip_entry) + return 0; + + nh->ipip_entry = mlxsw_sp_ipip_entry_get(mlxsw_sp, ipipt, ol_dev); + if (IS_ERR(nh->ipip_entry)) + return PTR_ERR(nh->ipip_entry); + + __mlxsw_sp_nexthop_neigh_update(nh, false); + return 0; +} + +static void mlxsw_sp_nexthop_ipip_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + struct mlxsw_sp_ipip_entry *ipip_entry = nh->ipip_entry; + + if (!ipip_entry) + return; + + __mlxsw_sp_nexthop_neigh_update(nh, true); + mlxsw_sp_ipip_entry_put(mlxsw_sp, ipip_entry); + nh->ipip_entry = NULL; +} + +static bool mlxsw_sp_nexthop4_ipip_type(const struct mlxsw_sp *mlxsw_sp, + const struct fib_nh *fib_nh, + enum mlxsw_sp_ipip_type *p_ipipt) +{ + struct net_device *dev = fib_nh->nh_dev; + + return dev && + fib_nh->nh_parent->fib_type == RTN_UNICAST && + mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, p_ipipt); +} + +static void mlxsw_sp_nexthop_type_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + switch (nh->type) { + case MLXSW_SP_NEXTHOP_TYPE_ETH: + mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh); + mlxsw_sp_nexthop_rif_fini(nh); + break; + case MLXSW_SP_NEXTHOP_TYPE_IPIP: + mlxsw_sp_nexthop_ipip_fini(mlxsw_sp, nh); + break; + } +} + +static int mlxsw_sp_nexthop4_type_init(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh, + struct fib_nh *fib_nh) +{ + struct mlxsw_sp_router *router = mlxsw_sp->router; + struct net_device *dev = fib_nh->nh_dev; + enum mlxsw_sp_ipip_type ipipt; + struct mlxsw_sp_rif *rif; + int err; + + if (mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, fib_nh, &ipipt) && + router->ipip_ops_arr[ipipt]->can_offload(mlxsw_sp, dev, + MLXSW_SP_L3_PROTO_IPV4)) { + nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP; + return mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + } + + nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH; + rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); + if (!rif) + return 0; + + mlxsw_sp_nexthop_rif_init(nh, rif); + err = mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh); + if (err) + goto err_neigh_init; + + return 0; + +err_neigh_init: + mlxsw_sp_nexthop_rif_fini(nh); + return err; +} + +static void mlxsw_sp_nexthop4_type_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh); +} + static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop_group *nh_grp, struct mlxsw_sp_nexthop *nh, @@ -2245,7 +2775,6 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp, { struct net_device *dev = fib_nh->nh_dev; struct in_device *in_dev; - struct mlxsw_sp_rif *rif; int err; nh->nh_grp = nh_grp; @@ -2263,19 +2792,13 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp, fib_nh->nh_flags & RTNH_F_LINKDOWN) return 0; - rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); - if (!rif) - return 0; - mlxsw_sp_nexthop_rif_init(nh, rif); - - err = mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh); + err = mlxsw_sp_nexthop4_type_init(mlxsw_sp, nh, fib_nh); if (err) goto err_nexthop_neigh_init; return 0; err_nexthop_neigh_init: - mlxsw_sp_nexthop_rif_fini(nh); mlxsw_sp_nexthop_remove(mlxsw_sp, nh); return err; } @@ -2283,8 +2806,7 @@ err_nexthop_neigh_init: static void mlxsw_sp_nexthop4_fini(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh) { - mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh); - mlxsw_sp_nexthop_rif_fini(nh); + mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh); mlxsw_sp_nexthop_remove(mlxsw_sp, nh); } @@ -2293,7 +2815,6 @@ static void mlxsw_sp_nexthop4_event(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_nexthop_key key; struct mlxsw_sp_nexthop *nh; - struct mlxsw_sp_rif *rif; if (mlxsw_sp->router->aborted) return; @@ -2303,18 +2824,12 @@ static void mlxsw_sp_nexthop4_event(struct mlxsw_sp *mlxsw_sp, if (WARN_ON_ONCE(!nh)) return; - rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, fib_nh->nh_dev); - if (!rif) - return; - switch (event) { case FIB_EVENT_NH_ADD: - mlxsw_sp_nexthop_rif_init(nh, rif); - mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh); + mlxsw_sp_nexthop4_type_init(mlxsw_sp, nh, fib_nh); break; case FIB_EVENT_NH_DEL: - mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh); - mlxsw_sp_nexthop_rif_fini(nh); + mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh); break; } @@ -2327,12 +2842,18 @@ static void mlxsw_sp_nexthop_rif_gone_sync(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh, *tmp; list_for_each_entry_safe(nh, tmp, &rif->nexthop_list, rif_list_node) { - mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh); - mlxsw_sp_nexthop_rif_fini(nh); + mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh); mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp); } } +static bool mlxsw_sp_fi_is_gateway(const struct mlxsw_sp *mlxsw_sp, + const struct fib_info *fi) +{ + return fi->fib_nh->nh_scope == RT_SCOPE_LINK || + mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, fi->fib_nh, NULL); +} + static struct mlxsw_sp_nexthop_group * mlxsw_sp_nexthop4_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi) { @@ -2352,7 +2873,7 @@ mlxsw_sp_nexthop4_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi) INIT_LIST_HEAD(&nh_grp->fib_list); nh_grp->neigh_tbl = &arp_tbl; - nh_grp->gateway = fi->fib_nh->nh_scope == RT_SCOPE_LINK; + nh_grp->gateway = mlxsw_sp_fi_is_gateway(mlxsw_sp, fi); nh_grp->count = fi->fib_nhs; fib_info_hold(fi); for (i = 0; i < nh_grp->count; i++) { @@ -2454,6 +2975,8 @@ mlxsw_sp_fib_entry_should_offload(const struct mlxsw_sp_fib_entry *fib_entry) return !!nh_group->adj_index_valid; case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL: return !!nh_group->nh_rif; + case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP: + return true; default: return false; } @@ -2485,7 +3008,8 @@ mlxsw_sp_fib4_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; int i; - if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) { + if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL || + fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP) { nh_grp->nexthops->key.fib_nh->nh_flags |= RTNH_F_OFFLOAD; return; } @@ -2690,6 +3214,22 @@ static int mlxsw_sp_fib_entry_op_trap(struct mlxsw_sp *mlxsw_sp, return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl); } +static int +mlxsw_sp_fib_entry_op_ipip_decap(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry, + enum mlxsw_reg_ralue_op op) +{ + struct mlxsw_sp_ipip_entry *ipip_entry = fib_entry->decap.ipip_entry; + const struct mlxsw_sp_ipip_ops *ipip_ops; + + if (WARN_ON(!ipip_entry)) + return -EINVAL; + + ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt]; + return ipip_ops->fib_entry_op(mlxsw_sp, ipip_entry, op, + fib_entry->decap.tunnel_index); +} + static int __mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib_entry *fib_entry, enum mlxsw_reg_ralue_op op) @@ -2701,6 +3241,9 @@ static int __mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp, return mlxsw_sp_fib_entry_op_local(mlxsw_sp, fib_entry, op); case MLXSW_SP_FIB_ENTRY_TYPE_TRAP: return mlxsw_sp_fib_entry_op_trap(mlxsw_sp, fib_entry, op); + case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP: + return mlxsw_sp_fib_entry_op_ipip_decap(mlxsw_sp, + fib_entry, op); } return -EINVAL; } @@ -2735,11 +3278,23 @@ mlxsw_sp_fib4_entry_type_set(struct mlxsw_sp *mlxsw_sp, const struct fib_entry_notifier_info *fen_info, struct mlxsw_sp_fib_entry *fib_entry) { + union mlxsw_sp_l3addr dip = { .addr4 = htonl(fen_info->dst) }; + struct net_device *dev = fen_info->fi->fib_dev; + struct mlxsw_sp_ipip_entry *ipip_entry; struct fib_info *fi = fen_info->fi; switch (fen_info->type) { - case RTN_BROADCAST: /* fall through */ case RTN_LOCAL: + ipip_entry = mlxsw_sp_ipip_entry_find_by_decap(mlxsw_sp, dev, + MLXSW_SP_L3_PROTO_IPV4, dip); + if (ipip_entry) { + fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP; + return mlxsw_sp_fib_entry_decap_init(mlxsw_sp, + fib_entry, + ipip_entry); + } + /* fall through */ + case RTN_BROADCAST: fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP; return 0; case RTN_UNREACHABLE: /* fall through */ @@ -2752,10 +3307,10 @@ mlxsw_sp_fib4_entry_type_set(struct mlxsw_sp *mlxsw_sp, fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL; return 0; case RTN_UNICAST: - if (fi->fib_nh->nh_scope != RT_SCOPE_LINK) - fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL; - else + if (mlxsw_sp_fi_is_gateway(mlxsw_sp, fi)) fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE; + else + fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL; return 0; default: return -EINVAL; @@ -2806,10 +3361,6 @@ static void mlxsw_sp_fib4_entry_destroy(struct mlxsw_sp *mlxsw_sp, kfree(fib4_entry); } -static struct mlxsw_sp_fib_node * -mlxsw_sp_fib_node_lookup(struct mlxsw_sp_fib *fib, const void *addr, - size_t addr_len, unsigned char prefix_len); - static struct mlxsw_sp_fib4_entry * mlxsw_sp_fib4_entry_lookup(struct mlxsw_sp *mlxsw_sp, const struct fib_entry_notifier_info *fen_info) @@ -3232,6 +3783,9 @@ mlxsw_sp_fib4_node_entry_unlink(struct mlxsw_sp *mlxsw_sp, { mlxsw_sp_fib_node_entry_del(mlxsw_sp, &fib4_entry->common); mlxsw_sp_fib4_node_list_remove(fib4_entry); + + if (fib4_entry->common.type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP) + mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, &fib4_entry->common); } static void mlxsw_sp_fib4_entry_replace(struct mlxsw_sp *mlxsw_sp, @@ -3432,22 +3986,33 @@ mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry, return NULL; } -static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_nexthop_group *nh_grp, - struct mlxsw_sp_nexthop *nh, - const struct rt6_info *rt) +static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp, + const struct rt6_info *rt, + enum mlxsw_sp_ipip_type *ret) { + return rt->dst.dev && + mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret); +} + +static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp, + struct mlxsw_sp_nexthop *nh, + const struct rt6_info *rt) +{ + struct mlxsw_sp_router *router = mlxsw_sp->router; struct net_device *dev = rt->dst.dev; + enum mlxsw_sp_ipip_type ipipt; struct mlxsw_sp_rif *rif; int err; - nh->nh_grp = nh_grp; - memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr)); - - if (!dev) - return 0; - nh->ifindex = dev->ifindex; + if (mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, &ipipt) && + router->ipip_ops_arr[ipipt]->can_offload(mlxsw_sp, dev, + MLXSW_SP_L3_PROTO_IPV6)) { + nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP; + return mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + } + nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH; rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); if (!rif) return 0; @@ -3464,11 +4029,40 @@ err_nexthop_neigh_init: return err; } +static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh); +} + +static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp, + struct mlxsw_sp_nexthop *nh, + const struct rt6_info *rt) +{ + struct net_device *dev = rt->dst.dev; + + nh->nh_grp = nh_grp; + memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr)); + + if (!dev) + return 0; + nh->ifindex = dev->ifindex; + + return mlxsw_sp_nexthop6_type_init(mlxsw_sp, nh_grp, nh, rt); +} + static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh) { - mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh); - mlxsw_sp_nexthop_rif_fini(nh); + mlxsw_sp_nexthop6_type_fini(mlxsw_sp, nh); +} + +static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp, + const struct rt6_info *rt) +{ + return rt->rt6i_flags & RTF_GATEWAY || + mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL); } static struct mlxsw_sp_nexthop_group * @@ -3493,7 +4087,7 @@ mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp, #endif mlxsw_sp_rt6 = list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6, list); - nh_grp->gateway = !!(mlxsw_sp_rt6->rt->rt6i_flags & RTF_GATEWAY); + nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt); nh_grp->count = fib6_entry->nrt6; for (i = 0; i < nh_grp->count; i++) { struct rt6_info *rt = mlxsw_sp_rt6->rt; @@ -3650,7 +4244,8 @@ mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_rt6_destroy(mlxsw_sp_rt6); } -static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp_fib_entry *fib_entry, +static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry, const struct rt6_info *rt) { /* Packets hitting RTF_REJECT routes need to be discarded by the @@ -3663,7 +4258,7 @@ static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp_fib_entry *fib_entry, fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP; else if (rt->rt6i_flags & RTF_REJECT) fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL; - else if (rt->rt6i_flags & RTF_GATEWAY) + else if (mlxsw_sp_rt6_is_gateway(mlxsw_sp, rt)) fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE; else fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL; @@ -3703,7 +4298,7 @@ mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp, goto err_rt6_create; } - mlxsw_sp_fib6_entry_type_set(fib_entry, mlxsw_sp_rt6->rt); + mlxsw_sp_fib6_entry_type_set(mlxsw_sp, fib_entry, mlxsw_sp_rt6->rt); INIT_LIST_HEAD(&fib6_entry->rt6_list); list_add_tail(&mlxsw_sp_rt6->list, &fib6_entry->rt6_list); @@ -4376,7 +4971,10 @@ mlxsw_sp_dev_rif_type(const struct mlxsw_sp *mlxsw_sp, { enum mlxsw_sp_fid_type type; - /* RIF type is derived from the type of the underlying FID */ + if (mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL)) + return MLXSW_SP_RIF_TYPE_IPIP_LB; + + /* Otherwise RIF type is derived from the type of the underlying FID. */ if (is_vlan_dev(dev) && netif_is_bridge_master(vlan_dev_real_dev(dev))) type = MLXSW_SP_FID_TYPE_8021Q; else if (netif_is_bridge_master(dev) && br_vlan_enabled(dev)) @@ -4435,6 +5033,16 @@ u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif) return rif->rif_index; } +u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *lb_rif) +{ + return lb_rif->common.rif_index; +} + +u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif) +{ + return lb_rif->ul_vr_id; +} + int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif) { return rif->dev->ifindex; @@ -4446,9 +5054,9 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, { u32 tb_id = l3mdev_fib_table(params->dev); const struct mlxsw_sp_rif_ops *ops; + struct mlxsw_sp_fid *fid = NULL; enum mlxsw_sp_rif_type type; struct mlxsw_sp_rif *rif; - struct mlxsw_sp_fid *fid; struct mlxsw_sp_vr *vr; u16 rif_index; int err; @@ -4472,12 +5080,14 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, rif->mlxsw_sp = mlxsw_sp; rif->ops = ops; - fid = ops->fid_get(rif); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - goto err_fid_get; + if (ops->fid_get) { + fid = ops->fid_get(rif); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + goto err_fid_get; + } + rif->fid = fid; } - rif->fid = fid; if (ops->setup) ops->setup(rif, params); @@ -4486,22 +5096,15 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, if (err) goto err_configure; - err = mlxsw_sp_rif_fdb_op(mlxsw_sp, params->dev->dev_addr, - mlxsw_sp_fid_index(fid), true); - if (err) - goto err_rif_fdb_op; - mlxsw_sp_rif_counters_alloc(rif); - mlxsw_sp_fid_rif_set(fid, rif); mlxsw_sp->router->rifs[rif_index] = rif; vr->rif_count++; return rif; -err_rif_fdb_op: - ops->deconfigure(rif); err_configure: - mlxsw_sp_fid_put(fid); + if (fid) + mlxsw_sp_fid_put(fid); err_fid_get: kfree(rif); err_rif_alloc: @@ -4522,12 +5125,11 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif) vr->rif_count--; mlxsw_sp->router->rifs[rif->rif_index] = NULL; - mlxsw_sp_fid_rif_set(fid, NULL); mlxsw_sp_rif_counters_free(rif); - mlxsw_sp_rif_fdb_op(mlxsw_sp, rif->dev->dev_addr, - mlxsw_sp_fid_index(fid), false); ops->deconfigure(rif); - mlxsw_sp_fid_put(fid); + if (fid) + /* Loopback RIFs are not associated with a FID. */ + mlxsw_sp_fid_put(fid); kfree(rif); mlxsw_sp_vr_put(vr); } @@ -4955,8 +5557,8 @@ static int mlxsw_sp_rif_subport_op(struct mlxsw_sp_rif *rif, bool enable) rif_subport = mlxsw_sp_rif_subport_rif(rif); mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_SP_IF, - rif->rif_index, rif->vr_id, rif->dev->mtu, - rif->dev->dev_addr); + rif->rif_index, rif->vr_id, rif->dev->mtu); + mlxsw_reg_ritr_mac_pack(ritr_pl, rif->dev->dev_addr); mlxsw_reg_ritr_sp_if_pack(ritr_pl, rif_subport->lag, rif_subport->lag ? rif_subport->lag_id : rif_subport->system_port, @@ -4967,11 +5569,32 @@ static int mlxsw_sp_rif_subport_op(struct mlxsw_sp_rif *rif, bool enable) static int mlxsw_sp_rif_subport_configure(struct mlxsw_sp_rif *rif) { - return mlxsw_sp_rif_subport_op(rif, true); + int err; + + err = mlxsw_sp_rif_subport_op(rif, true); + if (err) + return err; + + err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr, + mlxsw_sp_fid_index(rif->fid), true); + if (err) + goto err_rif_fdb_op; + + mlxsw_sp_fid_rif_set(rif->fid, rif); + return 0; + +err_rif_fdb_op: + mlxsw_sp_rif_subport_op(rif, false); + return err; } static void mlxsw_sp_rif_subport_deconfigure(struct mlxsw_sp_rif *rif) { + struct mlxsw_sp_fid *fid = rif->fid; + + mlxsw_sp_fid_rif_set(fid, NULL); + mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr, + mlxsw_sp_fid_index(fid), false); mlxsw_sp_rif_subport_op(rif, false); } @@ -4998,7 +5621,8 @@ static int mlxsw_sp_rif_vlan_fid_op(struct mlxsw_sp_rif *rif, char ritr_pl[MLXSW_REG_RITR_LEN]; mlxsw_reg_ritr_pack(ritr_pl, enable, type, rif->rif_index, rif->vr_id, - rif->dev->mtu, rif->dev->dev_addr); + rif->dev->mtu); + mlxsw_reg_ritr_mac_pack(ritr_pl, rif->dev->dev_addr); mlxsw_reg_ritr_fid_set(ritr_pl, type, vid_fid); return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl); @@ -5029,8 +5653,17 @@ static int mlxsw_sp_rif_vlan_configure(struct mlxsw_sp_rif *rif) if (err) goto err_fid_bc_flood_set; + err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr, + mlxsw_sp_fid_index(rif->fid), true); + if (err) + goto err_rif_fdb_op; + + mlxsw_sp_fid_rif_set(rif->fid, rif); return 0; +err_rif_fdb_op: + mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC, + mlxsw_sp_router_port(mlxsw_sp), false); err_fid_bc_flood_set: mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC, mlxsw_sp_router_port(mlxsw_sp), false); @@ -5041,9 +5674,13 @@ err_fid_mc_flood_set: static void mlxsw_sp_rif_vlan_deconfigure(struct mlxsw_sp_rif *rif) { - struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid); + struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; + struct mlxsw_sp_fid *fid = rif->fid; + mlxsw_sp_fid_rif_set(fid, NULL); + mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr, + mlxsw_sp_fid_index(fid), false); mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC, mlxsw_sp_router_port(mlxsw_sp), false); mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC, @@ -5088,8 +5725,17 @@ static int mlxsw_sp_rif_fid_configure(struct mlxsw_sp_rif *rif) if (err) goto err_fid_bc_flood_set; + err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr, + mlxsw_sp_fid_index(rif->fid), true); + if (err) + goto err_rif_fdb_op; + + mlxsw_sp_fid_rif_set(rif->fid, rif); return 0; +err_rif_fdb_op: + mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC, + mlxsw_sp_router_port(mlxsw_sp), false); err_fid_bc_flood_set: mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC, mlxsw_sp_router_port(mlxsw_sp), false); @@ -5100,9 +5746,13 @@ err_fid_mc_flood_set: static void mlxsw_sp_rif_fid_deconfigure(struct mlxsw_sp_rif *rif) { - struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; u16 fid_index = mlxsw_sp_fid_index(rif->fid); + struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; + struct mlxsw_sp_fid *fid = rif->fid; + mlxsw_sp_fid_rif_set(fid, NULL); + mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr, + mlxsw_sp_fid_index(fid), false); mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC, mlxsw_sp_router_port(mlxsw_sp), false); mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC, @@ -5124,10 +5774,104 @@ static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_fid_ops = { .fid_get = mlxsw_sp_rif_fid_fid_get, }; +static struct mlxsw_sp_rif_ipip_lb * +mlxsw_sp_rif_ipip_lb_rif(struct mlxsw_sp_rif *rif) +{ + return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common); +} + +static void +mlxsw_sp_rif_ipip_lb_setup(struct mlxsw_sp_rif *rif, + const struct mlxsw_sp_rif_params *params) +{ + struct mlxsw_sp_rif_params_ipip_lb *params_lb; + struct mlxsw_sp_rif_ipip_lb *rif_lb; + + params_lb = container_of(params, struct mlxsw_sp_rif_params_ipip_lb, + common); + rif_lb = mlxsw_sp_rif_ipip_lb_rif(rif); + rif_lb->lb_config = params_lb->lb_config; +} + +static int +mlxsw_sp_rif_ipip_lb_op(struct mlxsw_sp_rif_ipip_lb *lb_rif, + struct mlxsw_sp_vr *ul_vr, bool enable) +{ + struct mlxsw_sp_rif_ipip_lb_config lb_cf = lb_rif->lb_config; + struct mlxsw_sp_rif *rif = &lb_rif->common; + struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; + char ritr_pl[MLXSW_REG_RITR_LEN]; + u32 saddr4; + + switch (lb_cf.ul_protocol) { + case MLXSW_SP_L3_PROTO_IPV4: + saddr4 = be32_to_cpu(lb_cf.saddr.addr4); + mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_LOOPBACK_IF, + rif->rif_index, rif->vr_id, rif->dev->mtu); + mlxsw_reg_ritr_loopback_ipip4_pack(ritr_pl, lb_cf.lb_ipipt, + MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET, + ul_vr->id, saddr4, lb_cf.okey); + break; + + case MLXSW_SP_L3_PROTO_IPV6: + return -EAFNOSUPPORT; + } + + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl); +} + +static int +mlxsw_sp_rif_ipip_lb_configure(struct mlxsw_sp_rif *rif) +{ + struct mlxsw_sp_rif_ipip_lb *lb_rif = mlxsw_sp_rif_ipip_lb_rif(rif); + u32 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(rif->dev); + struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; + struct mlxsw_sp_vr *ul_vr; + int err; + + ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id); + if (IS_ERR(ul_vr)) + return PTR_ERR(ul_vr); + + err = mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, true); + if (err) + goto err_loopback_op; + + lb_rif->ul_vr_id = ul_vr->id; + ++ul_vr->rif_count; + return 0; + +err_loopback_op: + mlxsw_sp_vr_put(ul_vr); + return err; +} + +static void mlxsw_sp_rif_ipip_lb_deconfigure(struct mlxsw_sp_rif *rif) +{ + struct mlxsw_sp_rif_ipip_lb *lb_rif = mlxsw_sp_rif_ipip_lb_rif(rif); + struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp; + struct mlxsw_sp_vr *ul_vr; + + ul_vr = &mlxsw_sp->router->vrs[lb_rif->ul_vr_id]; + mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, false); + + --ul_vr->rif_count; + mlxsw_sp_vr_put(ul_vr); +} + +static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_ipip_lb_ops = { + .type = MLXSW_SP_RIF_TYPE_IPIP_LB, + .rif_size = sizeof(struct mlxsw_sp_rif_ipip_lb), + .setup = mlxsw_sp_rif_ipip_lb_setup, + .configure = mlxsw_sp_rif_ipip_lb_configure, + .deconfigure = mlxsw_sp_rif_ipip_lb_deconfigure, +}; + static const struct mlxsw_sp_rif_ops *mlxsw_sp_rif_ops_arr[] = { [MLXSW_SP_RIF_TYPE_SUBPORT] = &mlxsw_sp_rif_subport_ops, [MLXSW_SP_RIF_TYPE_VLAN] = &mlxsw_sp_rif_vlan_ops, [MLXSW_SP_RIF_TYPE_FID] = &mlxsw_sp_rif_fid_ops, + [MLXSW_SP_RIF_TYPE_IPIP_LB] = &mlxsw_sp_rif_ipip_lb_ops, }; static int mlxsw_sp_rifs_init(struct mlxsw_sp *mlxsw_sp) @@ -5155,6 +5899,18 @@ static void mlxsw_sp_rifs_fini(struct mlxsw_sp *mlxsw_sp) kfree(mlxsw_sp->router->rifs); } +static int mlxsw_sp_ipips_init(struct mlxsw_sp *mlxsw_sp) +{ + mlxsw_sp->router->ipip_ops_arr = mlxsw_sp_ipip_ops_arr; + INIT_LIST_HEAD(&mlxsw_sp->router->ipip_list); + return 0; +} + +static void mlxsw_sp_ipips_fini(struct mlxsw_sp *mlxsw_sp) +{ + WARN_ON(!list_empty(&mlxsw_sp->router->ipip_list)); +} + static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb) { struct mlxsw_sp_router *router; @@ -5214,6 +5970,10 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) if (err) goto err_rifs_init; + err = mlxsw_sp_ipips_init(mlxsw_sp); + if (err) + goto err_ipips_init; + err = rhashtable_init(&mlxsw_sp->router->nexthop_ht, &mlxsw_sp_nexthop_ht_params); if (err) @@ -5255,6 +6015,8 @@ err_lpm_init: err_nexthop_group_ht_init: rhashtable_destroy(&mlxsw_sp->router->nexthop_ht); err_nexthop_ht_init: + mlxsw_sp_ipips_fini(mlxsw_sp); +err_ipips_init: mlxsw_sp_rifs_fini(mlxsw_sp); err_rifs_init: __mlxsw_sp_router_fini(mlxsw_sp); @@ -5271,6 +6033,7 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) mlxsw_sp_lpm_fini(mlxsw_sp); rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht); rhashtable_destroy(&mlxsw_sp->router->nexthop_ht); + mlxsw_sp_ipips_fini(mlxsw_sp); mlxsw_sp_rifs_fini(mlxsw_sp); __mlxsw_sp_router_fini(mlxsw_sp); kfree(mlxsw_sp->router); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index 87a04afee1385..345fcc4f38e9a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -36,6 +36,25 @@ #define _MLXSW_ROUTER_H_ #include "spectrum.h" +#include "reg.h" + +enum mlxsw_sp_l3proto { + MLXSW_SP_L3_PROTO_IPV4, + MLXSW_SP_L3_PROTO_IPV6, +}; + +union mlxsw_sp_l3addr { + __be32 addr4; + struct in6_addr addr6; +}; + +struct mlxsw_sp_rif_ipip_lb; +struct mlxsw_sp_rif_ipip_lb_config { + enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt; + u32 okey; + enum mlxsw_sp_l3proto ul_protocol; /* Underlay. */ + union mlxsw_sp_l3addr saddr; +}; enum mlxsw_sp_rif_counter_dir { MLXSW_SP_RIF_COUNTER_INGRESS, @@ -47,6 +66,8 @@ struct mlxsw_sp_neigh_entry; struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp, u16 rif_index); u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif); +u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif); +u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif); int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif); int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_rif *rif, @@ -79,5 +100,12 @@ mlxsw_sp_neigh_entry_counter_update(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_neigh_entry *neigh_entry, bool adding); bool mlxsw_sp_neigh_ipv6_ignore(struct mlxsw_sp_neigh_entry *neigh_entry); +union mlxsw_sp_l3addr +mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto, + const struct net_device *ol_dev); +union mlxsw_sp_l3addr +mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto, + const struct net_device *ol_dev); +__be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev); #endif /* _MLXSW_ROUTER_H_*/ diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index 61652396bf75d..f396a1fef6335 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -85,6 +85,7 @@ enum { MLXSW_TRAP_ID_HOST_MISS_IPV4 = 0x90, MLXSW_TRAP_ID_IPV6_MC_LINK_LOCAL_DEST = 0x91, MLXSW_TRAP_ID_HOST_MISS_IPV6 = 0x92, + MLXSW_TRAP_ID_IPIP_DECAP_ERROR = 0xB1, MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6, MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7, MLXSW_TRAP_ID_ACL0 = 0x1C0, diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 58a9f990b5530..c74893c1e6200 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -15,7 +15,7 @@ struct ipvlan_netns { unsigned int ipvl_nf_hook_refcnt; }; -static struct nf_hook_ops ipvl_nfops[] __read_mostly = { +static const struct nf_hook_ops ipvl_nfops[] = { { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV4, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8aba119bb005b..adacc45abec16 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1791,7 +1791,7 @@ struct net_device { #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS - struct nf_hook_entry __rcu *nf_hooks_ingress; + struct nf_hook_entries __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 22f081065d496..f84bca1703cdc 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -72,25 +72,32 @@ struct nf_hook_ops { }; struct nf_hook_entry { - struct nf_hook_entry __rcu *next; nf_hookfn *hook; void *priv; - const struct nf_hook_ops *orig_ops; }; -static inline void -nf_hook_entry_init(struct nf_hook_entry *entry, const struct nf_hook_ops *ops) -{ - entry->next = NULL; - entry->hook = ops->hook; - entry->priv = ops->priv; - entry->orig_ops = ops; -} +struct nf_hook_entries { + u16 num_hook_entries; + /* padding */ + struct nf_hook_entry hooks[]; + + /* trailer: pointers to original orig_ops of each hook. + * + * This is not part of struct nf_hook_entry since its only + * needed in slow path (hook register/unregister). + * + * const struct nf_hook_ops *orig_ops[] + */ +}; -static inline int -nf_hook_entry_priority(const struct nf_hook_entry *entry) +static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e) { - return entry->orig_ops->priority; + unsigned int n = e->num_hook_entries; + const void *hook_end; + + hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */ + + return (struct nf_hook_ops **)hook_end; } static inline int @@ -100,12 +107,6 @@ nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb, return entry->hook(entry->priv, skb, state); } -static inline const struct nf_hook_ops * -nf_hook_entry_ops(const struct nf_hook_entry *entry) -{ - return entry->orig_ops; -} - static inline void nf_hook_state_init(struct nf_hook_state *p, unsigned int hook, u_int8_t pf, @@ -168,7 +169,7 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_entry *entry); + const struct nf_hook_entries *e, unsigned int i); /** * nf_hook - call a netfilter hook @@ -182,7 +183,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { - struct nf_hook_entry *hook_head; + struct nf_hook_entries *hook_head; int ret = 1; #ifdef HAVE_JUMP_LABEL @@ -200,7 +201,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); - ret = nf_hook_slow(skb, &state, hook_head); + ret = nf_hook_slow(skb, &state, hook_head, 0); } rcu_read_unlock(); diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 59476061de868..8d5dae1e2ff85 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -17,7 +17,7 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb) /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { - struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress); + struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; @@ -30,7 +30,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb) nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); - ret = nf_hook_slow(skb, &state, e); + ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 48407569585da..6e6f678aaac71 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -224,6 +224,9 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); +/* Set all unconfirmed conntrack as dying */ +void nf_ct_unconfirmed_destroy(struct net *); + /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 2ba54feaccd8d..818def0111101 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -107,6 +107,11 @@ void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); bool nf_ct_remove_expect(struct nf_conntrack_expect *exp); +void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data); +void nf_ct_expect_iterate_net(struct net *net, + bool (*iter)(struct nf_conntrack_expect *e, void *data), + void *data, u32 portid, int report); + /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me); diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 6d14b36e3a490..6269deecbee77 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -20,8 +20,8 @@ struct nf_conntrack_l3proto { /* L3 Protocol Family number. ex) PF_INET */ u_int16_t l3proto; - /* Protocol name */ - const char *name; + /* size of tuple nlattr, fills a hole */ + u16 nla_size; /* * Try to fill in the third arg: nhoff is offset of l3 proto @@ -37,10 +37,6 @@ struct nf_conntrack_l3proto { bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); - /* Print out the per-protocol part of the tuple. */ - void (*print_tuple)(struct seq_file *s, - const struct nf_conntrack_tuple *); - /* * Called before tracking. * *dataoff: offset of protocol header (TCP, UDP,...) in skb @@ -49,23 +45,17 @@ struct nf_conntrack_l3proto { int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum); +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); - - /* Called when netns wants to use connection tracking */ - int (*net_ns_get)(struct net *); - void (*net_ns_put)(struct net *); - - /* - * Calculate size of tuple nlattr - */ - int (*nlattr_tuple_size)(void); - int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t); const struct nla_policy *nla_policy; +#endif - size_t nla_size; + /* Called when netns wants to use connection tracking */ + int (*net_ns_get)(struct net *); + void (*net_ns_put)(struct net *); /* Module (if any) which this is connected to. */ struct module *me; @@ -73,26 +63,11 @@ struct nf_conntrack_l3proto { extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO]; -#ifdef CONFIG_SYSCTL -/* Protocol pernet registration. */ -int nf_ct_l3proto_pernet_register(struct net *net, - struct nf_conntrack_l3proto *proto); -#else -static inline int nf_ct_l3proto_pernet_register(struct net *n, - struct nf_conntrack_l3proto *p) -{ - return 0; -} -#endif - -void nf_ct_l3proto_pernet_unregister(struct net *net, - struct nf_conntrack_l3proto *proto); - /* Protocol global registration. */ -int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto); -void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto); +int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto); +void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto); -struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto); +const struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto); /* Existing built-in protocols */ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic; diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 7032e044bbe2a..d4933d56809d9 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -61,13 +61,6 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* Print out the per-protocol part of the tuple. Return like seq_* */ - void (*print_tuple)(struct seq_file *s, - const struct nf_conntrack_tuple *); - - /* Print out the private part of the conntrack. */ - void (*print_conntrack)(struct seq_file *s, struct nf_conn *); - /* Return the array of timeouts for this protocol. */ unsigned int *(*get_timeouts)(struct net *net); @@ -92,15 +85,19 @@ struct nf_conntrack_l4proto { #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) struct { - size_t obj_size; int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); int (*obj_to_nlattr)(struct sk_buff *skb, const void *data); - unsigned int nlattr_max; + u16 obj_size; + u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; #endif +#ifdef CONFIG_NF_CONNTRACK_PROCFS + /* Print out the private part of the conntrack. */ + void (*print_conntrack)(struct seq_file *s, struct nf_conn *); +#endif unsigned int *net_id; /* Init l4proto pernet data */ int (*init_net)(struct net *net, u_int16_t proto); @@ -108,9 +105,6 @@ struct nf_conntrack_l4proto { /* Return the per-net protocol part. */ struct nf_proto_net *(*get_net_proto)(struct net *net); - /* Protocol name */ - const char *name; - /* Module (if any) which this is connected to. */ struct module *me; }; @@ -120,28 +114,28 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO 256 -struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u_int16_t l3proto, +const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto); -struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto, +const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto); -void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p); +void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p); /* Protocol pernet registration. */ int nf_ct_l4proto_pernet_register_one(struct net *net, - struct nf_conntrack_l4proto *proto); + const struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_pernet_unregister_one(struct net *net, - struct nf_conntrack_l4proto *proto); + const struct nf_conntrack_l4proto *proto); int nf_ct_l4proto_pernet_register(struct net *net, - struct nf_conntrack_l4proto *proto[], + struct nf_conntrack_l4proto *const proto[], unsigned int num_proto); void nf_ct_l4proto_pernet_unregister(struct net *net, - struct nf_conntrack_l4proto *proto[], - unsigned int num_proto); + struct nf_conntrack_l4proto *const proto[], + unsigned int num_proto); /* Protocol global registration. */ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto); -void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *proto); +void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto); int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[], unsigned int num_proto); void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[], diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index d40b89355fdd3..483d104fa2541 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -16,7 +16,7 @@ struct ctnl_timeout { refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; __u16 l3num; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; char data[0]; }; @@ -68,7 +68,7 @@ struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, static inline unsigned int * nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 4454719ff849f..39468720fc192 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -10,9 +10,9 @@ struct nf_queue_entry { struct list_head list; struct sk_buff *skb; unsigned int id; + unsigned int hook_index; /* index in hook_entries->hook[] */ struct nf_hook_state state; - struct nf_hook_entry *hook; u16 size; /* sizeof(entry) + saved route keys */ /* extra space to store route keys */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bd5be0d691d51..f9795fe394f31 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -396,7 +396,7 @@ void nft_unregister_set(struct nft_set_type *type); struct nft_set { struct list_head list; struct list_head bindings; - char name[NFT_SET_MAXNAMELEN]; + char *name; u32 ktype; u32 dtype; u32 objtype; @@ -859,7 +859,7 @@ struct nft_chain { u16 level; u8 flags:6, genmask:2; - char name[NFT_CHAIN_MAXNAMELEN]; + char *name; }; enum nft_chain_type { @@ -957,7 +957,7 @@ struct nft_table { u32 use; u16 flags:14, genmask:2; - char name[NFT_TABLE_MAXNAMELEN]; + char *name; }; enum nft_af_flags { @@ -1016,7 +1016,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, */ struct nft_object { struct list_head list; - char name[NFT_OBJ_MAXNAMELEN]; + char *name; struct nft_table *table; u32 genmask:2, use:30; @@ -1272,7 +1272,7 @@ struct nft_trans_set { struct nft_trans_chain { bool update; - char name[NFT_CHAIN_MAXNAMELEN]; + char *name; struct nft_stats __percpu *stats; u8 policy; }; diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 8f690effec373..424684c337719 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -49,6 +49,8 @@ struct nft_payload_set { }; extern const struct nft_expr_ops nft_payload_fast_ops; + +extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/net/netlink.h b/include/net/netlink.h index 82dd298b40c7c..e51cf5f815977 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -251,6 +251,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); +char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); int nla_strcmp(const struct nlattr *nla, const char *str); diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index cea396b53a60d..72d66c8763d03 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -16,7 +16,7 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif - struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) bool defrag_ipv4; #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 683f6f88fcace..b49da72efa68c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1,10 +1,11 @@ #ifndef _LINUX_NF_TABLES_H #define _LINUX_NF_TABLES_H -#define NFT_TABLE_MAXNAMELEN 32 -#define NFT_CHAIN_MAXNAMELEN 32 -#define NFT_SET_MAXNAMELEN 32 -#define NFT_OBJ_MAXNAMELEN 32 +#define NFT_NAME_MAXLEN 256 +#define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_OBJ_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_USERDATA_MAXLEN 256 /** @@ -731,7 +732,8 @@ enum nft_exthdr_op { * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32) * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) - * @NFTA_EXTHDR_OP: option match type (NLA_U8) + * @NFTA_EXTHDR_OP: option match type (NLA_U32) + * @NFTA_EXTHDR_SREG: option match type (NLA_U32) */ enum nft_exthdr_attributes { NFTA_EXTHDR_UNSPEC, @@ -741,6 +743,7 @@ enum nft_exthdr_attributes { NFTA_EXTHDR_LEN, NFTA_EXTHDR_FLAGS, NFTA_EXTHDR_OP, + NFTA_EXTHDR_SREG, __NFTA_EXTHDR_MAX }; #define NFTA_EXTHDR_MAX (__NFTA_EXTHDR_MAX - 1) @@ -808,11 +811,13 @@ enum nft_meta_keys { * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid) * @NFT_RT_NEXTHOP4: routing nexthop for IPv4 * @NFT_RT_NEXTHOP6: routing nexthop for IPv6 + * @NFT_RT_TCPMSS: fetch current path tcp mss */ enum nft_rt_keys { NFT_RT_CLASSID, NFT_RT_NEXTHOP4, NFT_RT_NEXTHOP6, + NFT_RT_TCPMSS, }; /** @@ -1221,6 +1226,8 @@ enum nft_objref_attributes { enum nft_gen_attributes { NFTA_GEN_UNSPEC, NFTA_GEN_ID, + NFTA_GEN_PROC_PID, + NFTA_GEN_PROC_NAME, __NFTA_GEN_MAX }; #define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) diff --git a/lib/nlattr.c b/lib/nlattr.c index ee79b7a3c6b0b..927c2f19f119e 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -302,6 +302,30 @@ size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) EXPORT_SYMBOL(nla_strlcpy); /** + * nla_strdup - Copy string attribute payload into a newly allocated buffer + * @nla: attribute to copy the string from + * @flags: the type of memory to allocate (see kmalloc). + * + * Returns a pointer to the allocated buffer or NULL on error. + */ +char *nla_strdup(const struct nlattr *nla, gfp_t flags) +{ + size_t srclen = nla_len(nla); + char *src = nla_data(nla), *dst; + + if (srclen > 0 && src[srclen - 1] == '\0') + srclen--; + + dst = kmalloc(srclen + 1, flags); + if (dst != NULL) { + memcpy(dst, src, srclen); + dst[srclen] = '\0'; + } + return dst; +} +EXPORT_SYMBOL(nla_strdup); + +/** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 2261e5194c82c..c2eea1b8737a1 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -887,7 +887,7 @@ EXPORT_SYMBOL_GPL(br_netfilter_enable); /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ -static struct nf_hook_ops br_nf_ops[] __read_mostly = { +static const struct nf_hook_ops br_nf_ops[] = { { .hook = br_nf_pre_routing, .pf = NFPROTO_BRIDGE, @@ -985,22 +985,25 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { - struct nf_hook_entry *elem; + const struct nf_hook_entries *e; struct nf_hook_state state; + struct nf_hook_ops **ops; + unsigned int i; int ret; - for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); - elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF; - elem = rcu_dereference(elem->next)) - ; - - if (!elem) + e = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); + if (!e) return okfn(net, sk, skb); + ops = nf_hook_entries_get_hook_ops(e); + for (i = 0; i < e->num_hook_entries && + ops[i]->priority <= NF_BR_PRI_BRNF; i++) + ; + nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); - ret = nf_hook_slow(skb, &state, elem); + ret = nf_hook_slow(skb, &state, e, i); if (ret == 1) ret = okfn(net, sk, skb); diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index d06968bdf5ec7..2b46c50abce03 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -64,14 +64,14 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) if (NF_INVF(info, EBT_IP_DPORT, dst < info->dport[0] || dst > info->dport[1])) - return false; + return false; } if (info->bitmask & EBT_IP_SPORT) { u32 src = ntohs(pptr->src); if (NF_INVF(info, EBT_IP_SPORT, src < info->sport[0] || src > info->sport[1])) - return false; + return false; } } return true; diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 4617491be41e7..2a5a52a53ec43 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -89,7 +89,7 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) if (NF_INVF(info, EBT_IP6_SPORT, src < info->sport[0] || src > info->sport[1])) - return false; + return false; } if ((info->bitmask & EBT_IP6_ICMP6) && NF_INVF(info, EBT_IP6_ICMP6, diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index f22ef7c219137..45a00dbdbcad6 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -70,7 +70,7 @@ ebt_out_hook(void *priv, struct sk_buff *skb, return ebt_do_table(skb, state, state->net->xt.frame_filter); } -static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { +static const struct nf_hook_ops ebt_ops_filter[] = { { .hook = ebt_in_hook, .pf = NFPROTO_BRIDGE, diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 30dedcb56adef..57cd5bb154e70 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -70,7 +70,7 @@ ebt_nat_out(void *priv, struct sk_buff *skb, return ebt_do_table(skb, state, state->net->xt.frame_nat); } -static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { +static const struct nf_hook_ops ebt_ops_nat[] = { { .hook = ebt_nat_out, .pf = NFPROTO_BRIDGE, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9c6e619f452bc..54c7ef4e970e1 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1069,15 +1069,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, #ifdef CONFIG_AUDIT if (audit_enabled) { - struct audit_buffer *ab; - - ab = audit_log_start(current->audit_context, GFP_KERNEL, - AUDIT_NETFILTER_CFG); - if (ab) { - audit_log_format(ab, "table=%s family=%u entries=%u", - repl->name, AF_BRIDGE, repl->nentries); - audit_log_end(ab); - } + audit_log(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG, + "table=%s family=%u entries=%u", + repl->name, AF_BRIDGE, repl->nentries); } #endif return ret; diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index aa8ffecc46a43..ab395e55cd789 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -115,7 +115,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) RCV_SKB_FAIL(-EINVAL); } -static struct nf_hook_ops dnrmg_ops __read_mostly = { +static const struct nf_hook_ops dnrmg_ops = { .hook = dnrmg_hook, .pf = NFPROTO_DECNET, .hooknum = NF_DN_ROUTE, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 9e9d9afd18f74..e04457198f939 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1117,7 +1117,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; - struct xt_target *target; struct arpt_entry *de; unsigned int origsize; int h; @@ -1132,7 +1131,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, de->target_offset = e->target_offset - (origsize - *size); t = compat_arpt_get_target(e); - target = t->u.kernel.target; xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 622ed2887cd56..ce1d97579ce8b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -151,7 +151,7 @@ static const char *const comments[] = { [NF_IP_TRACE_COMMENT_POLICY] = "policy", }; -static struct nf_loginfo trace_loginfo = { +static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { @@ -1356,7 +1356,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; - struct xt_target *target; struct ipt_entry *de; unsigned int origsize; int h; @@ -1375,7 +1374,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, de->target_offset = e->target_offset - (origsize - *size); t = compat_ipt_get_target(e); - target = t->u.kernel.target; xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index efaa04dcc80e3..17b4ca562944c 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -625,7 +625,7 @@ arp_mangle(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops cip_arp_ops __read_mostly = { +static const struct nf_hook_ops cip_arp_ops = { .hook = arp_mangle, .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f1528f7175a8c..811689e523c31 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -416,7 +416,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { +static const struct nf_hook_ops ipv4_synproxy_ops[] = { { .hook = ipv4_synproxy_hook, .pf = NFPROTO_IPV4, diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 138a24bc76ad9..a1a07b338ccfd 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -67,7 +67,7 @@ static unsigned int iptable_nat_ipv4_local_fn(void *priv, return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); } -static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { +static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_in, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 2e14ed11a35cf..fe374da4bc13e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -63,13 +63,6 @@ static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static void ipv4_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "src=%pI4 dst=%pI4 ", - &tuple->src.u3.ip, &tuple->dst.u3.ip); -} - static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -174,7 +167,7 @@ static unsigned int ipv4_conntrack_local(void *priv, /* Connection tracking may drop packets, but never alters them, so make it the first hook. */ -static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { +static const struct nf_hook_ops ipv4_conntrack_ops[] = { { .hook = ipv4_conntrack_in, .pf = NFPROTO_IPV4, @@ -303,11 +296,6 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[], return 0; } - -static int ipv4_nlattr_tuple_size(void) -{ - return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1); -} #endif static struct nf_sockopt_ops so_getorigdst = { @@ -358,16 +346,15 @@ static void ipv4_hooks_unregister(struct net *net) struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .l3proto = PF_INET, - .name = "ipv4", .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, - .print_tuple = ipv4_print_tuple, .get_l4proto = ipv4_get_l4proto, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv4_tuple_to_nlattr, - .nlattr_tuple_size = ipv4_nlattr_tuple_size, .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, + .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ + NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */ #endif .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, @@ -398,24 +385,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = { static int ipv4_net_init(struct net *net) { - int ret = 0; - - ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - return ret; - ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: pernet registration failed\n"); - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - } - return ret; + return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); } static void ipv4_net_exit(struct net *net) { - nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, ARRAY_SIZE(builtin_l4proto4)); } @@ -433,6 +408,11 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) need_conntrack(); +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) != + nf_conntrack_l3proto_ipv4.nla_size)) + return -EINVAL; +#endif ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 73c591d8a9a8e..434b4e20f6db5 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -71,16 +71,6 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void icmp_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "type=%u code=%u id=%u ", - tuple->dst.u.icmp.type, - tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); -} - static unsigned int *icmp_get_timeouts(struct net *net) { return &icmp_pernet(net)->timeout; @@ -362,10 +352,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, - .name = "icmp", .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, - .print_tuple = icmp_print_tuple, .packet = icmp_packet, .get_timeouts = icmp_get_timeouts, .new = icmp_new, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 346bf7ccac088..37fe1616ca0bc 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -90,7 +90,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops ipv4_defrag_ops[] = { +static const struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 2f3895ddc275d..df5c2a2061a4b 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -25,7 +25,7 @@ #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> -static struct nf_loginfo default_loginfo = { +static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index c83a9963269bf..4388de0e5380c 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -24,7 +24,7 @@ #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> -static struct nf_loginfo default_loginfo = { +static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 574f7ebba0b62..ac8342dcb55eb 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -252,16 +252,16 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, if (set_h245_addr(skb, protoff, data, dataoff, taddr, &ct->tuplehash[!dir].tuple.dst.u3, htons((port & htons(1)) ? nated_port + 1 : - nated_port)) == 0) { - /* Save ports */ - info->rtp_port[i][dir] = rtp_port; - info->rtp_port[i][!dir] = htons(nated_port); - } else { + nated_port))) { nf_ct_unexpect_related(rtp_exp); nf_ct_unexpect_related(rtcp_exp); return -1; } + /* Save ports */ + info->rtp_port[i][dir] = rtp_port; + info->rtp_port[i][!dir] = htons(nated_port); + /* Success */ pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n", &rtp_exp->tuple.src.u3.ip, @@ -370,15 +370,15 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, /* Modify signal */ if (set_h225_addr(skb, protoff, data, dataoff, taddr, &ct->tuplehash[!dir].tuple.dst.u3, - htons(nated_port)) == 0) { - /* Save ports */ - info->sig_port[dir] = port; - info->sig_port[!dir] = htons(nated_port); - } else { + htons(nated_port))) { nf_ct_unexpect_related(exp); return -1; } + /* Save ports */ + info->sig_port[dir] = port; + info->sig_port[!dir] = htons(nated_port); + pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n", &exp->tuple.src.u3.ip, ntohs(exp->tuple.src.u.tcp.port), @@ -462,24 +462,27 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, /* Modify signal */ if (set_h225_addr(skb, protoff, data, 0, &taddr[idx], &ct->tuplehash[!dir].tuple.dst.u3, - htons(nated_port)) == 0) { - /* Save ports */ - info->sig_port[dir] = port; - info->sig_port[!dir] = htons(nated_port); - - /* Fix for Gnomemeeting */ - if (idx > 0 && - get_h225_addr(ct, *data, &taddr[0], &addr, &port) && - (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { - set_h225_addr(skb, protoff, data, 0, &taddr[0], - &ct->tuplehash[!dir].tuple.dst.u3, - info->sig_port[!dir]); - } - } else { + htons(nated_port))) { nf_ct_unexpect_related(exp); return -1; } + /* Save ports */ + info->sig_port[dir] = port; + info->sig_port[!dir] = htons(nated_port); + + /* Fix for Gnomemeeting */ + if (idx > 0 && + get_h225_addr(ct, *data, &taddr[0], &addr, &port) && + (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { + if (set_h225_addr(skb, protoff, data, 0, &taddr[0], + &ct->tuplehash[!dir].tuple.dst.u3, + info->sig_port[!dir])) { + nf_ct_unexpect_related(exp); + return -1; + } + } + /* Success */ pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n", &exp->tuple.src.u3.ip, @@ -550,9 +553,9 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, } /* Modify signal */ - if (!set_h225_addr(skb, protoff, data, dataoff, taddr, - &ct->tuplehash[!dir].tuple.dst.u3, - htons(nated_port)) == 0) { + if (set_h225_addr(skb, protoff, data, dataoff, taddr, + &ct->tuplehash[!dir].tuple.dst.u3, + htons(nated_port))) { nf_ct_unexpect_related(exp); return -1; } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index de3681df2ce71..e50976e3c2133 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -32,9 +32,10 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); + int noff = skb_network_offset(pkt->skb); u32 *dst = ®s->data[priv->dreg]; const struct net_device *dev = NULL; - const struct iphdr *iph; + struct iphdr *iph, _iph; __be32 addr; if (priv->flags & NFTA_FIB_F_IIF) @@ -42,7 +43,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - iph = ip_hdr(pkt->skb); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); + if (!iph) { + regs->verdict.code = NFT_BREAK; + return; + } + if (priv->flags & NFTA_FIB_F_DADDR) addr = iph->daddr; else @@ -61,8 +67,9 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); + int noff = skb_network_offset(pkt->skb); u32 *dest = ®s->data[priv->dreg]; - const struct iphdr *iph; + struct iphdr *iph, _iph; struct fib_result res; struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, @@ -95,7 +102,12 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - iph = ip_hdr(pkt->skb); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); + if (!iph) { + regs->verdict.code = NFT_BREAK; + return; + } + if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) { diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 77f7f8c7d93d6..5bd419c1abc8b 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -208,7 +208,7 @@ ila_nf_input(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = { +static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1f90644056ac7..9f6644958e5e3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -176,7 +176,7 @@ static const char *const comments[] = { [NF_IP6_TRACE_COMMENT_POLICY] = "policy", }; -static struct nf_loginfo trace_loginfo = { +static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index ce203dd729e06..a5cd43d75393d 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -438,7 +438,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { +static const struct nf_hook_ops ipv6_synproxy_ops[] = { { .hook = ipv6_synproxy_hook, .pf = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 7d2bd940291fd..991512576c8c8 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -69,7 +69,7 @@ static unsigned int ip6table_nat_local_fn(void *priv, return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain); } -static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { +static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = ip6table_nat_in, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 4e34024868334..fe01dc953c56a 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -67,13 +67,6 @@ static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static void ipv6_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "src=%pI6 dst=%pI6 ", - tuple->src.u3.ip6, tuple->dst.u3.ip6); -} - static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -191,7 +184,7 @@ static unsigned int ipv6_conntrack_local(void *priv, return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } -static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { +static const struct nf_hook_ops ipv6_conntrack_ops[] = { { .hook = ipv6_conntrack_in, .pf = NFPROTO_IPV6, @@ -308,11 +301,6 @@ static int ipv6_nlattr_to_tuple(struct nlattr *tb[], return 0; } - -static int ipv6_nlattr_tuple_size(void) -{ - return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1); -} #endif static int ipv6_hooks_register(struct net *net) @@ -353,16 +341,15 @@ static void ipv6_hooks_unregister(struct net *net) struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .l3proto = PF_INET6, - .name = "ipv6", .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, - .print_tuple = ipv6_print_tuple, .get_l4proto = ipv6_get_l4proto, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv6_tuple_to_nlattr, - .nlattr_tuple_size = ipv6_nlattr_tuple_size, .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, + .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) + + NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])), #endif .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, @@ -398,25 +385,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = { static int ipv6_net_init(struct net *net) { - int ret = 0; - - ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - if (ret < 0) - return ret; - - ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - } - return ret; + return nf_ct_l4proto_pernet_register(net, builtin_l4proto6, + ARRAY_SIZE(builtin_l4proto6)); } static void ipv6_net_exit(struct net *net) { - nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, ARRAY_SIZE(builtin_l4proto6)); } @@ -434,6 +408,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) need_conntrack(); +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) != + nf_conntrack_l3proto_ipv6.nla_size)) + return -EINVAL; +#endif + ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index d5f028e33f658..43544b975eaee 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -84,16 +84,6 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void icmpv6_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "type=%u code=%u id=%u ", - tuple->dst.u.icmp.type, - tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); -} - static unsigned int *icmpv6_get_timeouts(struct net *net) { return &icmpv6_pernet(net)->timeout; @@ -131,11 +121,6 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); - if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) - nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL, - NULL, NULL, - "nf_ct_icmpv6: invalid new with type %d ", - type + 128); return false; } return true; @@ -367,10 +352,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, - .name = "icmpv6", .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, - .print_tuple = icmpv6_print_tuple, .packet = icmpv6_packet, .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index ada60d1a991b7..b326da59257f6 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -74,7 +74,7 @@ static unsigned int ipv6_defrag(void *priv, return err == 0 ? NF_ACCEPT : NF_DROP; } -static struct nf_hook_ops ipv6_defrag_ops[] = { +static const struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 97c724224da7b..b397a8fe88b93 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -25,7 +25,7 @@ #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> -static struct nf_loginfo default_loginfo = { +static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 43f91d9b086c7..54b5899543ef5 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -25,9 +25,9 @@ static int get_ifindex(const struct net_device *dev) static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, const struct nft_pktinfo *pkt, - const struct net_device *dev) + const struct net_device *dev, + struct ipv6hdr *iph) { - const struct ipv6hdr *iph = ipv6_hdr(pkt->skb); int lookup_flags = 0; if (priv->flags & NFTA_FIB_F_DADDR) { @@ -55,7 +55,8 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, } static u32 __nft_fib6_eval_type(const struct nft_fib *priv, - const struct nft_pktinfo *pkt) + const struct nft_pktinfo *pkt, + struct ipv6hdr *iph) { const struct net_device *dev = NULL; const struct nf_ipv6_ops *v6ops; @@ -77,7 +78,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - nft_fib6_flowi_init(&fl6, priv, pkt, dev); + nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); v6ops = nf_get_ipv6_ops(); if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -131,9 +132,17 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); + int noff = skb_network_offset(pkt->skb); u32 *dest = ®s->data[priv->dreg]; + struct ipv6hdr *iph, _iph; - *dest = __nft_fib6_eval_type(priv, pkt); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); + if (!iph) { + regs->verdict.code = NFT_BREAK; + return; + } + + *dest = __nft_fib6_eval_type(priv, pkt, iph); } EXPORT_SYMBOL_GPL(nft_fib6_eval_type); @@ -141,8 +150,10 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); + int noff = skb_network_offset(pkt->skb); const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; + struct ipv6hdr *iph, _iph; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, @@ -155,7 +166,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, else if (priv->flags & NFTA_FIB_F_OIF) oif = nft_out(pkt); - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); + if (!iph) { + regs->verdict.code = NFT_BREAK; + return; + } + + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9b28864cc36a9..e4a13cc8a2e76 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -636,6 +636,15 @@ config NFT_FWD_NETDEV help This option enables packet forwarding for the "netdev" family. +config NFT_FIB_NETDEV + depends on NFT_FIB_IPV4 + depends on NFT_FIB_IPV6 + tristate "Netfilter nf_tables netdev fib lookups support" + help + This option allows using the FIB expression from the netdev table. + The lookup will be delegated to the IPv4 or IPv6 FIB depending + on the protocol of the packet. + endif # NF_TABLES_NETDEV endif # NF_TABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 9133809193014..d3891c93edd6e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_NFT_REDIR) += nft_redir.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o +obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 974cf2a3795aa..04fe25abc5f66 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -21,7 +21,7 @@ #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/mutex.h> -#include <linux/slab.h> +#include <linux/mm.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -62,10 +62,182 @@ EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); + +/* max hooks per family/hooknum */ +#define MAX_HOOK_COUNT 1024 + #define nf_entry_dereference(e) \ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) -static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg) +static struct nf_hook_entries *allocate_hook_entries_size(u16 num) +{ + struct nf_hook_entries *e; + size_t alloc = sizeof(*e) + + sizeof(struct nf_hook_entry) * num + + sizeof(struct nf_hook_ops *) * num; + + if (num == 0) + return NULL; + + e = kvzalloc(alloc, GFP_KERNEL); + if (e) + e->num_hook_entries = num; + return e; +} + +static unsigned int accept_all(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */ +} + +static const struct nf_hook_ops dummy_ops = { + .hook = accept_all, + .priority = INT_MIN, +}; + +static struct nf_hook_entries * +nf_hook_entries_grow(const struct nf_hook_entries *old, + const struct nf_hook_ops *reg) +{ + unsigned int i, alloc_entries, nhooks, old_entries; + struct nf_hook_ops **orig_ops = NULL; + struct nf_hook_ops **new_ops; + struct nf_hook_entries *new; + bool inserted = false; + + alloc_entries = 1; + old_entries = old ? old->num_hook_entries : 0; + + if (old) { + orig_ops = nf_hook_entries_get_hook_ops(old); + + for (i = 0; i < old_entries; i++) { + if (orig_ops[i] != &dummy_ops) + alloc_entries++; + } + } + + if (alloc_entries > MAX_HOOK_COUNT) + return ERR_PTR(-E2BIG); + + new = allocate_hook_entries_size(alloc_entries); + if (!new) + return ERR_PTR(-ENOMEM); + + new_ops = nf_hook_entries_get_hook_ops(new); + + i = 0; + nhooks = 0; + while (i < old_entries) { + if (orig_ops[i] == &dummy_ops) { + ++i; + continue; + } + if (inserted || reg->priority > orig_ops[i]->priority) { + new_ops[nhooks] = (void *)orig_ops[i]; + new->hooks[nhooks] = old->hooks[i]; + i++; + } else { + new_ops[nhooks] = (void *)reg; + new->hooks[nhooks].hook = reg->hook; + new->hooks[nhooks].priv = reg->priv; + inserted = true; + } + nhooks++; + } + + if (!inserted) { + new_ops[nhooks] = (void *)reg; + new->hooks[nhooks].hook = reg->hook; + new->hooks[nhooks].priv = reg->priv; + } + + return new; +} + +static void hooks_validate(const struct nf_hook_entries *hooks) +{ +#ifdef CONFIG_DEBUG_KERNEL + struct nf_hook_ops **orig_ops; + int prio = INT_MIN; + size_t i = 0; + + orig_ops = nf_hook_entries_get_hook_ops(hooks); + + for (i = 0; i < hooks->num_hook_entries; i++) { + if (orig_ops[i] == &dummy_ops) + continue; + + WARN_ON(orig_ops[i]->priority < prio); + + if (orig_ops[i]->priority > prio) + prio = orig_ops[i]->priority; + } +#endif +} + +/* + * __nf_hook_entries_try_shrink - try to shrink hook array + * + * @pp -- location of hook blob + * + * Hook unregistration must always succeed, so to-be-removed hooks + * are replaced by a dummy one that will just move to next hook. + * + * This counts the current dummy hooks, attempts to allocate new blob, + * copies the live hooks, then replaces and discards old one. + * + * return values: + * + * Returns address to free, or NULL. + */ +static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp) +{ + struct nf_hook_entries *old, *new = NULL; + unsigned int i, j, skip = 0, hook_entries; + struct nf_hook_ops **orig_ops; + struct nf_hook_ops **new_ops; + + old = nf_entry_dereference(*pp); + if (WARN_ON_ONCE(!old)) + return NULL; + + orig_ops = nf_hook_entries_get_hook_ops(old); + for (i = 0; i < old->num_hook_entries; i++) { + if (orig_ops[i] == &dummy_ops) + skip++; + } + + /* if skip == hook_entries all hooks have been removed */ + hook_entries = old->num_hook_entries; + if (skip == hook_entries) + goto out_assign; + + if (WARN_ON(skip == 0)) + return NULL; + + hook_entries -= skip; + new = allocate_hook_entries_size(hook_entries); + if (!new) + return NULL; + + new_ops = nf_hook_entries_get_hook_ops(new); + for (i = 0, j = 0; i < old->num_hook_entries; i++) { + if (orig_ops[i] == &dummy_ops) + continue; + new->hooks[j] = old->hooks[i]; + new_ops[j] = (void *)orig_ops[i]; + j++; + } + hooks_validate(new); +out_assign: + rcu_assign_pointer(*pp, new); + return old; +} + +static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg) { if (reg->pf != NFPROTO_NETDEV) return net->nf.hooks[reg->pf]+reg->hooknum; @@ -76,13 +248,14 @@ static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const st return ®->dev->nf_hooks_ingress; } #endif + WARN_ON_ONCE(1); return NULL; } int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct nf_hook_entry __rcu **pp; - struct nf_hook_entry *entry, *p; + struct nf_hook_entries *p, *new_hooks; + struct nf_hook_entries __rcu **pp; if (reg->pf == NFPROTO_NETDEV) { #ifndef CONFIG_NETFILTER_INGRESS @@ -98,23 +271,19 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) if (!pp) return -EINVAL; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - nf_hook_entry_init(entry, reg); - mutex_lock(&nf_hook_mutex); - /* Find the spot in the list */ - for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) { - if (reg->priority < nf_hook_entry_priority(p)) - break; - } - rcu_assign_pointer(entry->next, p); - rcu_assign_pointer(*pp, entry); + p = nf_entry_dereference(*pp); + new_hooks = nf_hook_entries_grow(p, reg); + + if (!IS_ERR(new_hooks)) + rcu_assign_pointer(*pp, new_hooks); mutex_unlock(&nf_hook_mutex); + if (IS_ERR(new_hooks)) + return PTR_ERR(new_hooks); + + hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_inc_ingress_queue(); @@ -122,48 +291,74 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) #ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif + synchronize_net(); + BUG_ON(p == new_hooks); + kvfree(p); return 0; } EXPORT_SYMBOL(nf_register_net_hook); -static struct nf_hook_entry * -__nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) +/* + * __nf_unregister_net_hook - remove a hook from blob + * + * @oldp: current address of hook blob + * @unreg: hook to unregister + * + * This cannot fail, hook unregistration must always succeed. + * Therefore replace the to-be-removed hook with a dummy hook. + */ +static void __nf_unregister_net_hook(struct nf_hook_entries *old, + const struct nf_hook_ops *unreg) { - struct nf_hook_entry __rcu **pp; - struct nf_hook_entry *p; - - pp = nf_hook_entry_head(net, reg); - if (WARN_ON_ONCE(!pp)) - return NULL; + struct nf_hook_ops **orig_ops; + bool found = false; + unsigned int i; - mutex_lock(&nf_hook_mutex); - for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) { - if (nf_hook_entry_ops(p) == reg) { - rcu_assign_pointer(*pp, p->next); - break; - } - } - mutex_unlock(&nf_hook_mutex); - if (!p) { - WARN(1, "nf_unregister_net_hook: hook not found!\n"); - return NULL; + orig_ops = nf_hook_entries_get_hook_ops(old); + for (i = 0; i < old->num_hook_entries; i++) { + if (orig_ops[i] != unreg) + continue; + WRITE_ONCE(old->hooks[i].hook, accept_all); + WRITE_ONCE(orig_ops[i], &dummy_ops); + found = true; + break; } + + if (found) { #ifdef CONFIG_NETFILTER_INGRESS - if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) - net_dec_ingress_queue(); + if (unreg->pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS) + net_dec_ingress_queue(); #endif #ifdef HAVE_JUMP_LABEL - static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[unreg->pf][unreg->hooknum]); #endif - - return p; + } else { + WARN_ONCE(1, "hook not found, pf %d num %d", unreg->pf, unreg->hooknum); + } } void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct nf_hook_entry *p = __nf_unregister_net_hook(net, reg); + struct nf_hook_entries __rcu **pp; + struct nf_hook_entries *p; unsigned int nfq; + pp = nf_hook_entry_head(net, reg); + if (!pp) + return; + + mutex_lock(&nf_hook_mutex); + + p = nf_entry_dereference(*pp); + if (WARN_ON_ONCE(!p)) { + mutex_unlock(&nf_hook_mutex); + return; + } + + __nf_unregister_net_hook(p, reg); + + p = __nf_hook_entries_try_shrink(pp); + mutex_unlock(&nf_hook_mutex); if (!p) return; @@ -173,7 +368,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) nfq = nf_queue_nf_hook_drop(net); if (nfq) synchronize_net(); - kfree(p); + kvfree(p); } EXPORT_SYMBOL(nf_unregister_net_hook); @@ -200,26 +395,59 @@ EXPORT_SYMBOL(nf_register_net_hooks); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int hookcount) { - struct nf_hook_entry *to_free[16]; - unsigned int i, n, nfq; + struct nf_hook_entries *to_free[16], *p; + struct nf_hook_entries __rcu **pp; + unsigned int i, j, n; + + mutex_lock(&nf_hook_mutex); + for (i = 0; i < hookcount; i++) { + pp = nf_hook_entry_head(net, ®[i]); + if (!pp) + continue; + + p = nf_entry_dereference(*pp); + if (WARN_ON_ONCE(!p)) + continue; + __nf_unregister_net_hook(p, ®[i]); + } + mutex_unlock(&nf_hook_mutex); do { n = min_t(unsigned int, hookcount, ARRAY_SIZE(to_free)); - for (i = 0; i < n; i++) - to_free[i] = __nf_unregister_net_hook(net, ®[i]); + mutex_lock(&nf_hook_mutex); - synchronize_net(); + for (i = 0, j = 0; i < hookcount && j < n; i++) { + pp = nf_hook_entry_head(net, ®[i]); + if (!pp) + continue; + + p = nf_entry_dereference(*pp); + if (!p) + continue; + + to_free[j] = __nf_hook_entries_try_shrink(pp); + if (to_free[j]) + ++j; + } + + mutex_unlock(&nf_hook_mutex); + + if (j) { + unsigned int nfq; - /* need 2nd synchronize_net() if nfqueue is used, skb - * can get reinjected right before nf_queue_hook_drop() - */ - nfq = nf_queue_nf_hook_drop(net); - if (nfq) synchronize_net(); - for (i = 0; i < n; i++) - kfree(to_free[i]); + /* need 2nd synchronize_net() if nfqueue is used, skb + * can get reinjected right before nf_queue_hook_drop() + */ + nfq = nf_queue_nf_hook_drop(net); + if (nfq) + synchronize_net(); + + for (i = 0; i < j; i++) + kvfree(to_free[i]); + } reg += n; hookcount -= n; @@ -230,16 +458,15 @@ EXPORT_SYMBOL(nf_unregister_net_hooks); /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_entry *entry) + const struct nf_hook_entries *e, unsigned int s) { unsigned int verdict; int ret; - do { - verdict = nf_hook_entry_hookfn(entry, skb, state); + for (; s < e->num_hook_entries; s++) { + verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: - entry = rcu_dereference(entry->next); break; case NF_DROP: kfree_skb(skb); @@ -248,8 +475,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, ret = -EPERM; return ret; case NF_QUEUE: - ret = nf_queue(skb, state, &entry, verdict); - if (ret == 1 && entry) + ret = nf_queue(skb, state, e, s, verdict); + if (ret == 1) continue; return ret; default: @@ -258,7 +485,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, */ return 0; } - } while (entry); + } return 1; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e31956b58abaf..5cb7cac9177d8 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -125,14 +125,12 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); - rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.inpkts++; s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); - rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); u64_stats_update_begin(&s->syncp); @@ -159,14 +157,12 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); - rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.outpkts++; s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); - rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); u64_stats_update_begin(&s->syncp); @@ -1222,7 +1218,6 @@ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, if (!pptr) return NULL; - rcu_read_lock(); dest = ip_vs_find_real_service(ipvs, af, iph->protocol, &iph->saddr, pptr[0]); if (dest) { @@ -1237,7 +1232,6 @@ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, pptr[0], pptr[1]); } } - rcu_read_unlock(); return cp; } @@ -1689,11 +1683,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (dest) { struct ip_vs_dest_dst *dest_dst; - rcu_read_lock(); dest_dst = rcu_dereference(dest->dest_dst); if (dest_dst) mtu = dst_mtu(dest_dst->dst_cache); - rcu_read_unlock(); } if (mtu > 68 + sizeof(struct iphdr)) mtu -= sizeof(struct iphdr); @@ -2109,7 +2101,7 @@ ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, #endif -static struct nf_hook_ops ip_vs_ops[] __read_mostly = { +static const struct nf_hook_ops ip_vs_ops[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 1fa3c2307b6ea..4f940d7eb2f7e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -550,18 +550,15 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); - rcu_read_lock(); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark)) { /* HIT */ - rcu_read_unlock(); return true; } } - rcu_read_unlock(); return false; } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index fb780be76d15a..3e17d32b629d1 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -269,13 +269,11 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * hopefully it will succeed on the retransmitted * packet. */ - rcu_read_lock(); mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, iph->ihl * 4, start - data, end - start, buf, buf_len); - rcu_read_unlock(); if (mangled) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 3ffad4adaddf9..e1efa446b305e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -38,7 +38,6 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, return 0; } - rcu_read_lock(); if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); @@ -53,7 +52,6 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, * It seems that we are very loaded. * We have to drop this packet :( */ - rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -67,11 +65,9 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; - rcu_read_unlock(); return 0; } } - rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -526,12 +522,10 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - rcu_read_lock(); list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -544,11 +538,10 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); - goto out; + break; } } - rcu_read_unlock(); -out: + return result; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 12dc8d5bc37d7..121a321b91bea 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -63,7 +63,6 @@ tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ - rcu_read_lock(); if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, @@ -80,7 +79,6 @@ tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, * It seems that we are very loaded. * We have to drop this packet :( */ - rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -95,11 +93,9 @@ tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; - rcu_read_unlock(); return 0; } } - rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -661,12 +657,10 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - rcu_read_lock(); list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -680,12 +674,10 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); - goto out; + break; } } - rcu_read_unlock(); - out: return result; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index e494e9a88c7fb..30e11cd6aa8a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -53,7 +53,6 @@ udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, return 0; } - rcu_read_lock(); if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); @@ -69,7 +68,6 @@ udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, * It seems that we are very loaded. * We have to drop this packet :( */ - rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -84,11 +82,9 @@ udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; - rcu_read_unlock(); return 0; } } - rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -410,12 +406,10 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - rcu_read_lock(); list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -429,12 +423,10 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); - goto out; + break; } } - rcu_read_unlock(); - out: return result; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 2eab1e0400f48..90d396814798e 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -678,7 +678,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -689,14 +688,12 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -710,7 +707,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) @@ -720,14 +716,12 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -746,7 +740,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; @@ -815,14 +808,12 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); - rcu_read_unlock(); LeaveFunction(10); return rc; tx_error: kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -837,7 +828,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; @@ -906,7 +896,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); - rcu_read_unlock(); LeaveFunction(10); return rc; @@ -914,7 +903,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, tx_error: LeaveFunction(10); kfree_skb(skb); - rcu_read_unlock(); return NF_STOLEN; } #endif @@ -1035,7 +1023,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1043,10 +1030,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); if (local < 0) goto tx_error; - if (local) { - rcu_read_unlock(); + if (local) return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); - } rt = skb_rtable(skb); tdev = rt->dst.dev; @@ -1095,7 +1080,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); @@ -1104,7 +1088,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, tx_error: if (!IS_ERR(skb)) kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1127,7 +1110,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, @@ -1136,10 +1118,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_TUNNEL); if (local < 0) goto tx_error; - if (local) { - rcu_read_unlock(); + if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - } rt = (struct rt6_info *) skb_dst(skb); tdev = rt->dst.dev; @@ -1185,7 +1165,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip6_local_out(cp->ipvs->net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); @@ -1194,7 +1173,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, tx_error: if (!IS_ERR(skb)) kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1213,17 +1191,14 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); if (local < 0) goto tx_error; - if (local) { - rcu_read_unlock(); + if (local) return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); - } ip_send_check(ip_hdr(skb)); @@ -1231,14 +1206,12 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1252,7 +1225,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rcu_read_lock(); local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, @@ -1261,23 +1233,19 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_KNOWN_NH); if (local < 0) goto tx_error; - if (local) { - rcu_read_unlock(); + if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - } /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1322,7 +1290,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - rcu_read_lock(); local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, NULL, iph); if (local < 0) @@ -1368,12 +1335,10 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); - rcu_read_unlock(); goto out; tx_error: kfree_skb(skb); - rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); @@ -1414,7 +1379,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - rcu_read_lock(); local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) @@ -1460,12 +1424,10 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); - rcu_read_unlock(); goto out; tx_error: kfree_skb(skb); - rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 4e99cca61612f..ecc3ab7846339 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -40,7 +40,6 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) goto out; - rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { for_primary_ifa(in_dev) { @@ -50,7 +49,6 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, } } endfor_ifa(in_dev); } - rcu_read_unlock(); if (mask == 0) goto out; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9979f46c81dce..c23df7c9cd598 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -56,6 +56,8 @@ #include <net/netfilter/nf_nat_helper.h> #include <net/netns/hash.h> +#include "nf_internals.h" + #define NF_CONNTRACK_VERSION "0.5.0" int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, @@ -248,8 +250,8 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; unsigned int protoff; u_int8_t protonum; int ret; @@ -398,7 +400,7 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); @@ -407,13 +409,10 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_tmpl_free(ct); return; } - rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->destroy) l4proto->destroy(ct); - rcu_read_unlock(); - local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, @@ -695,7 +694,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->allow_clash && @@ -1084,7 +1083,7 @@ static void gc_worker(struct work_struct *work) static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { - INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); + INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); gc_work->next_gc_run = HZ; gc_work->exiting = false; } @@ -1177,8 +1176,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l3proto *l3proto, - struct nf_conntrack_l4proto *l4proto, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash) { @@ -1289,8 +1288,8 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - struct nf_conntrack_l3proto *l3proto, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; @@ -1345,10 +1344,10 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts; unsigned int dataoff; u_int8_t protonum; @@ -1689,6 +1688,18 @@ __nf_ct_unconfirmed_destroy(struct net *net) } } +void nf_ct_unconfirmed_destroy(struct net *net) +{ + might_sleep(); + + if (atomic_read(&net->ct.count) > 0) { + __nf_ct_unconfirmed_destroy(net); + nf_queue_nf_hook_drop(net); + synchronize_net(); + } +} +EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy); + void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) @@ -1700,14 +1711,10 @@ void nf_ct_iterate_cleanup_net(struct net *net, if (atomic_read(&net->ct.count) == 0) return; - __nf_ct_unconfirmed_destroy(net); - d.iter = iter; d.data = data; d.net = net; - synchronize_net(); - nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); @@ -1733,6 +1740,7 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) if (atomic_read(&net->ct.count) == 0) continue; __nf_ct_unconfirmed_destroy(net); + nf_queue_nf_hook_drop(net); } rtnl_unlock(); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 899c2c36da136..dad2c0c22ad58 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -368,12 +368,6 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); - hlist_add_head_rcu(&exp->lnode, &master_help->expectations); - master_help->expecting[exp->class]++; - - hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - net->ct.expect_count++; - setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); helper = rcu_dereference_protected(master_help->helper, @@ -384,6 +378,12 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) } add_timer(&exp->timeout); + hlist_add_head_rcu(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + + hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); + net->ct.expect_count++; + NF_CT_STAT_INC(net, expect_create); } @@ -474,6 +474,60 @@ out: } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); +void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), + void *data) +{ + struct nf_conntrack_expect *exp; + const struct hlist_node *next; + unsigned int i; + + spin_lock_bh(&nf_conntrack_expect_lock); + + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, next, + &nf_ct_expect_hash[i], + hnode) { + if (iter(exp, data) && del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } + } + + spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); + +void nf_ct_expect_iterate_net(struct net *net, + bool (*iter)(struct nf_conntrack_expect *e, void *data), + void *data, + u32 portid, int report) +{ + struct nf_conntrack_expect *exp; + const struct hlist_node *next; + unsigned int i; + + spin_lock_bh(&nf_conntrack_expect_lock); + + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, next, + &nf_ct_expect_hash[i], + hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + + if (iter(exp, data) && del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, portid, report); + nf_ct_expect_put(exp); + } + } + } + + spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); + #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 9129bb3b51535..551a1eddf0fab 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -437,12 +437,22 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); -void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) { - struct nf_conntrack_expect *exp; - const struct hlist_node *next; - unsigned int i; + struct nf_conn_help *help = nfct_help(exp->master); + const struct nf_conntrack_helper *me = data; + const struct nf_conntrack_helper *this; + + if (exp->helper == me) + return true; + this = rcu_dereference_protected(help->helper, + lockdep_is_held(&nf_conntrack_expect_lock)); + return this == me; +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ mutex_lock(&nf_ct_helper_mutex); hlist_del_rcu(&me->hnode); nf_ct_helper_count--; @@ -453,21 +463,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) */ synchronize_rcu(); - /* Get rid of expectations */ - spin_lock_bh(&nf_conntrack_expect_lock); - for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, next, - &nf_ct_expect_hash[i], hnode) { - struct nf_conn_help *help = nfct_help(exp->master); - if ((rcu_dereference_protected( - help->helper, - lockdep_is_held(&nf_conntrack_expect_lock) - ) == me || exp->helper == me)) - nf_ct_remove_expect(exp); - } - } - spin_unlock_bh(&nf_conntrack_expect_lock); - + nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index cf9ace70becec..397e6911214f8 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -49,11 +49,6 @@ static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static void generic_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ -} - static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -64,10 +59,8 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .name = "unknown", .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, - .print_tuple = generic_print_tuple, .get_l4proto = generic_get_l4proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7999e70c3bfbe..de4053d84364b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -61,8 +61,8 @@ MODULE_LICENSE("GPL"); static char __initdata version[] = "0.93"; static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nlattr *nest_parms; @@ -86,7 +86,7 @@ nla_put_failure: static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l3proto *l3proto) + const struct nf_conntrack_l3proto *l3proto) { int ret = 0; struct nlattr *nest_parms; @@ -109,9 +109,9 @@ nla_put_failure: static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; int ret; - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; rcu_read_lock(); l3proto = __nf_ct_l3proto_find(tuple->src.l3num); @@ -163,7 +163,7 @@ nla_put_failure: static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) { - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; int ret; @@ -535,17 +535,16 @@ nla_put_failure: static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) { - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; - size_t len = 0; + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; + size_t len; - rcu_read_lock(); l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - len += l3proto->nla_size; + len = l3proto->nla_size; + len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); len += l4proto->nla_size; - rcu_read_unlock(); return len; } @@ -664,7 +663,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - rcu_read_lock(); zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); @@ -736,8 +734,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; #endif - rcu_read_unlock(); - nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); @@ -747,7 +743,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) return 0; nla_put_failure: - rcu_read_unlock(); nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); @@ -941,8 +936,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { static int ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nf_conntrack_tuple *tuple) { + const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; - struct nf_conntrack_l4proto *l4proto; int ret = 0; ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy, @@ -1585,8 +1580,8 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) { const struct nlattr *attr = cda[CTA_PROTOINFO]; + const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTOINFO_MAX+1]; - struct nf_conntrack_l4proto *l4proto; int err = 0; err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy, @@ -2213,7 +2208,6 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; - rcu_read_lock(); zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); @@ -2272,11 +2266,9 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) #endif if (ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; - rcu_read_unlock(); return 0; nla_put_failure: - rcu_read_unlock(); return -ENOSPC; } @@ -2483,11 +2475,11 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { - int ret; - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; + int ret; memset(&m, 0xFF, sizeof(m)); memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); @@ -2661,17 +2653,14 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - rcu_read_lock(); if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; - rcu_read_unlock(); nlmsg_end(skb, nlh); nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: - rcu_read_unlock(); nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); @@ -2910,6 +2899,21 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } +static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) +{ + const struct nf_conn_help *m_help; + const char *name = data; + + m_help = nfct_help(exp->master); + + return strcmp(m_help->helper->name, name) == 0; +} + +static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) +{ + return true; +} + static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[], @@ -2918,10 +2922,8 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; - unsigned int i; int err; if (cda[CTA_EXPECT_TUPLE]) { @@ -2961,49 +2963,15 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, nf_ct_expect_put(exp); } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); - struct nf_conn_help *m_help; - - /* delete all expectations for this helper */ - spin_lock_bh(&nf_conntrack_expect_lock); - for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, next, - &nf_ct_expect_hash[i], - hnode) { - - if (!net_eq(nf_ct_exp_net(exp), net)) - continue; - m_help = nfct_help(exp->master); - if (!strcmp(m_help->helper->name, name) && - del_timer(&exp->timeout)) { - nf_ct_unlink_expect_report(exp, - NETLINK_CB(skb).portid, - nlmsg_report(nlh)); - nf_ct_expect_put(exp); - } - } - } - spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_iterate_net(net, expect_iter_name, name, + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); } else { /* This basically means we have to flush everything*/ - spin_lock_bh(&nf_conntrack_expect_lock); - for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, next, - &nf_ct_expect_hash[i], - hnode) { - - if (!net_eq(nf_ct_exp_net(exp), net)) - continue; - - if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect_report(exp, - NETLINK_CB(skb).portid, - nlmsg_report(nlh)); - nf_ct_expect_put(exp); - } - } - } - spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_iterate_net(net, expect_iter_all, NULL, + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); } return 0; diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 6959e93063d4c..11562f2a08bb0 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -113,7 +113,6 @@ static void pptp_expectfn(struct nf_conn *ct, /* Can you see how rusty this code is, compared with the pre-2.6.11 * one? That's what happened to my shiny newnat of 2002 ;( -HW */ - rcu_read_lock(); nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn); if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK) nf_nat_pptp_expectfn(ct, exp); @@ -136,7 +135,6 @@ static void pptp_expectfn(struct nf_conn *ct, pr_debug("not found\n"); } } - rcu_read_unlock(); } static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 1dcad229c3cc7..b3e489c859ec4 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -65,7 +65,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, } #endif -struct nf_conntrack_l4proto * +const struct nf_conntrack_l4proto * __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) { if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL)) @@ -77,7 +77,7 @@ EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); /* this is guaranteed to always return a valid protocol helper, since * it falls back to generic_protocol */ -struct nf_conntrack_l3proto * +const struct nf_conntrack_l3proto * nf_ct_l3proto_find_get(u_int16_t l3proto) { struct nf_conntrack_l3proto *p; @@ -95,8 +95,8 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); int nf_ct_l3proto_try_module_get(unsigned short l3proto) { + const struct nf_conntrack_l3proto *p; int ret; - struct nf_conntrack_l3proto *p; retry: p = nf_ct_l3proto_find_get(l3proto); if (p == &nf_conntrack_l3proto_generic) { @@ -173,10 +173,10 @@ void nf_ct_netns_put(struct net *net, u8 nfproto) } EXPORT_SYMBOL_GPL(nf_ct_netns_put); -struct nf_conntrack_l4proto * +const struct nf_conntrack_l4proto * nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) { - struct nf_conntrack_l4proto *p; + const struct nf_conntrack_l4proto *p; rcu_read_lock(); p = __nf_ct_l4proto_find(l3num, l4num); @@ -188,7 +188,7 @@ nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) } EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); -void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) +void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p) { module_put(p->me); } @@ -196,28 +196,28 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); static int kill_l3proto(struct nf_conn *i, void *data) { - return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; + return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto; } static int kill_l4proto(struct nf_conn *i, void *data) { - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; l4proto = data; return nf_ct_protonum(i) == l4proto->l4proto && nf_ct_l3num(i) == l4proto->l3proto; } -int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto) { int ret = 0; struct nf_conntrack_l3proto *old; if (proto->l3proto >= NFPROTO_NUMPROTO) return -EBUSY; - - if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + if (proto->tuple_to_nlattr && proto->nla_size == 0) return -EINVAL; - +#endif mutex_lock(&nf_ct_proto_mutex); old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], lockdep_is_held(&nf_ct_proto_mutex)); @@ -226,9 +226,6 @@ int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) goto out_unlock; } - if (proto->nlattr_tuple_size) - proto->nla_size = 3 * proto->nlattr_tuple_size(); - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); out_unlock: @@ -238,21 +235,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); -#ifdef CONFIG_SYSCTL -extern unsigned int nf_conntrack_default_on; - -int nf_ct_l3proto_pernet_register(struct net *net, - struct nf_conntrack_l3proto *proto) -{ - if (nf_conntrack_default_on == 0) - return 0; - - return proto->net_ns_get ? proto->net_ns_get(net) : 0; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); -#endif - -void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) +void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto) { BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO); @@ -266,27 +249,12 @@ void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l3proto, proto); + nf_ct_iterate_destroy(kill_l3proto, (void*)proto); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); -void nf_ct_l3proto_pernet_unregister(struct net *net, - struct nf_conntrack_l3proto *proto) -{ - /* - * nf_conntrack_default_on *might* have registered hooks. - * ->net_ns_put must cope with more puts() than get(), i.e. - * if nf_conntrack_default_on was 0 at time of - * nf_ct_l3proto_pernet_register invocation this net_ns_put() - * should be a noop. - */ - if (proto->net_ns_put) - proto->net_ns_put(net); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); - static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto) { if (l4proto->get_net_proto) { /* statically built-in protocols use static per-net */ @@ -301,7 +269,7 @@ static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, static int nf_ct_l4proto_register_sysctl(struct net *net, struct nf_proto_net *pn, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto) { int err = 0; @@ -324,8 +292,8 @@ int nf_ct_l4proto_register_sysctl(struct net *net, static void nf_ct_l4proto_unregister_sysctl(struct net *net, - struct nf_proto_net *pn, - struct nf_conntrack_l4proto *l4proto) + struct nf_proto_net *pn, + const struct nf_conntrack_l4proto *l4proto) { #ifdef CONFIG_SYSCTL if (pn->ctl_table_header != NULL) @@ -395,7 +363,7 @@ out_unlock: EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one); int nf_ct_l4proto_pernet_register_one(struct net *net, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nf_proto_net *pn = NULL; @@ -420,7 +388,7 @@ out: } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one); -static void __nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) +static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) { BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)); @@ -433,7 +401,7 @@ static void __nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) &nf_conntrack_l4proto_generic); } -void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) +void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) { mutex_lock(&nf_ct_proto_mutex); __nf_ct_l4proto_unregister_one(l4proto); @@ -444,7 +412,7 @@ void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one); void nf_ct_l4proto_pernet_unregister_one(struct net *net, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto) { struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto); @@ -469,8 +437,8 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[], } if (i != num_proto) { ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4; - pr_err("nf_conntrack_ipv%d: can't register %s%d proto.\n", - ver, l4proto[i]->name, ver); + pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n", + ver, l4proto[i]->l4proto); nf_ct_l4proto_unregister(l4proto, i); } return ret; @@ -478,7 +446,7 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[], EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); int nf_ct_l4proto_pernet_register(struct net *net, - struct nf_conntrack_l4proto *l4proto[], + struct nf_conntrack_l4proto *const l4proto[], unsigned int num_proto) { int ret = -EINVAL; @@ -490,8 +458,8 @@ int nf_ct_l4proto_pernet_register(struct net *net, break; } if (i != num_proto) { - pr_err("nf_conntrack_%s%d: pernet registration failed\n", - l4proto[i]->name, + pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n", + l4proto[i]->l4proto, l4proto[i]->l3proto == PF_INET6 ? 6 : 4); nf_ct_l4proto_pernet_unregister(net, l4proto, i); } @@ -514,8 +482,8 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[], EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); void nf_ct_l4proto_pernet_unregister(struct net *net, - struct nf_conntrack_l4proto *l4proto[], - unsigned int num_proto) + struct nf_conntrack_l4proto *const l4proto[], + unsigned int num_proto) { while (num_proto-- != 0) nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 4707d997558af..188347571fc78 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -623,18 +623,12 @@ static bool dccp_can_early_drop(const struct nf_conn *ct) return false; } -static void dccp_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.dccp.port), - ntohs(tuple->dst.u.dccp.port)); -} - +#ifdef CONFIG_NF_CONNTRACK_PROCFS static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); } +#endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, @@ -880,7 +874,6 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net) struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, - .name = "dccp", .pkt_to_tuple = dccp_pkt_to_tuple, .invert_tuple = dccp_invert_tuple, .new = dccp_new, @@ -888,8 +881,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, - .print_tuple = dccp_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, +#endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = dccp_to_nlattr, .nlattr_size = dccp_nlattr_size, @@ -916,7 +910,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, - .name = "dccp", .pkt_to_tuple = dccp_pkt_to_tuple, .invert_tuple = dccp_invert_tuple, .new = dccp_new, @@ -924,8 +917,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, - .print_tuple = dccp_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, +#endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = dccp_to_nlattr, .nlattr_size = dccp_nlattr_size, diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index d5868bad33a7e..2993995b690db 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -17,22 +17,10 @@ static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; static bool nf_generic_should_process(u8 proto) { switch (proto) { -#ifdef CONFIG_NF_CT_PROTO_SCTP_MODULE - case IPPROTO_SCTP: - return false; -#endif -#ifdef CONFIG_NF_CT_PROTO_DCCP_MODULE - case IPPROTO_DCCP: - return false; -#endif #ifdef CONFIG_NF_CT_PROTO_GRE_MODULE case IPPROTO_GRE: return false; #endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE_MODULE - case IPPROTO_UDPLITE: - return false; -#endif default: return true; } @@ -62,12 +50,6 @@ static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void generic_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ -} - static unsigned int *generic_get_timeouts(struct net *net) { return &(generic_pernet(net)->timeout); @@ -187,10 +169,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = { .l3proto = PF_UNSPEC, .l4proto = 255, - .name = "unknown", .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, - .print_tuple = generic_print_tuple, .packet = generic_packet, .get_timeouts = generic_get_timeouts, .new = generic_new, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 87bb40a3feb58..c0e3a23ac23a4 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -224,15 +224,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -/* print gre part of tuple */ -static void gre_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "srckey=0x%x dstkey=0x%x ", - ntohs(tuple->src.u.gre.key), - ntohs(tuple->dst.u.gre.key)); -} - +#ifdef CONFIG_NF_CONNTRACK_PROCFS /* print private data for conntrack */ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) { @@ -240,6 +232,7 @@ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) (ct->proto.gre.timeout / HZ), (ct->proto.gre.stream_timeout / HZ)); } +#endif static unsigned int *gre_get_timeouts(struct net *net) { @@ -364,11 +357,11 @@ static int gre_init_net(struct net *net, u_int16_t proto) static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { .l3proto = AF_INET, .l4proto = IPPROTO_GRE, - .name = "gre", .pkt_to_tuple = gre_pkt_to_tuple, .invert_tuple = gre_invert_tuple, - .print_tuple = gre_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, +#endif .get_timeouts = gre_get_timeouts, .packet = gre_packet, .new = gre_new, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 6eef29d2eec40..890b5c73368db 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -174,20 +174,13 @@ static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void sctp_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.sctp.port), - ntohs(tuple->dst.u.sctp.port)); -} - +#ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } +#endif #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ @@ -791,11 +784,11 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net) struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, - .name = "sctp", .pkt_to_tuple = sctp_pkt_to_tuple, .invert_tuple = sctp_invert_tuple, - .print_tuple = sctp_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, +#endif .packet = sctp_packet, .get_timeouts = sctp_get_timeouts, .new = sctp_new, @@ -828,11 +821,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, - .name = "sctp", .pkt_to_tuple = sctp_pkt_to_tuple, .invert_tuple = sctp_invert_tuple, - .print_tuple = sctp_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, +#endif .packet = sctp_packet, .get_timeouts = sctp_get_timeouts, .new = sctp_new, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 9758a7dfd83ef..33c52d9ab2f52 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -301,20 +301,13 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void tcp_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.tcp.port), - ntohs(tuple->dst.u.tcp.port)); -} - +#ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } +#endif static unsigned int get_conntrack_index(const struct tcphdr *tcph) { @@ -1556,11 +1549,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_TCP, - .name = "tcp", .pkt_to_tuple = tcp_pkt_to_tuple, .invert_tuple = tcp_invert_tuple, - .print_tuple = tcp_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, +#endif .packet = tcp_packet, .get_timeouts = tcp_get_timeouts, .new = tcp_new, @@ -1594,11 +1587,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, - .name = "tcp", .pkt_to_tuple = tcp_pkt_to_tuple, .invert_tuple = tcp_invert_tuple, - .print_tuple = tcp_print_tuple, +#ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, +#endif .packet = tcp_packet, .get_timeouts = tcp_get_timeouts, .new = tcp_new, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index f6ebce6178ca6..dcf3030d22263 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -63,15 +63,6 @@ static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void udp_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.udp.port), - ntohs(tuple->dst.u.udp.port)); -} - static unsigned int *udp_get_timeouts(struct net *net) { return udp_pernet(net)->timeouts; @@ -313,11 +304,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_UDP, - .name = "udp", .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, - .print_tuple = udp_print_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -347,11 +336,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, - .name = "udplite", .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, - .print_tuple = udp_print_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -381,11 +368,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, - .name = "udp", .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, - .print_tuple = udp_print_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -415,11 +400,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, - .name = "udplite", .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, - .print_tuple = udp_print_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d38af4274335b..4dbb5bad4363b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -884,7 +884,6 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, tuple.dst.u3 = *daddr; tuple.dst.u.udp.port = port; - rcu_read_lock(); do { exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); @@ -918,10 +917,8 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, goto err1; } - if (skip_expect) { - rcu_read_unlock(); + if (skip_expect) return NF_ACCEPT; - } rtp_exp = nf_ct_expect_alloc(ct); if (rtp_exp == NULL) @@ -952,7 +949,6 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, err2: nf_ct_expect_put(rtp_exp); err1: - rcu_read_unlock(); return ret; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index ccb5cb9043e0e..9eb85858d764a 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -41,8 +41,62 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - l3proto->print_tuple(s, tuple); - l4proto->print_tuple(s, tuple); + switch (l3proto->l3proto) { + case NFPROTO_IPV4: + seq_printf(s, "src=%pI4 dst=%pI4 ", + &tuple->src.u3.ip, &tuple->dst.u3.ip); + break; + case NFPROTO_IPV6: + seq_printf(s, "src=%pI6 dst=%pI6 ", + tuple->src.u3.ip6, tuple->dst.u3.ip6); + break; + default: + break; + } + + switch (l4proto->l4proto) { + case IPPROTO_ICMP: + seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); + break; + case IPPROTO_TCP: + seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); + break; + case IPPROTO_UDPLITE: /* fallthrough */ + case IPPROTO_UDP: + seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); + + break; + case IPPROTO_DCCP: + seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.dccp.port), + ntohs(tuple->dst.u.dccp.port)); + break; + case IPPROTO_SCTP: + seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); + break; + case IPPROTO_ICMPV6: + seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); + break; + case IPPROTO_GRE: + seq_printf(s, "srckey=0x%x dstkey=0x%x ", + ntohs(tuple->src.u.gre.key), + ntohs(tuple->dst.u.gre.key)); + break; + default: + break; + } } EXPORT_SYMBOL_GPL(print_tuple); @@ -198,6 +252,31 @@ ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) } #endif +static const char* l3proto_name(u16 proto) +{ + switch (proto) { + case AF_INET: return "ipv4"; + case AF_INET6: return "ipv6"; + } + + return "unknown"; +} + +static const char* l4proto_name(u16 proto) +{ + switch (proto) { + case IPPROTO_ICMP: return "icmp"; + case IPPROTO_TCP: return "tcp"; + case IPPROTO_UDP: return "udp"; + case IPPROTO_DCCP: return "dccp"; + case IPPROTO_GRE: return "gre"; + case IPPROTO_SCTP: return "sctp"; + case IPPROTO_UDPLITE: return "udplite"; + } + + return "unknown"; +} + /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { @@ -231,8 +310,8 @@ static int ct_seq_show(struct seq_file *s, void *v) ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u %ld ", - l3proto->name, nf_ct_l3num(ct), - l4proto->name, nf_ct_protonum(ct), + l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), + l4proto_name(l4proto->l4proto), nf_ct_protonum(ct), nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) @@ -452,9 +531,6 @@ static int log_invalid_proto_max __read_mostly = 255; /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; -extern unsigned int nf_conntrack_default_on; -unsigned int nf_conntrack_default_on __read_mostly = 1; - static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -520,13 +596,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "nf_conntrack_default_on", - .data = &nf_conntrack_default_on, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index bfa742da83aff..49f87ec093a39 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -5,17 +5,11 @@ #include <linux/skbuff.h> #include <linux/netdevice.h> -#ifdef CONFIG_NETFILTER_DEBUG -#define NFDEBUG(format, args...) printk(KERN_DEBUG format , ## args) -#else -#define NFDEBUG(format, args...) -#endif - /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_entry **entryp, unsigned int verdict); + const struct nf_hook_entries *entries, unsigned int index, + unsigned int verdict); unsigned int nf_queue_nf_hook_drop(struct net *net); -int __init netfilter_queue_init(void); /* nf_log.c */ int __init netfilter_log_init(void); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 043850c9d154d..f7e21953b1deb 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -109,9 +109,11 @@ unsigned int nf_queue_nf_hook_drop(struct net *net) return count; } +EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop); static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, - struct nf_hook_entry *hook_entry, unsigned int queuenum) + const struct nf_hook_entries *entries, + unsigned int index, unsigned int queuenum) { int status = -ENOENT; struct nf_queue_entry *entry = NULL; @@ -139,7 +141,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, *entry = (struct nf_queue_entry) { .skb = skb, .state = *state, - .hook = hook_entry, + .hook_index = index, .size = sizeof(*entry) + afinfo->route_key_size, }; @@ -162,18 +164,16 @@ err: /* Packets leaving via this function must come back through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_entry **entryp, unsigned int verdict) + const struct nf_hook_entries *entries, unsigned int index, + unsigned int verdict) { - struct nf_hook_entry *entry = *entryp; int ret; - ret = __nf_queue(skb, state, entry, verdict >> NF_VERDICT_QBITS); + ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ESRCH && - (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) { - *entryp = rcu_dereference(entry->next); + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) return 1; - } kfree_skb(skb); } @@ -182,33 +182,56 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_entry **entryp) + const struct nf_hook_entries *hooks, + unsigned int *index) { - unsigned int verdict; + const struct nf_hook_entry *hook; + unsigned int verdict, i = *index; - do { + while (i < hooks->num_hook_entries) { + hook = &hooks->hooks[i]; repeat: - verdict = nf_hook_entry_hookfn((*entryp), skb, state); + verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { if (verdict != NF_REPEAT) return verdict; goto repeat; } - *entryp = rcu_dereference((*entryp)->next); - } while (*entryp); + i++; + } + *index = i; return NF_ACCEPT; } +/* Caller must hold rcu read-side lock */ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { - struct nf_hook_entry *hook_entry = entry->hook; + const struct nf_hook_entry *hook_entry; + const struct nf_hook_entries *hooks; struct sk_buff *skb = entry->skb; const struct nf_afinfo *afinfo; + const struct net *net; + unsigned int i; int err; + u8 pf; + + net = entry->state.net; + pf = entry->state.pf; + + hooks = rcu_dereference(net->nf.hooks[pf][entry->state.hook]); nf_queue_entry_release_refs(entry); + i = entry->hook_index; + if (WARN_ON_ONCE(i >= hooks->num_hook_entries)) { + kfree_skb(skb); + kfree(entry); + return; + } + + hook_entry = &hooks->hooks[i]; + /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); @@ -220,27 +243,22 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) } if (verdict == NF_ACCEPT) { - hook_entry = rcu_dereference(hook_entry->next); - if (hook_entry) next_hook: - verdict = nf_iterate(skb, &entry->state, &hook_entry); + ++i; + verdict = nf_iterate(skb, &entry->state, hooks, &i); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: -okfn: local_bh_disable(); entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, &entry->state, &hook_entry, verdict); - if (err == 1) { - if (hook_entry) - goto next_hook; - goto okfn; - } + err = nf_queue(skb, &entry->state, hooks, i, verdict); + if (err == 1) + goto next_hook; break; case NF_STOLEN: break; diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index c68c1e58b3628..d2a9e6b5d01f7 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -33,7 +33,7 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg) reg->set_optmin, reg->set_optmax) || overlap(ops->get_optmin, ops->get_optmax, reg->get_optmin, reg->get_optmax))) { - NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + pr_debug("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", ops->set_optmin, ops->set_optmax, ops->get_optmin, ops->get_optmax, reg->set_optmin, reg->set_optmax, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7843efa33c598..149785ff1c7b6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -726,7 +726,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table == NULL) goto err2; - nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); + table->name = nla_strdup(name, GFP_KERNEL); + if (table->name == NULL) + goto err3; + INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); @@ -735,10 +738,12 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err3; + goto err4; list_add_tail_rcu(&table->list, &afi->tables); return 0; +err4: + kfree(table->name); err3: kfree(table); err2: @@ -865,6 +870,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) { BUG_ON(ctx->table->use > 0); + kfree(ctx->table->name); kfree(ctx->table); module_put(ctx->afi->owner); } @@ -1240,10 +1246,14 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) module_put(basechain->type->owner); free_percpu(basechain->stats); + if (basechain->stats) + static_branch_dec(&nft_counters_enabled); if (basechain->ops[0].dev != NULL) dev_put(basechain->ops[0].dev); + kfree(chain->name); kfree(basechain); } else { + kfree(chain->name); kfree(chain); } } @@ -1468,8 +1478,13 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, nft_trans_chain_policy(trans) = -1; if (nla[NFTA_CHAIN_HANDLE] && name) { - nla_strlcpy(nft_trans_chain_name(trans), name, - NFT_CHAIN_MAXNAMELEN); + nft_trans_chain_name(trans) = + nla_strdup(name, GFP_KERNEL); + if (!nft_trans_chain_name(trans)) { + kfree(trans); + free_percpu(stats); + return -ENOMEM; + } } list_add_tail(&trans->list, &net->nft.commit_list); return 0; @@ -1504,14 +1519,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(stats); } basechain->stats = stats; - } else { - stats = netdev_alloc_pcpu_stats(struct nft_stats); - if (stats == NULL) { - nft_chain_release_hook(&hook); - kfree(basechain); - return -ENOMEM; - } - rcu_assign_pointer(basechain->stats, stats); + static_branch_inc(&nft_counters_enabled); } hookfn = hook.type->hooks[hook.num]; @@ -1543,7 +1551,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + chain->name = nla_strdup(name, GFP_KERNEL); + if (!chain->name) { + err = -ENOMEM; + goto err1; + } err = nf_tables_register_hooks(net, table, chain, afi->nops); if (err < 0) @@ -1977,8 +1989,8 @@ err: } struct nft_rule_dump_ctx { - char table[NFT_TABLE_MAXNAMELEN]; - char chain[NFT_CHAIN_MAXNAMELEN]; + char *table; + char *chain; }; static int nf_tables_dump_rules(struct sk_buff *skb, @@ -2002,7 +2014,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, continue; list_for_each_entry_rcu(table, &afi->tables, list) { - if (ctx && ctx->table[0] && + if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) continue; @@ -2042,7 +2054,13 @@ done: static int nf_tables_dump_rules_done(struct netlink_callback *cb) { - kfree(cb->data); + struct nft_rule_dump_ctx *ctx = cb->data; + + if (ctx) { + kfree(ctx->table); + kfree(ctx->chain); + kfree(ctx); + } return 0; } @@ -2074,12 +2092,23 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (!ctx) return -ENOMEM; - if (nla[NFTA_RULE_TABLE]) - nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE], - sizeof(ctx->table)); - if (nla[NFTA_RULE_CHAIN]) - nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN], - sizeof(ctx->chain)); + if (nla[NFTA_RULE_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], + GFP_KERNEL); + if (!ctx->table) { + kfree(ctx); + return -ENOMEM; + } + } + if (nla[NFTA_RULE_CHAIN]) { + ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], + GFP_KERNEL); + if (!ctx->chain) { + kfree(ctx->table); + kfree(ctx); + return -ENOMEM; + } + } c.data = ctx; } @@ -2621,7 +2650,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); + p = strchr(name, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2652,7 +2681,10 @@ cont: free_page((unsigned long)inuse); } - snprintf(set->name, sizeof(set->name), name, min + n); + set->name = kasprintf(GFP_KERNEL, name, min + n); + if (!set->name) + return -ENOMEM; + list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; @@ -2929,7 +2961,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[NFT_SET_MAXNAMELEN]; + char *name; unsigned int size; bool create; u64 timeout; @@ -3075,8 +3107,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, goto err1; } - nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); + name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); + if (!name) { + err = -ENOMEM; + goto err2; + } + err = nf_tables_set_alloc_name(&ctx, set, name); + kfree(name); if (err < 0) goto err2; @@ -3126,6 +3164,7 @@ static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); module_put(set->ops->type->owner); + kfree(set->name); kvfree(set); } @@ -4363,15 +4402,21 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, goto err1; } obj->table = table; - nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN); + obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); + if (!obj->name) { + err = -ENOMEM; + goto err2; + } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&obj->list, &table->objects); table->use++; return 0; +err3: + kfree(obj->name); err2: if (obj->type->destroy) obj->type->destroy(obj); @@ -4415,7 +4460,7 @@ nla_put_failure: } struct nft_obj_filter { - char table[NFT_OBJ_MAXNAMELEN]; + char *table; u32 type; }; @@ -4480,7 +4525,10 @@ done: static int nf_tables_dump_obj_done(struct netlink_callback *cb) { - kfree(cb->data); + struct nft_obj_filter *filter = cb->data; + + kfree(filter->table); + kfree(filter); return 0; } @@ -4494,9 +4542,13 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) if (!filter) return ERR_PTR(-ENOMEM); - if (nla[NFTA_OBJ_TABLE]) - nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE], - NFT_TABLE_MAXNAMELEN); + if (nla[NFTA_OBJ_TABLE]) { + filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL); + if (!filter->table) { + kfree(filter); + return ERR_PTR(-ENOMEM); + } + } if (nla[NFTA_OBJ_TYPE]) filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); @@ -4580,6 +4632,7 @@ static void nft_obj_destroy(struct nft_object *obj) obj->type->destroy(obj); module_put(obj->type->owner); + kfree(obj->name); kfree(obj); } @@ -4662,6 +4715,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; + char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); @@ -4673,7 +4727,9 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || + nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || + nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -4842,7 +4898,7 @@ static void nft_chain_commit_update(struct nft_trans *trans) { struct nft_base_chain *basechain; - if (nft_trans_chain_name(trans)[0]) + if (nft_trans_chain_name(trans)) strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); if (!nft_is_base_chain(trans->ctx.chain)) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 65dbeadcb1188..dfd0bf3810d2e 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -29,7 +29,7 @@ static const char *const comments[__NFT_TRACETYPE_MAX] = { [NFT_TRACETYPE_RULE] = "rule", }; -static struct nf_loginfo trace_loginfo = { +static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { @@ -114,6 +114,22 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, return true; } +DEFINE_STATIC_KEY_FALSE(nft_counters_enabled); + +static noinline void nft_update_chain_stats(const struct nft_chain *chain, + const struct nft_pktinfo *pkt) +{ + struct nft_stats *stats; + + local_bh_disable(); + stats = this_cpu_ptr(rcu_dereference(nft_base_chain(chain)->stats)); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); + local_bh_enable(); +} + struct nft_jumpstack { const struct nft_chain *chain; const struct nft_rule *rule; @@ -130,7 +146,6 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - struct nft_stats *stats; int rulenum; unsigned int gencursor = nft_genmask_cur(net); struct nft_traceinfo info; @@ -220,13 +235,8 @@ next_rule: nft_trace_packet(&info, basechain, NULL, -1, NFT_TRACETYPE_POLICY); - rcu_read_lock_bh(); - stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); - u64_stats_update_begin(&stats->syncp); - stats->pkts++; - stats->bytes += pkt->skb->len; - u64_stats_update_end(&stats->syncp); - rcu_read_unlock_bh(); + if (static_branch_unlikely(&nft_counters_enabled)) + nft_update_chain_stats(basechain, pkt); return nft_base_chain(basechain)->policy; } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index e1b15e7a5793f..e1dc527a493b8 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -162,6 +162,27 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb, NFTA_TRACE_PAD); } +static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) +{ + switch (info->type) { + case NFT_TRACETYPE_RETURN: + case NFT_TRACETYPE_RULE: + break; + default: + return false; + } + + switch (info->verdict->code) { + case NFT_JUMP: + case NFT_GOTO: + break; + default: + return false; + } + + return true; +} + void nft_trace_notify(struct nft_traceinfo *info) { const struct nft_pktinfo *pkt = info->pkt; @@ -175,13 +196,12 @@ void nft_trace_notify(struct nft_traceinfo *info) return; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + - nla_total_size(NFT_TABLE_MAXNAMELEN) + - nla_total_size(NFT_CHAIN_MAXNAMELEN) + + nla_total_size(strlen(info->chain->table->name)) + + nla_total_size(strlen(info->chain->name)) + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */ nla_total_size(sizeof(u32)) + /* id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + @@ -194,6 +214,9 @@ void nft_trace_notify(struct nft_traceinfo *info) nla_total_size(sizeof(u32)) + /* nfproto */ nla_total_size(sizeof(u32)); /* policy */ + if (nft_trace_have_verdict_chain(info)) + size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */ + skb = nlmsg_new(size, GFP_ATOMIC); if (!skb) return; @@ -217,14 +240,11 @@ void nft_trace_notify(struct nft_traceinfo *info) if (trace_fill_id(skb, pkt->skb)) goto nla_put_failure; - if (info->chain) { - if (nla_put_string(skb, NFTA_TRACE_CHAIN, - info->chain->name)) - goto nla_put_failure; - if (nla_put_string(skb, NFTA_TRACE_TABLE, - info->chain->table->name)) - goto nla_put_failure; - } + if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name)) + goto nla_put_failure; + + if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name)) + goto nla_put_failure; if (nf_trace_fill_rule_info(skb, info)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 400e9ae971533..32b1c0b44e791 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -47,7 +47,8 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { }; static int -ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, +ctnl_timeout_parse_policy(void *timeouts, + const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { int ret = 0; @@ -74,7 +75,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, { __u16 l3num; __u8 l4num; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout, *matching = NULL; char *name; int ret; @@ -158,7 +159,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; - struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + const struct nf_conntrack_l4proto *l4proto = timeout->l4proto; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -363,10 +364,10 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { + const struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts; __u16 l3num; __u8 l4num; - struct nf_conntrack_l4proto *l4proto; - unsigned int *timeouts; int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || @@ -401,7 +402,7 @@ err: static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -453,11 +454,11 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { - __u16 l3num; - __u8 l4num; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; struct sk_buff *skb2; int ret, err; + __u16 l3num; + __u8 l4num; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -505,7 +506,6 @@ ctnl_timeout_find_get(struct net *net, const char *name) { struct ctnl_timeout *timeout, *matching = NULL; - rcu_read_lock(); list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; @@ -521,7 +521,6 @@ ctnl_timeout_find_get(struct net *net, const char *name) break; } err: - rcu_read_unlock(); return matching; } @@ -572,6 +571,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) { struct ctnl_timeout *cur, *tmp; + nf_ct_unconfirmed_destroy(net); ctnl_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index c684ba95dbb49..cad6498f10b03 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -606,7 +606,7 @@ nla_put_failure: return -1; } -static struct nf_loginfo default_loginfo = { +static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { .ulog = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 16fa04086880c..c9796629858f7 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -41,6 +41,10 @@ #include "../bridge/br_private.h" #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif + #define NFQNL_QMAX_DEFAULT 1024 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len @@ -612,6 +616,18 @@ nlmsg_failure: return NULL; } +static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; + const struct nf_conn *ct = (void *)skb_nfct(entry->skb); + + if (ct && ((ct->status & flags) == IPS_DYING)) + return true; +#endif + return false; +} + static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) @@ -628,6 +644,9 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, } spin_lock_bh(&queue->lock); + if (nf_ct_drop_unconfirmed(entry)) + goto err_out_free_nskb; + if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; @@ -928,7 +947,6 @@ static unsigned int nfqnl_nf_hook_drop(struct net *net) unsigned int instances = 0; int i; - rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; @@ -938,7 +956,6 @@ static unsigned int nfqnl_nf_hook_drop(struct net *net) instances++; } } - rcu_read_unlock(); return instances; } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 1ec49fe5845f1..a0a93d987a3bd 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -8,6 +8,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ +#include <asm/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -23,6 +24,7 @@ struct nft_exthdr { u8 len; u8 op; enum nft_registers dreg:8; + enum nft_registers sreg:8; u8 flags; }; @@ -61,6 +63,26 @@ err: regs->verdict.code = NFT_BREAK; } +static void * +nft_tcp_header_pointer(const struct nft_pktinfo *pkt, + unsigned int len, void *buffer, unsigned int *tcphdr_len) +{ + struct tcphdr *tcph; + + if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) + return NULL; + + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer); + if (!tcph) + return NULL; + + *tcphdr_len = __tcp_hdrlen(tcph); + if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) + return NULL; + + return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer); +} + static void nft_exthdr_tcp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -72,18 +94,7 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr, struct tcphdr *tcph; u8 *opt; - if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) - goto err; - - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff); - if (!tcph) - goto err; - - tcphdr_len = __tcp_hdrlen(tcph); - if (tcphdr_len < sizeof(*tcph)) - goto err; - - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff); + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; @@ -115,6 +126,88 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int i, optl, tcphdr_len, offset; + struct tcphdr *tcph; + u8 *opt; + u32 src; + + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); + if (!tcph) + return; + + opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { + union { + u8 octet; + __be16 v16; + __be32 v32; + } old, new; + + optl = optlen(opt, i); + + if (priv->type != opt[i]) + continue; + + if (i + optl > tcphdr_len || priv->len + priv->offset > optl) + return; + + if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len)) + return; + + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, + &tcphdr_len); + if (!tcph) + return; + + src = regs->data[priv->sreg]; + offset = i + priv->offset; + + switch (priv->len) { + case 2: + old.v16 = get_unaligned((u16 *)(opt + offset)); + new.v16 = src; + + switch (priv->type) { + case TCPOPT_MSS: + /* increase can cause connection to stall */ + if (ntohs(old.v16) <= ntohs(new.v16)) + return; + break; + } + + if (old.v16 == new.v16) + return; + + put_unaligned(new.v16, (u16*)(opt + offset)); + inet_proto_csum_replace2(&tcph->check, pkt->skb, + old.v16, new.v16, false); + break; + case 4: + new.v32 = src; + old.v32 = get_unaligned((u32 *)(opt + offset)); + + if (old.v32 == new.v32) + return; + + put_unaligned(new.v32, (u32*)(opt + offset)); + inet_proto_csum_replace4(&tcph->check, pkt->skb, + old.v32, new.v32, false); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return; + } +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -171,12 +264,57 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, priv->len); } -static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { - const struct nft_exthdr *priv = nft_expr_priv(expr); + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; + int err; - if (nft_dump_register(skb, NFTA_EXTHDR_DREG, priv->dreg)) - goto nla_put_failure; + if (!tb[NFTA_EXTHDR_SREG] || + !tb[NFTA_EXTHDR_TYPE] || + !tb[NFTA_EXTHDR_OFFSET] || + !tb[NFTA_EXTHDR_LEN]) + return -EINVAL; + + if (tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS]) + return -EINVAL; + + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); + if (err < 0) + return err; + + err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); + if (err < 0) + return err; + + if (offset < 2) + return -EOPNOTSUPP; + + switch (len) { + case 2: break; + case 4: break; + default: + return -EOPNOTSUPP; + } + + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); + if (err < 0) + return err; + + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); + priv->offset = offset; + priv->len = len; + priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]); + priv->flags = flags; + priv->op = op; + + return nft_validate_register_load(priv->sreg, priv->len); +} + +static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) +{ if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) @@ -193,6 +331,26 @@ nla_put_failure: return -1; } +static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_EXTHDR_DREG, priv->dreg)) + return -1; + + return nft_exthdr_dump_common(skb, priv); +} + +static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_EXTHDR_SREG, priv->sreg)) + return -1; + + return nft_exthdr_dump_common(skb, priv); +} + static struct nft_expr_type nft_exthdr_type; static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, @@ -210,6 +368,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_tcp_set_eval, + .init = nft_exthdr_tcp_set_init, + .dump = nft_exthdr_dump_set, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -219,12 +385,21 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (!tb[NFTA_EXTHDR_OP]) return &nft_exthdr_ipv6_ops; - op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP])); + if (tb[NFTA_EXTHDR_SREG] && tb[NFTA_EXTHDR_DREG]) + return ERR_PTR(-EOPNOTSUPP); + + op = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OP])); switch (op) { case NFT_EXTHDR_OP_TCPOPT: - return &nft_exthdr_tcp_ops; + if (tb[NFTA_EXTHDR_SREG]) + return &nft_exthdr_tcp_set_ops; + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_tcp_ops; + break; case NFT_EXTHDR_OP_IPV6: - return &nft_exthdr_ipv6_ops; + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv6_ops; + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c new file mode 100644 index 0000000000000..3997ee36cfbd5 --- /dev/null +++ b/net/netfilter/nft_fib_netdev.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017 Pablo M. Bermudo Garay <pablombg@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This code is based on net/netfilter/nft_fib_inet.c, written by + * Florian Westphal <fw@strlen.de>. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +#include <net/netfilter/nft_fib.h> + +static void nft_fib_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + + switch (ntohs(pkt->skb->protocol)) { + case ETH_P_IP: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + case NFT_FIB_RESULT_OIFNAME: + return nft_fib4_eval(expr, regs, pkt); + case NFT_FIB_RESULT_ADDRTYPE: + return nft_fib4_eval_type(expr, regs, pkt); + } + break; + case ETH_P_IPV6: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + case NFT_FIB_RESULT_OIFNAME: + return nft_fib6_eval(expr, regs, pkt); + case NFT_FIB_RESULT_ADDRTYPE: + return nft_fib6_eval_type(expr, regs, pkt); + } + break; + } + + regs->verdict.code = NFT_BREAK; +} + +static struct nft_expr_type nft_fib_netdev_type; +static const struct nft_expr_ops nft_fib_netdev_ops = { + .type = &nft_fib_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib_netdev_eval, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static struct nft_expr_type nft_fib_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "fib", + .ops = &nft_fib_netdev_ops, + .policy = nft_fib_policy, + .maxattr = NFTA_FIB_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_fib_netdev_module_init(void) +{ + return nft_register_expr(&nft_fib_netdev_type); +} + +static void __exit nft_fib_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_fib_netdev_type); +} + +module_init(nft_fib_netdev_module_init); +module_exit(nft_fib_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo M. Bermudo Garay <pablombg@gmail.com>"); +MODULE_ALIAS_NFT_AF_EXPR(5, "fib"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7d699bbd45b0e..e110b0ebbf58b 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -184,7 +184,7 @@ static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff) if (!uh) return false; - return uh->check; + return (__force bool)uh->check; } static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index c7383d8f88d0b..a6b7d05aeacf4 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -23,6 +23,43 @@ struct nft_rt { enum nft_registers dreg:8; }; +static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) +{ + u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst); + const struct sk_buff *skb = pkt->skb; + const struct nf_afinfo *ai; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ip_hdr(skb)->saddr; + minlen = sizeof(struct iphdr) + sizeof(struct tcphdr); + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ipv6_hdr(skb)->saddr; + minlen = sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + break; + } + + ai = nf_get_afinfo(nft_pf(pkt)); + if (ai) { + struct dst_entry *dst = NULL; + + ai->route(nft_net(pkt), &dst, &fl, false); + if (dst) { + mtu = min(mtu, dst_mtu(dst)); + dst_release(dst); + } + } + + if (mtu <= minlen || mtu > 0xffff) + return TCP_MSS_DEFAULT; + + return mtu - minlen; +} + static void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -46,8 +83,8 @@ static void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = rt_nexthop((const struct rtable *)dst, - ip_hdr(skb)->daddr); + *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) @@ -57,6 +94,9 @@ static void nft_rt_get_eval(const struct nft_expr *expr, &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; + case NFT_RT_TCPMSS: + nft_reg_store16(dest, get_tcpmss(pkt, dst)); + break; default: WARN_ON(1); goto err; @@ -67,7 +107,7 @@ err: regs->verdict.code = NFT_BREAK; } -const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { +static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_DREG] = { .type = NLA_U32 }, [NFTA_RT_KEY] = { .type = NLA_U32 }, }; @@ -94,6 +134,9 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, case NFT_RT_NEXTHOP6: len = sizeof(struct in6_addr); break; + case NFT_RT_TCPMSS: + len = sizeof(u16); + break; default: return -EOPNOTSUPP; } @@ -118,6 +161,29 @@ nla_put_failure: return -1; } +static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_rt *priv = nft_expr_priv(expr); + unsigned int hooks; + + switch (priv->key) { + case NFT_RT_NEXTHOP4: + case NFT_RT_NEXTHOP6: + case NFT_RT_CLASSID: + return 0; + case NFT_RT_TCPMSS: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING); + break; + default: + return -EINVAL; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + static struct nft_expr_type nft_rt_type; static const struct nft_expr_ops nft_rt_get_ops = { .type = &nft_rt_type, @@ -125,6 +191,7 @@ static const struct nft_expr_ops nft_rt_get_ops = { .eval = nft_rt_get_eval, .init = nft_rt_get_init, .dump = nft_rt_get_dump, + .validate = nft_rt_validate, }; static struct nft_expr_type nft_rt_type __read_mostly = { diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index bce5382f1d49d..d83a4ec5900d4 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -19,8 +19,9 @@ #include <net/netfilter/nf_tables.h> struct nft_rbtree { - rwlock_t lock; struct rb_root root; + rwlock_t lock; + seqcount_t count; }; struct nft_rbtree_elem { @@ -40,8 +41,9 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this, return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; } -static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext, + unsigned int seq) { struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; @@ -50,15 +52,17 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const void *this; int d; - read_lock_bh(&priv->lock); - parent = priv->root.rb_node; + parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { + if (read_seqcount_retry(&priv->count, seq)) + return false; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); this = nft_set_ext_key(&rbe->ext); d = memcmp(this, key, set->klen); if (d < 0) { - parent = parent->rb_left; + parent = rcu_dereference_raw(parent->rb_left); if (interval && nft_rbtree_equal(set, this, interval) && nft_rbtree_interval_end(this) && @@ -66,15 +70,14 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, continue; interval = rbe; } else if (d > 0) - parent = parent->rb_right; + parent = rcu_dereference_raw(parent->rb_right); else { if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = parent->rb_left; + parent = rcu_dereference_raw(parent->rb_left); continue; } if (nft_rbtree_interval_end(rbe)) goto out; - read_unlock_bh(&priv->lock); *ext = &rbe->ext; return true; @@ -84,15 +87,32 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && !nft_rbtree_interval_end(interval)) { - read_unlock_bh(&priv->lock); *ext = &interval->ext; return true; } out: - read_unlock_bh(&priv->lock); return false; } +static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_rbtree *priv = nft_set_priv(set); + unsigned int seq = read_seqcount_begin(&priv->count); + bool ret; + + ret = __nft_rbtree_lookup(net, set, key, ext, seq); + if (ret || !read_seqcount_retry(&priv->count, seq)) + return ret; + + read_lock_bh(&priv->lock); + seq = read_seqcount_begin(&priv->count); + ret = __nft_rbtree_lookup(net, set, key, ext, seq); + read_unlock_bh(&priv->lock); + + return ret; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) @@ -130,7 +150,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } } - rb_link_node(&new->node, parent, p); + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; } @@ -144,7 +164,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, int err; write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); return err; @@ -158,7 +180,9 @@ static void nft_rbtree_remove(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); rb_erase(&rbe->node, &priv->root); + write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } @@ -264,6 +288,7 @@ static int nft_rbtree_init(const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); rwlock_init(&priv->lock); + seqcount_init(&priv->count); priv->root = RB_ROOT; return 0; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e1648238a9c99..c83a3b5e1c6c2 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1192,16 +1192,10 @@ xt_replace_table(struct xt_table *table, #ifdef CONFIG_AUDIT if (audit_enabled) { - struct audit_buffer *ab; - - ab = audit_log_start(current->audit_context, GFP_KERNEL, - AUDIT_NETFILTER_CFG); - if (ab) { - audit_log_format(ab, "table=%s family=%u entries=%u", - table->name, table->af, - private->number); - audit_log_end(ab); - } + audit_log(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG, + "table=%s family=%u entries=%u", + table->name, table->af, private->number); } #endif diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 623ef37de886f..5a152e2acfd58 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -121,9 +121,9 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT typeof(nf_ct_timeout_find_get_hook) timeout_find_get; + const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout; struct nf_conn_timeout *timeout_ext; - struct nf_conntrack_l4proto *l4proto; int ret = 0; u8 proto; diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index c64aca611ac5c..9dae4d665965e 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -62,11 +62,9 @@ static u_int32_t tcpmss_reverse_mtu(struct net *net, memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } - rcu_read_lock(); ai = nf_get_afinfo(family); if (ai != NULL) ai->route(net, (struct dst_entry **)&rt, &fl, false); - rcu_read_unlock(); if (rt != NULL) { mtu = dst_mtu(&rt->dst); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index ade4c10c28c6d..17d7705e3bd41 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -70,13 +70,11 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) return user_laddr; laddr = 0; - rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); for_primary_ifa(indev) { laddr = ifa->ifa_local; break; } endfor_ifa(indev); - rcu_read_unlock(); return laddr ? laddr : daddr; } @@ -391,7 +389,6 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, return user_laddr; laddr = NULL; - rcu_read_lock(); indev = __in6_dev_get(skb->dev); if (indev) { read_lock_bh(&indev->lock); @@ -404,7 +401,6 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, } read_unlock_bh(&indev->lock); } - rcu_read_unlock(); return laddr ? laddr : daddr; } diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index e329dabde35f5..3b2be2ae69875 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -47,8 +47,6 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (dev) flow.flowi6_oif = dev->ifindex; - rcu_read_lock(); - afinfo = nf_get_afinfo(NFPROTO_IPV6); if (afinfo != NULL) { const struct nf_ipv6_ops *v6ops; @@ -63,7 +61,6 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, } else { route_err = 1; } - rcu_read_unlock(); if (route_err) return XT_ADDRTYPE_UNREACHABLE; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index b8fd4ab762edb..ffa8eec980e9e 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -58,8 +58,7 @@ struct xt_connlimit_rb { static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; struct xt_connlimit_data { - struct rb_root climit_root4[CONNLIMIT_SLOTS]; - struct rb_root climit_root6[CONNLIMIT_SLOTS]; + struct rb_root climit_root[CONNLIMIT_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; @@ -144,7 +143,6 @@ static unsigned int check_hlist(struct net *net, unsigned int length = 0; *addit = true; - rcu_read_lock(); /* check the saved connections */ hlist_for_each_entry_safe(conn, n, head, node) { @@ -179,8 +177,6 @@ static unsigned int check_hlist(struct net *net, length++; } - rcu_read_unlock(); - return length; } @@ -297,13 +293,11 @@ static int count_them(struct net *net, int count; u32 hash; - if (family == NFPROTO_IPV6) { + if (family == NFPROTO_IPV6) hash = connlimit_iphash6(addr, mask); - root = &data->climit_root6[hash]; - } else { + else hash = connlimit_iphash(addr->ip & mask->ip); - root = &data->climit_root4[hash]; - } + root = &data->climit_root[hash]; spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); @@ -382,10 +376,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) - info->data->climit_root4[i] = RB_ROOT; - for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) - info->data->climit_root6[i] = RB_ROOT; + for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i) + info->data->climit_root[i] = RB_ROOT; return 0; } @@ -416,10 +408,8 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); - for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) - destroy_tree(&info->data->climit_root4[i]); - for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) - destroy_tree(&info->data->climit_root6[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i) + destroy_tree(&info->data->climit_root[i]); kfree(info->data); } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 762e1874f28b7..ffdb611e54a26 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -659,12 +659,12 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; - rcu_read_lock_bh(); + local_bh_disable(); dh = dsthash_find(hinfo, &dst); if (dh == NULL) { dh = dsthash_alloc_init(hinfo, &dst, &race); if (dh == NULL) { - rcu_read_unlock_bh(); + local_bh_enable(); goto hotdrop; } else if (race) { /* Already got an entry, update expiration timeout */ @@ -689,12 +689,12 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, /* below the limit */ dh->rateinfo.credit -= cost; spin_unlock(&dh->lock); - rcu_read_unlock_bh(); + local_bh_enable(); return !(cfg->mode & XT_HASHLIMIT_INVERT); } spin_unlock(&dh->lock); - rcu_read_unlock_bh(); + local_bh_enable(); /* default match is underlimit - so over the limit, we need to invert */ return cfg->mode & XT_HASHLIMIT_INVERT; diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 71cfa9551d083..36e14b1f061dd 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -226,7 +226,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) sizeof(struct tcphdr), optsize, opts); } - rcu_read_lock(); list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { int foptsize, optnum; @@ -340,7 +339,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) info->loglevel == XT_OSF_LOGLEVEL_FIRST) break; } - rcu_read_unlock(); if (!fcount && (info->flags & XT_OSF_LOG)) nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p), diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 30d632509f829..d558e882ca0c5 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -579,8 +579,8 @@ static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 l3num, struct sk_buff *skb, bool natted) { - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 33fd061305c40..2f2e1338cd3d7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6530,7 +6530,7 @@ security_initcall(selinux_init); #if defined(CONFIG_NETFILTER) -static struct nf_hook_ops selinux_nf_ops[] = { +static const struct nf_hook_ops selinux_nf_ops[] = { { .hook = selinux_ipv4_postroute, .pf = NFPROTO_IPV4, diff --git a/security/smack/smack_netfilter.c b/security/smack/smack_netfilter.c index cdeb0f3243dd6..e36d17835d4ff 100644 --- a/security/smack/smack_netfilter.c +++ b/security/smack/smack_netfilter.c @@ -58,7 +58,7 @@ static unsigned int smack_ipv4_output(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops smack_nf_ops[] = { +static const struct nf_hook_ops smack_nf_ops[] = { { .hook = smack_ipv4_output, .pf = NFPROTO_IPV4, |