summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/device_drivers/stmicro/stmmac.rst7
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c2
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c305
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h1
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c123
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c22
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c79
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h5
-rw-r--r--drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/resources.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum.h7
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c243
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c8
-rw-r--r--drivers/net/ethernet/mscc/ocelot_flower.c4
-rw-r--r--drivers/net/ethernet/netronome/nfp/flower/action.c4
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_devlink.c9
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_ethtool.c13
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_lif.c4
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_main.c3
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_filter.c2
-rw-r--r--drivers/net/ethernet/sfc/ethtool.c6
-rw-r--r--drivers/net/ethernet/sfc/falcon/ethtool.c6
-rw-r--r--drivers/net/ethernet/socionext/netsec.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/common.h13
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c99
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h22
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/hwif.c45
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/hwif.h1
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c58
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c4
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c2
-rw-r--r--drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c17
-rw-r--r--drivers/net/ethernet/tehuti/tehuti.c2
-rw-r--r--drivers/net/ethernet/ti/cpsw.c1
-rw-r--r--drivers/net/ethernet/ti/cpsw_new.c1
-rw-r--r--drivers/net/ethernet/ti/davinci_emac.c1
-rw-r--r--drivers/net/ethernet/xilinx/ll_temac_main.c21
-rw-r--r--drivers/net/ethernet/xilinx/xilinx_axienet_main.c22
-rw-r--r--drivers/net/phy/mdio-xpcs.c98
-rw-r--r--drivers/net/phy/mdio_bus.c68
-rw-r--r--drivers/net/phy/mscc/mscc_main.c18
-rw-r--r--drivers/net/phy/phy-core.c31
-rw-r--r--drivers/net/phy/phy.c26
-rw-r--r--drivers/net/phy/phylink.c236
-rw-r--r--drivers/net/phy/realtek.c60
-rw-r--r--drivers/s390/net/qeth_core.h7
-rw-r--r--drivers/s390/net/qeth_core_main.c131
-rw-r--r--drivers/s390/net/qeth_core_sys.c2
-rw-r--r--drivers/s390/net/qeth_ethtool.c43
-rw-r--r--drivers/s390/net/qeth_l2_main.c32
-rw-r--r--drivers/s390/net/qeth_l3_main.c29
-rw-r--r--include/linux/ethtool.h2
-rw-r--r--include/linux/mdio.h4
-rw-r--r--include/linux/netfilter/ipset/ip_set.h2
-rw-r--r--include/linux/netfilter/x_tables.h8
-rw-r--r--include/linux/netfilter_arp/arp_tables.h2
-rw-r--r--include/linux/netfilter_bridge/ebtables.h2
-rw-r--r--include/linux/netfilter_ipv4/ip_tables.h2
-rw-r--r--include/linux/netfilter_ipv6/ip6_tables.h2
-rw-r--r--include/linux/phy.h22
-rw-r--r--include/linux/phylink.h8
-rw-r--r--include/net/flow_offload.h49
-rw-r--r--include/net/netfilter/nf_conntrack_extend.h2
-rw-r--r--include/net/netfilter/nf_conntrack_timeout.h2
-rw-r--r--include/net/netfilter/nf_flow_table.h6
-rw-r--r--include/net/netfilter/nf_tables.h34
-rw-r--r--include/net/netfilter/nf_tables_core.h13
-rw-r--r--include/net/pkt_sched.h10
-rw-r--r--include/uapi/linux/if_bridge.h2
-rw-r--r--include/uapi/linux/mii.h5
-rw-r--r--include/uapi/linux/netfilter/nf_tables.h10
-rw-r--r--include/uapi/linux/netfilter/xt_IDLETIMER.h12
-rw-r--r--include/uapi/linux/netfilter_bridge/ebt_among.h2
-rw-r--r--include/uapi/linux/pkt_sched.h2
-rw-r--r--net/bridge/br_netlink_tunnel.c12
-rw-r--r--net/bridge/br_private.h4
-rw-r--r--net/bridge/br_private_tunnel.h17
-rw-r--r--net/bridge/br_vlan.c3
-rw-r--r--net/bridge/br_vlan_options.c74
-rw-r--r--net/bridge/br_vlan_tunnel.c5
-rw-r--r--net/bridge/netfilter/ebtables.c2
-rw-r--r--net/core/dev.c4
-rw-r--r--net/dsa/slave.c4
-rw-r--r--net/ethtool/common.c11
-rw-r--r--net/ethtool/ioctl.c3
-rw-r--r--net/ipv4/netfilter/arp_tables.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c4
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_scalable.c17
-rw-r--r--net/ipv4/tcp_veno.c47
-rw-r--r--net/ipv4/tcp_yeah.c41
-rw-r--r--net/ipv6/netfilter/ip6_tables.c4
-rw-r--r--net/mptcp/protocol.c9
-rw-r--r--net/mptcp/subflow.c2
-rw-r--r--net/netfilter/Kconfig8
-rw-r--r--net/netfilter/Makefile13
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c2
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h4
-rw-r--r--net/netfilter/nf_conntrack_core.c1
-rw-r--r--net/netfilter/nf_conntrack_standalone.c19
-rw-r--r--net/netfilter/nf_flow_table_offload.c251
-rw-r--r--net/netfilter/nf_tables_api.c113
-rw-r--r--net/netfilter/nf_tables_set_core.c31
-rw-r--r--net/netfilter/nfnetlink_acct.c2
-rw-r--r--net/netfilter/nft_bitwise.c14
-rw-r--r--net/netfilter/nft_dynset.c23
-rw-r--r--net/netfilter/nft_lookup.c1
-rw-r--r--net/netfilter/nft_set_bitmap.c3
-rw-r--r--net/netfilter/nft_set_hash.c9
-rw-r--r--net/netfilter/nft_set_pipapo.c637
-rw-r--r--net/netfilter/nft_set_pipapo.h280
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c1223
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h14
-rw-r--r--net/netfilter/nft_set_rbtree.c3
-rw-r--r--net/netfilter/nft_tunnel.c110
-rw-r--r--net/netfilter/xt_IDLETIMER.c248
-rw-r--r--net/netfilter/xt_SECMARK.c2
-rw-r--r--net/netfilter/xt_hashlimit.c2
-rw-r--r--net/netfilter/xt_recent.c4
-rw-r--r--net/sched/cls_api.c8
-rw-r--r--net/sched/sch_api.c21
-rw-r--r--net/sched/sch_fq.c21
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh4
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh130
-rw-r--r--tools/testing/selftests/net/forwarding/lib.sh9
130 files changed, 4581 insertions, 1087 deletions
diff --git a/Documentation/networking/device_drivers/stmicro/stmmac.rst b/Documentation/networking/device_drivers/stmicro/stmmac.rst
index c34bab3d2df0..5d46e5036129 100644
--- a/Documentation/networking/device_drivers/stmicro/stmmac.rst
+++ b/Documentation/networking/device_drivers/stmicro/stmmac.rst
@@ -32,7 +32,8 @@ is also supported.
DesignWare(R) Cores Ethernet MAC 10/100/1000 Universal version 3.70a
(and older) and DesignWare(R) Cores Ethernet Quality-of-Service version 4.0
(and upper) have been used for developing this driver as well as
-DesignWare(R) Cores XGMAC - 10G Ethernet MAC.
+DesignWare(R) Cores XGMAC - 10G Ethernet MAC and DesignWare(R) Cores
+Enterprise MAC - 100G Ethernet MAC.
This driver supports both the platform bus and PCI.
@@ -48,6 +49,8 @@ Cores Ethernet Controllers and corresponding minimum and maximum versions:
+-------------------------------+--------------+--------------+--------------+
| XGMAC - 10G Ethernet MAC | 2.10a | N/A | XGMAC2+ |
+-------------------------------+--------------+--------------+--------------+
+| XLGMAC - 100G Ethernet MAC | 2.00a | N/A | XLGMAC2+ |
++-------------------------------+--------------+--------------+--------------+
For questions related to hardware requirements, refer to the documentation
supplied with your Ethernet adapter. All hardware requirements listed apply
@@ -57,7 +60,7 @@ Feature List
============
The following features are available in this driver:
- - GMII/MII/RGMII/SGMII/RMII/XGMII Interface
+ - GMII/MII/RGMII/SGMII/RMII/XGMII/XLGMII Interface
- Half-Duplex / Full-Duplex Operation
- Energy Efficient Ethernet (EEE)
- IEEE 802.3x PAUSE Packets (Flow Control)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 523bf4be43cc..b19be7549aad 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -300,7 +300,7 @@ static int bnxt_tc_parse_actions(struct bnxt *bp,
return -EINVAL;
}
- if (!flow_action_basic_hw_stats_types_check(flow_action, extack))
+ if (!flow_action_basic_hw_stats_check(flow_action, extack))
return -EOPNOTSUPP;
flow_action_for_each(i, act, flow_action) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 2a2938bbb93a..e8852dfcc1f1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -438,13 +438,118 @@ int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
return get_filter_count(adapter, fidx, hitcnt, bytecnt, hash);
}
-int cxgb4_get_free_ftid(struct net_device *dev, int family)
+static bool cxgb4_filter_prio_in_range(struct tid_info *t, u32 idx, u8 nslots,
+ u32 prio)
+{
+ struct filter_entry *prev_tab, *next_tab, *prev_fe, *next_fe;
+ u32 prev_ftid, next_ftid;
+
+ /* Only insert the rule if both of the following conditions
+ * are met:
+ * 1. The immediate previous rule has priority <= @prio.
+ * 2. The immediate next rule has priority >= @prio.
+ */
+
+ /* High Priority (HPFILTER) region always has higher priority
+ * than normal FILTER region. So, all rules in HPFILTER region
+ * must have prio value <= rules in normal FILTER region.
+ */
+ if (idx < t->nhpftids) {
+ /* Don't insert if there's a rule already present at @idx
+ * in HPFILTER region.
+ */
+ if (test_bit(idx, t->hpftid_bmap))
+ return false;
+
+ next_tab = t->hpftid_tab;
+ next_ftid = find_next_bit(t->hpftid_bmap, t->nhpftids, idx);
+ if (next_ftid >= t->nhpftids) {
+ /* No next entry found in HPFILTER region.
+ * See if there's any next entry in normal
+ * FILTER region.
+ */
+ next_ftid = find_first_bit(t->ftid_bmap, t->nftids);
+ if (next_ftid >= t->nftids)
+ next_ftid = idx;
+ else
+ next_tab = t->ftid_tab;
+ }
+
+ /* Search for the closest previous filter entry in HPFILTER
+ * region. No need to search in normal FILTER region because
+ * there can never be any entry in normal FILTER region whose
+ * prio value is < last entry in HPFILTER region.
+ */
+ prev_ftid = find_last_bit(t->hpftid_bmap, idx);
+ if (prev_ftid >= idx)
+ prev_ftid = idx;
+
+ prev_tab = t->hpftid_tab;
+ } else {
+ idx -= t->nhpftids;
+
+ /* Don't insert if there's a rule already present at @idx
+ * in normal FILTER region.
+ */
+ if (test_bit(idx, t->ftid_bmap))
+ return false;
+
+ prev_tab = t->ftid_tab;
+ prev_ftid = find_last_bit(t->ftid_bmap, idx);
+ if (prev_ftid >= idx) {
+ /* No previous entry found in normal FILTER
+ * region. See if there's any previous entry
+ * in HPFILTER region.
+ */
+ prev_ftid = find_last_bit(t->hpftid_bmap, t->nhpftids);
+ if (prev_ftid >= t->nhpftids)
+ prev_ftid = idx;
+ else
+ prev_tab = t->hpftid_tab;
+ }
+
+ /* Search for the closest next filter entry in normal
+ * FILTER region. No need to search in HPFILTER region
+ * because there can never be any entry in HPFILTER
+ * region whose prio value is > first entry in normal
+ * FILTER region.
+ */
+ next_ftid = find_next_bit(t->ftid_bmap, t->nftids, idx);
+ if (next_ftid >= t->nftids)
+ next_ftid = idx;
+
+ next_tab = t->ftid_tab;
+ }
+
+ next_fe = &next_tab[next_ftid];
+
+ /* See if the filter entry belongs to an IPv6 rule, which
+ * occupy 4 slots on T5 and 2 slots on T6. Adjust the
+ * reference to the previously inserted filter entry
+ * accordingly.
+ */
+ prev_fe = &prev_tab[prev_ftid & ~(nslots - 1)];
+ if (!prev_fe->fs.type)
+ prev_fe = &prev_tab[prev_ftid];
+
+ if ((prev_fe->valid && prev_fe->fs.tc_prio > prio) ||
+ (next_fe->valid && next_fe->fs.tc_prio < prio))
+ return false;
+
+ return true;
+}
+
+int cxgb4_get_free_ftid(struct net_device *dev, u8 family, bool hash_en,
+ u32 tc_prio)
{
struct adapter *adap = netdev2adap(dev);
struct tid_info *t = &adap->tids;
+ struct filter_entry *tab, *f;
+ u32 bmap_ftid, max_ftid;
+ unsigned long *bmap;
bool found = false;
- u8 i, n, cnt;
- int ftid;
+ u8 i, cnt, n;
+ int ftid = 0;
/* IPv4 occupy 1 slot. IPv6 occupy 2 slots on T6 and 4 slots
* on T5.
@@ -456,34 +561,129 @@ int cxgb4_get_free_ftid(struct net_device *dev, int family)
n += 2;
}
- if (n > t->nftids)
- return -ENOMEM;
-
- /* Find free filter slots from the end of TCAM. Appropriate
- * checks must be done by caller later to ensure the prio
- * passed by TC doesn't conflict with prio saved by existing
- * rules in the TCAM.
+ /* There are 3 filter regions available in hardware in
+ * following order of priority:
+ *
+ * 1. High Priority (HPFILTER) region (Highest Priority).
+ * 2. HASH region.
+ * 3. Normal FILTER region (Lowest Priority).
+ *
+ * Entries in HPFILTER and normal FILTER region have index
+ * 0 as the highest priority and the rules will be scanned
+ * in ascending order until either a rule hits or end of
+ * the region is reached.
+ *
+ * All HASH region entries have same priority. The set of
+ * fields to match in headers are pre-determined. The same
+ * set of header match fields must be compulsorily specified
+ * in all the rules wanting to get inserted in HASH region.
+ * Hence, HASH region is an exact-match region. A HASH is
+ * generated for a rule based on the values in the
+ * pre-determined set of header match fields. The generated
+ * HASH serves as an index into the HASH region. There can
+ * never be 2 rules having the same HASH. Hardware will
+ * compute a HASH for every incoming packet based on the
+ * values in the pre-determined set of header match fields
+ * and uses it as an index to check if there's a rule
+ * inserted in the HASH region at the specified index. If
+ * there's a rule inserted, then it's considered as a filter
+ * hit. Otherwise, it's a filter miss and normal FILTER region
+ * is scanned afterwards.
*/
+
spin_lock_bh(&t->ftid_lock);
- ftid = t->nftids - 1;
- while (ftid >= n - 1) {
+
+ ftid = (tc_prio <= t->nhpftids) ? 0 : t->nhpftids;
+ max_ftid = t->nftids + t->nhpftids;
+ while (ftid < max_ftid) {
+ if (ftid < t->nhpftids) {
+ /* If the new rule wants to get inserted into
+ * HPFILTER region, but its prio is greater
+ * than the rule with the highest prio in HASH
+ * region, then reject the rule.
+ */
+ if (t->tc_hash_tids_max_prio &&
+ tc_prio > t->tc_hash_tids_max_prio)
+ break;
+
+ /* If there's not enough slots available
+ * in HPFILTER region, then move on to
+ * normal FILTER region immediately.
+ */
+ if (ftid + n > t->nhpftids) {
+ ftid = t->nhpftids;
+ continue;
+ }
+
+ bmap = t->hpftid_bmap;
+ bmap_ftid = ftid;
+ tab = t->hpftid_tab;
+ } else if (hash_en) {
+ /* Ensure priority is >= last rule in HPFILTER
+ * region.
+ */
+ ftid = find_last_bit(t->hpftid_bmap, t->nhpftids);
+ if (ftid < t->nhpftids) {
+ f = &t->hpftid_tab[ftid];
+ if (f->valid && tc_prio < f->fs.tc_prio)
+ break;
+ }
+
+ /* Ensure priority is <= first rule in normal
+ * FILTER region.
+ */
+ ftid = find_first_bit(t->ftid_bmap, t->nftids);
+ if (ftid < t->nftids) {
+ f = &t->ftid_tab[ftid];
+ if (f->valid && tc_prio > f->fs.tc_prio)
+ break;
+ }
+
+ found = true;
+ ftid = t->nhpftids;
+ goto out_unlock;
+ } else {
+ /* If the new rule wants to get inserted into
+ * normal FILTER region, but its prio is less
+ * than the rule with the highest prio in HASH
+ * region, then reject the rule.
+ */
+ if (t->tc_hash_tids_max_prio &&
+ tc_prio < t->tc_hash_tids_max_prio)
+ break;
+
+ if (ftid + n > max_ftid)
+ break;
+
+ bmap = t->ftid_bmap;
+ bmap_ftid = ftid - t->nhpftids;
+ tab = t->ftid_tab;
+ }
+
cnt = 0;
for (i = 0; i < n; i++) {
- if (test_bit(ftid - i, t->ftid_bmap))
+ if (test_bit(bmap_ftid + i, bmap))
break;
cnt++;
}
+
if (cnt == n) {
- ftid &= ~(n - 1);
- found = true;
- break;
+ /* Ensure the new rule's prio doesn't conflict
+ * with existing rules.
+ */
+ if (cxgb4_filter_prio_in_range(t, ftid, n,
+ tc_prio)) {
+ ftid &= ~(n - 1);
+ found = true;
+ break;
+ }
}
- ftid -= n;
+ ftid += n;
}
- spin_unlock_bh(&t->ftid_lock);
- ftid += t->nhpftids;
+out_unlock:
+ spin_unlock_bh(&t->ftid_lock);
return found ? ftid : -ENOMEM;
}
@@ -555,73 +755,6 @@ static void cxgb4_clear_hpftid(struct tid_info *t, int fidx, int family)
spin_unlock_bh(&t->ftid_lock);
}
-bool cxgb4_filter_prio_in_range(struct net_device *dev, u32 idx, u32 prio)
-{
- struct filter_entry *prev_fe, *next_fe, *tab;
- struct adapter *adap = netdev2adap(dev);
- u32 prev_ftid, next_ftid, max_tid;
- struct tid_info *t = &adap->tids;
- unsigned long *bmap;
- bool valid = true;
-
- if (idx < t->nhpftids) {
- bmap = t->hpftid_bmap;
- tab = t->hpftid_tab;
- max_tid = t->nhpftids;
- } else {
- idx -= t->nhpftids;
- bmap = t->ftid_bmap;
- tab = t->ftid_tab;
- max_tid = t->nftids;
- }
-
- /* Only insert the rule if both of the following conditions
- * are met:
- * 1. The immediate previous rule has priority <= @prio.
- * 2. The immediate next rule has priority >= @prio.
- */
- spin_lock_bh(&t->ftid_lock);
-
- /* Don't insert if there's a rule already present at @idx. */
- if (test_bit(idx, bmap)) {
- valid = false;
- goto out_unlock;
- }
-
- next_ftid = find_next_bit(bmap, max_tid, idx);
- if (next_ftid >= max_tid)
- next_ftid = idx;
-
- next_fe = &tab[next_ftid];
-
- prev_ftid = find_last_bit(bmap, idx);
- if (prev_ftid >= idx)
- prev_ftid = idx;
-
- /* See if the filter entry belongs to an IPv6 rule, which
- * occupy 4 slots on T5 and 2 slots on T6. Adjust the
- * reference to the previously inserted filter entry
- * accordingly.
- */
- if (CHELSIO_CHIP_VERSION(adap->params.chip) < CHELSIO_T6) {
- prev_fe = &tab[prev_ftid & ~0x3];
- if (!prev_fe->fs.type)
- prev_fe = &tab[prev_ftid];
- } else {
- prev_fe = &tab[prev_ftid & ~0x1];
- if (!prev_fe->fs.type)
- prev_fe = &tab[prev_ftid];
- }
-
- if ((prev_fe->valid && prio < prev_fe->fs.tc_prio) ||
- (next_fe->valid && prio > next_fe->fs.tc_prio))
- valid = false;
-
-out_unlock:
- spin_unlock_bh(&t->ftid_lock);
- return valid;
-}
-
/* Delete the filter at a specified index. */
static int del_filter_wr(struct adapter *adapter, int fidx)
{
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
index b3e4a645043d..b0751c0611ec 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
@@ -53,5 +53,4 @@ void clear_all_filters(struct adapter *adapter);
void init_hash_filter(struct adapter *adap);
bool is_filter_exact_match(struct adapter *adap,
struct ch_filter_specification *fs);
-bool cxgb4_filter_prio_in_range(struct net_device *dev, u32 idx, u32 prio);
#endif /* __CXGB4_FILTER_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index cc46277e98de..aec9b90313e7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -553,7 +553,7 @@ int cxgb4_validate_flow_actions(struct net_device *dev,
bool act_vlan = false;
int i;
- if (!flow_action_basic_hw_stats_types_check(actions, extack))
+ if (!flow_action_basic_hw_stats_check(actions, extack))
return -EOPNOTSUPP;
flow_action_for_each(i, act, actions) {
@@ -635,6 +635,64 @@ int cxgb4_validate_flow_actions(struct net_device *dev,
return 0;
}
+static void cxgb4_tc_flower_hash_prio_add(struct adapter *adap, u32 tc_prio)
+{
+ spin_lock_bh(&adap->tids.ftid_lock);
+ if (adap->tids.tc_hash_tids_max_prio < tc_prio)
+ adap->tids.tc_hash_tids_max_prio = tc_prio;
+ spin_unlock_bh(&adap->tids.ftid_lock);
+}
+
+static void cxgb4_tc_flower_hash_prio_del(struct adapter *adap, u32 tc_prio)
+{
+ struct tid_info *t = &adap->tids;
+ struct ch_tc_flower_entry *fe;
+ struct rhashtable_iter iter;
+ u32 found = 0;
+
+ spin_lock_bh(&t->ftid_lock);
+ /* Bail if the current rule is not the one with the max
+ * prio.
+ */
+ if (t->tc_hash_tids_max_prio != tc_prio)
+ goto out_unlock;
+
+ /* Search for the next rule having the same or next lower
+ * max prio.
+ */
+ rhashtable_walk_enter(&adap->flower_tbl, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ fe = rhashtable_walk_next(&iter);
+ while (!IS_ERR_OR_NULL(fe)) {
+ if (fe->fs.hash &&
+ fe->fs.tc_prio <= t->tc_hash_tids_max_prio) {
+ t->tc_hash_tids_max_prio = fe->fs.tc_prio;
+ found++;
+
+ /* Bail if we found another rule
+ * having the same prio as the
+ * current max one.
+ */
+ if (fe->fs.tc_prio == tc_prio)
+ break;
+ }
+
+ fe = rhashtable_walk_next(&iter);
+ }
+
+ rhashtable_walk_stop(&iter);
+ } while (fe == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+
+ if (!found)
+ t->tc_hash_tids_max_prio = 0;
+
+out_unlock:
+ spin_unlock_bh(&t->ftid_lock);
+}
+
int cxgb4_tc_flower_replace(struct net_device *dev,
struct flow_cls_offload *cls)
{
@@ -644,6 +702,7 @@ int cxgb4_tc_flower_replace(struct net_device *dev,
struct ch_tc_flower_entry *ch_flower;
struct ch_filter_specification *fs;
struct filter_ctx ctx;
+ u8 inet_family;
int fidx, ret;
if (cxgb4_validate_flow_actions(dev, &rule->action, extack))
@@ -664,39 +723,32 @@ int cxgb4_tc_flower_replace(struct net_device *dev,
cxgb4_process_flow_actions(dev, &rule->action, fs);
fs->hash = is_filter_exact_match(adap, fs);
- if (fs->hash) {
- fidx = 0;
- } else {
- u8 inet_family;
+ inet_family = fs->type ? PF_INET6 : PF_INET;
- inet_family = fs->type ? PF_INET6 : PF_INET;
-
- /* Note that TC uses prio 0 to indicate stack to
- * generate automatic prio and hence doesn't pass prio
- * 0 to driver. However, the hardware TCAM index
- * starts from 0. Hence, the -1 here.
- */
- if (cls->common.prio <= (adap->tids.nftids +
- adap->tids.nhpftids)) {
- fidx = cls->common.prio - 1;
- if (fidx < adap->tids.nhpftids)
- fs->prio = 1;
- } else {
- fidx = cxgb4_get_free_ftid(dev, inet_family);
- }
+ /* Get a free filter entry TID, where we can insert this new
+ * rule. Only insert rule if its prio doesn't conflict with
+ * existing rules.
+ */
+ fidx = cxgb4_get_free_ftid(dev, inet_family, fs->hash,
+ cls->common.prio);
+ if (fidx < 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "No free LETCAM index available");
+ ret = -ENOMEM;
+ goto free_entry;
+ }
- /* Only insert FLOWER rule if its priority doesn't
- * conflict with existing rules in the LETCAM.
- */
- if (fidx < 0 ||
- !cxgb4_filter_prio_in_range(dev, fidx, cls->common.prio)) {
- NL_SET_ERR_MSG_MOD(extack,
- "No free LETCAM index available");
- ret = -ENOMEM;
- goto free_entry;
- }
+ if (fidx < adap->tids.nhpftids) {
+ fs->prio = 1;
+ fs->hash = 0;
}
+ /* If the rule can be inserted into HASH region, then ignore
+ * the index to normal FILTER region.
+ */
+ if (fs->hash)
+ fidx = 0;
+
fs->tc_prio = cls->common.prio;
fs->tc_cookie = cls->cookie;
@@ -727,6 +779,9 @@ int cxgb4_tc_flower_replace(struct net_device *dev,
if (ret)
goto del_filter;
+ if (fs->hash)
+ cxgb4_tc_flower_hash_prio_add(adap, cls->common.prio);
+
return 0;
del_filter:
@@ -742,12 +797,17 @@ int cxgb4_tc_flower_destroy(struct net_device *dev,
{
struct adapter *adap = netdev2adap(dev);
struct ch_tc_flower_entry *ch_flower;
+ u32 tc_prio;
+ bool hash;
int ret;
ch_flower = ch_flower_lookup(adap, cls->cookie);
if (!ch_flower)
return -ENOENT;
+ hash = ch_flower->fs.hash;
+ tc_prio = ch_flower->fs.tc_prio;
+
ret = cxgb4_del_filter(dev, ch_flower->filter_id, &ch_flower->fs);
if (ret)
goto err;
@@ -760,6 +820,9 @@ int cxgb4_tc_flower_destroy(struct net_device *dev,
}
kfree_rcu(ch_flower, rcu);
+ if (hash)
+ cxgb4_tc_flower_hash_prio_del(adap, tc_prio);
+
err:
return ret;
}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c
index d80dee4d316d..8a5ae8bc9b7d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c
@@ -198,22 +198,14 @@ static int cxgb4_matchall_alloc_filter(struct net_device *dev,
struct ch_filter_specification *fs;
int ret, fidx;
- /* Note that TC uses prio 0 to indicate stack to generate
- * automatic prio and hence doesn't pass prio 0 to driver.
- * However, the hardware TCAM index starts from 0. Hence, the
- * -1 here. 1 slot is enough to create a wildcard matchall
- * VIID rule.
+ /* Get a free filter entry TID, where we can insert this new
+ * rule. Only insert rule if its prio doesn't conflict with
+ * existing rules.
+ *
+ * 1 slot is enough to create a wildcard matchall VIID rule.
*/
- if (cls->common.prio <= (adap->tids.nftids + adap->tids.nhpftids))
- fidx = cls->common.prio - 1;
- else
- fidx = cxgb4_get_free_ftid(dev, PF_INET);
-
- /* Only insert MATCHALL rule if its priority doesn't conflict
- * with existing rules in the LETCAM.
- */
- if (fidx < 0 ||
- !cxgb4_filter_prio_in_range(dev, fidx, cls->common.prio)) {
+ fidx = cxgb4_get_free_ftid(dev, PF_INET, false, cls->common.prio);
+ if (fidx < 0) {
NL_SET_ERR_MSG_MOD(extack,
"No free LETCAM index available");
return -ENOMEM;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index 269b8d9e25e0..3f3c11e54d97 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -155,9 +155,10 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
struct ch_filter_specification fs;
struct cxgb4_tc_u32_table *t;
struct cxgb4_link *link;
- unsigned int filter_id;
u32 uhtid, link_uhtid;
bool is_ipv6 = false;
+ u8 inet_family;
+ int filter_id;
int ret;
if (!can_tc_u32_offload(dev))
@@ -166,18 +167,15 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
if (protocol != htons(ETH_P_IP) && protocol != htons(ETH_P_IPV6))
return -EOPNOTSUPP;
- /* Note that TC uses prio 0 to indicate stack to generate
- * automatic prio and hence doesn't pass prio 0 to driver.
- * However, the hardware TCAM index starts from 0. Hence, the
- * -1 here.
- */
- filter_id = TC_U32_NODE(cls->knode.handle) - 1;
+ inet_family = (protocol == htons(ETH_P_IPV6)) ? PF_INET6 : PF_INET;
- /* Only insert U32 rule if its priority doesn't conflict with
- * existing rules in the LETCAM.
+ /* Get a free filter entry TID, where we can insert this new
+ * rule. Only insert rule if its prio doesn't conflict with
+ * existing rules.
*/
- if (filter_id >= adapter->tids.nftids + adapter->tids.nhpftids ||
- !cxgb4_filter_prio_in_range(dev, filter_id, cls->common.prio)) {
+ filter_id = cxgb4_get_free_ftid(dev, inet_family, false,
+ TC_U32_NODE(cls->knode.handle));
+ if (filter_id < 0) {
NL_SET_ERR_MSG_MOD(extack,
"No free LETCAM index available");
return -ENOMEM;
@@ -358,23 +356,65 @@ int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
struct cxgb4_link *link = NULL;
struct cxgb4_tc_u32_table *t;
struct filter_entry *f;
+ bool found = false;
u32 handle, uhtid;
+ u8 nslots;
int ret;
if (!can_tc_u32_offload(dev))
return -EOPNOTSUPP;
/* Fetch the location to delete the filter. */
- filter_id = TC_U32_NODE(cls->knode.handle) - 1;
- if (filter_id >= adapter->tids.nftids + adapter->tids.nhpftids)
- return -ERANGE;
+ max_tids = adapter->tids.nhpftids + adapter->tids.nftids;
+
+ spin_lock_bh(&adapter->tids.ftid_lock);
+ filter_id = 0;
+ while (filter_id < max_tids) {
+ if (filter_id < adapter->tids.nhpftids) {
+ i = filter_id;
+ f = &adapter->tids.hpftid_tab[i];
+ if (f->valid && f->fs.tc_cookie == cls->knode.handle) {
+ found = true;
+ break;
+ }
- if (filter_id < adapter->tids.nhpftids)
- f = &adapter->tids.hpftid_tab[filter_id];
- else
- f = &adapter->tids.ftid_tab[filter_id - adapter->tids.nhpftids];
+ i = find_next_bit(adapter->tids.hpftid_bmap,
+ adapter->tids.nhpftids, i + 1);
+ if (i >= adapter->tids.nhpftids) {
+ filter_id = adapter->tids.nhpftids;
+ continue;
+ }
+
+ filter_id = i;
+ } else {
+ i = filter_id - adapter->tids.nhpftids;
+ f = &adapter->tids.ftid_tab[i];
+ if (f->valid && f->fs.tc_cookie == cls->knode.handle) {
+ found = true;
+ break;
+ }
+
+ i = find_next_bit(adapter->tids.ftid_bmap,
+ adapter->tids.nftids, i + 1);
+ if (i >= adapter->tids.nftids)
+ break;
+
+ filter_id = i + adapter->tids.nhpftids;
+ }
+
+ nslots = 0;
+ if (f->fs.type) {
+ nslots++;
+ if (CHELSIO_CHIP_VERSION(adapter->params.chip) <
+ CHELSIO_T6)
+ nslots += 2;
+ }
+
+ filter_id += nslots;
+ }
+ spin_unlock_bh(&adapter->tids.ftid_lock);
- if (cls->knode.handle != f->fs.tc_cookie)
+ if (!found)
return -ERANGE;
t = adapter->tc_u32;
@@ -407,7 +447,6 @@ int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
/* If a link is being deleted, then delete all filters
* associated with the link.
*/
- max_tids = adapter->tids.nftids;
for (i = 0; i < t->size; i++) {
link = &t->table[i];
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 03b9bdc812cc..be831317520a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -149,6 +149,8 @@ struct tid_info {
atomic_t conns_in_use;
/* lock for setting/clearing filter bitmap */
spinlock_t ftid_lock;
+
+ unsigned int tc_hash_tids_max_prio;
};
static inline void *lookup_tid(const struct tid_info *t, unsigned int tid)
@@ -263,7 +265,8 @@ struct filter_ctx {
struct ch_filter_specification;
-int cxgb4_get_free_ftid(struct net_device *dev, int family);
+int cxgb4_get_free_ftid(struct net_device *dev, u8 family, bool hash_en,
+ u32 tc_prio);
int __cxgb4_set_filter(struct net_device *dev, int filter_id,
struct ch_filter_specification *fs,
struct filter_ctx *ctx);
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c
index 0a0c6ec2336c..8972cdd559e8 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c
@@ -1082,7 +1082,7 @@ static int mvpp2_port_c2_tcam_rule_add(struct mvpp2_port *port,
u8 qh, ql, pmap;
int index, ctx;
- if (!flow_action_basic_hw_stats_types_check(&rule->flow->action, NULL))
+ if (!flow_action_basic_hw_stats_check(&rule->flow->action, NULL))
return -EOPNOTSUPP;
memset(&c2, 0, sizeof(c2));
@@ -1308,7 +1308,7 @@ static int mvpp2_cls_rfs_parse_rule(struct mvpp2_rfs_rule *rule)
struct flow_rule *flow = rule->flow;
struct flow_action_entry *act;
- if (!flow_action_basic_hw_stats_types_check(&rule->flow->action, NULL))
+ if (!flow_action_basic_hw_stats_check(&rule->flow->action, NULL))
return -EOPNOTSUPP;
act = &flow->action.entries[0];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ebf60ff30295..901f88a886c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3167,8 +3167,8 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv,
if (!flow_action_has_entries(flow_action))
return -EINVAL;
- if (!flow_action_hw_stats_types_check(flow_action, extack,
- FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT))
+ if (!flow_action_hw_stats_check(flow_action, extack,
+ FLOW_ACTION_HW_STATS_DELAYED_BIT))
return -EOPNOTSUPP;
attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
@@ -3702,8 +3702,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
if (!flow_action_has_entries(flow_action))
return -EINVAL;
- if (!flow_action_hw_stats_types_check(flow_action, extack,
- FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT))
+ if (!flow_action_hw_stats_check(flow_action, extack,
+ FLOW_ACTION_HW_STATS_DELAYED_BIT))
return -EOPNOTSUPP;
flow_action_for_each(i, act, flow_action) {
@@ -4524,7 +4524,7 @@ static int scan_tc_matchall_fdb_actions(struct mlx5e_priv *priv,
return -EOPNOTSUPP;
}
- if (!flow_action_basic_hw_stats_types_check(flow_action, extack))
+ if (!flow_action_basic_hw_stats_check(flow_action, extack))
return -EOPNOTSUPP;
flow_action_for_each(i, act, flow_action) {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h
index 6534184cb942..d62496ef299c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/resources.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h
@@ -18,6 +18,7 @@ enum mlxsw_res_id {
MLXSW_RES_ID_CQE_V1,
MLXSW_RES_ID_CQE_V2,
MLXSW_RES_ID_COUNTER_POOL_SIZE,
+ MLXSW_RES_ID_COUNTER_BANK_SIZE,
MLXSW_RES_ID_MAX_SPAN,
MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES,
MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC,
@@ -75,6 +76,7 @@ static u16 mlxsw_res_ids[] = {
[MLXSW_RES_ID_CQE_V1] = 0x2211,
[MLXSW_RES_ID_CQE_V2] = 0x2212,
[MLXSW_RES_ID_COUNTER_POOL_SIZE] = 0x2410,
+ [MLXSW_RES_ID_COUNTER_BANK_SIZE] = 0x2411,
[MLXSW_RES_ID_MAX_SPAN] = 0x2420,
[MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES] = 0x2443,
[MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC] = 0x2449,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 51709012593e..35d3a68ef4fd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -5421,8 +5421,13 @@ static int mlxsw_sp1_resources_register(struct mlxsw_core *mlxsw_core)
if (err)
goto err_resources_span_register;
+ err = mlxsw_sp_counter_resources_register(mlxsw_core);
+ if (err)
+ goto err_resources_counter_register;
+
return 0;
+err_resources_counter_register:
err_resources_span_register:
devlink_resources_unregister(priv_to_devlink(mlxsw_core), NULL);
return err;
@@ -5440,8 +5445,13 @@ static int mlxsw_sp2_resources_register(struct mlxsw_core *mlxsw_core)
if (err)
goto err_resources_span_register;
+ err = mlxsw_sp_counter_resources_register(mlxsw_core);
+ if (err)
+ goto err_resources_counter_register;
+
return 0;
+err_resources_counter_register:
err_resources_span_register:
devlink_resources_unregister(priv_to_devlink(mlxsw_core), NULL);
return err;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 81801c6fb941..57d8c95e4f9f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -46,6 +46,10 @@
#define MLXSW_SP_RESOURCE_NAME_SPAN "span_agents"
+#define MLXSW_SP_RESOURCE_NAME_COUNTERS "counters"
+#define MLXSW_SP_RESOURCE_NAME_COUNTERS_FLOW "flow"
+#define MLXSW_SP_RESOURCE_NAME_COUNTERS_RIF "rif"
+
enum mlxsw_sp_resource_id {
MLXSW_SP_RESOURCE_KVD = 1,
MLXSW_SP_RESOURCE_KVD_LINEAR,
@@ -55,6 +59,9 @@ enum mlxsw_sp_resource_id {
MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
MLXSW_SP_RESOURCE_SPAN,
+ MLXSW_SP_RESOURCE_COUNTERS,
+ MLXSW_SP_RESOURCE_COUNTERS_FLOW,
+ MLXSW_SP_RESOURCE_COUNTERS_RIF,
};
struct mlxsw_sp_port;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
index 6a02ef9ec00e..0268f0a6662a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
@@ -7,91 +7,143 @@
#include "spectrum_cnt.h"
-#define MLXSW_SP_COUNTER_POOL_BANK_SIZE 4096
-
struct mlxsw_sp_counter_sub_pool {
+ u64 size;
unsigned int base_index;
- unsigned int size;
+ enum mlxsw_res_id entry_size_res_id;
+ const char *resource_name; /* devlink resource name */
+ u64 resource_id; /* devlink resource id */
unsigned int entry_size;
unsigned int bank_count;
+ atomic_t active_entries_count;
};
struct mlxsw_sp_counter_pool {
- unsigned int pool_size;
+ u64 pool_size;
unsigned long *usage; /* Usage bitmap */
spinlock_t counter_pool_lock; /* Protects counter pool allocations */
- struct mlxsw_sp_counter_sub_pool *sub_pools;
+ atomic_t active_entries_count;
+ unsigned int sub_pools_count;
+ struct mlxsw_sp_counter_sub_pool sub_pools[];
};
-static struct mlxsw_sp_counter_sub_pool mlxsw_sp_counter_sub_pools[] = {
+static const struct mlxsw_sp_counter_sub_pool mlxsw_sp_counter_sub_pools[] = {
[MLXSW_SP_COUNTER_SUB_POOL_FLOW] = {
+ .entry_size_res_id = MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES,
+ .resource_name = MLXSW_SP_RESOURCE_NAME_COUNTERS_FLOW,
+ .resource_id = MLXSW_SP_RESOURCE_COUNTERS_FLOW,
.bank_count = 6,
},
[MLXSW_SP_COUNTER_SUB_POOL_RIF] = {
+ .entry_size_res_id = MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC,
+ .resource_name = MLXSW_SP_RESOURCE_NAME_COUNTERS_RIF,
+ .resource_id = MLXSW_SP_RESOURCE_COUNTERS_RIF,
.bank_count = 2,
}
};
-static int mlxsw_sp_counter_pool_validate(struct mlxsw_sp *mlxsw_sp)
+static u64 mlxsw_sp_counter_sub_pool_occ_get(void *priv)
+{
+ const struct mlxsw_sp_counter_sub_pool *sub_pool = priv;
+
+ return atomic_read(&sub_pool->active_entries_count);
+}
+
+static int mlxsw_sp_counter_sub_pools_init(struct mlxsw_sp *mlxsw_sp)
{
- unsigned int total_bank_config = 0;
- unsigned int pool_size;
+ struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+ struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+ struct mlxsw_sp_counter_sub_pool *sub_pool;
+ unsigned int base_index = 0;
+ enum mlxsw_res_id res_id;
+ int err;
int i;
- pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE);
- /* Check config is valid, no bank over subscription */
- for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++)
- total_bank_config += mlxsw_sp_counter_sub_pools[i].bank_count;
- if (total_bank_config > pool_size / MLXSW_SP_COUNTER_POOL_BANK_SIZE + 1)
- return -EINVAL;
+ for (i = 0; i < pool->sub_pools_count; i++) {
+ sub_pool = &pool->sub_pools[i];
+ res_id = sub_pool->entry_size_res_id;
+
+ if (!mlxsw_core_res_valid(mlxsw_sp->core, res_id))
+ return -EIO;
+ sub_pool->entry_size = mlxsw_core_res_get(mlxsw_sp->core,
+ res_id);
+ err = devlink_resource_size_get(devlink,
+ sub_pool->resource_id,
+ &sub_pool->size);
+ if (err)
+ goto err_resource_size_get;
+
+ devlink_resource_occ_get_register(devlink,
+ sub_pool->resource_id,
+ mlxsw_sp_counter_sub_pool_occ_get,
+ sub_pool);
+
+ sub_pool->base_index = base_index;
+ base_index += sub_pool->size;
+ atomic_set(&sub_pool->active_entries_count, 0);
+ }
return 0;
+
+err_resource_size_get:
+ for (i--; i >= 0; i--) {
+ sub_pool = &pool->sub_pools[i];
+
+ devlink_resource_occ_get_unregister(devlink,
+ sub_pool->resource_id);
+ }
+ return err;
}
-static int mlxsw_sp_counter_sub_pools_prepare(struct mlxsw_sp *mlxsw_sp)
+static void mlxsw_sp_counter_sub_pools_fini(struct mlxsw_sp *mlxsw_sp)
{
+ struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+ struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
struct mlxsw_sp_counter_sub_pool *sub_pool;
+ int i;
- /* Prepare generic flow pool*/
- sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_FLOW];
- if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_PACKETS_BYTES))
- return -EIO;
- sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
- COUNTER_SIZE_PACKETS_BYTES);
- /* Prepare erif pool*/
- sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_RIF];
- if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_ROUTER_BASIC))
- return -EIO;
- sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
- COUNTER_SIZE_ROUTER_BASIC);
- return 0;
+ for (i = 0; i < pool->sub_pools_count; i++) {
+ sub_pool = &pool->sub_pools[i];
+
+ WARN_ON(atomic_read(&sub_pool->active_entries_count));
+ devlink_resource_occ_get_unregister(devlink,
+ sub_pool->resource_id);
+ }
+}
+
+static u64 mlxsw_sp_counter_pool_occ_get(void *priv)
+{
+ const struct mlxsw_sp_counter_pool *pool = priv;
+
+ return atomic_read(&pool->active_entries_count);
}
int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp)
{
+ unsigned int sub_pools_count = ARRAY_SIZE(mlxsw_sp_counter_sub_pools);
+ struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
struct mlxsw_sp_counter_sub_pool *sub_pool;
struct mlxsw_sp_counter_pool *pool;
- unsigned int base_index;
unsigned int map_size;
- int i;
int err;
- if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_POOL_SIZE))
- return -EIO;
-
- err = mlxsw_sp_counter_pool_validate(mlxsw_sp);
- if (err)
- return err;
-
- err = mlxsw_sp_counter_sub_pools_prepare(mlxsw_sp);
- if (err)
- return err;
-
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kzalloc(struct_size(pool, sub_pools, sub_pools_count),
+ GFP_KERNEL);
if (!pool)
return -ENOMEM;
+ mlxsw_sp->counter_pool = pool;
+ memcpy(pool->sub_pools, mlxsw_sp_counter_sub_pools,
+ sub_pools_count * sizeof(*sub_pool));
+ pool->sub_pools_count = sub_pools_count;
spin_lock_init(&pool->counter_pool_lock);
+ atomic_set(&pool->active_entries_count, 0);
+
+ err = devlink_resource_size_get(devlink, MLXSW_SP_RESOURCE_COUNTERS,
+ &pool->pool_size);
+ if (err)
+ goto err_pool_resource_size_get;
+ devlink_resource_occ_get_register(devlink, MLXSW_SP_RESOURCE_COUNTERS,
+ mlxsw_sp_counter_pool_occ_get, pool);
- pool->pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE);
map_size = BITS_TO_LONGS(pool->pool_size) * sizeof(unsigned long);
pool->usage = kzalloc(map_size, GFP_KERNEL);
@@ -100,26 +152,18 @@ int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp)
goto err_usage_alloc;
}
- pool->sub_pools = mlxsw_sp_counter_sub_pools;
- /* Allocation is based on bank count which should be
- * specified for each sub pool statically.
- */
- base_index = 0;
- for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) {
- sub_pool = &pool->sub_pools[i];
- sub_pool->size = sub_pool->bank_count *
- MLXSW_SP_COUNTER_POOL_BANK_SIZE;
- sub_pool->base_index = base_index;
- base_index += sub_pool->size;
- /* The last bank can't be fully used */
- if (sub_pool->base_index + sub_pool->size > pool->pool_size)
- sub_pool->size = pool->pool_size - sub_pool->base_index;
- }
+ err = mlxsw_sp_counter_sub_pools_init(mlxsw_sp);
+ if (err)
+ goto err_sub_pools_init;
- mlxsw_sp->counter_pool = pool;
return 0;
+err_sub_pools_init:
+ kfree(pool->usage);
err_usage_alloc:
+ devlink_resource_occ_get_unregister(devlink,
+ MLXSW_SP_RESOURCE_COUNTERS);
+err_pool_resource_size_get:
kfree(pool);
return err;
}
@@ -127,10 +171,15 @@ err_usage_alloc:
void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp)
{
struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+ struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+ mlxsw_sp_counter_sub_pools_fini(mlxsw_sp);
WARN_ON(find_first_bit(pool->usage, pool->pool_size) !=
pool->pool_size);
+ WARN_ON(atomic_read(&pool->active_entries_count));
kfree(pool->usage);
+ devlink_resource_occ_get_unregister(devlink,
+ MLXSW_SP_RESOURCE_COUNTERS);
kfree(pool);
}
@@ -144,7 +193,7 @@ int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp,
unsigned int stop_index;
int i, err;
- sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id];
+ sub_pool = &pool->sub_pools[sub_pool_id];
stop_index = sub_pool->base_index + sub_pool->size;
entry_index = sub_pool->base_index;
@@ -166,6 +215,8 @@ int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp,
spin_unlock(&pool->counter_pool_lock);
*p_counter_index = entry_index;
+ atomic_add(sub_pool->entry_size, &sub_pool->active_entries_count);
+ atomic_add(sub_pool->entry_size, &pool->active_entries_count);
return 0;
err_alloc:
@@ -183,9 +234,77 @@ void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp,
if (WARN_ON(counter_index >= pool->pool_size))
return;
- sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id];
+ sub_pool = &pool->sub_pools[sub_pool_id];
spin_lock(&pool->counter_pool_lock);
for (i = 0; i < sub_pool->entry_size; i++)
__clear_bit(counter_index + i, pool->usage);
spin_unlock(&pool->counter_pool_lock);
+ atomic_sub(sub_pool->entry_size, &sub_pool->active_entries_count);
+ atomic_sub(sub_pool->entry_size, &pool->active_entries_count);
+}
+
+int mlxsw_sp_counter_resources_register(struct mlxsw_core *mlxsw_core)
+{
+ static struct devlink_resource_size_params size_params;
+ struct devlink *devlink = priv_to_devlink(mlxsw_core);
+ const struct mlxsw_sp_counter_sub_pool *sub_pool;
+ unsigned int total_bank_config;
+ u64 sub_pool_size;
+ u64 base_index;
+ u64 pool_size;
+ u64 bank_size;
+ int err;
+ int i;
+
+ if (!MLXSW_CORE_RES_VALID(mlxsw_core, COUNTER_POOL_SIZE) ||
+ !MLXSW_CORE_RES_VALID(mlxsw_core, COUNTER_BANK_SIZE))
+ return -EIO;
+
+ pool_size = MLXSW_CORE_RES_GET(mlxsw_core, COUNTER_POOL_SIZE);
+ bank_size = MLXSW_CORE_RES_GET(mlxsw_core, COUNTER_BANK_SIZE);
+
+ devlink_resource_size_params_init(&size_params, pool_size,
+ pool_size, bank_size,
+ DEVLINK_RESOURCE_UNIT_ENTRY);
+ err = devlink_resource_register(devlink,
+ MLXSW_SP_RESOURCE_NAME_COUNTERS,
+ pool_size,
+ MLXSW_SP_RESOURCE_COUNTERS,
+ DEVLINK_RESOURCE_ID_PARENT_TOP,
+ &size_params);
+ if (err)
+ return err;
+
+ /* Allocation is based on bank count which should be
+ * specified for each sub pool statically.
+ */
+ total_bank_config = 0;
+ base_index = 0;
+ for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) {
+ sub_pool = &mlxsw_sp_counter_sub_pools[i];
+ sub_pool_size = sub_pool->bank_count * bank_size;
+ /* The last bank can't be fully used */
+ if (base_index + sub_pool_size > pool_size)
+ sub_pool_size = pool_size - base_index;
+ base_index += sub_pool_size;
+
+ devlink_resource_size_params_init(&size_params, sub_pool_size,
+ sub_pool_size, bank_size,
+ DEVLINK_RESOURCE_UNIT_ENTRY);
+ err = devlink_resource_register(devlink,
+ sub_pool->resource_name,
+ sub_pool_size,
+ sub_pool->resource_id,
+ MLXSW_SP_RESOURCE_COUNTERS,
+ &size_params);
+ if (err)
+ return err;
+ total_bank_config += sub_pool->bank_count;
+ }
+
+ /* Check config is valid, no bank over subscription */
+ if (WARN_ON(total_bank_config > pool_size / bank_size + 1))
+ return -EINVAL;
+
+ return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
index 81465e267b10..a68d931090dd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
@@ -4,6 +4,7 @@
#ifndef _MLXSW_SPECTRUM_CNT_H
#define _MLXSW_SPECTRUM_CNT_H
+#include "core.h"
#include "spectrum.h"
enum mlxsw_sp_counter_sub_pool_id {
@@ -19,5 +20,6 @@ void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp,
unsigned int counter_index);
int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp);
void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_counter_resources_register(struct mlxsw_core *mlxsw_core);
#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 88aa554415df..21c4b10d106c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -26,17 +26,17 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
if (!flow_action_has_entries(flow_action))
return 0;
- if (!flow_action_mixed_hw_stats_types_check(flow_action, extack))
+ if (!flow_action_mixed_hw_stats_check(flow_action, extack))
return -EOPNOTSUPP;
act = flow_action_first_entry_get(flow_action);
- if (act->hw_stats_type == FLOW_ACTION_HW_STATS_TYPE_ANY ||
- act->hw_stats_type == FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE) {
+ if (act->hw_stats_type == FLOW_ACTION_HW_STATS_ANY ||
+ act->hw_stats_type == FLOW_ACTION_HW_STATS_IMMEDIATE) {
/* Count action is inserted first */
err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack);
if (err)
return err;
- } else if (act->hw_stats_type != FLOW_ACTION_HW_STATS_TYPE_DISABLED) {
+ } else if (act->hw_stats_type != FLOW_ACTION_HW_STATS_DISABLED) {
NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type");
return -EOPNOTSUPP;
}
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index 6d84173373c7..873a9944fbfb 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -17,8 +17,8 @@ static int ocelot_flower_parse_action(struct flow_cls_offload *f,
if (!flow_offload_has_one_action(&f->rule->action))
return -EOPNOTSUPP;
- if (!flow_action_basic_hw_stats_types_check(&f->rule->action,
- f->common.extack))
+ if (!flow_action_basic_hw_stats_check(&f->rule->action,
+ f->common.extack))
return -EOPNOTSUPP;
flow_action_for_each(i, a, &f->rule->action) {
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 4aa7346cb040..1c76e1592ca2 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -1207,8 +1207,8 @@ int nfp_flower_compile_action(struct nfp_app *app,
bool pkt_host = false;
u32 csum_updated = 0;
- if (!flow_action_basic_hw_stats_types_check(&flow->rule->action,
- extack))
+ if (!flow_action_hw_stats_check(&flow->rule->action, extack,
+ FLOW_ACTION_HW_STATS_DELAYED_BIT))
return -EOPNOTSUPP;
memset(nfp_flow->action_data, 0, NFP_FL_MAX_A_SIZ);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
index ed14164468a1..273c889faaad 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
@@ -77,12 +77,16 @@ int ionic_devlink_register(struct ionic *ionic)
return err;
}
+ /* don't register the mgmt_nic as a port */
+ if (ionic->is_mgmt_nic)
+ return 0;
+
devlink_port_attrs_set(&ionic->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
0, false, 0, NULL, 0);
err = devlink_port_register(dl, &ionic->dl_port, 0);
if (err)
dev_err(ionic->dev, "devlink_port_register failed: %d\n", err);
- else if (!ionic->is_mgmt_nic)
+ else
devlink_port_type_eth_set(&ionic->dl_port,
ionic->master_lif->netdev);
@@ -93,6 +97,7 @@ void ionic_devlink_unregister(struct ionic *ionic)
{
struct devlink *dl = priv_to_devlink(ionic);
- devlink_port_unregister(&ionic->dl_port);
+ if (ionic->dl_port.registered)
+ devlink_port_unregister(&ionic->dl_port);
devlink_unregister(dl);
}
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
index a233716eac29..6996229facfd 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/netdevice.h>
+#include <linux/sfp.h>
#include "ionic.h"
#include "ionic_bus.h"
@@ -677,23 +678,27 @@ static int ionic_get_module_info(struct net_device *netdev,
struct ionic_lif *lif = netdev_priv(netdev);
struct ionic_dev *idev = &lif->ionic->idev;
struct ionic_xcvr_status *xcvr;
+ struct sfp_eeprom_base *sfp;
xcvr = &idev->port_info->status.xcvr;
+ sfp = (struct sfp_eeprom_base *) xcvr->sprom;
/* report the module data type and length */
- switch (xcvr->sprom[0]) {
- case 0x03: /* SFP */
+ switch (sfp->phys_id) {
+ case SFF8024_ID_SFP:
modinfo->type = ETH_MODULE_SFF_8079;
modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
break;
- case 0x0D: /* QSFP */
- case 0x11: /* QSFP28 */
+ case SFF8024_ID_QSFP_8436_8636:
+ case SFF8024_ID_QSFP28_8636:
modinfo->type = ETH_MODULE_SFF_8436;
modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
break;
default:
netdev_info(netdev, "unknown xcvr type 0x%02x\n",
xcvr->sprom[0]);
+ modinfo->type = 0;
+ modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
break;
}
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
index b903016193df..12e3823b0bc1 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
@@ -2066,9 +2066,11 @@ static void ionic_lif_deinit(struct ionic_lif *lif)
clear_bit(IONIC_LIF_F_INITED, lif->state);
ionic_rx_filters_deinit(lif);
- ionic_lif_rss_deinit(lif);
+ if (lif->netdev->features & NETIF_F_RXHASH)
+ ionic_lif_rss_deinit(lif);
napi_disable(&lif->adminqcq->napi);
+ netif_napi_del(&lif->adminqcq->napi);
ionic_lif_qcq_deinit(lif, lif->notifyqcq);
ionic_lif_qcq_deinit(lif, lif->adminqcq);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c
index e4a76e66f542..c5e3d7639f7e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_main.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c
@@ -58,6 +58,8 @@ static const char *ionic_error_to_str(enum ionic_status_code code)
return "IONIC_RC_BAD_ADDR";
case IONIC_RC_DEV_CMD:
return "IONIC_RC_DEV_CMD";
+ case IONIC_RC_ENOSUPP:
+ return "IONIC_RC_ENOSUPP";
case IONIC_RC_ERROR:
return "IONIC_RC_ERROR";
case IONIC_RC_ERDMA:
@@ -76,6 +78,7 @@ static int ionic_error_to_errno(enum ionic_status_code code)
case IONIC_RC_EQTYPE:
case IONIC_RC_EQID:
case IONIC_RC_EINVAL:
+ case IONIC_RC_ENOSUPP:
return -EINVAL;
case IONIC_RC_EPERM:
return -EPERM;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 6505f7e2d1db..fe72bb6c9455 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1757,7 +1757,7 @@ static int qede_parse_actions(struct qede_dev *edev,
return -EINVAL;
}
- if (!flow_action_basic_hw_stats_types_check(flow_action, extack))
+ if (!flow_action_basic_hw_stats_check(flow_action, extack))
return -EOPNOTSUPP;
flow_action_for_each(i, act, flow_action) {
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 9a637cd67f43..04e88d05e8ff 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -232,9 +232,6 @@ static int efx_ethtool_set_coalesce(struct net_device *net_dev,
bool adaptive, rx_may_override_tx;
int rc;
- if (coalesce->use_adaptive_tx_coalesce)
- return -EINVAL;
-
efx_get_irq_moderation(efx, &tx_usecs, &rx_usecs, &adaptive);
if (coalesce->rx_coalesce_usecs != rx_usecs)
@@ -1138,6 +1135,9 @@ static int efx_ethtool_set_fecparam(struct net_device *net_dev,
}
const struct ethtool_ops efx_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+ ETHTOOL_COALESCE_USECS_IRQ |
+ ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
.get_drvinfo = efx_ethtool_get_drvinfo,
.get_regs_len = efx_ethtool_get_regs_len,
.get_regs = efx_ethtool_get_regs,
diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c
index 08bd6a321918..db90d94e24c9 100644
--- a/drivers/net/ethernet/sfc/falcon/ethtool.c
+++ b/drivers/net/ethernet/sfc/falcon/ethtool.c
@@ -603,9 +603,6 @@ static int ef4_ethtool_set_coalesce(struct net_device *net_dev,
bool adaptive, rx_may_override_tx;
int rc;
- if (coalesce->use_adaptive_tx_coalesce)
- return -EINVAL;
-
ef4_get_irq_moderation(efx, &tx_usecs, &rx_usecs, &adaptive);
if (coalesce->rx_coalesce_usecs != rx_usecs)
@@ -1311,6 +1308,9 @@ static int ef4_ethtool_get_module_info(struct net_device *net_dev,
}
const struct ethtool_ops ef4_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+ ETHTOOL_COALESCE_USECS_IRQ |
+ ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
.get_drvinfo = ef4_ethtool_get_drvinfo,
.get_regs_len = ef4_ethtool_get_regs_len,
.get_regs = ef4_ethtool_get_regs,
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 58b9b7ce7195..a5a0fb60193a 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -589,6 +589,8 @@ static void netsec_et_set_msglevel(struct net_device *dev, u32 datum)
}
static const struct ethtool_ops netsec_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+ ETHTOOL_COALESCE_MAX_FRAMES,
.get_drvinfo = netsec_et_get_drvinfo,
.get_link_ksettings = phy_ethtool_get_link_ksettings,
.set_link_ksettings = phy_ethtool_set_link_ksettings,
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 9bdbf589d93f..386663208c23 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -34,6 +34,11 @@
#define DWMAC_CORE_5_00 0x50
#define DWMAC_CORE_5_10 0x51
#define DWXGMAC_CORE_2_10 0x21
+#define DWXLGMAC_CORE_2_00 0x20
+
+/* Device ID */
+#define DWXGMAC_ID 0x76
+#define DWXLGMAC_ID 0x27
#define STMMAC_CHAN0 0 /* Always supported and default for all chips */
@@ -426,6 +431,12 @@ struct mac_link {
u32 speed5000;
u32 speed10000;
} xgmii;
+ struct {
+ u32 speed25000;
+ u32 speed40000;
+ u32 speed50000;
+ u32 speed100000;
+ } xlgmii;
};
struct mii_regs {
@@ -459,6 +470,7 @@ struct mac_device_info {
unsigned int pcs;
unsigned int pmt;
unsigned int ps;
+ unsigned int xlgmac;
};
struct stmmac_rx_routing {
@@ -470,6 +482,7 @@ int dwmac100_setup(struct stmmac_priv *priv);
int dwmac1000_setup(struct stmmac_priv *priv);
int dwmac4_setup(struct stmmac_priv *priv);
int dwxgmac2_setup(struct stmmac_priv *priv);
+int dwxlgmac2_setup(struct stmmac_priv *priv);
void stmmac_set_mac_addr(void __iomem *ioaddr, u8 addr[6],
unsigned int high, unsigned int low);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index 67b754a56288..0e4575f7bedb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -9,6 +9,7 @@
#include <linux/iopoll.h>
#include "stmmac.h"
#include "stmmac_ptp.h"
+#include "dwxlgmac2.h"
#include "dwxgmac2.h"
static void dwxgmac2_core_init(struct mac_device_info *hw,
@@ -1485,6 +1486,67 @@ const struct stmmac_ops dwxgmac210_ops = {
.fpe_configure = dwxgmac3_fpe_configure,
};
+static void dwxlgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode,
+ u32 queue)
+{
+ void __iomem *ioaddr = hw->pcsr;
+ u32 value;
+
+ value = readl(ioaddr + XLGMAC_RXQ_ENABLE_CTRL0) & ~XGMAC_RXQEN(queue);
+ if (mode == MTL_QUEUE_AVB)
+ value |= 0x1 << XGMAC_RXQEN_SHIFT(queue);
+ else if (mode == MTL_QUEUE_DCB)
+ value |= 0x2 << XGMAC_RXQEN_SHIFT(queue);
+ writel(value, ioaddr + XLGMAC_RXQ_ENABLE_CTRL0);
+}
+
+const struct stmmac_ops dwxlgmac2_ops = {
+ .core_init = dwxgmac2_core_init,
+ .set_mac = dwxgmac2_set_mac,
+ .rx_ipc = dwxgmac2_rx_ipc,
+ .rx_queue_enable = dwxlgmac2_rx_queue_enable,
+ .rx_queue_prio = dwxgmac2_rx_queue_prio,
+ .tx_queue_prio = dwxgmac2_tx_queue_prio,
+ .rx_queue_routing = NULL,
+ .prog_mtl_rx_algorithms = dwxgmac2_prog_mtl_rx_algorithms,
+ .prog_mtl_tx_algorithms = dwxgmac2_prog_mtl_tx_algorithms,
+ .set_mtl_tx_queue_weight = dwxgmac2_set_mtl_tx_queue_weight,
+ .map_mtl_to_dma = dwxgmac2_map_mtl_to_dma,
+ .config_cbs = dwxgmac2_config_cbs,
+ .dump_regs = dwxgmac2_dump_regs,
+ .host_irq_status = dwxgmac2_host_irq_status,
+ .host_mtl_irq_status = dwxgmac2_host_mtl_irq_status,
+ .flow_ctrl = dwxgmac2_flow_ctrl,
+ .pmt = dwxgmac2_pmt,
+ .set_umac_addr = dwxgmac2_set_umac_addr,
+ .get_umac_addr = dwxgmac2_get_umac_addr,
+ .set_eee_mode = dwxgmac2_set_eee_mode,
+ .reset_eee_mode = dwxgmac2_reset_eee_mode,
+ .set_eee_timer = dwxgmac2_set_eee_timer,
+ .set_eee_pls = dwxgmac2_set_eee_pls,
+ .pcs_ctrl_ane = NULL,
+ .pcs_rane = NULL,
+ .pcs_get_adv_lp = NULL,
+ .debug = NULL,
+ .set_filter = dwxgmac2_set_filter,
+ .safety_feat_config = dwxgmac3_safety_feat_config,
+ .safety_feat_irq_status = dwxgmac3_safety_feat_irq_status,
+ .safety_feat_dump = dwxgmac3_safety_feat_dump,
+ .set_mac_loopback = dwxgmac2_set_mac_loopback,
+ .rss_configure = dwxgmac2_rss_configure,
+ .update_vlan_hash = dwxgmac2_update_vlan_hash,
+ .rxp_config = dwxgmac3_rxp_config,
+ .get_mac_tx_timestamp = dwxgmac2_get_mac_tx_timestamp,
+ .flex_pps_config = dwxgmac2_flex_pps_config,
+ .sarc_configure = dwxgmac2_sarc_configure,
+ .enable_vlan = dwxgmac2_enable_vlan,
+ .config_l3_filter = dwxgmac2_config_l3_filter,
+ .config_l4_filter = dwxgmac2_config_l4_filter,
+ .set_arp_offload = dwxgmac2_set_arp_offload,
+ .est_configure = dwxgmac3_est_configure,
+ .fpe_configure = dwxgmac3_fpe_configure,
+};
+
int dwxgmac2_setup(struct stmmac_priv *priv)
{
struct mac_device_info *mac = priv->hw;
@@ -1521,3 +1583,40 @@ int dwxgmac2_setup(struct stmmac_priv *priv)
return 0;
}
+
+int dwxlgmac2_setup(struct stmmac_priv *priv)
+{
+ struct mac_device_info *mac = priv->hw;
+
+ dev_info(priv->device, "\tXLGMAC\n");
+
+ priv->dev->priv_flags |= IFF_UNICAST_FLT;
+ mac->pcsr = priv->ioaddr;
+ mac->multicast_filter_bins = priv->plat->multicast_filter_bins;
+ mac->unicast_filter_entries = priv->plat->unicast_filter_entries;
+ mac->mcast_bits_log2 = 0;
+
+ if (mac->multicast_filter_bins)
+ mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins);
+
+ mac->link.duplex = 0;
+ mac->link.speed1000 = XLGMAC_CONFIG_SS_1000;
+ mac->link.speed2500 = XLGMAC_CONFIG_SS_2500;
+ mac->link.xgmii.speed10000 = XLGMAC_CONFIG_SS_10G;
+ mac->link.xlgmii.speed25000 = XLGMAC_CONFIG_SS_25G;
+ mac->link.xlgmii.speed40000 = XLGMAC_CONFIG_SS_40G;
+ mac->link.xlgmii.speed50000 = XLGMAC_CONFIG_SS_50G;
+ mac->link.xlgmii.speed100000 = XLGMAC_CONFIG_SS_100G;
+ mac->link.speed_mask = XLGMAC_CONFIG_SS;
+
+ mac->mii.addr = XGMAC_MDIO_ADDR;
+ mac->mii.data = XGMAC_MDIO_DATA;
+ mac->mii.addr_shift = 16;
+ mac->mii.addr_mask = GENMASK(20, 16);
+ mac->mii.reg_shift = 0;
+ mac->mii.reg_mask = GENMASK(15, 0);
+ mac->mii.clk_csr_shift = 19;
+ mac->mii.clk_csr_mask = GENMASK(21, 19);
+
+ return 0;
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h
new file mode 100644
index 000000000000..726090d49221
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare XLGMAC definitions.
+ */
+
+#ifndef __STMMAC_DWXLGMAC2_H__
+#define __STMMAC_DWXLGMAC2_H__
+
+/* MAC Registers */
+#define XLGMAC_CONFIG_SS GENMASK(30, 28)
+#define XLGMAC_CONFIG_SS_SHIFT 28
+#define XLGMAC_CONFIG_SS_40G (0x0 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_CONFIG_SS_25G (0x1 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_CONFIG_SS_50G (0x2 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_CONFIG_SS_100G (0x3 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_CONFIG_SS_10G (0x4 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_CONFIG_SS_2500 (0x6 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_CONFIG_SS_1000 (0x7 << XLGMAC_CONFIG_SS_SHIFT)
+#define XLGMAC_RXQ_ENABLE_CTRL0 0x00000140
+
+#endif /* __STMMAC_DWXLGMAC2_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 3af2e5015245..bb7114f970f8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -23,6 +23,18 @@ static u32 stmmac_get_id(struct stmmac_priv *priv, u32 id_reg)
return reg & GENMASK(7, 0);
}
+static u32 stmmac_get_dev_id(struct stmmac_priv *priv, u32 id_reg)
+{
+ u32 reg = readl(priv->ioaddr + id_reg);
+
+ if (!reg) {
+ dev_info(priv->device, "Version ID not available\n");
+ return 0x0;
+ }
+
+ return (reg & GENMASK(15, 8)) >> 8;
+}
+
static void stmmac_dwmac_mode_quirk(struct stmmac_priv *priv)
{
struct mac_device_info *mac = priv->hw;
@@ -69,11 +81,18 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
return 0;
}
+static int stmmac_dwxlgmac_quirks(struct stmmac_priv *priv)
+{
+ priv->hw->xlgmac = true;
+ return 0;
+}
+
static const struct stmmac_hwif_entry {
bool gmac;
bool gmac4;
bool xgmac;
u32 min_id;
+ u32 dev_id;
const struct stmmac_regs_off regs;
const void *desc;
const void *dma;
@@ -199,6 +218,7 @@ static const struct stmmac_hwif_entry {
.gmac4 = false,
.xgmac = true,
.min_id = DWXGMAC_CORE_2_10,
+ .dev_id = DWXGMAC_ID,
.regs = {
.ptp_off = PTP_XGMAC_OFFSET,
.mmc_off = MMC_XGMAC_OFFSET,
@@ -212,6 +232,25 @@ static const struct stmmac_hwif_entry {
.mmc = &dwxgmac_mmc_ops,
.setup = dwxgmac2_setup,
.quirks = NULL,
+ }, {
+ .gmac = false,
+ .gmac4 = false,
+ .xgmac = true,
+ .min_id = DWXLGMAC_CORE_2_00,
+ .dev_id = DWXLGMAC_ID,
+ .regs = {
+ .ptp_off = PTP_XGMAC_OFFSET,
+ .mmc_off = MMC_XGMAC_OFFSET,
+ },
+ .desc = &dwxgmac210_desc_ops,
+ .dma = &dwxgmac210_dma_ops,
+ .mac = &dwxlgmac2_ops,
+ .hwtimestamp = &stmmac_ptp,
+ .mode = NULL,
+ .tc = &dwmac510_tc_ops,
+ .mmc = &dwxgmac_mmc_ops,
+ .setup = dwxlgmac2_setup,
+ .quirks = stmmac_dwxlgmac_quirks,
},
};
@@ -223,13 +262,15 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
const struct stmmac_hwif_entry *entry;
struct mac_device_info *mac;
bool needs_setup = true;
+ u32 id, dev_id = 0;
int i, ret;
- u32 id;
if (needs_gmac) {
id = stmmac_get_id(priv, GMAC_VERSION);
} else if (needs_gmac4 || needs_xgmac) {
id = stmmac_get_id(priv, GMAC4_VERSION);
+ if (needs_xgmac)
+ dev_id = stmmac_get_dev_id(priv, GMAC4_VERSION);
} else {
id = 0;
}
@@ -267,6 +308,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
/* Use synopsys_id var because some setups can override this */
if (priv->synopsys_id < entry->min_id)
continue;
+ if (needs_xgmac && (dev_id ^ entry->dev_id))
+ continue;
/* Only use generic HW helpers if needed */
mac->desc = mac->desc ? : entry->desc;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index c71dd99c8abf..fc350149ba34 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -605,6 +605,7 @@ extern const struct stmmac_dma_ops dwmac410_dma_ops;
extern const struct stmmac_ops dwmac510_ops;
extern const struct stmmac_tc_ops dwmac510_tc_ops;
extern const struct stmmac_ops dwxgmac210_ops;
+extern const struct stmmac_ops dwxlgmac2_ops;
extern const struct stmmac_dma_ops dwxgmac210_dma_ops;
extern const struct stmmac_desc_ops dwxgmac210_desc_ops;
extern const struct stmmac_mmc_ops dwmac_mmc_ops;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f26699d9a050..0e8c80f23557 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -849,6 +849,38 @@ static void stmmac_validate(struct phylink_config *config,
phylink_set(mac_supported, 10000baseKX4_Full);
phylink_set(mac_supported, 10000baseKR_Full);
}
+ if (!max_speed || (max_speed >= 25000)) {
+ phylink_set(mac_supported, 25000baseCR_Full);
+ phylink_set(mac_supported, 25000baseKR_Full);
+ phylink_set(mac_supported, 25000baseSR_Full);
+ }
+ if (!max_speed || (max_speed >= 40000)) {
+ phylink_set(mac_supported, 40000baseKR4_Full);
+ phylink_set(mac_supported, 40000baseCR4_Full);
+ phylink_set(mac_supported, 40000baseSR4_Full);
+ phylink_set(mac_supported, 40000baseLR4_Full);
+ }
+ if (!max_speed || (max_speed >= 50000)) {
+ phylink_set(mac_supported, 50000baseCR2_Full);
+ phylink_set(mac_supported, 50000baseKR2_Full);
+ phylink_set(mac_supported, 50000baseSR2_Full);
+ phylink_set(mac_supported, 50000baseKR_Full);
+ phylink_set(mac_supported, 50000baseSR_Full);
+ phylink_set(mac_supported, 50000baseCR_Full);
+ phylink_set(mac_supported, 50000baseLR_ER_FR_Full);
+ phylink_set(mac_supported, 50000baseDR_Full);
+ }
+ if (!max_speed || (max_speed >= 100000)) {
+ phylink_set(mac_supported, 100000baseKR4_Full);
+ phylink_set(mac_supported, 100000baseSR4_Full);
+ phylink_set(mac_supported, 100000baseCR4_Full);
+ phylink_set(mac_supported, 100000baseLR4_ER4_Full);
+ phylink_set(mac_supported, 100000baseKR2_Full);
+ phylink_set(mac_supported, 100000baseSR2_Full);
+ phylink_set(mac_supported, 100000baseCR2_Full);
+ phylink_set(mac_supported, 100000baseLR2_ER2_FR2_Full);
+ phylink_set(mac_supported, 100000baseDR2_Full);
+ }
}
/* Half-Duplex can only work with single queue */
@@ -929,6 +961,32 @@ static void stmmac_mac_link_up(struct phylink_config *config,
default:
return;
}
+ } else if (interface == PHY_INTERFACE_MODE_XLGMII) {
+ switch (speed) {
+ case SPEED_100000:
+ ctrl |= priv->hw->link.xlgmii.speed100000;
+ break;
+ case SPEED_50000:
+ ctrl |= priv->hw->link.xlgmii.speed50000;
+ break;
+ case SPEED_40000:
+ ctrl |= priv->hw->link.xlgmii.speed40000;
+ break;
+ case SPEED_25000:
+ ctrl |= priv->hw->link.xlgmii.speed25000;
+ break;
+ case SPEED_10000:
+ ctrl |= priv->hw->link.xgmii.speed10000;
+ break;
+ case SPEED_2500:
+ ctrl |= priv->hw->link.speed2500;
+ break;
+ case SPEED_1000:
+ ctrl |= priv->hw->link.speed1000;
+ break;
+ default:
+ return;
+ }
} else {
switch (speed) {
case SPEED_2500:
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
index 07dbe4f5456e..63d6c85a59e3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
@@ -1387,7 +1387,7 @@ static int __stmmac_test_l3filt(struct stmmac_priv *priv, u32 dst, u32 src,
cls->rule = rule;
rule->action.entries[0].id = FLOW_ACTION_DROP;
- rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_TYPE_ANY;
+ rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_ANY;
rule->action.num_entries = 1;
attr.dst = priv->dev->dev_addr;
@@ -1516,7 +1516,7 @@ static int __stmmac_test_l4filt(struct stmmac_priv *priv, u32 dst, u32 src,
cls->rule = rule;
rule->action.entries[0].id = FLOW_ACTION_DROP;
- rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_TYPE_ANY;
+ rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_ANY;
rule->action.num_entries = 1;
attr.dst = priv->dev->dev_addr;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index a0e6118444b0..3d747846f482 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -376,7 +376,7 @@ static int tc_parse_flow_actions(struct stmmac_priv *priv,
if (!flow_action_has_entries(action))
return -EINVAL;
- if (!flow_action_basic_hw_stats_types_check(action, extack))
+ if (!flow_action_basic_hw_stats_check(action, extack))
return -EOPNOTSUPP;
flow_action_for_each(i, act, action) {
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c
index fde722136869..bc198eadfcab 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c
@@ -151,7 +151,6 @@ static int xlgmac_ethtool_get_coalesce(struct net_device *netdev,
{
struct xlgmac_pdata *pdata = netdev_priv(netdev);
- memset(ec, 0, sizeof(struct ethtool_coalesce));
ec->rx_coalesce_usecs = pdata->rx_usecs;
ec->rx_max_coalesced_frames = pdata->rx_frames;
ec->tx_max_coalesced_frames = pdata->tx_frames;
@@ -167,20 +166,6 @@ static int xlgmac_ethtool_set_coalesce(struct net_device *netdev,
unsigned int rx_frames, rx_riwt, rx_usecs;
unsigned int tx_frames;
- /* Check for not supported parameters */
- if ((ec->rx_coalesce_usecs_irq) || (ec->rx_max_coalesced_frames_irq) ||
- (ec->tx_coalesce_usecs) || (ec->tx_coalesce_usecs_high) ||
- (ec->tx_max_coalesced_frames_irq) || (ec->tx_coalesce_usecs_irq) ||
- (ec->stats_block_coalesce_usecs) || (ec->pkt_rate_low) ||
- (ec->use_adaptive_rx_coalesce) || (ec->use_adaptive_tx_coalesce) ||
- (ec->rx_max_coalesced_frames_low) || (ec->rx_coalesce_usecs_low) ||
- (ec->tx_coalesce_usecs_low) || (ec->tx_max_coalesced_frames_low) ||
- (ec->pkt_rate_high) || (ec->rx_coalesce_usecs_high) ||
- (ec->rx_max_coalesced_frames_high) ||
- (ec->tx_max_coalesced_frames_high) ||
- (ec->rate_sample_interval))
- return -EOPNOTSUPP;
-
rx_usecs = ec->rx_coalesce_usecs;
rx_riwt = hw_ops->usec_to_riwt(pdata, rx_usecs);
rx_frames = ec->rx_max_coalesced_frames;
@@ -257,6 +242,8 @@ static void xlgmac_ethtool_get_ethtool_stats(struct net_device *netdev,
}
static const struct ethtool_ops xlgmac_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS |
+ ETHTOOL_COALESCE_MAX_FRAMES,
.get_drvinfo = xlgmac_ethtool_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_msglevel = xlgmac_ethtool_get_msglevel,
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 0f8a924fc60c..40a2ce0ca808 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -2373,6 +2373,8 @@ static void bdx_get_ethtool_stats(struct net_device *netdev,
static void bdx_set_ethtool_ops(struct net_device *netdev)
{
static const struct ethtool_ops bdx_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+ ETHTOOL_COALESCE_MAX_FRAMES,
.get_drvinfo = bdx_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_coalesce = bdx_get_coalesce,
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 6ae4a72e6f43..c2c5bf87da01 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1211,6 +1211,7 @@ static int cpsw_set_channels(struct net_device *ndev,
}
static const struct ethtool_ops cpsw_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS,
.get_drvinfo = cpsw_get_drvinfo,
.get_msglevel = cpsw_get_msglevel,
.set_msglevel = cpsw_set_msglevel,
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 71215db7934b..9209e613257d 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1175,6 +1175,7 @@ static int cpsw_set_channels(struct net_device *ndev,
}
static const struct ethtool_ops cpsw_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS,
.get_drvinfo = cpsw_get_drvinfo,
.get_msglevel = cpsw_get_msglevel,
.set_msglevel = cpsw_set_msglevel,
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 75d4e16c692b..de282531f68b 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -481,6 +481,7 @@ static int emac_set_coalesce(struct net_device *ndev,
* Ethtool support for EMAC adapter
*/
static const struct ethtool_ops ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS,
.get_drvinfo = emac_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_coalesce = emac_get_coalesce,
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index dc022cd5bc42..3e313e71ae36 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1314,25 +1314,6 @@ static int ll_temac_ethtools_set_coalesce(struct net_device *ndev,
return -EFAULT;
}
- if (ec->rx_coalesce_usecs_irq ||
- ec->rx_max_coalesced_frames_irq ||
- ec->tx_coalesce_usecs_irq ||
- ec->tx_max_coalesced_frames_irq ||
- ec->stats_block_coalesce_usecs ||
- ec->use_adaptive_rx_coalesce ||
- ec->use_adaptive_tx_coalesce ||
- ec->pkt_rate_low ||
- ec->rx_coalesce_usecs_low ||
- ec->rx_max_coalesced_frames_low ||
- ec->tx_coalesce_usecs_low ||
- ec->tx_max_coalesced_frames_low ||
- ec->pkt_rate_high ||
- ec->rx_coalesce_usecs_high ||
- ec->rx_max_coalesced_frames_high ||
- ec->tx_coalesce_usecs_high ||
- ec->tx_max_coalesced_frames_high ||
- ec->rate_sample_interval)
- return -EOPNOTSUPP;
if (ec->rx_max_coalesced_frames)
lp->coalesce_count_rx = ec->rx_max_coalesced_frames;
if (ec->tx_max_coalesced_frames)
@@ -1351,6 +1332,8 @@ static int ll_temac_ethtools_set_coalesce(struct net_device *ndev,
}
static const struct ethtool_ops temac_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+ ETHTOOL_COALESCE_MAX_FRAMES,
.nway_reset = phy_ethtool_nway_reset,
.get_link = ethtool_op_get_link,
.get_ts_info = ethtool_op_get_ts_info,
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index c2f4c5ca2e80..e2f3e2b0cec7 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1309,27 +1309,6 @@ static int axienet_ethtools_set_coalesce(struct net_device *ndev,
return -EFAULT;
}
- if ((ecoalesce->rx_coalesce_usecs) ||
- (ecoalesce->rx_coalesce_usecs_irq) ||
- (ecoalesce->rx_max_coalesced_frames_irq) ||
- (ecoalesce->tx_coalesce_usecs) ||
- (ecoalesce->tx_coalesce_usecs_irq) ||
- (ecoalesce->tx_max_coalesced_frames_irq) ||
- (ecoalesce->stats_block_coalesce_usecs) ||
- (ecoalesce->use_adaptive_rx_coalesce) ||
- (ecoalesce->use_adaptive_tx_coalesce) ||
- (ecoalesce->pkt_rate_low) ||
- (ecoalesce->rx_coalesce_usecs_low) ||
- (ecoalesce->rx_max_coalesced_frames_low) ||
- (ecoalesce->tx_coalesce_usecs_low) ||
- (ecoalesce->tx_max_coalesced_frames_low) ||
- (ecoalesce->pkt_rate_high) ||
- (ecoalesce->rx_coalesce_usecs_high) ||
- (ecoalesce->rx_max_coalesced_frames_high) ||
- (ecoalesce->tx_coalesce_usecs_high) ||
- (ecoalesce->tx_max_coalesced_frames_high) ||
- (ecoalesce->rate_sample_interval))
- return -EOPNOTSUPP;
if (ecoalesce->rx_max_coalesced_frames)
lp->coalesce_count_rx = ecoalesce->rx_max_coalesced_frames;
if (ecoalesce->tx_max_coalesced_frames)
@@ -1357,6 +1336,7 @@ axienet_ethtools_set_link_ksettings(struct net_device *ndev,
}
static const struct ethtool_ops axienet_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
.get_drvinfo = axienet_ethtools_get_drvinfo,
.get_regs_len = axienet_ethtools_get_regs_len,
.get_regs = axienet_ethtools_get_regs,
diff --git a/drivers/net/phy/mdio-xpcs.c b/drivers/net/phy/mdio-xpcs.c
index 973f588146f7..2f4cdf807160 100644
--- a/drivers/net/phy/mdio-xpcs.c
+++ b/drivers/net/phy/mdio-xpcs.c
@@ -14,6 +14,7 @@
#define SYNOPSYS_XPCS_USXGMII_ID 0x7996ced0
#define SYNOPSYS_XPCS_10GKR_ID 0x7996ced0
+#define SYNOPSYS_XPCS_XLGMII_ID 0x7996ced0
#define SYNOPSYS_XPCS_MASK 0xffffffff
/* Vendor regs access */
@@ -74,6 +75,36 @@ static const int xpcs_10gkr_features[] = {
__ETHTOOL_LINK_MODE_MASK_NBITS,
};
+static const int xpcs_xlgmii_features[] = {
+ ETHTOOL_LINK_MODE_Pause_BIT,
+ ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+ ETHTOOL_LINK_MODE_25000baseCR_Full_BIT,
+ ETHTOOL_LINK_MODE_25000baseKR_Full_BIT,
+ ETHTOOL_LINK_MODE_25000baseSR_Full_BIT,
+ ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT,
+ ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT,
+ ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT,
+ ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseKR_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseSR_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseCR_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT,
+ ETHTOOL_LINK_MODE_50000baseDR_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT,
+ ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
static const phy_interface_t xpcs_usxgmii_interfaces[] = {
PHY_INTERFACE_MODE_USXGMII,
PHY_INTERFACE_MODE_MAX,
@@ -84,6 +115,11 @@ static const phy_interface_t xpcs_10gkr_interfaces[] = {
PHY_INTERFACE_MODE_MAX,
};
+static const phy_interface_t xpcs_xlgmii_interfaces[] = {
+ PHY_INTERFACE_MODE_XLGMII,
+ PHY_INTERFACE_MODE_MAX,
+};
+
static struct xpcs_id {
u32 id;
u32 mask;
@@ -100,6 +136,11 @@ static struct xpcs_id {
.mask = SYNOPSYS_XPCS_MASK,
.supported = xpcs_10gkr_features,
.interface = xpcs_10gkr_interfaces,
+ }, {
+ .id = SYNOPSYS_XPCS_XLGMII_ID,
+ .mask = SYNOPSYS_XPCS_MASK,
+ .supported = xpcs_xlgmii_features,
+ .interface = xpcs_xlgmii_interfaces,
},
};
@@ -458,6 +499,60 @@ static void xpcs_resolve_lpa(struct mdio_xpcs_args *xpcs,
state->duplex = DUPLEX_FULL;
}
+static int xpcs_get_max_xlgmii_speed(struct mdio_xpcs_args *xpcs,
+ struct phylink_link_state *state)
+{
+ unsigned long *adv = state->advertising;
+ int speed = SPEED_UNKNOWN;
+ int bit;
+
+ for_each_set_bit(bit, adv, __ETHTOOL_LINK_MODE_MASK_NBITS) {
+ int new_speed = SPEED_UNKNOWN;
+
+ switch (bit) {
+ case ETHTOOL_LINK_MODE_25000baseCR_Full_BIT:
+ case ETHTOOL_LINK_MODE_25000baseKR_Full_BIT:
+ case ETHTOOL_LINK_MODE_25000baseSR_Full_BIT:
+ new_speed = SPEED_25000;
+ break;
+ case ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT:
+ case ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT:
+ case ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT:
+ case ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT:
+ new_speed = SPEED_40000;
+ break;
+ case ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseKR_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseSR_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseCR_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT:
+ case ETHTOOL_LINK_MODE_50000baseDR_Full_BIT:
+ new_speed = SPEED_50000;
+ break;
+ case ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT:
+ case ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT:
+ new_speed = SPEED_100000;
+ break;
+ default:
+ continue;
+ }
+
+ if (new_speed > speed)
+ speed = new_speed;
+ }
+
+ return speed;
+}
+
static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs,
struct phylink_link_state *state)
{
@@ -468,6 +563,9 @@ static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs,
case PHY_INTERFACE_MODE_10GKR:
state->speed = SPEED_10000;
break;
+ case PHY_INTERFACE_MODE_XLGMII:
+ state->speed = xpcs_get_max_xlgmii_speed(xpcs, state);
+ break;
default:
state->speed = SPEED_UNKNOWN;
break;
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 3ab9ca7614d1..522760c8bca6 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -825,6 +825,38 @@ int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)
EXPORT_SYMBOL(__mdiobus_write);
/**
+ * __mdiobus_modify_changed - Unlocked version of the mdiobus_modify function
+ * @bus: the mii_bus struct
+ * @addr: the phy address
+ * @regnum: register number to modify
+ * @mask: bit mask of bits to clear
+ * @set: bit mask of bits to set
+ *
+ * Read, modify, and if any change, write the register value back to the
+ * device. Any error returns a negative number.
+ *
+ * NOTE: MUST NOT be called from interrupt context.
+ */
+int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
+ u16 mask, u16 set)
+{
+ int new, ret;
+
+ ret = __mdiobus_read(bus, addr, regnum);
+ if (ret < 0)
+ return ret;
+
+ new = (ret & ~mask) | set;
+ if (new == ret)
+ return 0;
+
+ ret = __mdiobus_write(bus, addr, regnum, new);
+
+ return ret < 0 ? ret : 1;
+}
+EXPORT_SYMBOL_GPL(__mdiobus_modify_changed);
+
+/**
* mdiobus_read_nested - Nested version of the mdiobus_read function
* @bus: the mii_bus struct
* @addr: the phy address
@@ -841,7 +873,8 @@ int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum)
{
int retval;
- BUG_ON(in_interrupt());
+ if (WARN_ON_ONCE(in_interrupt()))
+ return -EINVAL;
mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
retval = __mdiobus_read(bus, addr, regnum);
@@ -865,7 +898,8 @@ int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum)
{
int retval;
- BUG_ON(in_interrupt());
+ if (WARN_ON_ONCE(in_interrupt()))
+ return -EINVAL;
mutex_lock(&bus->mdio_lock);
retval = __mdiobus_read(bus, addr, regnum);
@@ -893,7 +927,8 @@ int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val)
{
int err;
- BUG_ON(in_interrupt());
+ if (WARN_ON_ONCE(in_interrupt()))
+ return -EINVAL;
mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
err = __mdiobus_write(bus, addr, regnum, val);
@@ -918,7 +953,8 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)
{
int err;
- BUG_ON(in_interrupt());
+ if (WARN_ON_ONCE(in_interrupt()))
+ return -EINVAL;
mutex_lock(&bus->mdio_lock);
err = __mdiobus_write(bus, addr, regnum, val);
@@ -929,6 +965,30 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)
EXPORT_SYMBOL(mdiobus_write);
/**
+ * mdiobus_modify - Convenience function for modifying a given mdio device
+ * register
+ * @bus: the mii_bus struct
+ * @addr: the phy address
+ * @regnum: register number to write
+ * @mask: bit mask of bits to clear
+ * @set: bit mask of bits to set
+ */
+int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set)
+{
+ int err;
+
+ if (WARN_ON_ONCE(in_interrupt()))
+ return -EINVAL;
+
+ mutex_lock(&bus->mdio_lock);
+ err = __mdiobus_modify_changed(bus, addr, regnum, mask, set);
+ mutex_unlock(&bus->mdio_lock);
+
+ return err < 0 ? err : 0;
+}
+EXPORT_SYMBOL_GPL(mdiobus_modify);
+
+/**
* mdio_bus_match - determine if given MDIO driver supports the given
* MDIO device
* @dev: target MDIO device
diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index cb4d65f81095..2f6229a70ec1 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -1429,11 +1429,21 @@ err:
return ret;
}
-static int vsc8584_handle_interrupt(struct phy_device *phydev)
+static irqreturn_t vsc8584_handle_interrupt(struct phy_device *phydev)
{
- vsc8584_handle_macsec_interrupt(phydev);
- phy_mac_interrupt(phydev);
- return 0;
+ int irq_status;
+
+ irq_status = phy_read(phydev, MII_VSC85XX_INT_STATUS);
+ if (irq_status < 0 || !(irq_status & MII_VSC85XX_INT_MASK_MASK))
+ return IRQ_NONE;
+
+ if (irq_status & MII_VSC85XX_INT_MASK_EXT)
+ vsc8584_handle_macsec_interrupt(phydev);
+
+ if (irq_status & MII_VSC85XX_INT_MASK_LINK_CHG)
+ phy_mac_interrupt(phydev);
+
+ return IRQ_HANDLED;
}
static int vsc85xx_config_init(struct phy_device *phydev)
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index e083e7a76ada..94cd85b1e49b 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -489,37 +489,6 @@ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
EXPORT_SYMBOL(phy_write_mmd);
/**
- * __phy_modify_changed() - Convenience function for modifying a PHY register
- * @phydev: a pointer to a &struct phy_device
- * @regnum: register number
- * @mask: bit mask of bits to clear
- * @set: bit mask of bits to set
- *
- * Unlocked helper function which allows a PHY register to be modified as
- * new register value = (old register value & ~mask) | set
- *
- * Returns negative errno, 0 if there was no change, and 1 in case of change
- */
-int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask,
- u16 set)
-{
- int new, ret;
-
- ret = __phy_read(phydev, regnum);
- if (ret < 0)
- return ret;
-
- new = (ret & ~mask) | set;
- if (new == ret)
- return 0;
-
- ret = __phy_write(phydev, regnum, new);
-
- return ret < 0 ? ret : 1;
-}
-EXPORT_SYMBOL_GPL(__phy_modify_changed);
-
-/**
* phy_modify_changed - Function for modifying a PHY register
* @phydev: the phy_device struct
* @regnum: register number to modify
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 355bfdef48d2..d71212a418f3 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -715,26 +715,24 @@ static int phy_disable_interrupts(struct phy_device *phydev)
static irqreturn_t phy_interrupt(int irq, void *phy_dat)
{
struct phy_device *phydev = phy_dat;
+ struct phy_driver *drv = phydev->drv;
- if (phydev->drv->did_interrupt && !phydev->drv->did_interrupt(phydev))
+ if (drv->handle_interrupt)
+ return drv->handle_interrupt(phydev);
+
+ if (drv->did_interrupt && !drv->did_interrupt(phydev))
return IRQ_NONE;
- if (phydev->drv->handle_interrupt) {
- if (phydev->drv->handle_interrupt(phydev))
- goto phy_err;
- } else {
- /* reschedule state queue work to run as soon as possible */
- phy_trigger_machine(phydev);
- }
+ /* reschedule state queue work to run as soon as possible */
+ phy_trigger_machine(phydev);
/* did_interrupt() may have cleared the interrupt already */
- if (!phydev->drv->did_interrupt && phy_clear_interrupt(phydev))
- goto phy_err;
- return IRQ_HANDLED;
+ if (!drv->did_interrupt && phy_clear_interrupt(phydev)) {
+ phy_error(phydev);
+ return IRQ_NONE;
+ }
-phy_err:
- phy_error(phydev);
- return IRQ_NONE;
+ return IRQ_HANDLED;
}
/**
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 60f32b354013..fed0c5907c6a 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2068,4 +2068,240 @@ void phylink_helper_basex_speed(struct phylink_link_state *state)
}
EXPORT_SYMBOL_GPL(phylink_helper_basex_speed);
+static void phylink_decode_c37_word(struct phylink_link_state *state,
+ uint16_t config_reg, int speed)
+{
+ bool tx_pause, rx_pause;
+ int fd_bit;
+
+ if (speed == SPEED_2500)
+ fd_bit = ETHTOOL_LINK_MODE_2500baseX_Full_BIT;
+ else
+ fd_bit = ETHTOOL_LINK_MODE_1000baseX_Full_BIT;
+
+ mii_lpa_mod_linkmode_x(state->lp_advertising, config_reg, fd_bit);
+
+ if (linkmode_test_bit(fd_bit, state->advertising) &&
+ linkmode_test_bit(fd_bit, state->lp_advertising)) {
+ state->speed = speed;
+ state->duplex = DUPLEX_FULL;
+ } else {
+ /* negotiation failure */
+ state->link = false;
+ }
+
+ linkmode_resolve_pause(state->advertising, state->lp_advertising,
+ &tx_pause, &rx_pause);
+
+ if (tx_pause)
+ state->pause |= MLO_PAUSE_TX;
+ if (rx_pause)
+ state->pause |= MLO_PAUSE_RX;
+}
+
+static void phylink_decode_sgmii_word(struct phylink_link_state *state,
+ uint16_t config_reg)
+{
+ if (!(config_reg & LPA_SGMII_LINK)) {
+ state->link = false;
+ return;
+ }
+
+ switch (config_reg & LPA_SGMII_SPD_MASK) {
+ case LPA_SGMII_10:
+ state->speed = SPEED_10;
+ break;
+ case LPA_SGMII_100:
+ state->speed = SPEED_100;
+ break;
+ case LPA_SGMII_1000:
+ state->speed = SPEED_1000;
+ break;
+ default:
+ state->link = false;
+ return;
+ }
+ if (config_reg & LPA_SGMII_FULL_DUPLEX)
+ state->duplex = DUPLEX_FULL;
+ else
+ state->duplex = DUPLEX_HALF;
+}
+
+/**
+ * phylink_mii_c22_pcs_get_state() - read the MAC PCS state
+ * @pcs: a pointer to a &struct mdio_device.
+ * @state: a pointer to a &struct phylink_link_state.
+ *
+ * Helper for MAC PCS supporting the 802.3 clause 22 register set for
+ * clause 37 negotiation and/or SGMII control.
+ *
+ * Read the MAC PCS state from the MII device configured in @config and
+ * parse the Clause 37 or Cisco SGMII link partner negotiation word into
+ * the phylink @state structure. This is suitable to be directly plugged
+ * into the mac_pcs_get_state() member of the struct phylink_mac_ops
+ * structure.
+ */
+void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
+ struct phylink_link_state *state)
+{
+ struct mii_bus *bus = pcs->bus;
+ int addr = pcs->addr;
+ int bmsr, lpa;
+
+ bmsr = mdiobus_read(bus, addr, MII_BMSR);
+ lpa = mdiobus_read(bus, addr, MII_LPA);
+ if (bmsr < 0 || lpa < 0) {
+ state->link = false;
+ return;
+ }
+
+ state->link = !!(bmsr & BMSR_LSTATUS);
+ state->an_complete = !!(bmsr & BMSR_ANEGCOMPLETE);
+ if (!state->link)
+ return;
+
+ switch (state->interface) {
+ case PHY_INTERFACE_MODE_1000BASEX:
+ phylink_decode_c37_word(state, lpa, SPEED_1000);
+ break;
+
+ case PHY_INTERFACE_MODE_2500BASEX:
+ phylink_decode_c37_word(state, lpa, SPEED_2500);
+ break;
+
+ case PHY_INTERFACE_MODE_SGMII:
+ phylink_decode_sgmii_word(state, lpa);
+ break;
+
+ default:
+ state->link = false;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_get_state);
+
+/**
+ * phylink_mii_c22_pcs_set_advertisement() - configure the clause 37 PCS
+ * advertisement
+ * @pcs: a pointer to a &struct mdio_device.
+ * @state: a pointer to the state being configured.
+ *
+ * Helper for MAC PCS supporting the 802.3 clause 22 register set for
+ * clause 37 negotiation and/or SGMII control.
+ *
+ * Configure the clause 37 PCS advertisement as specified by @state. This
+ * does not trigger a renegotiation; phylink will do that via the
+ * mac_an_restart() method of the struct phylink_mac_ops structure.
+ *
+ * Returns negative error code on failure to configure the advertisement,
+ * zero if no change has been made, or one if the advertisement has changed.
+ */
+int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
+ const struct phylink_link_state *state)
+{
+ struct mii_bus *bus = pcs->bus;
+ int addr = pcs->addr;
+ int val, ret;
+ u16 adv;
+
+ switch (state->interface) {
+ case PHY_INTERFACE_MODE_1000BASEX:
+ case PHY_INTERFACE_MODE_2500BASEX:
+ adv = ADVERTISE_1000XFULL;
+ if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+ state->advertising))
+ adv |= ADVERTISE_1000XPAUSE;
+ if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+ state->advertising))
+ adv |= ADVERTISE_1000XPSE_ASYM;
+
+ val = mdiobus_read(bus, addr, MII_ADVERTISE);
+ if (val < 0)
+ return val;
+
+ if (val == adv)
+ return 0;
+
+ ret = mdiobus_write(bus, addr, MII_ADVERTISE, adv);
+ if (ret < 0)
+ return ret;
+
+ return 1;
+
+ case PHY_INTERFACE_MODE_SGMII:
+ val = mdiobus_read(bus, addr, MII_ADVERTISE);
+ if (val < 0)
+ return val;
+
+ if (val == 0x0001)
+ return 0;
+
+ ret = mdiobus_write(bus, addr, MII_ADVERTISE, 0x0001);
+ if (ret < 0)
+ return ret;
+
+ return 1;
+
+ default:
+ /* Nothing to do for other modes */
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_set_advertisement);
+
+/**
+ * phylink_mii_c22_pcs_an_restart() - restart 802.3z autonegotiation
+ * @pcs: a pointer to a &struct mdio_device.
+ *
+ * Helper for MAC PCS supporting the 802.3 clause 22 register set for
+ * clause 37 negotiation.
+ *
+ * Restart the clause 37 negotiation with the link partner. This is
+ * suitable to be directly plugged into the mac_pcs_get_state() member
+ * of the struct phylink_mac_ops structure.
+ */
+void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs)
+{
+ struct mii_bus *bus = pcs->bus;
+ int val, addr = pcs->addr;
+
+ val = mdiobus_read(bus, addr, MII_BMCR);
+ if (val >= 0) {
+ val |= BMCR_ANRESTART;
+
+ mdiobus_write(bus, addr, MII_BMCR, val);
+ }
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_an_restart);
+
+#define C45_ADDR(d,a) (MII_ADDR_C45 | (d) << 16 | (a))
+void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs,
+ struct phylink_link_state *state)
+{
+ struct mii_bus *bus = pcs->bus;
+ int addr = pcs->addr;
+ int stat;
+
+ stat = mdiobus_read(bus, addr, C45_ADDR(MDIO_MMD_PCS, MDIO_STAT1));
+ if (stat < 0) {
+ state->link = false;
+ return;
+ }
+
+ state->link = !!(stat & MDIO_STAT1_LSTATUS);
+ if (!state->link)
+ return;
+
+ switch (state->interface) {
+ case PHY_INTERFACE_MODE_10GBASER:
+ state->speed = SPEED_10000;
+ state->duplex = DUPLEX_FULL;
+ break;
+
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c45_pcs_get_state);
+
MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index f5fa2fff3ddc..2d99e9de6ee1 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -49,6 +49,8 @@
#define RTL_LPADV_5000FULL BIT(6)
#define RTL_LPADV_2500FULL BIT(5)
+#define RTLGEN_SPEED_MASK 0x0630
+
#define RTL_GENERIC_PHYID 0x001cc800
MODULE_DESCRIPTION("Realtek PHY driver");
@@ -309,6 +311,55 @@ static int rtl8366rb_config_init(struct phy_device *phydev)
return ret;
}
+/* get actual speed to cover the downshift case */
+static int rtlgen_get_speed(struct phy_device *phydev)
+{
+ int val;
+
+ if (!phydev->link)
+ return 0;
+
+ val = phy_read_paged(phydev, 0xa43, 0x12);
+ if (val < 0)
+ return val;
+
+ switch (val & RTLGEN_SPEED_MASK) {
+ case 0x0000:
+ phydev->speed = SPEED_10;
+ break;
+ case 0x0010:
+ phydev->speed = SPEED_100;
+ break;
+ case 0x0020:
+ phydev->speed = SPEED_1000;
+ break;
+ case 0x0200:
+ phydev->speed = SPEED_10000;
+ break;
+ case 0x0210:
+ phydev->speed = SPEED_2500;
+ break;
+ case 0x0220:
+ phydev->speed = SPEED_5000;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int rtlgen_read_status(struct phy_device *phydev)
+{
+ int ret;
+
+ ret = genphy_read_status(phydev);
+ if (ret < 0)
+ return ret;
+
+ return rtlgen_get_speed(phydev);
+}
+
static int rtlgen_read_mmd(struct phy_device *phydev, int devnum, u16 regnum)
{
int ret;
@@ -429,6 +480,8 @@ static int rtl8125_config_aneg(struct phy_device *phydev)
static int rtl8125_read_status(struct phy_device *phydev)
{
+ int ret;
+
if (phydev->autoneg == AUTONEG_ENABLE) {
int lpadv = phy_read_paged(phydev, 0xa5d, 0x13);
@@ -443,7 +496,11 @@ static int rtl8125_read_status(struct phy_device *phydev)
phydev->lp_advertising, lpadv & RTL_LPADV_2500FULL);
}
- return genphy_read_status(phydev);
+ ret = genphy_read_status(phydev);
+ if (ret < 0)
+ return ret;
+
+ return rtlgen_get_speed(phydev);
}
static bool rtlgen_supports_2_5gbps(struct phy_device *phydev)
@@ -550,6 +607,7 @@ static struct phy_driver realtek_drvs[] = {
}, {
.name = "Generic FE-GE Realtek PHY",
.match_phy_device = rtlgen_match_phy_device,
+ .read_status = rtlgen_read_status,
.suspend = genphy_suspend,
.resume = genphy_resume,
.read_page = rtl821x_read_page,
diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 962be94ed3ca..6eb431c194bd 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -847,11 +847,6 @@ struct qeth_trap_id {
/*some helper functions*/
#define QETH_CARD_IFNAME(card) (((card)->dev)? (card)->dev->name : "")
-static inline bool qeth_netdev_is_registered(struct net_device *dev)
-{
- return dev->netdev_ops != NULL;
-}
-
static inline u16 qeth_iqd_translate_txq(struct net_device *dev, u16 txq)
{
if (txq == QETH_IQD_MCAST_TXQ)
@@ -1053,6 +1048,7 @@ int qeth_configure_cq(struct qeth_card *, enum qeth_cq);
int qeth_hw_trap(struct qeth_card *, enum qeth_diags_trap_action);
void qeth_trace_features(struct qeth_card *);
int qeth_setassparms_cb(struct qeth_card *, struct qeth_reply *, unsigned long);
+int qeth_setup_netdev(struct qeth_card *card);
int qeth_set_features(struct net_device *, netdev_features_t);
void qeth_enable_hw_features(struct net_device *dev);
netdev_features_t qeth_fix_features(struct net_device *, netdev_features_t);
@@ -1060,6 +1056,7 @@ netdev_features_t qeth_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features);
void qeth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats);
+int qeth_set_real_num_tx_queues(struct qeth_card *card, unsigned int count);
u16 qeth_iqd_select_queue(struct net_device *dev, struct sk_buff *skb,
u8 cast_type, struct net_device *sb_dev);
int qeth_open(struct net_device *dev);
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 6caa78d51bd1..bd3adbb6ad50 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -244,7 +244,7 @@ static struct qeth_buffer_pool_entry *qeth_alloc_pool_entry(unsigned int pages)
return NULL;
for (i = 0; i < pages; i++) {
- entry->elements[i] = alloc_page(GFP_KERNEL);
+ entry->elements[i] = __dev_alloc_page(GFP_KERNEL);
if (!entry->elements[i]) {
qeth_free_pool_entry(entry);
@@ -538,9 +538,10 @@ static void qeth_qdio_handle_aob(struct qeth_card *card,
for (i = 0;
i < aob->sb_count && i < QETH_MAX_BUFFER_ELEMENTS(card);
i++) {
- if (aob->sba[i] && buffer->is_header[i])
- kmem_cache_free(qeth_core_header_cache,
- (void *) aob->sba[i]);
+ void *data = phys_to_virt(aob->sba[i]);
+
+ if (data && buffer->is_header[i])
+ kmem_cache_free(qeth_core_header_cache, data);
}
atomic_set(&buffer->state, QETH_QDIO_BUF_HANDLED_DELAYED);
@@ -1244,9 +1245,12 @@ EXPORT_SYMBOL_GPL(qeth_drain_output_queues);
static int qeth_osa_set_output_queues(struct qeth_card *card, bool single)
{
- unsigned int count = single ? 1 : card->dev->num_tx_queues;
+ unsigned int max = single ? 1 : card->dev->num_tx_queues;
+ unsigned int count;
int rc;
+ count = IS_VM_NIC(card) ? min(max, card->dev->real_num_tx_queues) : max;
+
rtnl_lock();
rc = netif_set_real_num_tx_queues(card->dev, count);
rtnl_unlock();
@@ -1254,16 +1258,16 @@ static int qeth_osa_set_output_queues(struct qeth_card *card, bool single)
if (rc)
return rc;
- if (card->qdio.no_out_queues == count)
+ if (card->qdio.no_out_queues == max)
return 0;
if (atomic_read(&card->qdio.state) != QETH_QDIO_UNINITIALIZED)
qeth_free_qdio_queues(card);
- if (count == 1)
+ if (max == 1 && card->qdio.do_prio_queueing != QETH_PRIOQ_DEFAULT)
dev_info(&card->gdev->dev, "Priority Queueing not supported\n");
- card->qdio.no_out_queues = count;
+ card->qdio.no_out_queues = max;
return 0;
}
@@ -2654,7 +2658,7 @@ static struct qeth_buffer_pool_entry *qeth_find_free_buffer_pool_entry(
struct qeth_buffer_pool_entry, list);
for (i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i) {
if (page_count(entry->elements[i]) > 1) {
- struct page *page = alloc_page(GFP_ATOMIC);
+ struct page *page = dev_alloc_page();
if (!page)
return NULL;
@@ -3352,6 +3356,7 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index,
for (i = index; i < index + count; ++i) {
unsigned int bidx = QDIO_BUFNR(i);
+ struct sk_buff *skb;
buf = queue->bufs[bidx];
buf->buffer->element[buf->next_element_to_fill - 1].eflags |=
@@ -3360,8 +3365,11 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index,
if (queue->bufstates)
queue->bufstates[bidx].user = buf;
- if (IS_IQD(queue->card))
+ if (IS_IQD(card)) {
+ skb_queue_walk(&buf->skb_list, skb)
+ skb_tx_timestamp(skb);
continue;
+ }
if (!queue->do_pack) {
if ((atomic_read(&queue->used_buffers) >=
@@ -3705,6 +3713,7 @@ static int qeth_add_hw_header(struct qeth_qdio_out_q *queue,
unsigned int hdr_len, unsigned int proto_len,
unsigned int *elements)
{
+ gfp_t gfp = GFP_ATOMIC | (skb_pfmemalloc(skb) ? __GFP_MEMALLOC : 0);
const unsigned int contiguous = proto_len ? proto_len : 1;
const unsigned int max_elements = queue->max_elements;
unsigned int __elements;
@@ -3760,10 +3769,11 @@ check_layout:
*hdr = skb_push(skb, hdr_len);
return hdr_len;
}
- /* fall back */
+
+ /* Fall back to cache element with known-good alignment: */
if (hdr_len + proto_len > QETH_HDR_CACHE_OBJ_SIZE)
return -E2BIG;
- *hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
+ *hdr = kmem_cache_alloc(qeth_core_header_cache, gfp);
if (!*hdr)
return -ENOMEM;
/* Copy protocol headers behind HW header: */
@@ -5985,22 +5995,8 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card)
SET_NETDEV_DEV(dev, &card->gdev->dev);
netif_carrier_off(dev);
- if (IS_OSN(card)) {
- dev->ethtool_ops = &qeth_osn_ethtool_ops;
- } else {
- dev->ethtool_ops = &qeth_ethtool_ops;
- dev->priv_flags &= ~IFF_TX_SKB_SHARING;
- dev->hw_features |= NETIF_F_SG;
- dev->vlan_features |= NETIF_F_SG;
- if (IS_IQD(card)) {
- dev->features |= NETIF_F_SG;
- if (netif_set_real_num_tx_queues(dev,
- QETH_IQD_MIN_TXQ)) {
- free_netdev(dev);
- return NULL;
- }
- }
- }
+ dev->ethtool_ops = IS_OSN(card) ? &qeth_osn_ethtool_ops :
+ &qeth_ethtool_ops;
return dev;
}
@@ -6016,6 +6012,28 @@ struct net_device *qeth_clone_netdev(struct net_device *orig)
return clone;
}
+int qeth_setup_netdev(struct qeth_card *card)
+{
+ struct net_device *dev = card->dev;
+ unsigned int num_tx_queues;
+
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->hw_features |= NETIF_F_SG;
+ dev->vlan_features |= NETIF_F_SG;
+
+ if (IS_IQD(card)) {
+ dev->features |= NETIF_F_SG;
+ num_tx_queues = QETH_IQD_MIN_TXQ;
+ } else if (IS_VM_NIC(card)) {
+ num_tx_queues = 1;
+ } else {
+ num_tx_queues = dev->real_num_tx_queues;
+ }
+
+ return qeth_set_real_num_tx_queues(card, num_tx_queues);
+}
+EXPORT_SYMBOL_GPL(qeth_setup_netdev);
+
static int qeth_core_probe_device(struct ccwgroup_device *gdev)
{
struct qeth_card *card;
@@ -6055,12 +6073,13 @@ static int qeth_core_probe_device(struct ccwgroup_device *gdev)
goto err_card;
}
+ qeth_determine_capabilities(card);
+ qeth_set_blkt_defaults(card);
+
card->qdio.no_out_queues = card->dev->num_tx_queues;
rc = qeth_update_from_chp_desc(card);
if (rc)
goto err_chp_desc;
- qeth_determine_capabilities(card);
- qeth_set_blkt_defaults(card);
enforced_disc = qeth_enforce_discipline(card);
switch (enforced_disc) {
@@ -6245,9 +6264,6 @@ int qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
struct mii_ioctl_data *mii_data;
int rc = 0;
- if (!card)
- return -ENODEV;
-
switch (cmd) {
case SIOC_QETH_ADP_SET_SNMP_CONTROL:
rc = qeth_snmp_command(card, rq->ifr_ifru.ifru_data);
@@ -6627,12 +6643,59 @@ void qeth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
}
EXPORT_SYMBOL_GPL(qeth_get_stats64);
+#define TC_IQD_UCAST 0
+static void qeth_iqd_set_prio_tc_map(struct net_device *dev,
+ unsigned int ucast_txqs)
+{
+ unsigned int prio;
+
+ /* IQD requires mcast traffic to be placed on a dedicated queue, and
+ * qeth_iqd_select_queue() deals with this.
+ * For unicast traffic, we defer the queue selection to the stack.
+ * By installing a trivial prio map that spans over only the unicast
+ * queues, we can encourage the stack to spread the ucast traffic evenly
+ * without selecting the mcast queue.
+ */
+
+ /* One traffic class, spanning over all active ucast queues: */
+ netdev_set_num_tc(dev, 1);
+ netdev_set_tc_queue(dev, TC_IQD_UCAST, ucast_txqs,
+ QETH_IQD_MIN_UCAST_TXQ);
+
+ /* Map all priorities to this traffic class: */
+ for (prio = 0; prio <= TC_BITMASK; prio++)
+ netdev_set_prio_tc_map(dev, prio, TC_IQD_UCAST);
+}
+
+int qeth_set_real_num_tx_queues(struct qeth_card *card, unsigned int count)
+{
+ struct net_device *dev = card->dev;
+ int rc;
+
+ /* Per netif_setup_tc(), adjust the mapping first: */
+ if (IS_IQD(card))
+ qeth_iqd_set_prio_tc_map(dev, count - 1);
+
+ rc = netif_set_real_num_tx_queues(dev, count);
+
+ if (rc && IS_IQD(card))
+ qeth_iqd_set_prio_tc_map(dev, dev->real_num_tx_queues - 1);
+
+ return rc;
+}
+
u16 qeth_iqd_select_queue(struct net_device *dev, struct sk_buff *skb,
u8 cast_type, struct net_device *sb_dev)
{
+ u16 txq;
+
if (cast_type != RTN_UNICAST)
return QETH_IQD_MCAST_TXQ;
- return QETH_IQD_MIN_UCAST_TXQ;
+ if (dev->real_num_tx_queues == QETH_IQD_MIN_TXQ)
+ return QETH_IQD_MIN_UCAST_TXQ;
+
+ txq = netdev_pick_tx(dev, skb, sb_dev);
+ return (txq == QETH_IQD_MCAST_TXQ) ? QETH_IQD_MIN_UCAST_TXQ : txq;
}
EXPORT_SYMBOL_GPL(qeth_iqd_select_queue);
diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c
index 78cae61bc924..533a7f26dbe1 100644
--- a/drivers/s390/net/qeth_core_sys.c
+++ b/drivers/s390/net/qeth_core_sys.c
@@ -176,7 +176,7 @@ static ssize_t qeth_dev_prioqing_store(struct device *dev,
struct qeth_card *card = dev_get_drvdata(dev);
int rc = 0;
- if (IS_IQD(card))
+ if (IS_IQD(card) || IS_VM_NIC(card))
return -EOPNOTSUPP;
mutex_lock(&card->conf_mutex);
diff --git a/drivers/s390/net/qeth_ethtool.c b/drivers/s390/net/qeth_ethtool.c
index 9052c72d5b8f..31e019085fc3 100644
--- a/drivers/s390/net/qeth_ethtool.c
+++ b/drivers/s390/net/qeth_ethtool.c
@@ -153,7 +153,6 @@ static void qeth_get_drvinfo(struct net_device *dev,
strlcpy(info->driver, IS_LAYER2(card) ? "qeth_l2" : "qeth_l3",
sizeof(info->driver));
- strlcpy(info->version, "1.0", sizeof(info->version));
strlcpy(info->fw_version, card->info.mcl_level,
sizeof(info->fw_version));
snprintf(info->bus_info, sizeof(info->bus_info), "%s/%s/%s",
@@ -175,6 +174,46 @@ static void qeth_get_channels(struct net_device *dev,
channels->combined_count = 0;
}
+static int qeth_set_channels(struct net_device *dev,
+ struct ethtool_channels *channels)
+{
+ struct qeth_card *card = dev->ml_priv;
+
+ if (channels->rx_count == 0 || channels->tx_count == 0)
+ return -EINVAL;
+ if (channels->tx_count > card->qdio.no_out_queues)
+ return -EINVAL;
+
+ if (IS_IQD(card)) {
+ if (channels->tx_count < QETH_IQD_MIN_TXQ)
+ return -EINVAL;
+
+ /* Reject downgrade while running. It could push displaced
+ * ucast flows onto txq0, which is reserved for mcast.
+ */
+ if (netif_running(dev) &&
+ channels->tx_count < dev->real_num_tx_queues)
+ return -EPERM;
+ } else {
+ /* OSA still uses the legacy prio-queue mechanism: */
+ if (!IS_VM_NIC(card))
+ return -EOPNOTSUPP;
+ }
+
+ return qeth_set_real_num_tx_queues(card, channels->tx_count);
+}
+
+static int qeth_get_ts_info(struct net_device *dev,
+ struct ethtool_ts_info *info)
+{
+ struct qeth_card *card = dev->ml_priv;
+
+ if (!IS_IQD(card))
+ return -EOPNOTSUPP;
+
+ return ethtool_op_get_ts_info(dev, info);
+}
+
static int qeth_get_tunable(struct net_device *dev,
const struct ethtool_tunable *tuna, void *data)
{
@@ -410,6 +449,8 @@ const struct ethtool_ops qeth_ethtool_ops = {
.get_sset_count = qeth_get_sset_count,
.get_drvinfo = qeth_get_drvinfo,
.get_channels = qeth_get_channels,
+ .set_channels = qeth_set_channels,
+ .get_ts_info = qeth_get_ts_info,
.get_tunable = qeth_get_tunable,
.set_tunable = qeth_set_tunable,
.get_link_ksettings = qeth_get_link_ksettings,
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 4c8e93132e08..73cb363b1fab 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -499,6 +499,7 @@ static void qeth_l2_rx_mode_work(struct work_struct *work)
static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb,
struct qeth_qdio_out_q *queue)
{
+ gfp_t gfp = GFP_ATOMIC | (skb_pfmemalloc(skb) ? __GFP_MEMALLOC : 0);
struct qeth_hdr *hdr = (struct qeth_hdr *)skb->data;
addr_t end = (addr_t)(skb->data + sizeof(*hdr));
addr_t start = (addr_t)skb->data;
@@ -511,7 +512,7 @@ static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb,
if (qeth_get_elements_for_range(start, end) > 1) {
/* Misaligned HW header, move it to its own buffer element. */
- hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
+ hdr = kmem_cache_alloc(qeth_core_header_cache, gfp);
if (!hdr)
return -ENOMEM;
hd_len = sizeof(*hdr);
@@ -570,7 +571,9 @@ static u16 qeth_l2_select_queue(struct net_device *dev, struct sk_buff *skb,
return qeth_iqd_select_queue(dev, skb,
qeth_get_ether_cast_type(skb),
sb_dev);
- return qeth_get_priority_queue(card, skb);
+
+ return IS_VM_NIC(card) ? netdev_pick_tx(dev, skb, sb_dev) :
+ qeth_get_priority_queue(card, skb);
}
static const struct device_type qeth_l2_devtype = {
@@ -610,7 +613,7 @@ static void qeth_l2_remove_device(struct ccwgroup_device *cgdev)
qeth_set_offline(card, false);
cancel_work_sync(&card->close_dev_work);
- if (qeth_netdev_is_registered(card->dev))
+ if (card->dev->reg_state == NETREG_REGISTERED)
unregister_netdev(card->dev);
}
@@ -648,7 +651,7 @@ static const struct net_device_ops qeth_osn_netdev_ops = {
.ndo_tx_timeout = qeth_tx_timeout,
};
-static int qeth_l2_setup_netdev(struct qeth_card *card, bool carrier_ok)
+static int qeth_l2_setup_netdev(struct qeth_card *card)
{
int rc;
@@ -658,6 +661,10 @@ static int qeth_l2_setup_netdev(struct qeth_card *card, bool carrier_ok)
goto add_napi;
}
+ rc = qeth_setup_netdev(card);
+ if (rc)
+ return rc;
+
card->dev->needed_headroom = sizeof(struct qeth_hdr);
card->dev->netdev_ops = &qeth_l2_netdev_ops;
card->dev->priv_flags |= IFF_UNICAST_FLT;
@@ -704,13 +711,7 @@ static int qeth_l2_setup_netdev(struct qeth_card *card, bool carrier_ok)
add_napi:
netif_napi_add(card->dev, &card->napi, qeth_poll, QETH_NAPI_WEIGHT);
- rc = register_netdev(card->dev);
- if (!rc && carrier_ok)
- netif_carrier_on(card->dev);
-
- if (rc)
- card->dev->netdev_ops = NULL;
- return rc;
+ return register_netdev(card->dev);
}
static void qeth_l2_trace_features(struct qeth_card *card)
@@ -783,10 +784,13 @@ static int qeth_l2_set_online(struct qeth_card *card)
qeth_set_allowed_threads(card, 0xffffffff, 0);
- if (!qeth_netdev_is_registered(dev)) {
- rc = qeth_l2_setup_netdev(card, carrier_ok);
+ if (dev->reg_state != NETREG_REGISTERED) {
+ rc = qeth_l2_setup_netdev(card);
if (rc)
goto out_remove;
+
+ if (carrier_ok)
+ netif_carrier_on(dev);
} else {
rtnl_lock();
if (carrier_ok)
@@ -1512,8 +1516,6 @@ int qeth_bridgeport_an_set(struct qeth_card *card, int enable)
struct ccw_device *ddev;
struct subchannel_id schid;
- if (!card)
- return -EINVAL;
if (!card->options.sbp.supported_funcs)
return -EOPNOTSUPP;
ddev = CARD_DDEV(card);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 8a803d6c9357..83ae75cf1389 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1880,7 +1880,8 @@ static u16 qeth_l3_osa_select_queue(struct net_device *dev, struct sk_buff *skb,
{
struct qeth_card *card = dev->ml_priv;
- return qeth_get_priority_queue(card, skb);
+ return IS_VM_NIC(card) ? netdev_pick_tx(dev, skb, sb_dev) :
+ qeth_get_priority_queue(card, skb);
}
static const struct net_device_ops qeth_l3_netdev_ops = {
@@ -1917,11 +1918,15 @@ static const struct net_device_ops qeth_l3_osa_netdev_ops = {
.ndo_neigh_setup = qeth_l3_neigh_setup,
};
-static int qeth_l3_setup_netdev(struct qeth_card *card, bool carrier_ok)
+static int qeth_l3_setup_netdev(struct qeth_card *card)
{
unsigned int headroom;
int rc;
+ rc = qeth_setup_netdev(card);
+ if (rc)
+ return rc;
+
if (IS_OSD(card) || IS_OSX(card)) {
if ((card->info.link_type == QETH_LINK_TYPE_LANE_TR) ||
(card->info.link_type == QETH_LINK_TYPE_HSTR)) {
@@ -1967,7 +1972,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card, bool carrier_ok)
rc = qeth_l3_iqd_read_initial_mac(card);
if (rc)
- goto out;
+ return rc;
} else
return -ENODEV;
@@ -1982,14 +1987,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card, bool carrier_ok)
PAGE_SIZE * (QETH_MAX_BUFFER_ELEMENTS(card) - 1));
netif_napi_add(card->dev, &card->napi, qeth_poll, QETH_NAPI_WEIGHT);
- rc = register_netdev(card->dev);
- if (!rc && carrier_ok)
- netif_carrier_on(card->dev);
-
-out:
- if (rc)
- card->dev->netdev_ops = NULL;
- return rc;
+ return register_netdev(card->dev);
}
static const struct device_type qeth_l3_devtype = {
@@ -2036,7 +2034,7 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev)
qeth_set_offline(card, false);
cancel_work_sync(&card->close_dev_work);
- if (qeth_netdev_is_registered(card->dev))
+ if (card->dev->reg_state == NETREG_REGISTERED)
unregister_netdev(card->dev);
flush_workqueue(card->cmd_wq);
@@ -2083,10 +2081,13 @@ static int qeth_l3_set_online(struct qeth_card *card)
qeth_set_allowed_threads(card, 0xffffffff, 0);
qeth_l3_recover_ip(card);
- if (!qeth_netdev_is_registered(dev)) {
- rc = qeth_l3_setup_netdev(card, carrier_ok);
+ if (dev->reg_state != NETREG_REGISTERED) {
+ rc = qeth_l3_setup_netdev(card);
if (rc)
goto out_remove;
+
+ if (carrier_ok)
+ netif_carrier_on(dev);
} else {
rtnl_lock();
if (carrier_ok)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index be355f37337d..c1d379bf6ee1 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -458,6 +458,8 @@ struct ethtool_ops {
struct ethtool_stats *, u64 *);
};
+int ethtool_check_ops(const struct ethtool_ops *ops);
+
struct ethtool_rx_flow_rule {
struct flow_rule *rule;
unsigned long priv[0];
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index a7604248777b..917e4bb2ed71 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -316,11 +316,15 @@ static inline void mii_10gbt_stat_mod_linkmode_lpa_t(unsigned long *advertising,
int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
+int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
+ u16 mask, u16 set);
int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum);
int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val);
+int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask,
+ u16 set);
int mdiobus_register_device(struct mdio_device *mdiodev);
int mdiobus_unregister_device(struct mdio_device *mdiodev);
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 5448c8b443db..ab192720e2d6 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -98,7 +98,7 @@ struct ip_set_counter {
struct ip_set_comment_rcu {
struct rcu_head rcu;
- char str[0];
+ char str[];
};
struct ip_set_comment {
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1b261c51b3a3..5da88451853b 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -264,7 +264,7 @@ struct xt_table_info {
unsigned int stacksize;
void ***jumpstack;
- unsigned char entries[0] __aligned(8);
+ unsigned char entries[] __aligned(8);
};
int xt_register_target(struct xt_target *target);
@@ -464,7 +464,7 @@ struct compat_xt_entry_match {
} kernel;
u_int16_t match_size;
} u;
- unsigned char data[0];
+ unsigned char data[];
};
struct compat_xt_entry_target {
@@ -480,7 +480,7 @@ struct compat_xt_entry_target {
} kernel;
u_int16_t target_size;
} u;
- unsigned char data[0];
+ unsigned char data[];
};
/* FIXME: this works only on 32 bit tasks
@@ -494,7 +494,7 @@ struct compat_xt_counters {
struct compat_xt_counters_info {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t num_counters;
- struct compat_xt_counters counters[0];
+ struct compat_xt_counters counters[];
};
struct _compat_xt_align {
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index e98028f00e47..7d3537c40ec9 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -67,7 +67,7 @@ struct compat_arpt_entry {
__u16 next_offset;
compat_uint_t comefrom;
struct compat_xt_counters counters;
- unsigned char elems[0];
+ unsigned char elems[];
};
static inline struct xt_entry_target *
diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 162f59d0d17a..2f5c4e6ecd8a 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -85,7 +85,7 @@ struct ebt_table_info {
/* room to maintain the stack used for jumping from and into udc */
struct ebt_chainstack **chainstack;
char *entries;
- struct ebt_counter counters[0] ____cacheline_aligned;
+ struct ebt_counter counters[] ____cacheline_aligned;
};
struct ebt_table {
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index e9e1ed74cdf1..b394bd4f68a3 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -76,7 +76,7 @@ struct compat_ipt_entry {
__u16 next_offset;
compat_uint_t comefrom;
struct compat_xt_counters counters;
- unsigned char elems[0];
+ unsigned char elems[];
};
/* Helper functions */
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 78ab959c4575..8225f7821a29 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -43,7 +43,7 @@ struct compat_ip6t_entry {
__u16 next_offset;
compat_uint_t comefrom;
struct compat_xt_counters counters;
- unsigned char elems[0];
+ unsigned char elems[];
};
static inline struct xt_entry_target *
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6b872aed8ba6..36d9dea04016 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/mod_devicetable.h>
#include <linux/u64_stats_sync.h>
+#include <linux/irqreturn.h>
#include <linux/atomic.h>
@@ -568,7 +569,7 @@ struct phy_driver {
int (*did_interrupt)(struct phy_device *phydev);
/* Override default interrupt handling */
- int (*handle_interrupt)(struct phy_device *phydev);
+ irqreturn_t (*handle_interrupt)(struct phy_device *phydev);
/* Clears up any memory if needed */
void (*remove)(struct phy_device *phydev);
@@ -754,6 +755,25 @@ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val)
}
/**
+ * __phy_modify_changed() - Convenience function for modifying a PHY register
+ * @phydev: a pointer to a &struct phy_device
+ * @regnum: register number
+ * @mask: bit mask of bits to clear
+ * @set: bit mask of bits to set
+ *
+ * Unlocked helper function which allows a PHY register to be modified as
+ * new register value = (old register value & ~mask) | set
+ *
+ * Returns negative errno, 0 if there was no change, and 1 in case of change
+ */
+static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum,
+ u16 mask, u16 set)
+{
+ return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr,
+ regnum, mask, set);
+}
+
+/**
* phy_read_mmd - Convenience function for reading a register
* from an MMD on a given PHY.
* @phydev: The phy_device struct
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2180eb1aa254..8fa6df3b881b 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -317,4 +317,12 @@ int phylink_mii_ioctl(struct phylink *, struct ifreq *, int);
void phylink_set_port_modes(unsigned long *bits);
void phylink_helper_basex_speed(struct phylink_link_state *state);
+void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
+ struct phylink_link_state *state);
+int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
+ const struct phylink_link_state *state);
+void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs);
+
+void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs,
+ struct phylink_link_state *state);
#endif
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index efd8d47f6997..1e30b0d44b61 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -163,19 +163,17 @@ enum flow_action_mangle_base {
};
enum flow_action_hw_stats_type_bit {
- FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE_BIT,
- FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT,
+ FLOW_ACTION_HW_STATS_IMMEDIATE_BIT,
+ FLOW_ACTION_HW_STATS_DELAYED_BIT,
};
enum flow_action_hw_stats_type {
- FLOW_ACTION_HW_STATS_TYPE_DISABLED = 0,
- FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE =
- BIT(FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE_BIT),
- FLOW_ACTION_HW_STATS_TYPE_DELAYED =
- BIT(FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT),
- FLOW_ACTION_HW_STATS_TYPE_ANY =
- FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE |
- FLOW_ACTION_HW_STATS_TYPE_DELAYED,
+ FLOW_ACTION_HW_STATS_DISABLED = 0,
+ FLOW_ACTION_HW_STATS_IMMEDIATE =
+ BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT),
+ FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT),
+ FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE |
+ FLOW_ACTION_HW_STATS_DELAYED,
};
typedef void (*action_destr)(void *priv);
@@ -285,8 +283,8 @@ static inline bool flow_offload_has_one_action(const struct flow_action *action)
__act = &(__actions)->entries[++__i])
static inline bool
-flow_action_mixed_hw_stats_types_check(const struct flow_action *action,
- struct netlink_ext_ack *extack)
+flow_action_mixed_hw_stats_check(const struct flow_action *action,
+ struct netlink_ext_ack *extack)
{
const struct flow_action_entry *action_entry;
u8 uninitialized_var(last_hw_stats_type);
@@ -313,20 +311,20 @@ flow_action_first_entry_get(const struct flow_action *action)
}
static inline bool
-__flow_action_hw_stats_types_check(const struct flow_action *action,
- struct netlink_ext_ack *extack,
- bool check_allow_bit,
- enum flow_action_hw_stats_type_bit allow_bit)
+__flow_action_hw_stats_check(const struct flow_action *action,
+ struct netlink_ext_ack *extack,
+ bool check_allow_bit,
+ enum flow_action_hw_stats_type_bit allow_bit)
{
const struct flow_action_entry *action_entry;
if (!flow_action_has_entries(action))
return true;
- if (!flow_action_mixed_hw_stats_types_check(action, extack))
+ if (!flow_action_mixed_hw_stats_check(action, extack))
return false;
action_entry = flow_action_first_entry_get(action);
if (!check_allow_bit &&
- action_entry->hw_stats_type != FLOW_ACTION_HW_STATS_TYPE_ANY) {
+ action_entry->hw_stats_type != FLOW_ACTION_HW_STATS_ANY) {
NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\"");
return false;
} else if (check_allow_bit &&
@@ -338,19 +336,18 @@ __flow_action_hw_stats_types_check(const struct flow_action *action,
}
static inline bool
-flow_action_hw_stats_types_check(const struct flow_action *action,
- struct netlink_ext_ack *extack,
- enum flow_action_hw_stats_type_bit allow_bit)
+flow_action_hw_stats_check(const struct flow_action *action,
+ struct netlink_ext_ack *extack,
+ enum flow_action_hw_stats_type_bit allow_bit)
{
- return __flow_action_hw_stats_types_check(action, extack,
- true, allow_bit);
+ return __flow_action_hw_stats_check(action, extack, true, allow_bit);
}
static inline bool
-flow_action_basic_hw_stats_types_check(const struct flow_action *action,
- struct netlink_ext_ack *extack)
+flow_action_basic_hw_stats_check(const struct flow_action *action,
+ struct netlink_ext_ack *extack)
{
- return __flow_action_hw_stats_types_check(action, extack, false, 0);
+ return __flow_action_hw_stats_check(action, extack, false, 0);
}
struct flow_rule {
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 5ae5295aa46d..e1e588387103 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -45,7 +45,7 @@ enum nf_ct_ext_id {
struct nf_ct_ext {
u8 offset[NF_CT_EXT_NUM];
u8 len;
- char data[0];
+ char data[];
};
static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 6dd72396f534..659b0ea25b4d 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -14,7 +14,7 @@
struct nf_ct_timeout {
__u16 l3num;
const struct nf_conntrack_l4proto *l4proto;
- char data[0];
+ char data[];
};
struct ctnl_timeout {
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 6890f1ca3e31..f523ea87b6ae 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -19,11 +19,17 @@ enum flow_offload_tuple_dir;
struct nf_flow_key {
struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
+ struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
};
+ struct flow_dissector_key_keyid enc_key_id;
+ union {
+ struct flow_dissector_key_ipv4_addrs enc_ipv4;
+ struct flow_dissector_key_ipv6_addrs enc_ipv6;
+ };
struct flow_dissector_key_tcp tcp;
struct flow_dissector_key_ports tp;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4170c033d461..5d80e09f8148 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -224,7 +224,7 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
*/
struct nft_userdata {
u8 len;
- unsigned char data[0];
+ unsigned char data[];
};
/**
@@ -385,21 +385,14 @@ struct nft_set_ops {
* struct nft_set_type - nf_tables set type
*
* @ops: set ops for this type
- * @list: used internally
- * @owner: module reference
* @features: features supported by the implementation
*/
struct nft_set_type {
const struct nft_set_ops ops;
- struct list_head list;
- struct module *owner;
u32 features;
};
#define to_set_type(o) container_of(o, struct nft_set_type, ops)
-int nft_register_set(struct nft_set_type *type);
-void nft_unregister_set(struct nft_set_type *type);
-
/**
* struct nft_set - nf_tables set instance
*
@@ -572,7 +565,7 @@ struct nft_set_ext_tmpl {
struct nft_set_ext {
u8 genmask;
u8 offset[NFT_SET_EXT_NUM];
- char data[0];
+ char data[];
};
static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl)
@@ -673,6 +666,10 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
}
+struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nlattr *attr);
+
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end, const u32 *data,
@@ -849,8 +846,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
return (void *)expr->data;
}
-struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
- const struct nlattr *nla);
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
const struct nft_expr *expr);
@@ -895,6 +890,18 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule)
return (void *)&rule->data[rule->dlen];
}
+static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_expr *expr;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
+ expr = nft_set_ext_expr(ext);
+ expr->ops->eval(expr, regs, pkt);
+ }
+}
+
/*
* The last pointer isn't really necessary, but the compiler isn't able to
* determine that the result of nft_expr_last() is always the same since it
@@ -1253,9 +1260,6 @@ void nft_trace_notify(struct nft_traceinfo *info);
#define MODULE_ALIAS_NFT_EXPR(name) \
MODULE_ALIAS("nft-expr-" name)
-#define MODULE_ALIAS_NFT_SET() \
- MODULE_ALIAS("nft-set")
-
#define MODULE_ALIAS_NFT_OBJ(type) \
MODULE_ALIAS("nft-obj-" __stringify(type))
@@ -1385,7 +1389,7 @@ struct nft_trans {
int msg_type;
bool put_net;
struct nft_ctx ctx;
- char data[0];
+ char data[];
};
struct nft_trans_rule {
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 29e7e1021267..78516de14d31 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -69,12 +69,13 @@ extern const struct nft_expr_ops nft_payload_fast_ops;
extern struct static_key_false nft_counters_enabled;
extern struct static_key_false nft_trace_enabled;
-extern struct nft_set_type nft_set_rhash_type;
-extern struct nft_set_type nft_set_hash_type;
-extern struct nft_set_type nft_set_hash_fast_type;
-extern struct nft_set_type nft_set_rbtree_type;
-extern struct nft_set_type nft_set_bitmap_type;
-extern struct nft_set_type nft_set_pipapo_type;
+extern const struct nft_set_type nft_set_rhash_type;
+extern const struct nft_set_type nft_set_hash_type;
+extern const struct nft_set_type nft_set_hash_fast_type;
+extern const struct nft_set_type nft_set_rbtree_type;
+extern const struct nft_set_type nft_set_bitmap_type;
+extern const struct nft_set_type nft_set_pipapo_type;
+extern const struct nft_set_type nft_set_pipapo_avx2_type;
struct nft_expr;
struct nft_regs;
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 20d2c6419612..9092e697059e 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -75,7 +75,15 @@ struct qdisc_watchdog {
void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
clockid_t clockid);
void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires);
+
+void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
+ u64 delta_ns);
+
+static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd,
+ u64 expires)
+{
+ return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL);
+}
static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd,
psched_time_t expires)
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 42f7ca38ad80..54010b49c093 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -131,6 +131,7 @@ enum {
#define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */
#define BRIDGE_VLAN_INFO_BRENTRY (1<<5) /* Global bridge VLAN entry */
#define BRIDGE_VLAN_INFO_ONLY_OPTS (1<<6) /* Skip create/delete/flags */
+#define BRIDGE_VLAN_INFO_REMOVE_TUN (1<<7) /* Remove tunnel mapping */
struct bridge_vlan_info {
__u16 flags;
@@ -192,6 +193,7 @@ enum {
BRIDGE_VLANDB_ENTRY_INFO,
BRIDGE_VLANDB_ENTRY_RANGE,
BRIDGE_VLANDB_ENTRY_STATE,
+ BRIDGE_VLANDB_ENTRY_TUNNEL_ID,
__BRIDGE_VLANDB_ENTRY_MAX,
};
#define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1)
diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h
index 0b9c3beda345..90f9b4e1ba27 100644
--- a/include/uapi/linux/mii.h
+++ b/include/uapi/linux/mii.h
@@ -134,11 +134,16 @@
/* MAC and PHY tx_config_Reg[15:0] for SGMII in-band auto-negotiation.*/
#define ADVERTISE_SGMII 0x0001 /* MAC can do SGMII */
#define LPA_SGMII 0x0001 /* PHY can do SGMII */
+#define LPA_SGMII_SPD_MASK 0x0c00 /* SGMII speed mask */
+#define LPA_SGMII_FULL_DUPLEX 0x1000 /* SGMII full duplex */
#define LPA_SGMII_DPX_SPD_MASK 0x1C00 /* SGMII duplex and speed bits */
+#define LPA_SGMII_10 0x0000 /* 10Mbps */
#define LPA_SGMII_10HALF 0x0000 /* Can do 10mbps half-duplex */
#define LPA_SGMII_10FULL 0x1000 /* Can do 10mbps full-duplex */
+#define LPA_SGMII_100 0x0400 /* 100Mbps */
#define LPA_SGMII_100HALF 0x0400 /* Can do 100mbps half-duplex */
#define LPA_SGMII_100FULL 0x1400 /* Can do 100mbps full-duplex */
+#define LPA_SGMII_1000 0x0800 /* 1000Mbps */
#define LPA_SGMII_1000HALF 0x0800 /* Can do 1000mbps half-duplex */
#define LPA_SGMII_1000FULL 0x1800 /* Can do 1000mbps full-duplex */
#define LPA_SGMII_LINK 0x8000 /* PHY link with copper-side partner */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 065218a20bb7..9c3d2d04d6a1 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1770,6 +1770,7 @@ enum nft_tunnel_opts_attributes {
NFTA_TUNNEL_KEY_OPTS_UNSPEC,
NFTA_TUNNEL_KEY_OPTS_VXLAN,
NFTA_TUNNEL_KEY_OPTS_ERSPAN,
+ NFTA_TUNNEL_KEY_OPTS_GENEVE,
__NFTA_TUNNEL_KEY_OPTS_MAX
};
#define NFTA_TUNNEL_KEY_OPTS_MAX (__NFTA_TUNNEL_KEY_OPTS_MAX - 1)
@@ -1791,6 +1792,15 @@ enum nft_tunnel_opts_erspan_attributes {
};
#define NFTA_TUNNEL_KEY_ERSPAN_MAX (__NFTA_TUNNEL_KEY_ERSPAN_MAX - 1)
+enum nft_tunnel_opts_geneve_attributes {
+ NFTA_TUNNEL_KEY_GENEVE_UNSPEC,
+ NFTA_TUNNEL_KEY_GENEVE_CLASS,
+ NFTA_TUNNEL_KEY_GENEVE_TYPE,
+ NFTA_TUNNEL_KEY_GENEVE_DATA,
+ __NFTA_TUNNEL_KEY_GENEVE_MAX
+};
+#define NFTA_TUNNEL_KEY_GENEVE_MAX (__NFTA_TUNNEL_KEY_GENEVE_MAX - 1)
+
enum nft_tunnel_flags {
NFT_TUNNEL_F_ZERO_CSUM_TX = (1 << 0),
NFT_TUNNEL_F_DONT_FRAGMENT = (1 << 1),
diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 3c586a19baea..434e6506abaa 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -1,4 +1,3 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* linux/include/linux/netfilter/xt_IDLETIMER.h
*
@@ -33,6 +32,7 @@
#include <linux/types.h>
#define MAX_IDLETIMER_LABEL_SIZE 28
+#define XT_IDLETIMER_ALARM 0x01
struct idletimer_tg_info {
__u32 timeout;
@@ -43,4 +43,14 @@ struct idletimer_tg_info {
struct idletimer_tg *timer __attribute__((aligned(8)));
};
+struct idletimer_tg_info_v1 {
+ __u32 timeout;
+
+ char label[MAX_IDLETIMER_LABEL_SIZE];
+
+ __u8 timer_type;
+
+ /* for kernel module internal use only */
+ struct idletimer_tg *timer __attribute__((aligned(8)));
+};
#endif
diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h
index 9acf757bc1f7..73b26a280c4f 100644
--- a/include/uapi/linux/netfilter_bridge/ebt_among.h
+++ b/include/uapi/linux/netfilter_bridge/ebt_among.h
@@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple {
struct ebt_mac_wormhash {
int table[257];
int poolsize;
- struct ebt_mac_wormhash_tuple pool[0];
+ struct ebt_mac_wormhash_tuple pool[];
};
#define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index ea39287d59c8..7307a29a103e 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -911,6 +911,8 @@ enum {
TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */
+ TCA_FQ_TIMER_SLACK, /* timer slack */
+
__TCA_FQ_MAX
};
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index afee292fb004..162998e2f039 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -26,8 +26,8 @@ static size_t __get_vlan_tinfo_size(void)
nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
}
-static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
- struct net_bridge_vlan *v_last)
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last)
{
__be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
__be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
@@ -193,8 +193,8 @@ static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX +
[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
};
-static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
- u16 vid, u32 tun_id, bool *changed)
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed)
{
int err = 0;
@@ -250,8 +250,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr,
return 0;
}
-int br_process_vlan_tunnel_info(struct net_bridge *br,
- struct net_bridge_port *p, int cmd,
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p, int cmd,
struct vtunnel_info *tinfo_curr,
struct vtunnel_info *tinfo_last,
bool *changed)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5153ffe79a01..1f97703a52ff 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1199,8 +1199,8 @@ static inline void br_vlan_notify(const struct net_bridge *br,
/* br_vlan_options.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-bool br_vlan_opts_eq(const struct net_bridge_vlan *v1,
- const struct net_bridge_vlan *v2);
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v);
size_t br_vlan_opts_nl_size(void);
int br_vlan_process_options(const struct net_bridge *br,
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
index 2bdef2ea3420..c54cc26211d7 100644
--- a/net/bridge/br_private_tunnel.h
+++ b/net/bridge/br_private_tunnel.h
@@ -18,8 +18,8 @@ struct vtunnel_info {
/* br_netlink_tunnel.c */
int br_parse_vlan_tunnel_info(struct nlattr *attr,
struct vtunnel_info *tinfo);
-int br_process_vlan_tunnel_info(struct net_bridge *br,
- struct net_bridge_port *p,
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p,
int cmd,
struct vtunnel_info *tinfo_curr,
struct vtunnel_info *tinfo_last,
@@ -32,8 +32,9 @@ int br_fill_vlan_tunnel_info(struct sk_buff *skb,
/* br_vlan_tunnel.c */
int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
-int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
-int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id);
void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan);
@@ -42,19 +43,23 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
struct net_bridge_vlan_group *vg);
int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
struct net_bridge_vlan *vlan);
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last);
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed);
#else
static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
{
return 0;
}
-static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
+static inline int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port,
u16 vid)
{
return 0;
}
-static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
+static inline int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port,
u16 vid, u32 tun_id)
{
return 0;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 6b5deca08b89..24f524536be4 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1694,7 +1694,7 @@ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
{
return v_curr->vid - range_end->vid == 1 &&
range_end->flags == v_curr->flags &&
- br_vlan_opts_eq(v_curr, range_end);
+ br_vlan_opts_eq_range(v_curr, range_end);
}
static int br_vlan_dump_dev(const struct net_device *dev,
@@ -1839,6 +1839,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
.len = sizeof(struct bridge_vlan_info) },
[BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
[BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_TUNNEL_ID] = { .type = NLA_U32 },
};
static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index cd2eb194eb98..138e180cf4d8 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -4,25 +4,48 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <net/ip_tunnels.h>
#include "br_private.h"
+#include "br_private_tunnel.h"
-/* check if the options between two vlans are equal */
-bool br_vlan_opts_eq(const struct net_bridge_vlan *v1,
- const struct net_bridge_vlan *v2)
+static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v)
{
- return v1->state == v2->state;
+ __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id);
+
+ if (!v->tinfo.tunnel_dst)
+ return true;
+
+ return !nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_ID,
+ be32_to_cpu(tid));
+}
+
+static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) ||
+ vlan_tunid_inrange(v_curr, range_end);
+}
+
+/* check if the options' state of v_curr allow it to enter the range */
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return v_curr->state == range_end->state &&
+ __vlan_tun_can_enter_range(v_curr, range_end);
}
bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
{
return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE,
- br_vlan_get_state(v));
+ br_vlan_get_state(v)) &&
+ __vlan_tun_put(skb, v);
}
size_t br_vlan_opts_nl_size(void)
{
- return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */
+ return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */
+ + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_ENTRY_TUNNEL_ID */
}
static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
@@ -62,6 +85,40 @@ static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
return 0;
}
+static int br_vlan_modify_tunnel(const struct net_bridge_port *p,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct bridge_vlan_info *vinfo;
+ int cmdmap;
+ u32 tun_id;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans");
+ return -EINVAL;
+ }
+ if (!(p->flags & BR_VLAN_TUNNEL)) {
+ NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set");
+ return -EINVAL;
+ }
+
+ /* vlan info attribute is guaranteed by br_vlan_rtm_process_one */
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ cmdmap = vinfo->flags & BRIDGE_VLAN_INFO_REMOVE_TUN ? RTM_DELLINK :
+ RTM_SETLINK;
+ /* when working on vlan ranges this represents the starting tunnel id */
+ tun_id = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_TUNNEL_ID]);
+ /* tunnel ids are mapped to each vlan in increasing order,
+ * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the
+ * current vlan, so we compute: tun_id + v - vinfo->vid
+ */
+ tun_id += v->vid - vinfo->vid;
+
+ return br_vlan_tunnel_info(p, cmdmap, v->vid, tun_id, changed);
+}
+
static int br_vlan_process_one_opts(const struct net_bridge *br,
const struct net_bridge_port *p,
struct net_bridge_vlan_group *vg,
@@ -80,6 +137,11 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
if (err)
return err;
}
+ if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_ID]) {
+ err = br_vlan_modify_tunnel(p, v, tb, changed, extack);
+ if (err)
+ return err;
+ }
return 0;
}
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index d13d2080f527..169e005fbda2 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -89,7 +89,8 @@ out:
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
-int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *vlan;
@@ -107,7 +108,7 @@ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
-int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index e1256e03a9a8..78db58c7aec2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1561,7 +1561,7 @@ struct compat_ebt_entry_mwt {
compat_uptr_t ptr;
} u;
compat_uint_t match_size;
- compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace))));
+ compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace));
};
/* account for possible padding between match_size and ->data */
diff --git a/net/core/dev.c b/net/core/dev.c
index d84541c24446..021e18251465 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9283,6 +9283,10 @@ int register_netdevice(struct net_device *dev)
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
spin_lock_init(&dev->addr_list_lock);
lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index c5beb3031a72..5f782fa3029f 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -861,8 +861,8 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
if (!flow_offload_has_one_action(&cls->rule->action))
return err;
- if (!flow_action_basic_hw_stats_types_check(&cls->rule->action,
- cls->common.extack))
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+ cls->common.extack))
return err;
act = &cls->rule->action.entries[0];
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 0b22741b2f8f..dab047eec943 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -289,3 +289,14 @@ out:
kfree(indir);
return ret;
}
+
+int ethtool_check_ops(const struct ethtool_ops *ops)
+{
+ if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params))
+ return -EINVAL;
+ /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime,
+ * the fact that ops are checked at registration time does not
+ * mean the ops attached to a netdev later on are sane.
+ */
+ return 0;
+}
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 258840b19fb5..3852a58d7f95 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1519,9 +1519,6 @@ ethtool_set_coalesce_supported(struct net_device *dev,
u32 supported_params = dev->ethtool_ops->supported_coalesce_params;
u32 nonzero_params = 0;
- if (!supported_params)
- return true;
-
if (coalesce->rx_coalesce_usecs)
nonzero_params |= ETHTOOL_COALESCE_RX_USECS;
if (coalesce->rx_max_coalesced_frames)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f1f78a742b36..b167f4a5b684 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1057,7 +1057,7 @@ struct compat_arpt_replace {
u32 underflow[NF_ARP_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters;
- struct compat_arpt_entry entries[0];
+ struct compat_arpt_entry entries[];
};
static inline void compat_release_entry(struct compat_arpt_entry *e)
@@ -1383,7 +1383,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
struct compat_arpt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_arpt_entry entrytable[0];
+ struct compat_arpt_entry entrytable[];
};
static int compat_get_entries(struct net *net,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 10b91ebdf213..c2670eaa74e6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1211,7 +1211,7 @@ struct compat_ipt_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ipt_entry entries[0];
+ struct compat_ipt_entry entries[];
};
static int
@@ -1562,7 +1562,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
struct compat_ipt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ipt_entry entrytable[0];
+ struct compat_ipt_entry entrytable[];
};
static int
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 645cc3009e64..f5f588b1f6e9 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -145,12 +145,13 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else {
- bictcp_update(ca, tp->snd_cwnd);
- tcp_cong_avoid_ai(tp, ca->cnt, 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
+ bictcp_update(ca, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
/*
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 471571e1ab26..6cebf412d590 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -10,10 +10,9 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8. We use 50 instead of 100 to account for
- * delayed ack.
+ * .01 and and 7/8.
*/
-#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -23,11 +22,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
- 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ acked);
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 3b36bb1a0dda..50a9a6e2c4cd 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -153,31 +153,34 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
if (tcp_in_slow_start(tp)) {
- /* Slow start. */
- tcp_slow_start(tp, acked);
+ /* Slow start. */
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto done;
+ }
+
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
} else {
- /* Congestion avoidance. */
- if (veno->diff < beta) {
- /* In the "non-congestive state", increase cwnd
- * every rtt.
- */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
- } else {
- /* In the "congestive state", increase cwnd
- * every other rtt.
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc &&
- tp->snd_cwnd < tp->snd_cwnd_clamp) {
- tp->snd_cwnd++;
- veno->inc = 0;
- } else
- veno->inc = 1;
- tp->snd_cwnd_cnt = 0;
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ tp->snd_cwnd++;
+ veno->inc = 0;
} else
- tp->snd_cwnd_cnt++;
- }
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt += acked;
}
+done:
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index e00570dd0a69..3bb448761ca3 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -36,8 +36,6 @@ struct yeah {
u32 reno_count;
u32 fast_count;
-
- u32 pkts_acked;
};
static void tcp_yeah_init(struct sock *sk)
@@ -57,18 +55,6 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk,
- const struct ack_sample *sample)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct yeah *yeah = inet_csk_ca(sk);
-
- if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = sample->pkts_acked;
-
- tcp_vegas_pkts_acked(sk, sample);
-}
-
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -77,24 +63,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto do_vegas;
+ }
- else if (!yeah->doing_reno_now) {
+ if (!yeah->doing_reno_now) {
/* Scalable */
-
- tp->snd_cwnd_cnt += yeah->pkts_acked;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
-
- yeah->pkts_acked = 1;
-
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ acked);
} else {
/* Reno */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
@@ -118,7 +99,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* of bytes we send in an RTT is often less than our cwnd will allow.
* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
*/
-
+do_vegas:
if (after(ack, yeah->vegas.beg_snd_nxt)) {
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -232,7 +213,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
- .pkts_acked = tcp_yeah_pkts_acked,
+ .pkts_acked = tcp_vegas_pkts_acked,
.owner = THIS_MODULE,
.name = "yeah",
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c973ace208c5..e27393498ecb 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1227,7 +1227,7 @@ struct compat_ip6t_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ip6t_entry entries[0];
+ struct compat_ip6t_entry entries[];
};
static int
@@ -1571,7 +1571,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
struct compat_ip6t_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ip6t_entry entrytable[0];
+ struct compat_ip6t_entry entrytable[];
};
static int
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 04c3caed92df..e959104832ef 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -861,6 +861,9 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
ack_seq++;
msk->ack_seq = ack_seq;
}
+
+ /* will be fully established after successful MPC subflow creation */
+ inet_sk_state_store(nsk, TCP_SYN_RECV);
bh_unlock_sock(nsk);
/* keep a single reference */
@@ -916,10 +919,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
- /* will be fully established at mptcp_stream_accept()
- * completion.
- */
- inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV);
bh_unlock_sock(new_mptcp_sock);
local_bh_enable();
}
@@ -1256,8 +1255,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssk->sk_socket)
mptcp_sock_graft(ssk, newsock);
}
-
- inet_sk_state_store(newsock->sk, TCP_ESTABLISHED);
}
sock_put(ssock->sk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 8434c7f5f712..052d72a1d3a2 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -234,6 +234,8 @@ create_child:
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
+ inet_sk_state_store((struct sock *)new_msk,
+ TCP_ESTABLISHED);
ctx->conn = new_msk;
new_msk = NULL;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 91efae88e8c2..468fea1aebba 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -455,14 +455,6 @@ config NF_TABLES
To compile it as a module, choose M here.
if NF_TABLES
-
-config NF_TABLES_SET
- tristate "Netfilter nf_tables set infrastructure"
- help
- This option enables the nf_tables set infrastructure that allows to
- look up for elements in a set and to build one-way mappings between
- matchings and actions.
-
config NF_TABLES_INET
depends on IPV6
select NF_TABLES_IPV4
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3f572e5a975e..292e71dc7ba4 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -78,14 +78,17 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
- nft_chain_route.o nf_tables_offload.o
+ nft_chain_route.o nf_tables_offload.o \
+ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
+ nft_set_pipapo.o
-nf_tables_set-objs := nf_tables_set_core.o \
- nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
- nft_set_pipapo.o
+ifdef CONFIG_X86_64
+ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS)))
+nf_tables-objs += nft_set_pipapo_avx2.o
+endif
+endif
obj-$(CONFIG_NF_TABLES) += nf_tables.o
-obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 0a2196f59106..486959f70cf3 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -46,7 +46,7 @@ struct bitmap_ip {
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 739e343efaf6..2310a316e0af 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -49,7 +49,7 @@ struct bitmap_ipmac {
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* MAC + data extensions */
+ unsigned char extensions[] /* MAC + data extensions */
__aligned(__alignof__(u64));
};
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b49978dd810d..e56ced66f202 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -37,7 +37,7 @@ struct bitmap_port {
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index e52d7b7597a0..1ee43752d6d3 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -76,7 +76,7 @@ struct hbucket {
DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
- unsigned char value[0] /* the array of the values */
+ unsigned char value[] /* the array of the values */
__aligned(__alignof__(u64));
};
@@ -109,7 +109,7 @@ struct htable {
u8 htable_bits; /* size of hash table == 2^htable_bits */
u32 maxelem; /* Maxelem per region */
struct ip_set_region *hregion; /* Region locks and ext sizes */
- struct hbucket __rcu *bucket[0]; /* hashtable buckets */
+ struct hbucket __rcu *bucket[]; /* hashtable buckets */
};
#define hbucket(h, i) ((h)->bucket[i])
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1927fc296f95..a18f8fe728e3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2633,7 +2633,6 @@ void nf_conntrack_init_end(void)
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
#define DYING_NULLS_VAL ((1<<30)+1)
-#define TEMPLATE_NULLS_VAL ((1<<30)+2)
int nf_conntrack_init_net(struct net *net)
{
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 4912069627b6..9b57330c81f8 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1054,21 +1054,18 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
nf_conntrack_standalone_init_dccp_sysctl(net, table);
nf_conntrack_standalone_init_gre_sysctl(net, table);
- /* Don't export sysctls to unprivileged users */
+ /* Don't allow unprivileged users to alter certain sysctls */
if (net->user_ns != &init_user_ns) {
- table[NF_SYSCTL_CT_MAX].procname = NULL;
- table[NF_SYSCTL_CT_ACCT].procname = NULL;
- table[NF_SYSCTL_CT_HELPER].procname = NULL;
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL;
-#endif
+ table[NF_SYSCTL_CT_MAX].mode = 0444;
+ table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
+ table[NF_SYSCTL_CT_HELPER].mode = 0444;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- table[NF_SYSCTL_CT_EVENTS].procname = NULL;
+ table[NF_SYSCTL_CT_EVENTS].mode = 0444;
#endif
- }
-
- if (!net_eq(&init_net, net))
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+ } else if (!net_eq(&init_net, net)) {
+ table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+ }
net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
if (!net->ct.sysctl_header)
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 42b73a084a63..ad549317af30 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -7,6 +7,7 @@
#include <linux/tc_act/tc_csum.h>
#include <net/flow_offload.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
@@ -27,11 +28,61 @@ struct flow_offload_work {
(__match)->dissector.offset[__type] = \
offsetof(struct nf_flow_key, __field)
+static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nf_flow_key *mask = &match->mask;
+ struct nf_flow_key *key = &match->key;
+ unsigned int enc_keys;
+
+ if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ return;
+
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id);
+ mask->enc_key_id.keyid = 0xffffffff;
+ enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+ BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL);
+
+ if (ip_tunnel_info_af(tun_info) == AF_INET) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+ enc_ipv4);
+ key->enc_ipv4.src = tun_info->key.u.ipv4.dst;
+ key->enc_ipv4.dst = tun_info->key.u.ipv4.src;
+ if (key->enc_ipv4.src)
+ mask->enc_ipv4.src = 0xffffffff;
+ if (key->enc_ipv4.dst)
+ mask->enc_ipv4.dst = 0xffffffff;
+ enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else {
+ memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst,
+ sizeof(struct in6_addr));
+ memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.src, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&key->enc_ipv6.src, 0xff,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.dst, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&key->enc_ipv6.dst, 0xff,
+ sizeof(struct in6_addr));
+ enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ match->dissector.used_keys |= enc_keys;
+}
+
static int nf_flow_rule_match(struct nf_flow_match *match,
- const struct flow_offload_tuple *tuple)
+ const struct flow_offload_tuple *tuple,
+ struct dst_entry *other_dst)
{
struct nf_flow_key *mask = &match->mask;
struct nf_flow_key *key = &match->key;
+ struct ip_tunnel_info *tun_info;
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
@@ -41,6 +92,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
+ if (other_dst->lwtstate) {
+ tun_info = lwt_tun_info(other_dst->lwtstate);
+ nf_flow_rule_lwt_match(match, tun_info);
+ }
+
key->meta.ingress_ifindex = tuple->iifidx;
mask->meta.ingress_ifindex = 0xffffffff;
@@ -419,10 +475,52 @@ static void flow_offload_redirect(const struct flow_offload *flow,
dev_hold(rt->dst.dev);
}
+static void flow_offload_encap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ dst = flow->tuplehash[dir].tuple.dst_cache;
+ if (dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_ENCAP;
+ entry->tunnel = tun_info;
+ }
+ }
+}
+
+static void flow_offload_decap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ dst = flow->tuplehash[!dir].tuple.dst_cache;
+ if (dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_DECAP;
+ }
+ }
+}
+
int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ flow_offload_decap_tunnel(flow, dir, flow_rule);
+ flow_offload_encap_tunnel(flow, dir, flow_rule);
+
if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
@@ -449,6 +547,9 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ flow_offload_decap_tunnel(flow, dir, flow_rule);
+ flow_offload_encap_tunnel(flow, dir, flow_rule);
+
if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
@@ -479,6 +580,7 @@ nf_flow_offload_rule_alloc(struct net *net,
const struct flow_offload *flow = offload->flow;
const struct flow_offload_tuple *tuple;
struct nf_flow_rule *flow_rule;
+ struct dst_entry *other_dst;
int err = -ENOMEM;
flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
@@ -494,7 +596,8 @@ nf_flow_offload_rule_alloc(struct net *net,
flow_rule->rule->match.key = &flow_rule->match.key;
tuple = &flow->tuplehash[dir].tuple;
- err = nf_flow_rule_match(&flow_rule->match, tuple);
+ other_dst = flow->tuplehash[!dir].tuple.dst_cache;
+ err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
if (err < 0)
goto err_flow_match;
@@ -574,6 +677,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
struct nf_flow_rule *flow_rule,
enum flow_offload_tuple_dir dir,
int priority, int cmd,
+ struct flow_stats *stats,
struct list_head *block_cb_list)
{
struct flow_cls_offload cls_flow = {};
@@ -598,6 +702,9 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
}
mutex_unlock(&flowtable->flow_block_lock);
+ if (cmd == FLOW_CLS_STATS)
+ memcpy(stats, &cls_flow.stats, sizeof(*stats));
+
return i;
}
@@ -607,7 +714,7 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload,
{
return nf_flow_offload_tuple(offload->flowtable, offload->flow,
flow_rule, dir, offload->priority,
- FLOW_CLS_REPLACE,
+ FLOW_CLS_REPLACE, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -615,7 +722,7 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_DESTROY,
+ offload->priority, FLOW_CLS_DESTROY, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -661,21 +768,9 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir,
struct flow_stats *stats)
{
- struct nf_flowtable *flowtable = offload->flowtable;
- struct flow_cls_offload cls_flow = {};
- struct flow_block_cb *block_cb;
- struct netlink_ext_ack extack;
- __be16 proto = ETH_P_ALL;
-
- nf_flow_offload_init(&cls_flow, proto, offload->priority,
- FLOW_CLS_STATS,
- &offload->flow->tuplehash[dir].tuple, &extack);
-
- mutex_lock(&flowtable->flow_block_lock);
- list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list)
- block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv);
- mutex_unlock(&flowtable->flow_block_lock);
- memcpy(stats, &cls_flow.stats, sizeof(*stats));
+ nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
+ offload->priority, FLOW_CLS_STATS, stats,
+ &offload->flowtable->flow_block.cb_list);
}
static void flow_offload_work_stats(struct flow_offload_work *offload)
@@ -820,25 +915,47 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
return err;
}
-static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
- struct nf_flowtable *flowtable,
- struct net_device *dev,
- enum flow_block_command cmd,
- struct netlink_ext_ack *extack)
+static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
+ struct net *net,
+ enum flow_block_command cmd,
+ struct nf_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
- int err;
-
- if (!dev->netdev_ops->ndo_setup_tc)
- return -EOPNOTSUPP;
-
memset(bo, 0, sizeof(*bo));
- bo->net = dev_net(dev);
+ bo->net = net;
bo->block = &flowtable->flow_block;
bo->command = cmd;
bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
bo->extack = extack;
INIT_LIST_HEAD(&bo->cb_list);
+}
+static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
+ flow_indr_block_call(dev, bo, cmd);
+
+ if (list_empty(&bo->cb_list))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
if (err < 0)
return err;
@@ -857,7 +974,12 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
if (!nf_flowtable_hw_offload(flowtable))
return 0;
- err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack);
+ if (dev->netdev_ops->ndo_setup_tc)
+ err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
+ else
+ err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
if (err < 0)
return err;
@@ -865,10 +987,75 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
}
EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
+static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev,
+ struct nf_flowtable *flowtable,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+
+ if (!flowtable)
+ return;
+
+ nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable,
+ &extack);
+
+ cb(dev, cb_priv, TC_SETUP_FT, &bo);
+
+ nf_flow_table_block_setup(flowtable, &bo, cmd);
+}
+
+static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
+{
+ if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD))
+ return;
+
+ nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd);
+}
+
+static void nf_flow_table_indr_block_cb(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
+{
+ struct net *net = dev_net(dev);
+ struct nft_flowtable *nft_ft;
+ struct nft_table *table;
+ struct nft_hook *hook;
+
+ mutex_lock(&net->nft.commit_mutex);
+ list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(nft_ft, &table->flowtables, list) {
+ list_for_each_entry(hook, &nft_ft->hook_list, list) {
+ if (hook->ops.dev != dev)
+ continue;
+
+ nf_flow_table_indr_block_cb_cmd(&nft_ft->data,
+ dev, cb,
+ cb_priv, cmd);
+ }
+ }
+ }
+ mutex_unlock(&net->nft.commit_mutex);
+}
+
+static struct flow_indr_block_entry block_ing_entry = {
+ .cb = nf_flow_table_indr_block_cb,
+ .list = LIST_HEAD_INIT(block_ing_entry.list),
+};
+
int nf_flow_table_offload_init(void)
{
INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler);
+ flow_indr_add_block_cb(&block_ing_entry);
+
return 0;
}
@@ -877,6 +1064,8 @@ void nf_flow_table_offload_exit(void)
struct flow_offload_work *offload, *next;
LIST_HEAD(offload_pending_list);
+ flow_indr_del_block_cb(&block_ing_entry);
+
cancel_work_sync(&nf_flow_offload_work);
list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 38c680f28f15..f92fb6003745 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2523,8 +2523,8 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
module_put(type->owner);
}
-struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
- const struct nlattr *nla)
+static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
{
struct nft_expr_info info;
struct nft_expr *expr;
@@ -3266,25 +3266,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
/*
* Sets
*/
-
-static LIST_HEAD(nf_tables_set_types);
-
-int nft_register_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_add_tail_rcu(&type->list, &nf_tables_set_types);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_register_set);
-
-void nft_unregister_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_del_rcu(&type->list);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_unregister_set);
+static const struct nft_set_type *nft_set_types[] = {
+ &nft_set_hash_fast_type,
+ &nft_set_hash_type,
+ &nft_set_rhash_type,
+ &nft_set_bitmap_type,
+ &nft_set_rbtree_type,
+#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+ &nft_set_pipapo_avx2_type,
+#endif
+ &nft_set_pipapo_type,
+};
#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
@@ -3310,15 +3302,11 @@ nft_select_set_ops(const struct nft_ctx *ctx,
struct nft_set_estimate est, best;
const struct nft_set_type *type;
u32 flags = 0;
+ int i;
lockdep_assert_held(&ctx->net->nft.commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
-#ifdef CONFIG_MODULES
- if (list_empty(&nf_tables_set_types)) {
- if (nft_request_module(ctx->net, "nft-set") == -EAGAIN)
- return ERR_PTR(-EAGAIN);
- }
-#endif
+
if (nla[NFTA_SET_FLAGS] != NULL)
flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
@@ -3327,7 +3315,8 @@ nft_select_set_ops(const struct nft_ctx *ctx,
best.lookup = ~0;
best.space = ~0;
- list_for_each_entry(type, &nf_tables_set_types, list) {
+ for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) {
+ type = nft_set_types[i];
ops = &type->ops;
if (!nft_set_ops_candidate(type, flags))
@@ -3358,11 +3347,6 @@ nft_select_set_ops(const struct nft_ctx *ctx,
break;
}
- if (!try_module_get(type->owner))
- continue;
- if (bops != NULL)
- module_put(to_set_type(bops)->owner);
-
bops = ops;
best = est;
}
@@ -4061,10 +4045,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
size = ops->privsize(nla, &desc);
set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
- if (!set) {
- err = -ENOMEM;
- goto err1;
- }
+ if (!set)
+ return -ENOMEM;
name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
if (!name) {
@@ -4123,8 +4105,6 @@ err3:
kfree(set->name);
err2:
kvfree(set);
-err1:
- module_put(to_set_type(ops)->owner);
return err;
}
@@ -4134,7 +4114,6 @@ static void nft_set_destroy(struct nft_set *set)
return;
set->ops->destroy(set);
- module_put(to_set_type(set->ops)->owner);
kfree(set->name);
kvfree(set);
}
@@ -4312,7 +4291,6 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u32),
},
};
-EXPORT_SYMBOL_GPL(nft_set_ext_types);
/*
* Set elements
@@ -4801,6 +4779,36 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
return trans;
}
+struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_expr_init(ctx, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ err = -EOPNOTSUPP;
+ if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_set_elem_expr;
+
+ if (expr->ops->type->flags & NFT_EXPR_GC) {
+ if (set->flags & NFT_SET_TIMEOUT)
+ goto err_set_elem_expr;
+ if (!set->ops->gc_init)
+ goto err_set_elem_expr;
+ set->ops->gc_init(set);
+ }
+
+ return expr;
+
+err_set_elem_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end,
@@ -4883,6 +4891,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_elem elem;
struct nft_set_binding *binding;
struct nft_object *obj = NULL;
+ struct nft_expr *expr = NULL;
struct nft_userdata *udata;
struct nft_data_desc desc;
struct nft_data data;
@@ -4950,10 +4959,17 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
+ if (nla[NFTA_SET_ELEM_EXPR] != NULL) {
+ expr = nft_set_elem_expr_alloc(ctx, set,
+ nla[NFTA_SET_ELEM_EXPR]);
+ if (IS_ERR(expr))
+ return PTR_ERR(expr);
+ }
+
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
- return err;
+ goto err_set_elem_expr;
nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
@@ -4972,6 +4988,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
}
+ if (expr)
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR,
+ expr->ops->size);
+
if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
err = -EINVAL;
@@ -5056,6 +5076,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
*nft_set_ext_obj(ext) = obj;
obj->use++;
}
+ if (expr) {
+ memcpy(nft_set_ext_expr(ext), expr, expr->ops->size);
+ kfree(expr);
+ }
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
@@ -5111,6 +5135,9 @@ err_parse_key_end:
nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
+err_set_elem_expr:
+ if (expr != NULL)
+ nft_expr_destroy(ctx, expr);
return err;
}
@@ -5365,7 +5392,6 @@ void nft_set_gc_batch_release(struct rcu_head *rcu)
nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
kfree(gcb);
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gfp_t gfp)
@@ -5378,7 +5404,6 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gcb->head.set = set;
return gcb;
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/*
* Stateful objects
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
deleted file mode 100644
index 586b621007eb..000000000000
--- a/net/netfilter/nf_tables_set_core.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/module.h>
-#include <net/netfilter/nf_tables_core.h>
-
-static int __init nf_tables_set_module_init(void)
-{
- nft_register_set(&nft_set_hash_fast_type);
- nft_register_set(&nft_set_hash_type);
- nft_register_set(&nft_set_rhash_type);
- nft_register_set(&nft_set_bitmap_type);
- nft_register_set(&nft_set_rbtree_type);
- nft_register_set(&nft_set_pipapo_type);
-
- return 0;
-}
-
-static void __exit nf_tables_set_module_exit(void)
-{
- nft_unregister_set(&nft_set_pipapo_type);
- nft_unregister_set(&nft_set_rbtree_type);
- nft_unregister_set(&nft_set_bitmap_type);
- nft_unregister_set(&nft_set_rhash_type);
- nft_unregister_set(&nft_set_hash_type);
- nft_unregister_set(&nft_set_hash_fast_type);
-}
-
-module_init(nf_tables_set_module_init);
-module_exit(nf_tables_set_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 2481470dec36..5827117f2635 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -33,7 +33,7 @@ struct nf_acct {
refcount_t refcnt;
char name[NFACCT_NAME_MAX];
struct rcu_head rcu_head;
- char data[0];
+ char data[];
};
struct nfacct_filter {
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 0ed2281f03be..bc37d6c59db4 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -93,7 +93,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
static int nft_bitwise_init_bool(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc d1, d2;
+ struct nft_data_desc mask, xor;
int err;
if (tb[NFTA_BITWISE_DATA])
@@ -103,29 +103,29 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv,
!tb[NFTA_BITWISE_XOR])
return -EINVAL;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
+ err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask,
tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) {
+ if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) {
err = -EINVAL;
goto err1;
}
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
+ err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor,
tb[NFTA_BITWISE_XOR]);
if (err < 0)
goto err1;
- if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) {
+ if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) {
err = -EINVAL;
goto err2;
}
return 0;
err2:
- nft_data_release(&priv->xor, d2.type);
+ nft_data_release(&priv->xor, xor.type);
err1:
- nft_data_release(&priv->mask, d1.type);
+ nft_data_release(&priv->mask, mask.type);
return err;
}
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 683785225a3e..46ab28ec4b53 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -81,7 +81,6 @@ void nft_dynset_eval(const struct nft_expr *expr,
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set *set = priv->set;
const struct nft_set_ext *ext;
- const struct nft_expr *sexpr;
u64 timeout;
if (priv->op == NFT_DYNSET_OP_DELETE) {
@@ -91,18 +90,13 @@ void nft_dynset_eval(const struct nft_expr *expr,
if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
expr, regs, &ext)) {
- sexpr = NULL;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- sexpr = nft_set_ext_expr(ext);
-
if (priv->op == NFT_DYNSET_OP_UPDATE &&
nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
timeout = priv->timeout ? : set->timeout;
*nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
}
- if (sexpr != NULL)
- sexpr->ops->eval(sexpr, regs, pkt);
+ nft_set_elem_update_expr(ext, regs, pkt);
if (priv->invert)
regs->verdict.code = NFT_BREAK;
@@ -206,21 +200,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_EVAL))
return -EINVAL;
- priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
+ priv->expr = nft_set_elem_expr_alloc(ctx, set,
+ tb[NFTA_DYNSET_EXPR]);
if (IS_ERR(priv->expr))
return PTR_ERR(priv->expr);
-
- err = -EOPNOTSUPP;
- if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err1;
-
- if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
- if (set->flags & NFT_SET_TIMEOUT)
- goto err1;
- if (!set->ops->gc_init)
- goto err1;
- set->ops->gc_init(set);
- }
}
nft_set_ext_prepare(&priv->tmpl);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 660bad688e2b..1e70359d633c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -43,6 +43,7 @@ void nft_lookup_eval(const struct nft_expr *expr,
nft_data_copy(&regs->data[priv->dreg],
nft_set_ext_data(ext), set->dlen);
+ nft_set_elem_update_expr(ext, regs, pkt);
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 87e8d9ba0c9b..1cb2e67e6e03 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -293,8 +293,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_bitmap_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_bitmap_type = {
.ops = {
.privsize = nft_bitmap_privsize,
.elemsize = offsetof(struct nft_bitmap_elem, ext),
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index d350a7cd3af0..4d3f147e8d8d 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -662,8 +662,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
return true;
}
-struct nft_set_type nft_set_rhash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rhash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT | NFT_SET_EVAL,
.ops = {
@@ -686,8 +685,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
@@ -706,8 +704,7 @@ struct nft_set_type nft_set_hash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_fast_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_fast_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 4fc0c924ed5d..c1afb6c94edc 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -330,144 +330,22 @@
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/log2.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <uapi/linux/netfilter/nf_tables.h>
-#include <net/ipv6.h> /* For the maximum length of a field */
#include <linux/bitmap.h>
#include <linux/bitops.h>
-/* Count of concatenated fields depends on count of 32-bit nftables registers */
-#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
-
-/* Largest supported field size */
-#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
-#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
-
-/* Number of bits to be grouped together in lookup table buckets, arbitrary */
-#define NFT_PIPAPO_GROUP_BITS 4
-#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS)
-
-/* Fields are padded to 32 bits in input registers */
-#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \
- (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32)))
-#define NFT_PIPAPO_GROUPS_PADDING(x) \
- (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE)
-
-/* Number of buckets, given by 2 ^ n, with n grouped bits */
-#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS)
-
-/* Each n-bit range maps to up to n * 2 rules */
-#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
-
-/* Use the rest of mapping table buckets for rule indices, but it makes no sense
- * to exceed 32 bits
- */
-#if BITS_PER_LONG == 64
-#define NFT_PIPAPO_MAP_TOBITS 32
-#else
-#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
-#endif
-
-/* ...which gives us the highest allowed index for a rule */
-#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
- - (1UL << NFT_PIPAPO_MAP_NBITS))
-
-#define nft_pipapo_for_each_field(field, index, match) \
- for ((field) = (match)->f, (index) = 0; \
- (index) < (match)->field_count; \
- (index)++, (field)++)
-
-/**
- * union nft_pipapo_map_bucket - Bucket of mapping table
- * @to: First rule number (in next field) this rule maps to
- * @n: Number of rules (in next field) this rule maps to
- * @e: If there's no next field, pointer to element this rule maps to
- */
-union nft_pipapo_map_bucket {
- struct {
-#if BITS_PER_LONG == 64
- static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
- u32 to;
-
- static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
- u32 n;
-#else
- unsigned long to:NFT_PIPAPO_MAP_TOBITS;
- unsigned long n:NFT_PIPAPO_MAP_NBITS;
-#endif
- };
- struct nft_pipapo_elem *e;
-};
-
-/**
- * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
- * @groups: Amount of 4-bit groups
- * @rules: Number of inserted rules
- * @bsize: Size of each bucket in lookup table, in longs
- * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets
- * @mt: Mapping table: one bucket per rule
- */
-struct nft_pipapo_field {
- int groups;
- unsigned long rules;
- size_t bsize;
- unsigned long *lt;
- union nft_pipapo_map_bucket *mt;
-};
-
-/**
- * struct nft_pipapo_match - Data used for lookup and matching
- * @field_count Amount of fields in set
- * @scratch: Preallocated per-CPU maps for partial matching results
- * @bsize_max: Maximum lookup table bucket size of all fields, in longs
- * @rcu Matching data is swapped on commits
- * @f: Fields, with lookup and mapping tables
- */
-struct nft_pipapo_match {
- int field_count;
- unsigned long * __percpu *scratch;
- size_t bsize_max;
- struct rcu_head rcu;
- struct nft_pipapo_field f[0];
-};
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
/* Current working bitmap index, toggled between field matches */
static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
/**
- * struct nft_pipapo - Representation of a set
- * @match: Currently in-use matching data
- * @clone: Copy where pending insertions and deletions are kept
- * @groups: Total amount of 4-bit groups for fields in this set
- * @width: Total bytes to be matched for one packet, including padding
- * @dirty: Working copy has pending insertions or deletions
- * @last_gc: Timestamp of last garbage collection run, jiffies
- */
-struct nft_pipapo {
- struct nft_pipapo_match __rcu *match;
- struct nft_pipapo_match *clone;
- int groups;
- int width;
- bool dirty;
- unsigned long last_gc;
-};
-
-struct nft_pipapo_elem;
-
-/**
- * struct nft_pipapo_elem - API-facing representation of single set element
- * @ext: nftables API extensions
- */
-struct nft_pipapo_elem {
- struct nft_set_ext ext;
-};
-
-/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
* @len: Length of bitmap in longs
@@ -484,9 +362,8 @@ struct nft_pipapo_elem {
*
* Return: -1 on no match, bit position on 'match_only', 0 otherwise.
*/
-static int pipapo_refill(unsigned long *map, int len, int rules,
- unsigned long *dst, union nft_pipapo_map_bucket *mt,
- bool match_only)
+int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool match_only)
{
unsigned long bitset;
int k, ret = -1;
@@ -559,26 +436,18 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
- unsigned long *lt = f->lt;
- int b, group;
+ int b;
- /* For each 4-bit group: select lookup table bucket depending on
+ /* For each bit group: select lookup table bucket depending on
* packet bytes value, then AND bucket value
*/
- for (group = 0; group < f->groups; group += 2) {
- u8 v;
-
- v = *rp >> 4;
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
-
- v = *rp & 0x0f;
- rp++;
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
- }
+ if (likely(f->bb == 8))
+ pipapo_and_field_buckets_8bit(f, res_map, rp);
+ else
+ pipapo_and_field_buckets_4bit(f, res_map, rp);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -621,7 +490,7 @@ next_match:
map_index = !map_index;
swap(res_map, fill_map);
- rp += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ rp += NFT_PIPAPO_GROUPS_PADDING(f);
}
out:
@@ -669,26 +538,19 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
- unsigned long *lt = f->lt;
- int b, group;
+ int b;
- /* For each 4-bit group: select lookup table bucket depending on
+ /* For each bit group: select lookup table bucket depending on
* packet bytes value, then AND bucket value
*/
- for (group = 0; group < f->groups; group++) {
- u8 v;
-
- if (group % 2) {
- v = *data & 0x0f;
- data++;
- } else {
- v = *data >> 4;
- }
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, res_map, data);
+ else if (f->bb == 4)
+ pipapo_and_field_buckets_4bit(f, res_map, data);
+ else
+ BUG();
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
- }
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -713,7 +575,7 @@ next_match:
goto out;
}
- data += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
/* Swap bitmap indices: fill_map will be the initial bitmap for
* the next field (i.e. the new res_map), and res_map is
@@ -736,8 +598,8 @@ out:
* @elem: nftables API element representation containing key data
* @flags: Unused
*/
-void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
return pipapo_get(net, set, (const u8 *)elem->key.val.data,
nft_genmask_cur(net));
@@ -763,6 +625,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
int group, bucket;
new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
+#ifdef NFT_PIPAPO_ALIGN
+ new_bucket_size = roundup(new_bucket_size,
+ NFT_PIPAPO_ALIGN / sizeof(*new_lt));
+#endif
if (new_bucket_size == f->bsize)
goto mt;
@@ -772,15 +638,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
else
copy = new_bucket_size;
- new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size *
- sizeof(*new_lt), GFP_KERNEL);
+ new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) *
+ new_bucket_size * sizeof(*new_lt) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL);
if (!new_lt)
return -ENOMEM;
- new_p = new_lt;
- old_p = old_lt;
+ new_p = NFT_PIPAPO_LT_ALIGN(new_lt);
+ old_p = NFT_PIPAPO_LT_ALIGN(old_lt);
+
for (group = 0; group < f->groups; group++) {
- for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) {
+ for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) {
memcpy(new_p, old_p, copy * sizeof(*new_p));
new_p += copy;
old_p += copy;
@@ -807,7 +676,7 @@ mt:
if (new_lt) {
f->bsize = new_bucket_size;
- f->lt = new_lt;
+ NFT_PIPAPO_LT_ASSIGN(f, new_lt);
kvfree(old_lt);
}
@@ -829,13 +698,196 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
{
unsigned long *pos;
- pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt);
+ pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group;
pos += f->bsize * v;
__set_bit(rule, pos);
}
/**
+ * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit intersection between:
+ * - bucket with index given by the upper 4 bits of b, from group g, and
+ * - bucket with index given by the lower 4 bits of b, from group g + 1
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * N(b, g) := O(b / 16, g) & O(b % 16, g + 1)
+ *
+ * This ensures equivalence of the matching results on lookup. Two examples in
+ * pictures:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255
+ * 0 ^
+ * 1 | ^
+ * ... ( & ) |
+ * / \ |
+ * / \ .-( & )-.
+ * / bucket \ | |
+ * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 |
+ * 0 / \ | |
+ * 1 \ | |
+ * 2 | --'
+ * 3 '-
+ * ...
+ */
+static void pipapo_lt_4b_to_8b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, i;
+
+ for (g = 0; g < old_groups / 2; g++) {
+ int src_g0 = g * 2, src_g1 = g * 2 + 1;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) {
+ int src_b0 = b / NFT_PIPAPO_BUCKETS(4);
+ int src_b1 = b % NFT_PIPAPO_BUCKETS(4);
+ int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0;
+ int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1;
+
+ for (i = 0; i < bsize; i++) {
+ *new_lt = old_lt[src_i0 * bsize + i] &
+ old_lt[src_i1 * bsize + i];
+ new_lt++;
+ }
+ }
+ }
+}
+
+/**
+ * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit union of:
+ * - all the buckets with index such that the upper four bits of the lower byte
+ * equal b, from group g, with g odd
+ * - all the buckets with index such that the lower four bits equal b, from
+ * group g, with g even
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4)
+ * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f)
+ *
+ * where U() denotes the arbitrary union operation (binary OR of n terms). This
+ * ensures equivalence of the matching results on lookup.
+ */
+static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, bsrc, i;
+
+ memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize *
+ sizeof(unsigned long));
+
+ for (g = 0; g < old_groups * 2; g += 2) {
+ int src_g = g / 2;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if (((bsrc & 0xf0) >> 4) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if ((bsrc & 0x0f) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+ }
+}
+
+/**
+ * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed
+ * @f: Field containing lookup table
+ */
+static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
+{
+ unsigned long *new_lt;
+ int groups, bb;
+ size_t lt_size;
+
+ lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
+ sizeof(*f->lt);
+
+ if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET &&
+ lt_size > NFT_PIPAPO_LT_SIZE_HIGH) {
+ groups = f->groups * 2;
+ bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
+
+ lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
+ sizeof(*f->lt);
+ } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
+ lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
+ groups = f->groups / 2;
+ bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
+
+ lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
+ sizeof(*f->lt);
+
+ /* Don't increase group width if the resulting lookup table size
+ * would exceed the upper size threshold for a "small" set.
+ */
+ if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH)
+ return;
+ } else {
+ return;
+ }
+
+ new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL);
+ if (!new_lt)
+ return;
+
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+ if (f->bb == 4 && bb == 8) {
+ pipapo_lt_4b_to_8b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else if (f->bb == 8 && bb == 4) {
+ pipapo_lt_8b_to_4b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else {
+ BUG();
+ }
+
+ f->groups = groups;
+ f->bb = bb;
+ kvfree(f->lt);
+ NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+}
+
+/**
* pipapo_insert() - Insert new rule in field given input key and mask length
* @f: Field containing lookup table
* @k: Input key for classification, without nftables padding
@@ -849,7 +901,7 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int mask_bits)
{
- int rule = f->rules++, group, ret;
+ int rule = f->rules++, group, ret, bit_offset = 0;
ret = pipapo_resize(f, f->rules - 1, f->rules);
if (ret)
@@ -859,28 +911,33 @@ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int i, v;
u8 mask;
- if (group % 2)
- v = k[group / 2] & 0x0f;
- else
- v = k[group / 2] >> 4;
+ v = k[group / (BITS_PER_BYTE / f->bb)];
+ v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0);
+ v >>= (BITS_PER_BYTE - bit_offset) - f->bb;
+
+ bit_offset += f->bb;
+ bit_offset %= BITS_PER_BYTE;
- if (mask_bits >= (group + 1) * 4) {
+ if (mask_bits >= (group + 1) * f->bb) {
/* Not masked */
pipapo_bucket_set(f, rule, group, v);
- } else if (mask_bits <= group * 4) {
+ } else if (mask_bits <= group * f->bb) {
/* Completely masked */
- for (i = 0; i < NFT_PIPAPO_BUCKETS; i++)
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++)
pipapo_bucket_set(f, rule, group, i);
} else {
/* The mask limit falls on this group */
- mask = 0x0f >> (mask_bits - group * 4);
- for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) {
+ mask = GENMASK(f->bb - 1, 0);
+ mask >>= mask_bits - group * f->bb;
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) {
if ((i & ~mask) == (v & ~mask))
pipapo_bucket_set(f, rule, group, i);
}
}
}
+ pipapo_lt_bits_adjust(f);
+
return 1;
}
@@ -1053,8 +1110,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
for_each_possible_cpu(i) {
unsigned long *scratch;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long *scratch_aligned;
+#endif
- scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2,
+ scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 +
+ NFT_PIPAPO_ALIGN_HEADROOM,
GFP_KERNEL, cpu_to_node(i));
if (!scratch) {
/* On failure, there's no need to undo previous
@@ -1070,6 +1131,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
kfree(*per_cpu_ptr(clone->scratch, i));
*per_cpu_ptr(clone->scratch, i) = scratch;
+
+#ifdef NFT_PIPAPO_ALIGN
+ scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch);
+ *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned;
+#endif
}
return 0;
@@ -1123,11 +1189,11 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
return -ENOSPC;
if (memcmp(start_p, end_p,
- f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0)
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0)
return -EINVAL;
- start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
/* Insert */
@@ -1141,22 +1207,19 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
rulemap[i].to = f->rules;
ret = memcmp(start, end,
- f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
- if (!ret) {
- ret = pipapo_insert(f, start,
- f->groups * NFT_PIPAPO_GROUP_BITS);
- } else {
- ret = pipapo_expand(f, start, end,
- f->groups * NFT_PIPAPO_GROUP_BITS);
- }
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
+ if (!ret)
+ ret = pipapo_insert(f, start, f->groups * f->bb);
+ else
+ ret = pipapo_expand(f, start, end, f->groups * f->bb);
if (f->bsize > bsize_max)
bsize_max = f->bsize;
rulemap[i].n = ret;
- start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
@@ -1200,23 +1263,35 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (!new->scratch)
goto out_scratch;
+#ifdef NFT_PIPAPO_ALIGN
+ new->scratch_aligned = alloc_percpu(*new->scratch_aligned);
+ if (!new->scratch_aligned)
+ goto out_scratch;
+#endif
+
rcu_head_init(&new->rcu);
src = old->f;
dst = new->f;
for (i = 0; i < old->field_count; i++) {
+ unsigned long *new_lt;
+
memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
- dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS *
- src->bsize * sizeof(*dst->lt),
- GFP_KERNEL);
- if (!dst->lt)
+ new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
+ src->bsize * sizeof(*dst->lt) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL);
+ if (!new_lt)
goto out_lt;
- memcpy(dst->lt, src->lt,
+ NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
+
+ memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
+ NFT_PIPAPO_LT_ALIGN(src->lt),
src->bsize * sizeof(*dst->lt) *
- src->groups * NFT_PIPAPO_BUCKETS);
+ src->groups * NFT_PIPAPO_BUCKETS(src->bb));
dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
if (!dst->mt)
@@ -1237,8 +1312,11 @@ out_lt:
kvfree(dst->lt);
dst--;
}
- free_percpu(new->scratch);
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(new->scratch_aligned);
+#endif
out_scratch:
+ free_percpu(new->scratch);
kfree(new);
return ERR_PTR(-ENOMEM);
@@ -1394,9 +1472,10 @@ static void pipapo_drop(struct nft_pipapo_match *m,
unsigned long *pos;
int b;
- pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g *
+ NFT_PIPAPO_BUCKETS(f->bb) * f->bsize;
- for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
bitmap_cut(pos, pos, rulemap[i].to,
rulemap[i].n,
f->bsize * BITS_PER_LONG);
@@ -1414,6 +1493,8 @@ static void pipapo_drop(struct nft_pipapo_match *m,
;
}
f->rules -= rulemap[i].n;
+
+ pipapo_lt_bits_adjust(f);
}
}
@@ -1498,6 +1579,9 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
for_each_possible_cpu(i)
kfree(*per_cpu_ptr(m->scratch, i));
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
free_percpu(m->scratch);
pipapo_free_fields(m);
@@ -1690,30 +1774,33 @@ static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule,
int rule_count, u8 *left, u8 *right)
{
+ int g, mask_len = 0, bit_offset = 0;
u8 *l = left, *r = right;
- int g, mask_len = 0;
for (g = 0; g < f->groups; g++) {
int b, x0, x1;
x0 = -1;
x1 = -1;
- for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
unsigned long *pos;
- pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) +
+ (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize;
if (test_bit(first_rule, pos) && x0 == -1)
x0 = b;
if (test_bit(first_rule + rule_count - 1, pos))
x1 = b;
}
- if (g % 2) {
- *(l++) |= x0 & 0x0f;
- *(r++) |= x1 & 0x0f;
- } else {
- *l |= x0 << 4;
- *r |= x1 << 4;
+ *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset);
+ *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset);
+
+ bit_offset += f->bb;
+ if (bit_offset >= BITS_PER_BYTE) {
+ bit_offset %= BITS_PER_BYTE;
+ l++;
+ r++;
}
if (x1 - x0 == 0)
@@ -1748,8 +1835,9 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
pipapo_get_boundaries(f, first_rule, rule_count, left, right);
- return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) &&
- !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
+ return !memcmp(start, left,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) &&
+ !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
}
/**
@@ -1801,8 +1889,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
rules_fx = f->mt[start].n;
start = f->mt[start].to;
- match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
if (i == m->field_count) {
@@ -1885,56 +1973,24 @@ static u64 nft_pipapo_privsize(const struct nlattr * const nla[],
}
/**
- * nft_pipapo_estimate() - Estimate set size, space and lookup complexity
- * @desc: Set description, element count and field description used here
+ * nft_pipapo_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
* @features: Flags: NFT_SET_INTERVAL needs to be there
* @est: Storage for estimation data
*
- * The size for this set type can vary dramatically, as it depends on the number
- * of rules (composing netmasks) the entries expand to. We compute the worst
- * case here.
- *
- * In general, for a non-ranged entry or a single composing netmask, we need
- * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
- * is, each input bit needs four bits of matching data), plus a bucket in the
- * mapping table for each field.
- *
- * Return: true only for compatible range concatenations
+ * Return: true if set description is compatible, false otherwise
*/
static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
- unsigned long entry_size;
- int i;
-
- if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1)
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- for (i = 0, entry_size = 0; i < desc->field_count; i++) {
- unsigned long rules;
-
- if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
- return false;
-
- /* Worst-case ranges for each concatenated field: each n-bit
- * field can expand to up to n * 2 rules in each bucket, and
- * each rule also needs a mapping bucket.
- */
- rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
- entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE;
- entry_size += rules * sizeof(union nft_pipapo_map_bucket);
- }
-
- /* Rules in lookup and mapping tables are needed for each entry */
- est->size = desc->size * entry_size;
- if (est->size && div_u64(est->size, desc->size) != entry_size)
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
return false;
- est->size += sizeof(struct nft_pipapo) +
- sizeof(struct nft_pipapo_match) * 2;
-
- est->size += sizeof(struct nft_pipapo_field) * desc->field_count;
-
est->lookup = NFT_SET_CLASS_O_LOG_N;
est->space = NFT_SET_CLASS_O_N;
@@ -1961,38 +2017,52 @@ static int nft_pipapo_init(const struct nft_set *set,
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
struct nft_pipapo_field *f;
- int err, i;
+ int err, i, field_count;
+
+ field_count = desc->field_count ? : 1;
- if (desc->field_count > NFT_PIPAPO_MAX_FIELDS)
+ if (field_count > NFT_PIPAPO_MAX_FIELDS)
return -EINVAL;
- m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count,
+ m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count,
GFP_KERNEL);
if (!m)
return -ENOMEM;
- m->field_count = desc->field_count;
+ m->field_count = field_count;
m->bsize_max = 0;
m->scratch = alloc_percpu(unsigned long *);
if (!m->scratch) {
err = -ENOMEM;
- goto out_free;
+ goto out_scratch;
}
for_each_possible_cpu(i)
*per_cpu_ptr(m->scratch, i) = NULL;
+#ifdef NFT_PIPAPO_ALIGN
+ m->scratch_aligned = alloc_percpu(unsigned long *);
+ if (!m->scratch_aligned) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(m->scratch_aligned, i) = NULL;
+#endif
+
rcu_head_init(&m->rcu);
nft_pipapo_for_each_field(f, i, m) {
- f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE;
- priv->groups += f->groups;
+ int len = desc->field_len[i] ? : set->klen;
- priv->width += round_up(desc->field_len[i], sizeof(u32));
+ f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
+ f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
+
+ priv->width += round_up(len, sizeof(u32));
f->bsize = 0;
f->rules = 0;
- f->lt = NULL;
+ NFT_PIPAPO_LT_ASSIGN(f, NULL);
f->mt = NULL;
}
@@ -2010,7 +2080,11 @@ static int nft_pipapo_init(const struct nft_set *set,
return 0;
out_free:
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
free_percpu(m->scratch);
+out_scratch:
kfree(m);
return err;
@@ -2045,16 +2119,21 @@ static void nft_pipapo_destroy(const struct nft_set *set)
nft_set_elem_destroy(set, e, true);
}
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
for_each_possible_cpu(cpu)
kfree(*per_cpu_ptr(m->scratch, cpu));
free_percpu(m->scratch);
-
pipapo_free_fields(m);
kfree(m);
priv->match = NULL;
}
if (priv->clone) {
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(priv->clone->scratch_aligned);
+#endif
for_each_possible_cpu(cpu)
kfree(*per_cpu_ptr(priv->clone->scratch, cpu));
free_percpu(priv->clone->scratch);
@@ -2081,8 +2160,7 @@ static void nft_pipapo_gc_init(const struct nft_set *set)
priv->last_gc = jiffies;
}
-struct nft_set_type nft_set_pipapo_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_pipapo_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT,
.ops = {
@@ -2102,3 +2180,26 @@ struct nft_set_type nft_set_pipapo_type __read_mostly = {
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+const struct nft_set_type nft_set_pipapo_avx2_type = {
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT,
+ .ops = {
+ .lookup = nft_pipapo_avx2_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_avx2_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
+#endif
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
new file mode 100644
index 000000000000..25a75591583e
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo.h
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef _NFT_SET_PIPAPO_H
+
+#include <linux/log2.h>
+#include <net/ipv6.h> /* For the maximum length of a field */
+
+/* Count of concatenated fields depends on count of 32-bit nftables registers */
+#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
+
+/* Restrict usage to multiple fields, make sure rbtree is used otherwise */
+#define NFT_PIPAPO_MIN_FIELDS 2
+
+/* Largest supported field size */
+#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
+#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
+
+/* Bits to be grouped together in table buckets depending on set size */
+#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET
+#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8
+#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4
+#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \
+ BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \
+ (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4))
+#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb)
+
+/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the
+ * small group width, and switch to the big group width if the table gets
+ * smaller than NFT_PIPAPO_LT_SIZE_LOW.
+ *
+ * Picking 2MiB as threshold (for a single table) avoids as much as possible
+ * crossing page boundaries on most architectures (x86-64 and MIPS huge pages,
+ * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which
+ * keeps performance nice in case kvmalloc() gives us non-contiguous areas.
+ */
+#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21)
+#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16)
+#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD
+#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \
+ NFT_PIPAPO_LT_SIZE_HYSTERESIS
+
+/* Fields are padded to 32 bits in input registers */
+#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \
+ (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32)))
+#define NFT_PIPAPO_GROUPS_PADDING(f) \
+ (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \
+ NFT_PIPAPO_GROUPS_PER_BYTE(f))
+
+/* Number of buckets given by 2 ^ n, with n bucket bits */
+#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb))
+
+/* Each n-bit range maps to up to n * 2 rules */
+#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
+
+/* Use the rest of mapping table buckets for rule indices, but it makes no sense
+ * to exceed 32 bits
+ */
+#if BITS_PER_LONG == 64
+#define NFT_PIPAPO_MAP_TOBITS 32
+#else
+#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
+#endif
+
+/* ...which gives us the highest allowed index for a rule */
+#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
+ - (1UL << NFT_PIPAPO_MAP_NBITS))
+
+/* Definitions for vectorised implementations */
+#ifdef NFT_PIPAPO_ALIGN
+#define NFT_PIPAPO_ALIGN_HEADROOM \
+ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
+#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
+#define NFT_PIPAPO_LT_ASSIGN(field, x) \
+ do { \
+ (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \
+ (field)->lt = (x); \
+ } while (0)
+#else
+#define NFT_PIPAPO_ALIGN_HEADROOM 0
+#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
+#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x))
+#endif /* NFT_PIPAPO_ALIGN */
+
+#define nft_pipapo_for_each_field(field, index, match) \
+ for ((field) = (match)->f, (index) = 0; \
+ (index) < (match)->field_count; \
+ (index)++, (field)++)
+
+/**
+ * union nft_pipapo_map_bucket - Bucket of mapping table
+ * @to: First rule number (in next field) this rule maps to
+ * @n: Number of rules (in next field) this rule maps to
+ * @e: If there's no next field, pointer to element this rule maps to
+ */
+union nft_pipapo_map_bucket {
+ struct {
+#if BITS_PER_LONG == 64
+ static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
+ u32 to;
+
+ static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
+ u32 n;
+#else
+ unsigned long to:NFT_PIPAPO_MAP_TOBITS;
+ unsigned long n:NFT_PIPAPO_MAP_NBITS;
+#endif
+ };
+ struct nft_pipapo_elem *e;
+};
+
+/**
+ * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
+ * @groups: Amount of bit groups
+ * @rules: Number of inserted rules
+ * @bsize: Size of each bucket in lookup table, in longs
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @lt: Lookup table: 'groups' rows of buckets
+ * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
+ * @mt: Mapping table: one bucket per rule
+ */
+struct nft_pipapo_field {
+ int groups;
+ unsigned long rules;
+ size_t bsize;
+ int bb;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long *lt_aligned;
+#endif
+ unsigned long *lt;
+ union nft_pipapo_map_bucket *mt;
+};
+
+/**
+ * struct nft_pipapo_match - Data used for lookup and matching
+ * @field_count Amount of fields in set
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes
+ * @bsize_max: Maximum lookup table bucket size of all fields, in longs
+ * @rcu Matching data is swapped on commits
+ * @f: Fields, with lookup and mapping tables
+ */
+struct nft_pipapo_match {
+ int field_count;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long * __percpu *scratch_aligned;
+#endif
+ unsigned long * __percpu *scratch;
+ size_t bsize_max;
+ struct rcu_head rcu;
+ struct nft_pipapo_field f[];
+};
+
+/**
+ * struct nft_pipapo - Representation of a set
+ * @match: Currently in-use matching data
+ * @clone: Copy where pending insertions and deletions are kept
+ * @width: Total bytes to be matched for one packet, including padding
+ * @dirty: Working copy has pending insertions or deletions
+ * @last_gc: Timestamp of last garbage collection run, jiffies
+ */
+struct nft_pipapo {
+ struct nft_pipapo_match __rcu *match;
+ struct nft_pipapo_match *clone;
+ int width;
+ bool dirty;
+ unsigned long last_gc;
+};
+
+struct nft_pipapo_elem;
+
+/**
+ * struct nft_pipapo_elem - API-facing representation of single set element
+ * @ext: nftables API extensions
+ */
+struct nft_pipapo_elem {
+ struct nft_set_ext ext;
+};
+
+int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool match_only);
+
+/**
+ * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) {
+ u8 v;
+
+ v = *data >> 4;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+
+ v = *data & 0x0f;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+ }
+}
+
+/**
+ * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group++, data++) {
+ __bitmap_and(dst, dst, lt + *data * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(8);
+ }
+}
+
+/**
+ * pipapo_estimate_size() - Estimate worst-case for set size
+ * @desc: Set description, element count and field description used here
+ *
+ * The size for this set type can vary dramatically, as it depends on the number
+ * of rules (composing netmasks) the entries expand to. We compute the worst
+ * case here.
+ *
+ * In general, for a non-ranged entry or a single composing netmask, we need
+ * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
+ * is, each input bit needs four bits of matching data), plus a bucket in the
+ * mapping table for each field.
+ *
+ * Return: worst-case set size in bytes, 0 on any overflow
+ */
+static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
+{
+ unsigned long entry_size;
+ u64 size;
+ int i;
+
+ for (i = 0, entry_size = 0; i < desc->field_count; i++) {
+ unsigned long rules;
+
+ if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
+ return 0;
+
+ /* Worst-case ranges for each concatenated field: each n-bit
+ * field can expand to up to n * 2 rules in each bucket, and
+ * each rule also needs a mapping bucket.
+ */
+ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
+ entry_size += rules *
+ NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) /
+ BITS_PER_BYTE;
+ entry_size += rules * sizeof(union nft_pipapo_map_bucket);
+ }
+
+ /* Rules in lookup and mapping tables are needed for each entry */
+ size = desc->size * entry_size;
+ if (size && div_u64(size, desc->size) != entry_size)
+ return 0;
+
+ size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2;
+
+ size += sizeof(struct nft_pipapo_field) * desc->field_count;
+
+ return size;
+}
+
+#endif /* _NFT_SET_PIPAPO_H */
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
new file mode 100644
index 000000000000..d65ae0e23028
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
+ *
+ * Copyright (c) 2019-2020 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
+
+#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
+
+/* Load from memory into YMM register with non-temporal hint ("stream load"),
+ * that is, don't fetch lines from memory into the cache. This avoids pushing
+ * precious packet data out of the cache hierarchy, and is appropriate when:
+ *
+ * - loading buckets from lookup tables, as they are not going to be used
+ * again before packets are entirely classified
+ *
+ * - loading the result bitmap from the previous field, as it's never used
+ * again
+ */
+#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
+ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
+
+/* Stream a single lookup table bucket into YMM register given lookup table,
+ * group index, value of packet bits, bucket size.
+ */
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
+ (v)) * (bsize)])
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
+ (v)) * (bsize)])
+
+/* Bitwise AND: the staple operation of this algorithm */
+#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
+ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
+
+/* Jump to label if @reg is zero */
+#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
+ asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ "je %l[" #label "]" : : : : label)
+
+/* Store 256 bits from YMM register into memory. Contrary to bucket load
+ * operation, we don't bypass the cache here, as stored matching results
+ * are always used shortly after.
+ */
+#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
+
+/* Zero out a complete YMM register, @reg */
+#define NFT_PIPAPO_AVX2_ZERO(reg) \
+ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
+
+/* Current working bitmap index, toggled between field matches */
+static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
+
+/**
+ * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
+ *
+ * This zeroes out ymm15, which is later used whenever we need to clear a
+ * memory location, by storing its content into memory.
+ */
+static void nft_pipapo_avx2_prepare(void)
+{
+ NFT_PIPAPO_AVX2_ZERO(15);
+}
+
+/**
+ * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
+ * @data: Base memory area
+ * @start: First bit to set
+ * @len: Count of bits to fill
+ *
+ * This is nothing else than a version of bitmap_set(), as used e.g. by
+ * pipapo_refill(), tailored for the microarchitectures using it and better
+ * suited for the specific usage: it's very likely that we'll set a small number
+ * of bits, not crossing a word boundary, and correct branch prediction is
+ * critical here.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ */
+static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
+{
+ int offset = start % BITS_PER_LONG;
+ unsigned long mask;
+
+ data += start / BITS_PER_LONG;
+
+ if (likely(len == 1)) {
+ *data |= BIT(offset);
+ return;
+ }
+
+ if (likely(len < BITS_PER_LONG || offset)) {
+ if (likely(len + offset <= BITS_PER_LONG)) {
+ *data |= GENMASK(len - 1 + offset, offset);
+ return;
+ }
+
+ *data |= ~0UL << offset;
+ len -= BITS_PER_LONG - offset;
+ data++;
+
+ if (len <= BITS_PER_LONG) {
+ mask = ~0UL >> (BITS_PER_LONG - len);
+ *data |= mask;
+ return;
+ }
+ }
+
+ memset(data, 0xff, len / BITS_PER_BYTE);
+ data += len / BITS_PER_LONG;
+
+ len %= BITS_PER_LONG;
+ if (len)
+ *data |= ~0UL >> (BITS_PER_LONG - len);
+}
+
+/**
+ * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
+ * @offset: Start from given bitmap (equivalent to bucket) offset, in longs
+ * @map: Bitmap to be scanned for set bits
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @len: Length of bitmap in longs
+ * @last: Return index of first set bit, if this is the last field
+ *
+ * This is an alternative implementation of pipapo_refill() suitable for usage
+ * with AVX2 lookup routines: we know there are four words to be scanned, at
+ * a given offset inside the map, for each matching iteration.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ *
+ * Return: first set bit index if @last, index of first filled word otherwise.
+ */
+static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
+ unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool last)
+{
+ int ret = -1;
+
+#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
+ do { \
+ while (map[(x)]) { \
+ int r = __builtin_ctzl(map[(x)]); \
+ int i = (offset + (x)) * BITS_PER_LONG + r; \
+ \
+ if (last) \
+ return i; \
+ \
+ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
+ \
+ if (ret == -1) \
+ ret = mt[i].to; \
+ \
+ map[(x)] &= ~(1UL << r); \
+ } \
+ } while (0)
+
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
+#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
+ * working bitmap, @fill.
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Out-of-order (and superscalar) execution is vital here, so it's critical to
+ * avoid false data dependencies. CPU and compiler could (mostly) take care of
+ * this on their own, but the operation ordering is explicitly given here with
+ * a likely execution order in mind, to highlight possible stalls. That's why
+ * a number of logically distinct operations (i.e. loading buckets, intersecting
+ * buckets) are interleaved.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 6, 7);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_AND(9, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 6, 7);
+ NFT_PIPAPO_AVX2_AND(12, 8, 9);
+ NFT_PIPAPO_AVX2_AND(13, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 6, 7);
+ NFT_PIPAPO_AVX2_AND(13, 8, 9);
+ NFT_PIPAPO_AVX2_AND(14, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ NFT_PIPAPO_AVX2_AND(1, 1, 14);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 9, 10);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(8, 6, 7);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
+ pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
+ pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
+ pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
+ pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(14, 9, 10);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 8, 9);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 12, 13);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 14, 0);
+ NFT_PIPAPO_AVX2_AND(7, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 9, 10);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ NFT_PIPAPO_AVX2_AND(1, 8, 9);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_AND(3, 12, 0);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(4, 1, 2);
+ NFT_PIPAPO_AVX2_AND(5, 3, 4);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_AND(2, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(4, 3, 2);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(1, 6, 7);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
+ NFT_PIPAPO_AVX2_AND(3, 5, 6);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(6, 4, 5);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * This function should never be called, but is provided for the case the field
+ * size doesn't match any of the known data types. Matching rate is
+ * substantially lower than AVX2 routines.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ unsigned long *lt = f->lt, bsize = f->bsize;
+ int i, ret = -1, b;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first)
+ memset(map, 0xff, bsize * sizeof(*map));
+
+ for (i = offset; i < bsize; i++) {
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, map, pkt);
+ else
+ pipapo_and_field_buckets_4bit(f, map, pkt);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
+
+ if (last)
+ return b;
+
+ if (ret == -1)
+ ret = b / XSAVE_YMM_SIZE;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
+ * @features: Flags: NFT_SET_INTERVAL needs to be there
+ * @est: Storage for estimation data
+ *
+ * Return: true if set is compatible and AVX2 available, false otherwise.
+ */
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ return false;
+
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
+ return false;
+
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @ext: nftables API extension pointer, filled with matching reference
+ *
+ * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
+ *
+ * This implementation exploits the repetitive characteristic of the algorithm
+ * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
+ *
+ * Return: true on match, false otherwise.
+ */
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ unsigned long *res, *fill, *scratch;
+ u8 genmask = nft_genmask_cur(net);
+ const u8 *rp = (const u8 *)key;
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ bool map_index;
+ int i, ret = 0;
+
+ m = rcu_dereference(priv->match);
+
+ /* This also protects access to all data related to scratch maps */
+ kernel_fpu_begin();
+
+ scratch = *raw_cpu_ptr(m->scratch_aligned);
+ if (unlikely(!scratch)) {
+ kernel_fpu_end();
+ return false;
+ }
+ map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
+
+ res = scratch + (map_index ? m->bsize_max : 0);
+ fill = scratch + (map_index ? 0 : m->bsize_max);
+
+ /* Starting map doesn't need to be set for this implementation */
+
+ nft_pipapo_avx2_prepare();
+
+next_match:
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1, first = !i;
+
+#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
+ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
+ ret, rp, \
+ first, last))
+
+ if (likely(f->bb == 8)) {
+ if (f->groups == 1) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
+ } else if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
+ } else if (f->groups == 6) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
+ } else if (f->groups == 16) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
+ ret, rp,
+ first, last);
+ }
+ } else {
+ if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
+ } else if (f->groups == 8) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
+ } else if (f->groups == 12) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
+ } else if (f->groups == 32) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
+ ret, rp,
+ first, last);
+ }
+ }
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+#undef NFT_SET_PIPAPO_AVX2_LOOKUP
+
+ if (ret < 0)
+ goto out;
+
+ if (last) {
+ *ext = &f->mt[ret].e->ext;
+ if (unlikely(nft_set_elem_expired(*ext) ||
+ !nft_set_elem_active(*ext, genmask))) {
+ ret = 0;
+ goto next_match;
+ }
+
+ goto out;
+ }
+
+ swap(res, fill);
+ rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ }
+
+out:
+ if (i % 2)
+ raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
+ kernel_fpu_end();
+
+ return ret >= 0;
+}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
new file mode 100644
index 000000000000..396caf7bfca8
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NFT_SET_PIPAPO_AVX2_H
+
+#ifdef CONFIG_AS_AVX2
+#include <asm/fpu/xstate.h>
+#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext);
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est);
+#endif /* CONFIG_AS_AVX2 */
+
+#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 5000b938ab1e..172ef8189f99 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -481,8 +481,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_rbtree_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
.privsize = nft_rbtree_privsize,
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 764e88682a81..30be5787fbde 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -11,6 +11,7 @@
#include <net/ip_tunnels.h>
#include <net/vxlan.h>
#include <net/erspan.h>
+#include <net/geneve.h>
struct nft_tunnel {
enum nft_tunnel_keys key:8;
@@ -144,6 +145,7 @@ struct nft_tunnel_opts {
union {
struct vxlan_metadata vxlan;
struct erspan_metadata erspan;
+ u8 data[IP_TUNNEL_OPTS_MAX];
} u;
u32 len;
__be16 flags;
@@ -301,9 +303,53 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
return 0;
}
+static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
+ [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+};
+
+static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len;
+ struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
+ int err, data_len;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr,
+ nft_tunnel_opts_geneve_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ opts->len += sizeof(*opt) + data_len;
+ if (opts->len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
+ opts->flags = TUNNEL_GENEVE_OPT;
+
+ return 0;
+}
+
static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = {
+ .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE },
[NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
[NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, },
};
static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
@@ -311,22 +357,43 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct ip_tunnel_info *info,
struct nft_tunnel_opts *opts)
{
- struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
- int err;
+ int err, rem, type = 0;
+ struct nlattr *nla;
- err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
- nft_tunnel_opts_policy, NULL);
+ err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
+ nft_tunnel_opts_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
- err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
- opts);
- } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
- err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
- opts);
- } else {
- return -EOPNOTSUPP;
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case NFTA_TUNNEL_KEY_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_vxlan_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_erspan_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_ERSPAN_OPT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT)
+ return -EINVAL;
+ err = nft_tunnel_obj_geneve_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
}
return err;
@@ -518,6 +585,25 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
break;
}
nla_nest_end(skb, inner);
+ } else if (opts->flags & TUNNEL_GENEVE_OPT) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
+ while (opts->len > offset) {
+ opt = (struct geneve_opt *)opts->u.data + offset;
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA,
+ opt->length * 4, opt->opt_data))
+ goto inner_failure;
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index f56d3ed93e56..75bd0e5dd312 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/timer.h>
+#include <linux/alarmtimer.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/netfilter.h>
@@ -30,6 +31,7 @@
struct idletimer_tg {
struct list_head entry;
+ struct alarm alarm;
struct timer_list timer;
struct work_struct work;
@@ -37,6 +39,7 @@ struct idletimer_tg {
struct device_attribute attr;
unsigned int refcnt;
+ u8 timer_type;
};
static LIST_HEAD(idletimer_tg_list);
@@ -62,20 +65,29 @@ static ssize_t idletimer_tg_show(struct device *dev,
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ struct timespec64 ktimespec = {};
+ long time_diff = 0;
mutex_lock(&list_mutex);
timer = __idletimer_tg_find_by_label(attr->attr.name);
- if (timer)
- expires = timer->timer.expires;
+ if (timer) {
+ if (timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm);
+ ktimespec = ktime_to_timespec64(expires_alarm);
+ time_diff = ktimespec.tv_sec;
+ } else {
+ expires = timer->timer.expires;
+ time_diff = jiffies_to_msecs(expires - jiffies) / 1000;
+ }
+ }
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
- return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ if (time_after(expires, jiffies) || ktimespec.tv_sec > 0)
+ return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff);
- return sprintf(buf, "0\n");
+ return snprintf(buf, PAGE_SIZE, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -95,6 +107,16 @@ static void idletimer_tg_expired(struct timer_list *t)
schedule_work(&timer->work);
}
+static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm,
+ ktime_t now)
+{
+ struct idletimer_tg *timer = alarm->data;
+
+ pr_debug("alarm %s expired\n", timer->attr.attr.name);
+ schedule_work(&timer->work);
+ return ALARMTIMER_NORESTART;
+}
+
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
{
int ret;
@@ -160,6 +182,68 @@ out:
return ret;
}
+static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
+{
+ int ret;
+
+ info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ if (!info->timer) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = idletimer_check_sysfs_name(info->label, sizeof(info->label));
+ if (ret < 0)
+ goto out_free_timer;
+
+ sysfs_attr_init(&info->timer->attr.attr);
+ info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+ if (!info->timer->attr.attr.name) {
+ ret = -ENOMEM;
+ goto out_free_timer;
+ }
+ info->timer->attr.attr.mode = 0444;
+ info->timer->attr.show = idletimer_tg_show;
+
+ ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ if (ret < 0) {
+ pr_debug("couldn't add file to sysfs");
+ goto out_free_attr;
+ }
+
+ /* notify userspace */
+ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD);
+
+ list_add(&info->timer->entry, &idletimer_tg_list);
+ pr_debug("timer type value is %u", info->timer_type);
+ info->timer->timer_type = info->timer_type;
+ info->timer->refcnt = 1;
+
+ INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout;
+ alarm_init(&info->timer->alarm, ALARM_BOOTTIME,
+ idletimer_tg_alarmproc);
+ info->timer->alarm.data = info->timer;
+ tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+
+ return 0;
+
+out_free_attr:
+ kfree(info->timer->attr.attr.name);
+out_free_timer:
+ kfree(info->timer);
+out:
+ return ret;
+}
+
/*
* The actual xt_tables plugin.
*/
@@ -177,13 +261,30 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
return XT_CONTINUE;
}
-static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
+ const struct xt_action_param *par)
{
- struct idletimer_tg_info *info = par->targinfo;
- int ret;
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("resetting timer %s, timeout period %u\n",
+ info->label, info->timeout);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+ return XT_CONTINUE;
+}
+
+static int idletimer_tg_helper(struct idletimer_tg_info *info)
+{
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
return -EINVAL;
@@ -198,7 +299,23 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
pr_debug("label is empty or not nul-terminated\n");
return -EINVAL;
}
+ return 0;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ ret = idletimer_tg_helper(info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
mutex_lock(&list_mutex);
info->timer = __idletimer_tg_find_by_label(info->label);
@@ -222,6 +339,65 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
return 0;
}
+static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info_v1 *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ ret = idletimer_tg_helper((struct idletimer_tg_info *)info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
+
+ if (info->timer_type > XT_IDLETIMER_ALARM) {
+ pr_debug("invalid value for timer type\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&list_mutex);
+
+ info->timer = __idletimer_tg_find_by_label(info->label);
+ if (info->timer) {
+ if (info->timer->timer_type != info->timer_type) {
+ pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n");
+ mutex_unlock(&list_mutex);
+ return -EINVAL;
+ }
+
+ info->timer->refcnt++;
+ if (info->timer_type & XT_IDLETIMER_ALARM) {
+ /* calculate remaining expiry time */
+ ktime_t tout = alarm_expires_remaining(&info->timer->alarm);
+ struct timespec64 ktimespec = ktime_to_timespec64(tout);
+
+ if (ktimespec.tv_sec > 0) {
+ pr_debug("time_expiry_remaining %lld\n",
+ ktimespec.tv_sec);
+ alarm_start_relative(&info->timer->alarm, tout);
+ }
+ } else {
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+ pr_debug("increased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ } else {
+ ret = idletimer_tg_create_v1(info);
+ if (ret < 0) {
+ pr_debug("failed to create timer\n");
+ mutex_unlock(&list_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&list_mutex);
+ return 0;
+}
+
static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
@@ -247,7 +423,38 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
mutex_unlock(&list_mutex);
}
-static struct xt_target idletimer_tg __read_mostly = {
+static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
+{
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
+
+ pr_debug("destroy targinfo %s\n", info->label);
+
+ mutex_lock(&list_mutex);
+
+ if (--info->timer->refcnt == 0) {
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ alarm_cancel(&info->timer->alarm);
+ } else {
+ del_timer_sync(&info->timer->timer);
+ }
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
+ } else {
+ pr_debug("decreased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ }
+
+ mutex_unlock(&list_mutex);
+}
+
+
+static struct xt_target idletimer_tg[] __read_mostly = {
+ {
.name = "IDLETIMER",
.family = NFPROTO_UNSPEC,
.target = idletimer_tg_target,
@@ -256,6 +463,20 @@ static struct xt_target idletimer_tg __read_mostly = {
.checkentry = idletimer_tg_checkentry,
.destroy = idletimer_tg_destroy,
.me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+
+
};
static struct class *idletimer_tg_class;
@@ -283,7 +504,8 @@ static int __init idletimer_tg_init(void)
idletimer_tg_kobj = &idletimer_tg_device->kobj;
- err = xt_register_target(&idletimer_tg);
+ err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
+
if (err < 0) {
pr_debug("couldn't register xt target\n");
goto out_dev;
@@ -300,7 +522,7 @@ out:
static void __exit idletimer_tg_exit(void)
{
- xt_unregister_target(&idletimer_tg);
+ xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
device_destroy(idletimer_tg_class, MKDEV(0, 0));
class_destroy(idletimer_tg_class);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 2317721f3ecb..75625d13e976 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -21,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: packet security mark modification");
MODULE_ALIAS("ipt_SECMARK");
MODULE_ALIAS("ip6t_SECMARK");
-#define PFX "SECMARK: "
-
static u8 mode;
static unsigned int
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 8c835ad63729..9c5cfd74a0ee 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -132,7 +132,7 @@ struct xt_hashlimit_htable {
const char *name;
struct net *net;
- struct hlist_head hash[0]; /* hashtable itself */
+ struct hlist_head hash[]; /* hashtable itself */
};
static int
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 225a7ab6d79a..19bef176145e 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -71,7 +71,7 @@ struct recent_entry {
u_int8_t ttl;
u_int8_t index;
u_int16_t nstamps;
- unsigned long stamps[0];
+ unsigned long stamps[];
};
struct recent_table {
@@ -82,7 +82,7 @@ struct recent_table {
unsigned int entries;
u8 nstamps_max_mask;
struct list_head lru_list;
- struct list_head iphash[0];
+ struct list_head iphash[];
};
struct recent_net {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 363264ca2e09..eefacb3176e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3528,9 +3528,9 @@ int tc_setup_flow_action(struct flow_action *flow_action,
struct tc_action *act;
int i, j, k, err = 0;
- BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_ANY != FLOW_ACTION_HW_STATS_TYPE_ANY);
- BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_IMMEDIATE != FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE);
- BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_DELAYED != FLOW_ACTION_HW_STATS_TYPE_DELAYED);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_ANY != FLOW_ACTION_HW_STATS_ANY);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_DELAYED != FLOW_ACTION_HW_STATS_DELAYED);
if (!exts)
return 0;
@@ -3613,8 +3613,8 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->mangle.mask = tcf_pedit_mask(act, k);
entry->mangle.val = tcf_pedit_val(act, k);
entry->mangle.offset = tcf_pedit_offset(act, k);
- entry = &flow_action->entries[++j];
entry->hw_stats_type = act->hw_stats_type;
+ entry = &flow_action->entries[++j];
}
} else if (is_tcf_csum(act)) {
entry->id = FLOW_ACTION_CSUM;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 50794125bf02..0d99df1e764d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -618,21 +618,28 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
}
EXPORT_SYMBOL(qdisc_watchdog_init);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
+void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
+ u64 delta_ns)
{
if (test_bit(__QDISC_STATE_DEACTIVATED,
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- if (wd->last_expires == expires)
- return;
+ if (hrtimer_is_queued(&wd->timer)) {
+ /* If timer is already set in [expires, expires + delta_ns],
+ * do not reprogram it.
+ */
+ if (wd->last_expires - expires <= delta_ns)
+ return;
+ }
wd->last_expires = expires;
- hrtimer_start(&wd->timer,
- ns_to_ktime(expires),
- HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_range_ns(&wd->timer,
+ ns_to_ktime(expires),
+ delta_ns,
+ HRTIMER_MODE_ABS_PINNED);
}
-EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 371ad84def3b..4c060134c736 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -121,6 +121,8 @@ struct fq_sched_data {
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
+
+ u32 timer_slack; /* hrtimer slack in ns */
struct qdisc_watchdog watchdog;
};
@@ -504,8 +506,9 @@ begin:
head = &q->old_flows;
if (!head->first) {
if (q->time_next_delayed_flow != ~0ULL)
- qdisc_watchdog_schedule_ns(&q->watchdog,
- q->time_next_delayed_flow);
+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
+ q->time_next_delayed_flow,
+ q->timer_slack);
return NULL;
}
}
@@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log)
}
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
+
[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
@@ -747,6 +752,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -833,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
q->ce_threshold = (u64)NSEC_PER_USEC *
nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+ if (tb[TCA_FQ_TIMER_SLACK])
+ q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
+
if (!err) {
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
@@ -884,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
+ q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
+
/* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
@@ -924,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
- nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -947,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long;
st.allocation_errors = q->stat_allocation_errors;
- st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
+ ktime_get_ns();
st.flows = q->flows;
st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows;
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh
index a0795227216e..efd798a85931 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh
@@ -8,9 +8,9 @@ tc_flower_get_target()
# The driver associates a counter with each tc filter, which means the
# number of supported filters is bounded by the number of available
# counters.
- # Currently, the driver supports 12K (12,288) flow counters and six of
+ # Currently, the driver supports 30K (30,720) flow counters and six of
# these are used for multicast routing.
- local target=12282
+ local target=30714
if ((! should_fail)); then
echo $target
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh
new file mode 100755
index 000000000000..20ed98fe5a60
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ default_hw_stats_test
+ immediate_hw_stats_test
+ delayed_hw_stats_test
+ disabled_hw_stats_test
+"
+NUM_NETIFS=2
+
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+switch_create()
+{
+ simple_if_init $swp1 192.0.2.2/24
+ tc qdisc add dev $swp1 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp1 clsact
+ simple_if_fini $swp1 192.0.2.2/24
+}
+
+hw_stats_test()
+{
+ RET=0
+
+ local name=$1
+ local action_hw_stats=$2
+ local occ_delta=$3
+ local expected_packet_count=$4
+
+ local orig_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]')
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop $action_hw_stats
+ check_err $? "Failed to add rule with $name hw_stats"
+
+ local new_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]')
+ local expected_occ=$((orig_occ + occ_delta))
+ [ "$new_occ" == "$expected_occ" ]
+ check_err $? "Expected occupancy of $expected_occ, got $new_occ"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $swp1mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $swp1 ingress" 101 $expected_packet_count
+ check_err $? "Did not match incoming packet"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "$name hw_stats"
+}
+
+default_hw_stats_test()
+{
+ hw_stats_test "default" "" 2 1
+}
+
+immediate_hw_stats_test()
+{
+ hw_stats_test "immediate" "hw_stats immediate" 2 1
+}
+
+delayed_hw_stats_test()
+{
+ RET=0
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop hw_stats delayed
+ check_fail $? "Unexpected success in adding rule with delayed hw_stats"
+
+ log_test "delayed hw_stats"
+}
+
+disabled_hw_stats_test()
+{
+ hw_stats_test "disabled" "hw_stats disabled" 0 0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ h1mac=$(mac_get $h1)
+ swp1mac=$(mac_get $swp1)
+
+ vrf_prepare
+
+ h1_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+check_tc_action_hw_stats_support
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index a4a7879b3bb9..977fc2b326a2 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -60,6 +60,15 @@ check_tc_chain_support()
fi
}
+check_tc_action_hw_stats_support()
+{
+ tc actions help 2>&1 | grep -q hw_stats
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: iproute2 too old; tc is missing action hw_stats support"
+ exit 1
+ fi
+}
+
if [[ "$(id -u)" -ne 0 ]]; then
echo "SKIP: need root privileges"
exit 0