diff options
130 files changed, 4581 insertions, 1087 deletions
diff --git a/Documentation/networking/device_drivers/stmicro/stmmac.rst b/Documentation/networking/device_drivers/stmicro/stmmac.rst index c34bab3d2df0..5d46e5036129 100644 --- a/Documentation/networking/device_drivers/stmicro/stmmac.rst +++ b/Documentation/networking/device_drivers/stmicro/stmmac.rst @@ -32,7 +32,8 @@ is also supported. DesignWare(R) Cores Ethernet MAC 10/100/1000 Universal version 3.70a (and older) and DesignWare(R) Cores Ethernet Quality-of-Service version 4.0 (and upper) have been used for developing this driver as well as -DesignWare(R) Cores XGMAC - 10G Ethernet MAC. +DesignWare(R) Cores XGMAC - 10G Ethernet MAC and DesignWare(R) Cores +Enterprise MAC - 100G Ethernet MAC. This driver supports both the platform bus and PCI. @@ -48,6 +49,8 @@ Cores Ethernet Controllers and corresponding minimum and maximum versions: +-------------------------------+--------------+--------------+--------------+ | XGMAC - 10G Ethernet MAC | 2.10a | N/A | XGMAC2+ | +-------------------------------+--------------+--------------+--------------+ +| XLGMAC - 100G Ethernet MAC | 2.00a | N/A | XLGMAC2+ | ++-------------------------------+--------------+--------------+--------------+ For questions related to hardware requirements, refer to the documentation supplied with your Ethernet adapter. All hardware requirements listed apply @@ -57,7 +60,7 @@ Feature List ============ The following features are available in this driver: - - GMII/MII/RGMII/SGMII/RMII/XGMII Interface + - GMII/MII/RGMII/SGMII/RMII/XGMII/XLGMII Interface - Half-Duplex / Full-Duplex Operation - Energy Efficient Ethernet (EEE) - IEEE 802.3x PAUSE Packets (Flow Control) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 523bf4be43cc..b19be7549aad 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -300,7 +300,7 @@ static int bnxt_tc_parse_actions(struct bnxt *bp, return -EINVAL; } - if (!flow_action_basic_hw_stats_types_check(flow_action, extack)) + if (!flow_action_basic_hw_stats_check(flow_action, extack)) return -EOPNOTSUPP; flow_action_for_each(i, act, flow_action) { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 2a2938bbb93a..e8852dfcc1f1 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -438,13 +438,118 @@ int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx, return get_filter_count(adapter, fidx, hitcnt, bytecnt, hash); } -int cxgb4_get_free_ftid(struct net_device *dev, int family) +static bool cxgb4_filter_prio_in_range(struct tid_info *t, u32 idx, u8 nslots, + u32 prio) +{ + struct filter_entry *prev_tab, *next_tab, *prev_fe, *next_fe; + u32 prev_ftid, next_ftid; + + /* Only insert the rule if both of the following conditions + * are met: + * 1. The immediate previous rule has priority <= @prio. + * 2. The immediate next rule has priority >= @prio. + */ + + /* High Priority (HPFILTER) region always has higher priority + * than normal FILTER region. So, all rules in HPFILTER region + * must have prio value <= rules in normal FILTER region. + */ + if (idx < t->nhpftids) { + /* Don't insert if there's a rule already present at @idx + * in HPFILTER region. + */ + if (test_bit(idx, t->hpftid_bmap)) + return false; + + next_tab = t->hpftid_tab; + next_ftid = find_next_bit(t->hpftid_bmap, t->nhpftids, idx); + if (next_ftid >= t->nhpftids) { + /* No next entry found in HPFILTER region. + * See if there's any next entry in normal + * FILTER region. + */ + next_ftid = find_first_bit(t->ftid_bmap, t->nftids); + if (next_ftid >= t->nftids) + next_ftid = idx; + else + next_tab = t->ftid_tab; + } + + /* Search for the closest previous filter entry in HPFILTER + * region. No need to search in normal FILTER region because + * there can never be any entry in normal FILTER region whose + * prio value is < last entry in HPFILTER region. + */ + prev_ftid = find_last_bit(t->hpftid_bmap, idx); + if (prev_ftid >= idx) + prev_ftid = idx; + + prev_tab = t->hpftid_tab; + } else { + idx -= t->nhpftids; + + /* Don't insert if there's a rule already present at @idx + * in normal FILTER region. + */ + if (test_bit(idx, t->ftid_bmap)) + return false; + + prev_tab = t->ftid_tab; + prev_ftid = find_last_bit(t->ftid_bmap, idx); + if (prev_ftid >= idx) { + /* No previous entry found in normal FILTER + * region. See if there's any previous entry + * in HPFILTER region. + */ + prev_ftid = find_last_bit(t->hpftid_bmap, t->nhpftids); + if (prev_ftid >= t->nhpftids) + prev_ftid = idx; + else + prev_tab = t->hpftid_tab; + } + + /* Search for the closest next filter entry in normal + * FILTER region. No need to search in HPFILTER region + * because there can never be any entry in HPFILTER + * region whose prio value is > first entry in normal + * FILTER region. + */ + next_ftid = find_next_bit(t->ftid_bmap, t->nftids, idx); + if (next_ftid >= t->nftids) + next_ftid = idx; + + next_tab = t->ftid_tab; + } + + next_fe = &next_tab[next_ftid]; + + /* See if the filter entry belongs to an IPv6 rule, which + * occupy 4 slots on T5 and 2 slots on T6. Adjust the + * reference to the previously inserted filter entry + * accordingly. + */ + prev_fe = &prev_tab[prev_ftid & ~(nslots - 1)]; + if (!prev_fe->fs.type) + prev_fe = &prev_tab[prev_ftid]; + + if ((prev_fe->valid && prev_fe->fs.tc_prio > prio) || + (next_fe->valid && next_fe->fs.tc_prio < prio)) + return false; + + return true; +} + +int cxgb4_get_free_ftid(struct net_device *dev, u8 family, bool hash_en, + u32 tc_prio) { struct adapter *adap = netdev2adap(dev); struct tid_info *t = &adap->tids; + struct filter_entry *tab, *f; + u32 bmap_ftid, max_ftid; + unsigned long *bmap; bool found = false; - u8 i, n, cnt; - int ftid; + u8 i, cnt, n; + int ftid = 0; /* IPv4 occupy 1 slot. IPv6 occupy 2 slots on T6 and 4 slots * on T5. @@ -456,34 +561,129 @@ int cxgb4_get_free_ftid(struct net_device *dev, int family) n += 2; } - if (n > t->nftids) - return -ENOMEM; - - /* Find free filter slots from the end of TCAM. Appropriate - * checks must be done by caller later to ensure the prio - * passed by TC doesn't conflict with prio saved by existing - * rules in the TCAM. + /* There are 3 filter regions available in hardware in + * following order of priority: + * + * 1. High Priority (HPFILTER) region (Highest Priority). + * 2. HASH region. + * 3. Normal FILTER region (Lowest Priority). + * + * Entries in HPFILTER and normal FILTER region have index + * 0 as the highest priority and the rules will be scanned + * in ascending order until either a rule hits or end of + * the region is reached. + * + * All HASH region entries have same priority. The set of + * fields to match in headers are pre-determined. The same + * set of header match fields must be compulsorily specified + * in all the rules wanting to get inserted in HASH region. + * Hence, HASH region is an exact-match region. A HASH is + * generated for a rule based on the values in the + * pre-determined set of header match fields. The generated + * HASH serves as an index into the HASH region. There can + * never be 2 rules having the same HASH. Hardware will + * compute a HASH for every incoming packet based on the + * values in the pre-determined set of header match fields + * and uses it as an index to check if there's a rule + * inserted in the HASH region at the specified index. If + * there's a rule inserted, then it's considered as a filter + * hit. Otherwise, it's a filter miss and normal FILTER region + * is scanned afterwards. */ + spin_lock_bh(&t->ftid_lock); - ftid = t->nftids - 1; - while (ftid >= n - 1) { + + ftid = (tc_prio <= t->nhpftids) ? 0 : t->nhpftids; + max_ftid = t->nftids + t->nhpftids; + while (ftid < max_ftid) { + if (ftid < t->nhpftids) { + /* If the new rule wants to get inserted into + * HPFILTER region, but its prio is greater + * than the rule with the highest prio in HASH + * region, then reject the rule. + */ + if (t->tc_hash_tids_max_prio && + tc_prio > t->tc_hash_tids_max_prio) + break; + + /* If there's not enough slots available + * in HPFILTER region, then move on to + * normal FILTER region immediately. + */ + if (ftid + n > t->nhpftids) { + ftid = t->nhpftids; + continue; + } + + bmap = t->hpftid_bmap; + bmap_ftid = ftid; + tab = t->hpftid_tab; + } else if (hash_en) { + /* Ensure priority is >= last rule in HPFILTER + * region. + */ + ftid = find_last_bit(t->hpftid_bmap, t->nhpftids); + if (ftid < t->nhpftids) { + f = &t->hpftid_tab[ftid]; + if (f->valid && tc_prio < f->fs.tc_prio) + break; + } + + /* Ensure priority is <= first rule in normal + * FILTER region. + */ + ftid = find_first_bit(t->ftid_bmap, t->nftids); + if (ftid < t->nftids) { + f = &t->ftid_tab[ftid]; + if (f->valid && tc_prio > f->fs.tc_prio) + break; + } + + found = true; + ftid = t->nhpftids; + goto out_unlock; + } else { + /* If the new rule wants to get inserted into + * normal FILTER region, but its prio is less + * than the rule with the highest prio in HASH + * region, then reject the rule. + */ + if (t->tc_hash_tids_max_prio && + tc_prio < t->tc_hash_tids_max_prio) + break; + + if (ftid + n > max_ftid) + break; + + bmap = t->ftid_bmap; + bmap_ftid = ftid - t->nhpftids; + tab = t->ftid_tab; + } + cnt = 0; for (i = 0; i < n; i++) { - if (test_bit(ftid - i, t->ftid_bmap)) + if (test_bit(bmap_ftid + i, bmap)) break; cnt++; } + if (cnt == n) { - ftid &= ~(n - 1); - found = true; - break; + /* Ensure the new rule's prio doesn't conflict + * with existing rules. + */ + if (cxgb4_filter_prio_in_range(t, ftid, n, + tc_prio)) { + ftid &= ~(n - 1); + found = true; + break; + } } - ftid -= n; + ftid += n; } - spin_unlock_bh(&t->ftid_lock); - ftid += t->nhpftids; +out_unlock: + spin_unlock_bh(&t->ftid_lock); return found ? ftid : -ENOMEM; } @@ -555,73 +755,6 @@ static void cxgb4_clear_hpftid(struct tid_info *t, int fidx, int family) spin_unlock_bh(&t->ftid_lock); } -bool cxgb4_filter_prio_in_range(struct net_device *dev, u32 idx, u32 prio) -{ - struct filter_entry *prev_fe, *next_fe, *tab; - struct adapter *adap = netdev2adap(dev); - u32 prev_ftid, next_ftid, max_tid; - struct tid_info *t = &adap->tids; - unsigned long *bmap; - bool valid = true; - - if (idx < t->nhpftids) { - bmap = t->hpftid_bmap; - tab = t->hpftid_tab; - max_tid = t->nhpftids; - } else { - idx -= t->nhpftids; - bmap = t->ftid_bmap; - tab = t->ftid_tab; - max_tid = t->nftids; - } - - /* Only insert the rule if both of the following conditions - * are met: - * 1. The immediate previous rule has priority <= @prio. - * 2. The immediate next rule has priority >= @prio. - */ - spin_lock_bh(&t->ftid_lock); - - /* Don't insert if there's a rule already present at @idx. */ - if (test_bit(idx, bmap)) { - valid = false; - goto out_unlock; - } - - next_ftid = find_next_bit(bmap, max_tid, idx); - if (next_ftid >= max_tid) - next_ftid = idx; - - next_fe = &tab[next_ftid]; - - prev_ftid = find_last_bit(bmap, idx); - if (prev_ftid >= idx) - prev_ftid = idx; - - /* See if the filter entry belongs to an IPv6 rule, which - * occupy 4 slots on T5 and 2 slots on T6. Adjust the - * reference to the previously inserted filter entry - * accordingly. - */ - if (CHELSIO_CHIP_VERSION(adap->params.chip) < CHELSIO_T6) { - prev_fe = &tab[prev_ftid & ~0x3]; - if (!prev_fe->fs.type) - prev_fe = &tab[prev_ftid]; - } else { - prev_fe = &tab[prev_ftid & ~0x1]; - if (!prev_fe->fs.type) - prev_fe = &tab[prev_ftid]; - } - - if ((prev_fe->valid && prio < prev_fe->fs.tc_prio) || - (next_fe->valid && prio > next_fe->fs.tc_prio)) - valid = false; - -out_unlock: - spin_unlock_bh(&t->ftid_lock); - return valid; -} - /* Delete the filter at a specified index. */ static int del_filter_wr(struct adapter *adapter, int fidx) { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h index b3e4a645043d..b0751c0611ec 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h @@ -53,5 +53,4 @@ void clear_all_filters(struct adapter *adapter); void init_hash_filter(struct adapter *adap); bool is_filter_exact_match(struct adapter *adap, struct ch_filter_specification *fs); -bool cxgb4_filter_prio_in_range(struct net_device *dev, u32 idx, u32 prio); #endif /* __CXGB4_FILTER_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index cc46277e98de..aec9b90313e7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -553,7 +553,7 @@ int cxgb4_validate_flow_actions(struct net_device *dev, bool act_vlan = false; int i; - if (!flow_action_basic_hw_stats_types_check(actions, extack)) + if (!flow_action_basic_hw_stats_check(actions, extack)) return -EOPNOTSUPP; flow_action_for_each(i, act, actions) { @@ -635,6 +635,64 @@ int cxgb4_validate_flow_actions(struct net_device *dev, return 0; } +static void cxgb4_tc_flower_hash_prio_add(struct adapter *adap, u32 tc_prio) +{ + spin_lock_bh(&adap->tids.ftid_lock); + if (adap->tids.tc_hash_tids_max_prio < tc_prio) + adap->tids.tc_hash_tids_max_prio = tc_prio; + spin_unlock_bh(&adap->tids.ftid_lock); +} + +static void cxgb4_tc_flower_hash_prio_del(struct adapter *adap, u32 tc_prio) +{ + struct tid_info *t = &adap->tids; + struct ch_tc_flower_entry *fe; + struct rhashtable_iter iter; + u32 found = 0; + + spin_lock_bh(&t->ftid_lock); + /* Bail if the current rule is not the one with the max + * prio. + */ + if (t->tc_hash_tids_max_prio != tc_prio) + goto out_unlock; + + /* Search for the next rule having the same or next lower + * max prio. + */ + rhashtable_walk_enter(&adap->flower_tbl, &iter); + do { + rhashtable_walk_start(&iter); + + fe = rhashtable_walk_next(&iter); + while (!IS_ERR_OR_NULL(fe)) { + if (fe->fs.hash && + fe->fs.tc_prio <= t->tc_hash_tids_max_prio) { + t->tc_hash_tids_max_prio = fe->fs.tc_prio; + found++; + + /* Bail if we found another rule + * having the same prio as the + * current max one. + */ + if (fe->fs.tc_prio == tc_prio) + break; + } + + fe = rhashtable_walk_next(&iter); + } + + rhashtable_walk_stop(&iter); + } while (fe == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + + if (!found) + t->tc_hash_tids_max_prio = 0; + +out_unlock: + spin_unlock_bh(&t->ftid_lock); +} + int cxgb4_tc_flower_replace(struct net_device *dev, struct flow_cls_offload *cls) { @@ -644,6 +702,7 @@ int cxgb4_tc_flower_replace(struct net_device *dev, struct ch_tc_flower_entry *ch_flower; struct ch_filter_specification *fs; struct filter_ctx ctx; + u8 inet_family; int fidx, ret; if (cxgb4_validate_flow_actions(dev, &rule->action, extack)) @@ -664,39 +723,32 @@ int cxgb4_tc_flower_replace(struct net_device *dev, cxgb4_process_flow_actions(dev, &rule->action, fs); fs->hash = is_filter_exact_match(adap, fs); - if (fs->hash) { - fidx = 0; - } else { - u8 inet_family; + inet_family = fs->type ? PF_INET6 : PF_INET; - inet_family = fs->type ? PF_INET6 : PF_INET; - - /* Note that TC uses prio 0 to indicate stack to - * generate automatic prio and hence doesn't pass prio - * 0 to driver. However, the hardware TCAM index - * starts from 0. Hence, the -1 here. - */ - if (cls->common.prio <= (adap->tids.nftids + - adap->tids.nhpftids)) { - fidx = cls->common.prio - 1; - if (fidx < adap->tids.nhpftids) - fs->prio = 1; - } else { - fidx = cxgb4_get_free_ftid(dev, inet_family); - } + /* Get a free filter entry TID, where we can insert this new + * rule. Only insert rule if its prio doesn't conflict with + * existing rules. + */ + fidx = cxgb4_get_free_ftid(dev, inet_family, fs->hash, + cls->common.prio); + if (fidx < 0) { + NL_SET_ERR_MSG_MOD(extack, + "No free LETCAM index available"); + ret = -ENOMEM; + goto free_entry; + } - /* Only insert FLOWER rule if its priority doesn't - * conflict with existing rules in the LETCAM. - */ - if (fidx < 0 || - !cxgb4_filter_prio_in_range(dev, fidx, cls->common.prio)) { - NL_SET_ERR_MSG_MOD(extack, - "No free LETCAM index available"); - ret = -ENOMEM; - goto free_entry; - } + if (fidx < adap->tids.nhpftids) { + fs->prio = 1; + fs->hash = 0; } + /* If the rule can be inserted into HASH region, then ignore + * the index to normal FILTER region. + */ + if (fs->hash) + fidx = 0; + fs->tc_prio = cls->common.prio; fs->tc_cookie = cls->cookie; @@ -727,6 +779,9 @@ int cxgb4_tc_flower_replace(struct net_device *dev, if (ret) goto del_filter; + if (fs->hash) + cxgb4_tc_flower_hash_prio_add(adap, cls->common.prio); + return 0; del_filter: @@ -742,12 +797,17 @@ int cxgb4_tc_flower_destroy(struct net_device *dev, { struct adapter *adap = netdev2adap(dev); struct ch_tc_flower_entry *ch_flower; + u32 tc_prio; + bool hash; int ret; ch_flower = ch_flower_lookup(adap, cls->cookie); if (!ch_flower) return -ENOENT; + hash = ch_flower->fs.hash; + tc_prio = ch_flower->fs.tc_prio; + ret = cxgb4_del_filter(dev, ch_flower->filter_id, &ch_flower->fs); if (ret) goto err; @@ -760,6 +820,9 @@ int cxgb4_tc_flower_destroy(struct net_device *dev, } kfree_rcu(ch_flower, rcu); + if (hash) + cxgb4_tc_flower_hash_prio_del(adap, tc_prio); + err: return ret; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c index d80dee4d316d..8a5ae8bc9b7d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c @@ -198,22 +198,14 @@ static int cxgb4_matchall_alloc_filter(struct net_device *dev, struct ch_filter_specification *fs; int ret, fidx; - /* Note that TC uses prio 0 to indicate stack to generate - * automatic prio and hence doesn't pass prio 0 to driver. - * However, the hardware TCAM index starts from 0. Hence, the - * -1 here. 1 slot is enough to create a wildcard matchall - * VIID rule. + /* Get a free filter entry TID, where we can insert this new + * rule. Only insert rule if its prio doesn't conflict with + * existing rules. + * + * 1 slot is enough to create a wildcard matchall VIID rule. */ - if (cls->common.prio <= (adap->tids.nftids + adap->tids.nhpftids)) - fidx = cls->common.prio - 1; - else - fidx = cxgb4_get_free_ftid(dev, PF_INET); - - /* Only insert MATCHALL rule if its priority doesn't conflict - * with existing rules in the LETCAM. - */ - if (fidx < 0 || - !cxgb4_filter_prio_in_range(dev, fidx, cls->common.prio)) { + fidx = cxgb4_get_free_ftid(dev, PF_INET, false, cls->common.prio); + if (fidx < 0) { NL_SET_ERR_MSG_MOD(extack, "No free LETCAM index available"); return -ENOMEM; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c index 269b8d9e25e0..3f3c11e54d97 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c @@ -155,9 +155,10 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) struct ch_filter_specification fs; struct cxgb4_tc_u32_table *t; struct cxgb4_link *link; - unsigned int filter_id; u32 uhtid, link_uhtid; bool is_ipv6 = false; + u8 inet_family; + int filter_id; int ret; if (!can_tc_u32_offload(dev)) @@ -166,18 +167,15 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) if (protocol != htons(ETH_P_IP) && protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; - /* Note that TC uses prio 0 to indicate stack to generate - * automatic prio and hence doesn't pass prio 0 to driver. - * However, the hardware TCAM index starts from 0. Hence, the - * -1 here. - */ - filter_id = TC_U32_NODE(cls->knode.handle) - 1; + inet_family = (protocol == htons(ETH_P_IPV6)) ? PF_INET6 : PF_INET; - /* Only insert U32 rule if its priority doesn't conflict with - * existing rules in the LETCAM. + /* Get a free filter entry TID, where we can insert this new + * rule. Only insert rule if its prio doesn't conflict with + * existing rules. */ - if (filter_id >= adapter->tids.nftids + adapter->tids.nhpftids || - !cxgb4_filter_prio_in_range(dev, filter_id, cls->common.prio)) { + filter_id = cxgb4_get_free_ftid(dev, inet_family, false, + TC_U32_NODE(cls->knode.handle)); + if (filter_id < 0) { NL_SET_ERR_MSG_MOD(extack, "No free LETCAM index available"); return -ENOMEM; @@ -358,23 +356,65 @@ int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) struct cxgb4_link *link = NULL; struct cxgb4_tc_u32_table *t; struct filter_entry *f; + bool found = false; u32 handle, uhtid; + u8 nslots; int ret; if (!can_tc_u32_offload(dev)) return -EOPNOTSUPP; /* Fetch the location to delete the filter. */ - filter_id = TC_U32_NODE(cls->knode.handle) - 1; - if (filter_id >= adapter->tids.nftids + adapter->tids.nhpftids) - return -ERANGE; + max_tids = adapter->tids.nhpftids + adapter->tids.nftids; + + spin_lock_bh(&adapter->tids.ftid_lock); + filter_id = 0; + while (filter_id < max_tids) { + if (filter_id < adapter->tids.nhpftids) { + i = filter_id; + f = &adapter->tids.hpftid_tab[i]; + if (f->valid && f->fs.tc_cookie == cls->knode.handle) { + found = true; + break; + } - if (filter_id < adapter->tids.nhpftids) - f = &adapter->tids.hpftid_tab[filter_id]; - else - f = &adapter->tids.ftid_tab[filter_id - adapter->tids.nhpftids]; + i = find_next_bit(adapter->tids.hpftid_bmap, + adapter->tids.nhpftids, i + 1); + if (i >= adapter->tids.nhpftids) { + filter_id = adapter->tids.nhpftids; + continue; + } + + filter_id = i; + } else { + i = filter_id - adapter->tids.nhpftids; + f = &adapter->tids.ftid_tab[i]; + if (f->valid && f->fs.tc_cookie == cls->knode.handle) { + found = true; + break; + } + + i = find_next_bit(adapter->tids.ftid_bmap, + adapter->tids.nftids, i + 1); + if (i >= adapter->tids.nftids) + break; + + filter_id = i + adapter->tids.nhpftids; + } + + nslots = 0; + if (f->fs.type) { + nslots++; + if (CHELSIO_CHIP_VERSION(adapter->params.chip) < + CHELSIO_T6) + nslots += 2; + } + + filter_id += nslots; + } + spin_unlock_bh(&adapter->tids.ftid_lock); - if (cls->knode.handle != f->fs.tc_cookie) + if (!found) return -ERANGE; t = adapter->tc_u32; @@ -407,7 +447,6 @@ int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) /* If a link is being deleted, then delete all filters * associated with the link. */ - max_tids = adapter->tids.nftids; for (i = 0; i < t->size; i++) { link = &t->table[i]; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 03b9bdc812cc..be831317520a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -149,6 +149,8 @@ struct tid_info { atomic_t conns_in_use; /* lock for setting/clearing filter bitmap */ spinlock_t ftid_lock; + + unsigned int tc_hash_tids_max_prio; }; static inline void *lookup_tid(const struct tid_info *t, unsigned int tid) @@ -263,7 +265,8 @@ struct filter_ctx { struct ch_filter_specification; -int cxgb4_get_free_ftid(struct net_device *dev, int family); +int cxgb4_get_free_ftid(struct net_device *dev, u8 family, bool hash_en, + u32 tc_prio); int __cxgb4_set_filter(struct net_device *dev, int filter_id, struct ch_filter_specification *fs, struct filter_ctx *ctx); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index 0a0c6ec2336c..8972cdd559e8 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -1082,7 +1082,7 @@ static int mvpp2_port_c2_tcam_rule_add(struct mvpp2_port *port, u8 qh, ql, pmap; int index, ctx; - if (!flow_action_basic_hw_stats_types_check(&rule->flow->action, NULL)) + if (!flow_action_basic_hw_stats_check(&rule->flow->action, NULL)) return -EOPNOTSUPP; memset(&c2, 0, sizeof(c2)); @@ -1308,7 +1308,7 @@ static int mvpp2_cls_rfs_parse_rule(struct mvpp2_rfs_rule *rule) struct flow_rule *flow = rule->flow; struct flow_action_entry *act; - if (!flow_action_basic_hw_stats_types_check(&rule->flow->action, NULL)) + if (!flow_action_basic_hw_stats_check(&rule->flow->action, NULL)) return -EOPNOTSUPP; act = &flow->action.entries[0]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index ebf60ff30295..901f88a886c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3167,8 +3167,8 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, if (!flow_action_has_entries(flow_action)) return -EINVAL; - if (!flow_action_hw_stats_types_check(flow_action, extack, - FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT)) + if (!flow_action_hw_stats_check(flow_action, extack, + FLOW_ACTION_HW_STATS_DELAYED_BIT)) return -EOPNOTSUPP; attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; @@ -3702,8 +3702,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, if (!flow_action_has_entries(flow_action)) return -EINVAL; - if (!flow_action_hw_stats_types_check(flow_action, extack, - FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT)) + if (!flow_action_hw_stats_check(flow_action, extack, + FLOW_ACTION_HW_STATS_DELAYED_BIT)) return -EOPNOTSUPP; flow_action_for_each(i, act, flow_action) { @@ -4524,7 +4524,7 @@ static int scan_tc_matchall_fdb_actions(struct mlx5e_priv *priv, return -EOPNOTSUPP; } - if (!flow_action_basic_hw_stats_types_check(flow_action, extack)) + if (!flow_action_basic_hw_stats_check(flow_action, extack)) return -EOPNOTSUPP; flow_action_for_each(i, act, flow_action) { diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h index 6534184cb942..d62496ef299c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/resources.h +++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h @@ -18,6 +18,7 @@ enum mlxsw_res_id { MLXSW_RES_ID_CQE_V1, MLXSW_RES_ID_CQE_V2, MLXSW_RES_ID_COUNTER_POOL_SIZE, + MLXSW_RES_ID_COUNTER_BANK_SIZE, MLXSW_RES_ID_MAX_SPAN, MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES, MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC, @@ -75,6 +76,7 @@ static u16 mlxsw_res_ids[] = { [MLXSW_RES_ID_CQE_V1] = 0x2211, [MLXSW_RES_ID_CQE_V2] = 0x2212, [MLXSW_RES_ID_COUNTER_POOL_SIZE] = 0x2410, + [MLXSW_RES_ID_COUNTER_BANK_SIZE] = 0x2411, [MLXSW_RES_ID_MAX_SPAN] = 0x2420, [MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES] = 0x2443, [MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC] = 0x2449, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 51709012593e..35d3a68ef4fd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -5421,8 +5421,13 @@ static int mlxsw_sp1_resources_register(struct mlxsw_core *mlxsw_core) if (err) goto err_resources_span_register; + err = mlxsw_sp_counter_resources_register(mlxsw_core); + if (err) + goto err_resources_counter_register; + return 0; +err_resources_counter_register: err_resources_span_register: devlink_resources_unregister(priv_to_devlink(mlxsw_core), NULL); return err; @@ -5440,8 +5445,13 @@ static int mlxsw_sp2_resources_register(struct mlxsw_core *mlxsw_core) if (err) goto err_resources_span_register; + err = mlxsw_sp_counter_resources_register(mlxsw_core); + if (err) + goto err_resources_counter_register; + return 0; +err_resources_counter_register: err_resources_span_register: devlink_resources_unregister(priv_to_devlink(mlxsw_core), NULL); return err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 81801c6fb941..57d8c95e4f9f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -46,6 +46,10 @@ #define MLXSW_SP_RESOURCE_NAME_SPAN "span_agents" +#define MLXSW_SP_RESOURCE_NAME_COUNTERS "counters" +#define MLXSW_SP_RESOURCE_NAME_COUNTERS_FLOW "flow" +#define MLXSW_SP_RESOURCE_NAME_COUNTERS_RIF "rif" + enum mlxsw_sp_resource_id { MLXSW_SP_RESOURCE_KVD = 1, MLXSW_SP_RESOURCE_KVD_LINEAR, @@ -55,6 +59,9 @@ enum mlxsw_sp_resource_id { MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS, MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS, MLXSW_SP_RESOURCE_SPAN, + MLXSW_SP_RESOURCE_COUNTERS, + MLXSW_SP_RESOURCE_COUNTERS_FLOW, + MLXSW_SP_RESOURCE_COUNTERS_RIF, }; struct mlxsw_sp_port; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c index 6a02ef9ec00e..0268f0a6662a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c @@ -7,91 +7,143 @@ #include "spectrum_cnt.h" -#define MLXSW_SP_COUNTER_POOL_BANK_SIZE 4096 - struct mlxsw_sp_counter_sub_pool { + u64 size; unsigned int base_index; - unsigned int size; + enum mlxsw_res_id entry_size_res_id; + const char *resource_name; /* devlink resource name */ + u64 resource_id; /* devlink resource id */ unsigned int entry_size; unsigned int bank_count; + atomic_t active_entries_count; }; struct mlxsw_sp_counter_pool { - unsigned int pool_size; + u64 pool_size; unsigned long *usage; /* Usage bitmap */ spinlock_t counter_pool_lock; /* Protects counter pool allocations */ - struct mlxsw_sp_counter_sub_pool *sub_pools; + atomic_t active_entries_count; + unsigned int sub_pools_count; + struct mlxsw_sp_counter_sub_pool sub_pools[]; }; -static struct mlxsw_sp_counter_sub_pool mlxsw_sp_counter_sub_pools[] = { +static const struct mlxsw_sp_counter_sub_pool mlxsw_sp_counter_sub_pools[] = { [MLXSW_SP_COUNTER_SUB_POOL_FLOW] = { + .entry_size_res_id = MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES, + .resource_name = MLXSW_SP_RESOURCE_NAME_COUNTERS_FLOW, + .resource_id = MLXSW_SP_RESOURCE_COUNTERS_FLOW, .bank_count = 6, }, [MLXSW_SP_COUNTER_SUB_POOL_RIF] = { + .entry_size_res_id = MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC, + .resource_name = MLXSW_SP_RESOURCE_NAME_COUNTERS_RIF, + .resource_id = MLXSW_SP_RESOURCE_COUNTERS_RIF, .bank_count = 2, } }; -static int mlxsw_sp_counter_pool_validate(struct mlxsw_sp *mlxsw_sp) +static u64 mlxsw_sp_counter_sub_pool_occ_get(void *priv) +{ + const struct mlxsw_sp_counter_sub_pool *sub_pool = priv; + + return atomic_read(&sub_pool->active_entries_count); +} + +static int mlxsw_sp_counter_sub_pools_init(struct mlxsw_sp *mlxsw_sp) { - unsigned int total_bank_config = 0; - unsigned int pool_size; + struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool; + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); + struct mlxsw_sp_counter_sub_pool *sub_pool; + unsigned int base_index = 0; + enum mlxsw_res_id res_id; + int err; int i; - pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE); - /* Check config is valid, no bank over subscription */ - for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) - total_bank_config += mlxsw_sp_counter_sub_pools[i].bank_count; - if (total_bank_config > pool_size / MLXSW_SP_COUNTER_POOL_BANK_SIZE + 1) - return -EINVAL; + for (i = 0; i < pool->sub_pools_count; i++) { + sub_pool = &pool->sub_pools[i]; + res_id = sub_pool->entry_size_res_id; + + if (!mlxsw_core_res_valid(mlxsw_sp->core, res_id)) + return -EIO; + sub_pool->entry_size = mlxsw_core_res_get(mlxsw_sp->core, + res_id); + err = devlink_resource_size_get(devlink, + sub_pool->resource_id, + &sub_pool->size); + if (err) + goto err_resource_size_get; + + devlink_resource_occ_get_register(devlink, + sub_pool->resource_id, + mlxsw_sp_counter_sub_pool_occ_get, + sub_pool); + + sub_pool->base_index = base_index; + base_index += sub_pool->size; + atomic_set(&sub_pool->active_entries_count, 0); + } return 0; + +err_resource_size_get: + for (i--; i >= 0; i--) { + sub_pool = &pool->sub_pools[i]; + + devlink_resource_occ_get_unregister(devlink, + sub_pool->resource_id); + } + return err; } -static int mlxsw_sp_counter_sub_pools_prepare(struct mlxsw_sp *mlxsw_sp) +static void mlxsw_sp_counter_sub_pools_fini(struct mlxsw_sp *mlxsw_sp) { + struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool; + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); struct mlxsw_sp_counter_sub_pool *sub_pool; + int i; - /* Prepare generic flow pool*/ - sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_FLOW]; - if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_PACKETS_BYTES)) - return -EIO; - sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, - COUNTER_SIZE_PACKETS_BYTES); - /* Prepare erif pool*/ - sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_RIF]; - if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_ROUTER_BASIC)) - return -EIO; - sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, - COUNTER_SIZE_ROUTER_BASIC); - return 0; + for (i = 0; i < pool->sub_pools_count; i++) { + sub_pool = &pool->sub_pools[i]; + + WARN_ON(atomic_read(&sub_pool->active_entries_count)); + devlink_resource_occ_get_unregister(devlink, + sub_pool->resource_id); + } +} + +static u64 mlxsw_sp_counter_pool_occ_get(void *priv) +{ + const struct mlxsw_sp_counter_pool *pool = priv; + + return atomic_read(&pool->active_entries_count); } int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp) { + unsigned int sub_pools_count = ARRAY_SIZE(mlxsw_sp_counter_sub_pools); + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); struct mlxsw_sp_counter_sub_pool *sub_pool; struct mlxsw_sp_counter_pool *pool; - unsigned int base_index; unsigned int map_size; - int i; int err; - if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_POOL_SIZE)) - return -EIO; - - err = mlxsw_sp_counter_pool_validate(mlxsw_sp); - if (err) - return err; - - err = mlxsw_sp_counter_sub_pools_prepare(mlxsw_sp); - if (err) - return err; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); + pool = kzalloc(struct_size(pool, sub_pools, sub_pools_count), + GFP_KERNEL); if (!pool) return -ENOMEM; + mlxsw_sp->counter_pool = pool; + memcpy(pool->sub_pools, mlxsw_sp_counter_sub_pools, + sub_pools_count * sizeof(*sub_pool)); + pool->sub_pools_count = sub_pools_count; spin_lock_init(&pool->counter_pool_lock); + atomic_set(&pool->active_entries_count, 0); + + err = devlink_resource_size_get(devlink, MLXSW_SP_RESOURCE_COUNTERS, + &pool->pool_size); + if (err) + goto err_pool_resource_size_get; + devlink_resource_occ_get_register(devlink, MLXSW_SP_RESOURCE_COUNTERS, + mlxsw_sp_counter_pool_occ_get, pool); - pool->pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE); map_size = BITS_TO_LONGS(pool->pool_size) * sizeof(unsigned long); pool->usage = kzalloc(map_size, GFP_KERNEL); @@ -100,26 +152,18 @@ int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp) goto err_usage_alloc; } - pool->sub_pools = mlxsw_sp_counter_sub_pools; - /* Allocation is based on bank count which should be - * specified for each sub pool statically. - */ - base_index = 0; - for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) { - sub_pool = &pool->sub_pools[i]; - sub_pool->size = sub_pool->bank_count * - MLXSW_SP_COUNTER_POOL_BANK_SIZE; - sub_pool->base_index = base_index; - base_index += sub_pool->size; - /* The last bank can't be fully used */ - if (sub_pool->base_index + sub_pool->size > pool->pool_size) - sub_pool->size = pool->pool_size - sub_pool->base_index; - } + err = mlxsw_sp_counter_sub_pools_init(mlxsw_sp); + if (err) + goto err_sub_pools_init; - mlxsw_sp->counter_pool = pool; return 0; +err_sub_pools_init: + kfree(pool->usage); err_usage_alloc: + devlink_resource_occ_get_unregister(devlink, + MLXSW_SP_RESOURCE_COUNTERS); +err_pool_resource_size_get: kfree(pool); return err; } @@ -127,10 +171,15 @@ err_usage_alloc: void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp) { struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool; + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); + mlxsw_sp_counter_sub_pools_fini(mlxsw_sp); WARN_ON(find_first_bit(pool->usage, pool->pool_size) != pool->pool_size); + WARN_ON(atomic_read(&pool->active_entries_count)); kfree(pool->usage); + devlink_resource_occ_get_unregister(devlink, + MLXSW_SP_RESOURCE_COUNTERS); kfree(pool); } @@ -144,7 +193,7 @@ int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int stop_index; int i, err; - sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id]; + sub_pool = &pool->sub_pools[sub_pool_id]; stop_index = sub_pool->base_index + sub_pool->size; entry_index = sub_pool->base_index; @@ -166,6 +215,8 @@ int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp, spin_unlock(&pool->counter_pool_lock); *p_counter_index = entry_index; + atomic_add(sub_pool->entry_size, &sub_pool->active_entries_count); + atomic_add(sub_pool->entry_size, &pool->active_entries_count); return 0; err_alloc: @@ -183,9 +234,77 @@ void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp, if (WARN_ON(counter_index >= pool->pool_size)) return; - sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id]; + sub_pool = &pool->sub_pools[sub_pool_id]; spin_lock(&pool->counter_pool_lock); for (i = 0; i < sub_pool->entry_size; i++) __clear_bit(counter_index + i, pool->usage); spin_unlock(&pool->counter_pool_lock); + atomic_sub(sub_pool->entry_size, &sub_pool->active_entries_count); + atomic_sub(sub_pool->entry_size, &pool->active_entries_count); +} + +int mlxsw_sp_counter_resources_register(struct mlxsw_core *mlxsw_core) +{ + static struct devlink_resource_size_params size_params; + struct devlink *devlink = priv_to_devlink(mlxsw_core); + const struct mlxsw_sp_counter_sub_pool *sub_pool; + unsigned int total_bank_config; + u64 sub_pool_size; + u64 base_index; + u64 pool_size; + u64 bank_size; + int err; + int i; + + if (!MLXSW_CORE_RES_VALID(mlxsw_core, COUNTER_POOL_SIZE) || + !MLXSW_CORE_RES_VALID(mlxsw_core, COUNTER_BANK_SIZE)) + return -EIO; + + pool_size = MLXSW_CORE_RES_GET(mlxsw_core, COUNTER_POOL_SIZE); + bank_size = MLXSW_CORE_RES_GET(mlxsw_core, COUNTER_BANK_SIZE); + + devlink_resource_size_params_init(&size_params, pool_size, + pool_size, bank_size, + DEVLINK_RESOURCE_UNIT_ENTRY); + err = devlink_resource_register(devlink, + MLXSW_SP_RESOURCE_NAME_COUNTERS, + pool_size, + MLXSW_SP_RESOURCE_COUNTERS, + DEVLINK_RESOURCE_ID_PARENT_TOP, + &size_params); + if (err) + return err; + + /* Allocation is based on bank count which should be + * specified for each sub pool statically. + */ + total_bank_config = 0; + base_index = 0; + for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) { + sub_pool = &mlxsw_sp_counter_sub_pools[i]; + sub_pool_size = sub_pool->bank_count * bank_size; + /* The last bank can't be fully used */ + if (base_index + sub_pool_size > pool_size) + sub_pool_size = pool_size - base_index; + base_index += sub_pool_size; + + devlink_resource_size_params_init(&size_params, sub_pool_size, + sub_pool_size, bank_size, + DEVLINK_RESOURCE_UNIT_ENTRY); + err = devlink_resource_register(devlink, + sub_pool->resource_name, + sub_pool_size, + sub_pool->resource_id, + MLXSW_SP_RESOURCE_COUNTERS, + &size_params); + if (err) + return err; + total_bank_config += sub_pool->bank_count; + } + + /* Check config is valid, no bank over subscription */ + if (WARN_ON(total_bank_config > pool_size / bank_size + 1)) + return -EINVAL; + + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h index 81465e267b10..a68d931090dd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h @@ -4,6 +4,7 @@ #ifndef _MLXSW_SPECTRUM_CNT_H #define _MLXSW_SPECTRUM_CNT_H +#include "core.h" #include "spectrum.h" enum mlxsw_sp_counter_sub_pool_id { @@ -19,5 +20,6 @@ void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp, unsigned int counter_index); int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp); void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp); +int mlxsw_sp_counter_resources_register(struct mlxsw_core *mlxsw_core); #endif diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 88aa554415df..21c4b10d106c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -26,17 +26,17 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, if (!flow_action_has_entries(flow_action)) return 0; - if (!flow_action_mixed_hw_stats_types_check(flow_action, extack)) + if (!flow_action_mixed_hw_stats_check(flow_action, extack)) return -EOPNOTSUPP; act = flow_action_first_entry_get(flow_action); - if (act->hw_stats_type == FLOW_ACTION_HW_STATS_TYPE_ANY || - act->hw_stats_type == FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE) { + if (act->hw_stats_type == FLOW_ACTION_HW_STATS_ANY || + act->hw_stats_type == FLOW_ACTION_HW_STATS_IMMEDIATE) { /* Count action is inserted first */ err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack); if (err) return err; - } else if (act->hw_stats_type != FLOW_ACTION_HW_STATS_TYPE_DISABLED) { + } else if (act->hw_stats_type != FLOW_ACTION_HW_STATS_DISABLED) { NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 6d84173373c7..873a9944fbfb 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -17,8 +17,8 @@ static int ocelot_flower_parse_action(struct flow_cls_offload *f, if (!flow_offload_has_one_action(&f->rule->action)) return -EOPNOTSUPP; - if (!flow_action_basic_hw_stats_types_check(&f->rule->action, - f->common.extack)) + if (!flow_action_basic_hw_stats_check(&f->rule->action, + f->common.extack)) return -EOPNOTSUPP; flow_action_for_each(i, a, &f->rule->action) { diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 4aa7346cb040..1c76e1592ca2 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -1207,8 +1207,8 @@ int nfp_flower_compile_action(struct nfp_app *app, bool pkt_host = false; u32 csum_updated = 0; - if (!flow_action_basic_hw_stats_types_check(&flow->rule->action, - extack)) + if (!flow_action_hw_stats_check(&flow->rule->action, extack, + FLOW_ACTION_HW_STATS_DELAYED_BIT)) return -EOPNOTSUPP; memset(nfp_flow->action_data, 0, NFP_FL_MAX_A_SIZ); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c index ed14164468a1..273c889faaad 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c @@ -77,12 +77,16 @@ int ionic_devlink_register(struct ionic *ionic) return err; } + /* don't register the mgmt_nic as a port */ + if (ionic->is_mgmt_nic) + return 0; + devlink_port_attrs_set(&ionic->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL, 0, false, 0, NULL, 0); err = devlink_port_register(dl, &ionic->dl_port, 0); if (err) dev_err(ionic->dev, "devlink_port_register failed: %d\n", err); - else if (!ionic->is_mgmt_nic) + else devlink_port_type_eth_set(&ionic->dl_port, ionic->master_lif->netdev); @@ -93,6 +97,7 @@ void ionic_devlink_unregister(struct ionic *ionic) { struct devlink *dl = priv_to_devlink(ionic); - devlink_port_unregister(&ionic->dl_port); + if (ionic->dl_port.registered) + devlink_port_unregister(&ionic->dl_port); devlink_unregister(dl); } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index a233716eac29..6996229facfd 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/netdevice.h> +#include <linux/sfp.h> #include "ionic.h" #include "ionic_bus.h" @@ -677,23 +678,27 @@ static int ionic_get_module_info(struct net_device *netdev, struct ionic_lif *lif = netdev_priv(netdev); struct ionic_dev *idev = &lif->ionic->idev; struct ionic_xcvr_status *xcvr; + struct sfp_eeprom_base *sfp; xcvr = &idev->port_info->status.xcvr; + sfp = (struct sfp_eeprom_base *) xcvr->sprom; /* report the module data type and length */ - switch (xcvr->sprom[0]) { - case 0x03: /* SFP */ + switch (sfp->phys_id) { + case SFF8024_ID_SFP: modinfo->type = ETH_MODULE_SFF_8079; modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; break; - case 0x0D: /* QSFP */ - case 0x11: /* QSFP28 */ + case SFF8024_ID_QSFP_8436_8636: + case SFF8024_ID_QSFP28_8636: modinfo->type = ETH_MODULE_SFF_8436; modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; break; default: netdev_info(netdev, "unknown xcvr type 0x%02x\n", xcvr->sprom[0]); + modinfo->type = 0; + modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; break; } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index b903016193df..12e3823b0bc1 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2066,9 +2066,11 @@ static void ionic_lif_deinit(struct ionic_lif *lif) clear_bit(IONIC_LIF_F_INITED, lif->state); ionic_rx_filters_deinit(lif); - ionic_lif_rss_deinit(lif); + if (lif->netdev->features & NETIF_F_RXHASH) + ionic_lif_rss_deinit(lif); napi_disable(&lif->adminqcq->napi); + netif_napi_del(&lif->adminqcq->napi); ionic_lif_qcq_deinit(lif, lif->notifyqcq); ionic_lif_qcq_deinit(lif, lif->adminqcq); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index e4a76e66f542..c5e3d7639f7e 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -58,6 +58,8 @@ static const char *ionic_error_to_str(enum ionic_status_code code) return "IONIC_RC_BAD_ADDR"; case IONIC_RC_DEV_CMD: return "IONIC_RC_DEV_CMD"; + case IONIC_RC_ENOSUPP: + return "IONIC_RC_ENOSUPP"; case IONIC_RC_ERROR: return "IONIC_RC_ERROR"; case IONIC_RC_ERDMA: @@ -76,6 +78,7 @@ static int ionic_error_to_errno(enum ionic_status_code code) case IONIC_RC_EQTYPE: case IONIC_RC_EQID: case IONIC_RC_EINVAL: + case IONIC_RC_ENOSUPP: return -EINVAL; case IONIC_RC_EPERM: return -EPERM; diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 6505f7e2d1db..fe72bb6c9455 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1757,7 +1757,7 @@ static int qede_parse_actions(struct qede_dev *edev, return -EINVAL; } - if (!flow_action_basic_hw_stats_types_check(flow_action, extack)) + if (!flow_action_basic_hw_stats_check(flow_action, extack)) return -EOPNOTSUPP; flow_action_for_each(i, act, flow_action) { diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 9a637cd67f43..04e88d05e8ff 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -232,9 +232,6 @@ static int efx_ethtool_set_coalesce(struct net_device *net_dev, bool adaptive, rx_may_override_tx; int rc; - if (coalesce->use_adaptive_tx_coalesce) - return -EINVAL; - efx_get_irq_moderation(efx, &tx_usecs, &rx_usecs, &adaptive); if (coalesce->rx_coalesce_usecs != rx_usecs) @@ -1138,6 +1135,9 @@ static int efx_ethtool_set_fecparam(struct net_device *net_dev, } const struct ethtool_ops efx_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_USECS_IRQ | + ETHTOOL_COALESCE_USE_ADAPTIVE_RX, .get_drvinfo = efx_ethtool_get_drvinfo, .get_regs_len = efx_ethtool_get_regs_len, .get_regs = efx_ethtool_get_regs, diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c index 08bd6a321918..db90d94e24c9 100644 --- a/drivers/net/ethernet/sfc/falcon/ethtool.c +++ b/drivers/net/ethernet/sfc/falcon/ethtool.c @@ -603,9 +603,6 @@ static int ef4_ethtool_set_coalesce(struct net_device *net_dev, bool adaptive, rx_may_override_tx; int rc; - if (coalesce->use_adaptive_tx_coalesce) - return -EINVAL; - ef4_get_irq_moderation(efx, &tx_usecs, &rx_usecs, &adaptive); if (coalesce->rx_coalesce_usecs != rx_usecs) @@ -1311,6 +1308,9 @@ static int ef4_ethtool_get_module_info(struct net_device *net_dev, } const struct ethtool_ops ef4_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_USECS_IRQ | + ETHTOOL_COALESCE_USE_ADAPTIVE_RX, .get_drvinfo = ef4_ethtool_get_drvinfo, .get_regs_len = ef4_ethtool_get_regs_len, .get_regs = ef4_ethtool_get_regs, diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 58b9b7ce7195..a5a0fb60193a 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -589,6 +589,8 @@ static void netsec_et_set_msglevel(struct net_device *dev, u32 datum) } static const struct ethtool_ops netsec_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES, .get_drvinfo = netsec_et_get_drvinfo, .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 9bdbf589d93f..386663208c23 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -34,6 +34,11 @@ #define DWMAC_CORE_5_00 0x50 #define DWMAC_CORE_5_10 0x51 #define DWXGMAC_CORE_2_10 0x21 +#define DWXLGMAC_CORE_2_00 0x20 + +/* Device ID */ +#define DWXGMAC_ID 0x76 +#define DWXLGMAC_ID 0x27 #define STMMAC_CHAN0 0 /* Always supported and default for all chips */ @@ -426,6 +431,12 @@ struct mac_link { u32 speed5000; u32 speed10000; } xgmii; + struct { + u32 speed25000; + u32 speed40000; + u32 speed50000; + u32 speed100000; + } xlgmii; }; struct mii_regs { @@ -459,6 +470,7 @@ struct mac_device_info { unsigned int pcs; unsigned int pmt; unsigned int ps; + unsigned int xlgmac; }; struct stmmac_rx_routing { @@ -470,6 +482,7 @@ int dwmac100_setup(struct stmmac_priv *priv); int dwmac1000_setup(struct stmmac_priv *priv); int dwmac4_setup(struct stmmac_priv *priv); int dwxgmac2_setup(struct stmmac_priv *priv); +int dwxlgmac2_setup(struct stmmac_priv *priv); void stmmac_set_mac_addr(void __iomem *ioaddr, u8 addr[6], unsigned int high, unsigned int low); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 67b754a56288..0e4575f7bedb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -9,6 +9,7 @@ #include <linux/iopoll.h> #include "stmmac.h" #include "stmmac_ptp.h" +#include "dwxlgmac2.h" #include "dwxgmac2.h" static void dwxgmac2_core_init(struct mac_device_info *hw, @@ -1485,6 +1486,67 @@ const struct stmmac_ops dwxgmac210_ops = { .fpe_configure = dwxgmac3_fpe_configure, }; +static void dwxlgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode, + u32 queue) +{ + void __iomem *ioaddr = hw->pcsr; + u32 value; + + value = readl(ioaddr + XLGMAC_RXQ_ENABLE_CTRL0) & ~XGMAC_RXQEN(queue); + if (mode == MTL_QUEUE_AVB) + value |= 0x1 << XGMAC_RXQEN_SHIFT(queue); + else if (mode == MTL_QUEUE_DCB) + value |= 0x2 << XGMAC_RXQEN_SHIFT(queue); + writel(value, ioaddr + XLGMAC_RXQ_ENABLE_CTRL0); +} + +const struct stmmac_ops dwxlgmac2_ops = { + .core_init = dwxgmac2_core_init, + .set_mac = dwxgmac2_set_mac, + .rx_ipc = dwxgmac2_rx_ipc, + .rx_queue_enable = dwxlgmac2_rx_queue_enable, + .rx_queue_prio = dwxgmac2_rx_queue_prio, + .tx_queue_prio = dwxgmac2_tx_queue_prio, + .rx_queue_routing = NULL, + .prog_mtl_rx_algorithms = dwxgmac2_prog_mtl_rx_algorithms, + .prog_mtl_tx_algorithms = dwxgmac2_prog_mtl_tx_algorithms, + .set_mtl_tx_queue_weight = dwxgmac2_set_mtl_tx_queue_weight, + .map_mtl_to_dma = dwxgmac2_map_mtl_to_dma, + .config_cbs = dwxgmac2_config_cbs, + .dump_regs = dwxgmac2_dump_regs, + .host_irq_status = dwxgmac2_host_irq_status, + .host_mtl_irq_status = dwxgmac2_host_mtl_irq_status, + .flow_ctrl = dwxgmac2_flow_ctrl, + .pmt = dwxgmac2_pmt, + .set_umac_addr = dwxgmac2_set_umac_addr, + .get_umac_addr = dwxgmac2_get_umac_addr, + .set_eee_mode = dwxgmac2_set_eee_mode, + .reset_eee_mode = dwxgmac2_reset_eee_mode, + .set_eee_timer = dwxgmac2_set_eee_timer, + .set_eee_pls = dwxgmac2_set_eee_pls, + .pcs_ctrl_ane = NULL, + .pcs_rane = NULL, + .pcs_get_adv_lp = NULL, + .debug = NULL, + .set_filter = dwxgmac2_set_filter, + .safety_feat_config = dwxgmac3_safety_feat_config, + .safety_feat_irq_status = dwxgmac3_safety_feat_irq_status, + .safety_feat_dump = dwxgmac3_safety_feat_dump, + .set_mac_loopback = dwxgmac2_set_mac_loopback, + .rss_configure = dwxgmac2_rss_configure, + .update_vlan_hash = dwxgmac2_update_vlan_hash, + .rxp_config = dwxgmac3_rxp_config, + .get_mac_tx_timestamp = dwxgmac2_get_mac_tx_timestamp, + .flex_pps_config = dwxgmac2_flex_pps_config, + .sarc_configure = dwxgmac2_sarc_configure, + .enable_vlan = dwxgmac2_enable_vlan, + .config_l3_filter = dwxgmac2_config_l3_filter, + .config_l4_filter = dwxgmac2_config_l4_filter, + .set_arp_offload = dwxgmac2_set_arp_offload, + .est_configure = dwxgmac3_est_configure, + .fpe_configure = dwxgmac3_fpe_configure, +}; + int dwxgmac2_setup(struct stmmac_priv *priv) { struct mac_device_info *mac = priv->hw; @@ -1521,3 +1583,40 @@ int dwxgmac2_setup(struct stmmac_priv *priv) return 0; } + +int dwxlgmac2_setup(struct stmmac_priv *priv) +{ + struct mac_device_info *mac = priv->hw; + + dev_info(priv->device, "\tXLGMAC\n"); + + priv->dev->priv_flags |= IFF_UNICAST_FLT; + mac->pcsr = priv->ioaddr; + mac->multicast_filter_bins = priv->plat->multicast_filter_bins; + mac->unicast_filter_entries = priv->plat->unicast_filter_entries; + mac->mcast_bits_log2 = 0; + + if (mac->multicast_filter_bins) + mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + + mac->link.duplex = 0; + mac->link.speed1000 = XLGMAC_CONFIG_SS_1000; + mac->link.speed2500 = XLGMAC_CONFIG_SS_2500; + mac->link.xgmii.speed10000 = XLGMAC_CONFIG_SS_10G; + mac->link.xlgmii.speed25000 = XLGMAC_CONFIG_SS_25G; + mac->link.xlgmii.speed40000 = XLGMAC_CONFIG_SS_40G; + mac->link.xlgmii.speed50000 = XLGMAC_CONFIG_SS_50G; + mac->link.xlgmii.speed100000 = XLGMAC_CONFIG_SS_100G; + mac->link.speed_mask = XLGMAC_CONFIG_SS; + + mac->mii.addr = XGMAC_MDIO_ADDR; + mac->mii.data = XGMAC_MDIO_DATA; + mac->mii.addr_shift = 16; + mac->mii.addr_mask = GENMASK(20, 16); + mac->mii.reg_shift = 0; + mac->mii.reg_mask = GENMASK(15, 0); + mac->mii.clk_csr_shift = 19; + mac->mii.clk_csr_mask = GENMASK(21, 19); + + return 0; +} diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h new file mode 100644 index 000000000000..726090d49221 --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwxlgmac2.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates. + * Synopsys DesignWare XLGMAC definitions. + */ + +#ifndef __STMMAC_DWXLGMAC2_H__ +#define __STMMAC_DWXLGMAC2_H__ + +/* MAC Registers */ +#define XLGMAC_CONFIG_SS GENMASK(30, 28) +#define XLGMAC_CONFIG_SS_SHIFT 28 +#define XLGMAC_CONFIG_SS_40G (0x0 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_CONFIG_SS_25G (0x1 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_CONFIG_SS_50G (0x2 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_CONFIG_SS_100G (0x3 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_CONFIG_SS_10G (0x4 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_CONFIG_SS_2500 (0x6 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_CONFIG_SS_1000 (0x7 << XLGMAC_CONFIG_SS_SHIFT) +#define XLGMAC_RXQ_ENABLE_CTRL0 0x00000140 + +#endif /* __STMMAC_DWXLGMAC2_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 3af2e5015245..bb7114f970f8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -23,6 +23,18 @@ static u32 stmmac_get_id(struct stmmac_priv *priv, u32 id_reg) return reg & GENMASK(7, 0); } +static u32 stmmac_get_dev_id(struct stmmac_priv *priv, u32 id_reg) +{ + u32 reg = readl(priv->ioaddr + id_reg); + + if (!reg) { + dev_info(priv->device, "Version ID not available\n"); + return 0x0; + } + + return (reg & GENMASK(15, 8)) >> 8; +} + static void stmmac_dwmac_mode_quirk(struct stmmac_priv *priv) { struct mac_device_info *mac = priv->hw; @@ -69,11 +81,18 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv) return 0; } +static int stmmac_dwxlgmac_quirks(struct stmmac_priv *priv) +{ + priv->hw->xlgmac = true; + return 0; +} + static const struct stmmac_hwif_entry { bool gmac; bool gmac4; bool xgmac; u32 min_id; + u32 dev_id; const struct stmmac_regs_off regs; const void *desc; const void *dma; @@ -199,6 +218,7 @@ static const struct stmmac_hwif_entry { .gmac4 = false, .xgmac = true, .min_id = DWXGMAC_CORE_2_10, + .dev_id = DWXGMAC_ID, .regs = { .ptp_off = PTP_XGMAC_OFFSET, .mmc_off = MMC_XGMAC_OFFSET, @@ -212,6 +232,25 @@ static const struct stmmac_hwif_entry { .mmc = &dwxgmac_mmc_ops, .setup = dwxgmac2_setup, .quirks = NULL, + }, { + .gmac = false, + .gmac4 = false, + .xgmac = true, + .min_id = DWXLGMAC_CORE_2_00, + .dev_id = DWXLGMAC_ID, + .regs = { + .ptp_off = PTP_XGMAC_OFFSET, + .mmc_off = MMC_XGMAC_OFFSET, + }, + .desc = &dwxgmac210_desc_ops, + .dma = &dwxgmac210_dma_ops, + .mac = &dwxlgmac2_ops, + .hwtimestamp = &stmmac_ptp, + .mode = NULL, + .tc = &dwmac510_tc_ops, + .mmc = &dwxgmac_mmc_ops, + .setup = dwxlgmac2_setup, + .quirks = stmmac_dwxlgmac_quirks, }, }; @@ -223,13 +262,15 @@ int stmmac_hwif_init(struct stmmac_priv *priv) const struct stmmac_hwif_entry *entry; struct mac_device_info *mac; bool needs_setup = true; + u32 id, dev_id = 0; int i, ret; - u32 id; if (needs_gmac) { id = stmmac_get_id(priv, GMAC_VERSION); } else if (needs_gmac4 || needs_xgmac) { id = stmmac_get_id(priv, GMAC4_VERSION); + if (needs_xgmac) + dev_id = stmmac_get_dev_id(priv, GMAC4_VERSION); } else { id = 0; } @@ -267,6 +308,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv) /* Use synopsys_id var because some setups can override this */ if (priv->synopsys_id < entry->min_id) continue; + if (needs_xgmac && (dev_id ^ entry->dev_id)) + continue; /* Only use generic HW helpers if needed */ mac->desc = mac->desc ? : entry->desc; diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index c71dd99c8abf..fc350149ba34 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -605,6 +605,7 @@ extern const struct stmmac_dma_ops dwmac410_dma_ops; extern const struct stmmac_ops dwmac510_ops; extern const struct stmmac_tc_ops dwmac510_tc_ops; extern const struct stmmac_ops dwxgmac210_ops; +extern const struct stmmac_ops dwxlgmac2_ops; extern const struct stmmac_dma_ops dwxgmac210_dma_ops; extern const struct stmmac_desc_ops dwxgmac210_desc_ops; extern const struct stmmac_mmc_ops dwmac_mmc_ops; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f26699d9a050..0e8c80f23557 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -849,6 +849,38 @@ static void stmmac_validate(struct phylink_config *config, phylink_set(mac_supported, 10000baseKX4_Full); phylink_set(mac_supported, 10000baseKR_Full); } + if (!max_speed || (max_speed >= 25000)) { + phylink_set(mac_supported, 25000baseCR_Full); + phylink_set(mac_supported, 25000baseKR_Full); + phylink_set(mac_supported, 25000baseSR_Full); + } + if (!max_speed || (max_speed >= 40000)) { + phylink_set(mac_supported, 40000baseKR4_Full); + phylink_set(mac_supported, 40000baseCR4_Full); + phylink_set(mac_supported, 40000baseSR4_Full); + phylink_set(mac_supported, 40000baseLR4_Full); + } + if (!max_speed || (max_speed >= 50000)) { + phylink_set(mac_supported, 50000baseCR2_Full); + phylink_set(mac_supported, 50000baseKR2_Full); + phylink_set(mac_supported, 50000baseSR2_Full); + phylink_set(mac_supported, 50000baseKR_Full); + phylink_set(mac_supported, 50000baseSR_Full); + phylink_set(mac_supported, 50000baseCR_Full); + phylink_set(mac_supported, 50000baseLR_ER_FR_Full); + phylink_set(mac_supported, 50000baseDR_Full); + } + if (!max_speed || (max_speed >= 100000)) { + phylink_set(mac_supported, 100000baseKR4_Full); + phylink_set(mac_supported, 100000baseSR4_Full); + phylink_set(mac_supported, 100000baseCR4_Full); + phylink_set(mac_supported, 100000baseLR4_ER4_Full); + phylink_set(mac_supported, 100000baseKR2_Full); + phylink_set(mac_supported, 100000baseSR2_Full); + phylink_set(mac_supported, 100000baseCR2_Full); + phylink_set(mac_supported, 100000baseLR2_ER2_FR2_Full); + phylink_set(mac_supported, 100000baseDR2_Full); + } } /* Half-Duplex can only work with single queue */ @@ -929,6 +961,32 @@ static void stmmac_mac_link_up(struct phylink_config *config, default: return; } + } else if (interface == PHY_INTERFACE_MODE_XLGMII) { + switch (speed) { + case SPEED_100000: + ctrl |= priv->hw->link.xlgmii.speed100000; + break; + case SPEED_50000: + ctrl |= priv->hw->link.xlgmii.speed50000; + break; + case SPEED_40000: + ctrl |= priv->hw->link.xlgmii.speed40000; + break; + case SPEED_25000: + ctrl |= priv->hw->link.xlgmii.speed25000; + break; + case SPEED_10000: + ctrl |= priv->hw->link.xgmii.speed10000; + break; + case SPEED_2500: + ctrl |= priv->hw->link.speed2500; + break; + case SPEED_1000: + ctrl |= priv->hw->link.speed1000; + break; + default: + return; + } } else { switch (speed) { case SPEED_2500: diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index 07dbe4f5456e..63d6c85a59e3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -1387,7 +1387,7 @@ static int __stmmac_test_l3filt(struct stmmac_priv *priv, u32 dst, u32 src, cls->rule = rule; rule->action.entries[0].id = FLOW_ACTION_DROP; - rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_TYPE_ANY; + rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_ANY; rule->action.num_entries = 1; attr.dst = priv->dev->dev_addr; @@ -1516,7 +1516,7 @@ static int __stmmac_test_l4filt(struct stmmac_priv *priv, u32 dst, u32 src, cls->rule = rule; rule->action.entries[0].id = FLOW_ACTION_DROP; - rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_TYPE_ANY; + rule->action.entries[0].hw_stats_type = FLOW_ACTION_HW_STATS_ANY; rule->action.num_entries = 1; attr.dst = priv->dev->dev_addr; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index a0e6118444b0..3d747846f482 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -376,7 +376,7 @@ static int tc_parse_flow_actions(struct stmmac_priv *priv, if (!flow_action_has_entries(action)) return -EINVAL; - if (!flow_action_basic_hw_stats_types_check(action, extack)) + if (!flow_action_basic_hw_stats_check(action, extack)) return -EOPNOTSUPP; flow_action_for_each(i, act, action) { diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c index fde722136869..bc198eadfcab 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-ethtool.c @@ -151,7 +151,6 @@ static int xlgmac_ethtool_get_coalesce(struct net_device *netdev, { struct xlgmac_pdata *pdata = netdev_priv(netdev); - memset(ec, 0, sizeof(struct ethtool_coalesce)); ec->rx_coalesce_usecs = pdata->rx_usecs; ec->rx_max_coalesced_frames = pdata->rx_frames; ec->tx_max_coalesced_frames = pdata->tx_frames; @@ -167,20 +166,6 @@ static int xlgmac_ethtool_set_coalesce(struct net_device *netdev, unsigned int rx_frames, rx_riwt, rx_usecs; unsigned int tx_frames; - /* Check for not supported parameters */ - if ((ec->rx_coalesce_usecs_irq) || (ec->rx_max_coalesced_frames_irq) || - (ec->tx_coalesce_usecs) || (ec->tx_coalesce_usecs_high) || - (ec->tx_max_coalesced_frames_irq) || (ec->tx_coalesce_usecs_irq) || - (ec->stats_block_coalesce_usecs) || (ec->pkt_rate_low) || - (ec->use_adaptive_rx_coalesce) || (ec->use_adaptive_tx_coalesce) || - (ec->rx_max_coalesced_frames_low) || (ec->rx_coalesce_usecs_low) || - (ec->tx_coalesce_usecs_low) || (ec->tx_max_coalesced_frames_low) || - (ec->pkt_rate_high) || (ec->rx_coalesce_usecs_high) || - (ec->rx_max_coalesced_frames_high) || - (ec->tx_max_coalesced_frames_high) || - (ec->rate_sample_interval)) - return -EOPNOTSUPP; - rx_usecs = ec->rx_coalesce_usecs; rx_riwt = hw_ops->usec_to_riwt(pdata, rx_usecs); rx_frames = ec->rx_max_coalesced_frames; @@ -257,6 +242,8 @@ static void xlgmac_ethtool_get_ethtool_stats(struct net_device *netdev, } static const struct ethtool_ops xlgmac_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | + ETHTOOL_COALESCE_MAX_FRAMES, .get_drvinfo = xlgmac_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_msglevel = xlgmac_ethtool_get_msglevel, diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c index 0f8a924fc60c..40a2ce0ca808 100644 --- a/drivers/net/ethernet/tehuti/tehuti.c +++ b/drivers/net/ethernet/tehuti/tehuti.c @@ -2373,6 +2373,8 @@ static void bdx_get_ethtool_stats(struct net_device *netdev, static void bdx_set_ethtool_ops(struct net_device *netdev) { static const struct ethtool_ops bdx_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES, .get_drvinfo = bdx_get_drvinfo, .get_link = ethtool_op_get_link, .get_coalesce = bdx_get_coalesce, diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 6ae4a72e6f43..c2c5bf87da01 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1211,6 +1211,7 @@ static int cpsw_set_channels(struct net_device *ndev, } static const struct ethtool_ops cpsw_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS, .get_drvinfo = cpsw_get_drvinfo, .get_msglevel = cpsw_get_msglevel, .set_msglevel = cpsw_set_msglevel, diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 71215db7934b..9209e613257d 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1175,6 +1175,7 @@ static int cpsw_set_channels(struct net_device *ndev, } static const struct ethtool_ops cpsw_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS, .get_drvinfo = cpsw_get_drvinfo, .get_msglevel = cpsw_get_msglevel, .set_msglevel = cpsw_set_msglevel, diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 75d4e16c692b..de282531f68b 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -481,6 +481,7 @@ static int emac_set_coalesce(struct net_device *ndev, * Ethtool support for EMAC adapter */ static const struct ethtool_ops ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS, .get_drvinfo = emac_get_drvinfo, .get_link = ethtool_op_get_link, .get_coalesce = emac_get_coalesce, diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index dc022cd5bc42..3e313e71ae36 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1314,25 +1314,6 @@ static int ll_temac_ethtools_set_coalesce(struct net_device *ndev, return -EFAULT; } - if (ec->rx_coalesce_usecs_irq || - ec->rx_max_coalesced_frames_irq || - ec->tx_coalesce_usecs_irq || - ec->tx_max_coalesced_frames_irq || - ec->stats_block_coalesce_usecs || - ec->use_adaptive_rx_coalesce || - ec->use_adaptive_tx_coalesce || - ec->pkt_rate_low || - ec->rx_coalesce_usecs_low || - ec->rx_max_coalesced_frames_low || - ec->tx_coalesce_usecs_low || - ec->tx_max_coalesced_frames_low || - ec->pkt_rate_high || - ec->rx_coalesce_usecs_high || - ec->rx_max_coalesced_frames_high || - ec->tx_coalesce_usecs_high || - ec->tx_max_coalesced_frames_high || - ec->rate_sample_interval) - return -EOPNOTSUPP; if (ec->rx_max_coalesced_frames) lp->coalesce_count_rx = ec->rx_max_coalesced_frames; if (ec->tx_max_coalesced_frames) @@ -1351,6 +1332,8 @@ static int ll_temac_ethtools_set_coalesce(struct net_device *ndev, } static const struct ethtool_ops temac_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES, .nway_reset = phy_ethtool_nway_reset, .get_link = ethtool_op_get_link, .get_ts_info = ethtool_op_get_ts_info, diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index c2f4c5ca2e80..e2f3e2b0cec7 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1309,27 +1309,6 @@ static int axienet_ethtools_set_coalesce(struct net_device *ndev, return -EFAULT; } - if ((ecoalesce->rx_coalesce_usecs) || - (ecoalesce->rx_coalesce_usecs_irq) || - (ecoalesce->rx_max_coalesced_frames_irq) || - (ecoalesce->tx_coalesce_usecs) || - (ecoalesce->tx_coalesce_usecs_irq) || - (ecoalesce->tx_max_coalesced_frames_irq) || - (ecoalesce->stats_block_coalesce_usecs) || - (ecoalesce->use_adaptive_rx_coalesce) || - (ecoalesce->use_adaptive_tx_coalesce) || - (ecoalesce->pkt_rate_low) || - (ecoalesce->rx_coalesce_usecs_low) || - (ecoalesce->rx_max_coalesced_frames_low) || - (ecoalesce->tx_coalesce_usecs_low) || - (ecoalesce->tx_max_coalesced_frames_low) || - (ecoalesce->pkt_rate_high) || - (ecoalesce->rx_coalesce_usecs_high) || - (ecoalesce->rx_max_coalesced_frames_high) || - (ecoalesce->tx_coalesce_usecs_high) || - (ecoalesce->tx_max_coalesced_frames_high) || - (ecoalesce->rate_sample_interval)) - return -EOPNOTSUPP; if (ecoalesce->rx_max_coalesced_frames) lp->coalesce_count_rx = ecoalesce->rx_max_coalesced_frames; if (ecoalesce->tx_max_coalesced_frames) @@ -1357,6 +1336,7 @@ axienet_ethtools_set_link_ksettings(struct net_device *ndev, } static const struct ethtool_ops axienet_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES, .get_drvinfo = axienet_ethtools_get_drvinfo, .get_regs_len = axienet_ethtools_get_regs_len, .get_regs = axienet_ethtools_get_regs, diff --git a/drivers/net/phy/mdio-xpcs.c b/drivers/net/phy/mdio-xpcs.c index 973f588146f7..2f4cdf807160 100644 --- a/drivers/net/phy/mdio-xpcs.c +++ b/drivers/net/phy/mdio-xpcs.c @@ -14,6 +14,7 @@ #define SYNOPSYS_XPCS_USXGMII_ID 0x7996ced0 #define SYNOPSYS_XPCS_10GKR_ID 0x7996ced0 +#define SYNOPSYS_XPCS_XLGMII_ID 0x7996ced0 #define SYNOPSYS_XPCS_MASK 0xffffffff /* Vendor regs access */ @@ -74,6 +75,36 @@ static const int xpcs_10gkr_features[] = { __ETHTOOL_LINK_MODE_MASK_NBITS, }; +static const int xpcs_xlgmii_features[] = { + ETHTOOL_LINK_MODE_Pause_BIT, + ETHTOOL_LINK_MODE_Asym_Pause_BIT, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT, + __ETHTOOL_LINK_MODE_MASK_NBITS, +}; + static const phy_interface_t xpcs_usxgmii_interfaces[] = { PHY_INTERFACE_MODE_USXGMII, PHY_INTERFACE_MODE_MAX, @@ -84,6 +115,11 @@ static const phy_interface_t xpcs_10gkr_interfaces[] = { PHY_INTERFACE_MODE_MAX, }; +static const phy_interface_t xpcs_xlgmii_interfaces[] = { + PHY_INTERFACE_MODE_XLGMII, + PHY_INTERFACE_MODE_MAX, +}; + static struct xpcs_id { u32 id; u32 mask; @@ -100,6 +136,11 @@ static struct xpcs_id { .mask = SYNOPSYS_XPCS_MASK, .supported = xpcs_10gkr_features, .interface = xpcs_10gkr_interfaces, + }, { + .id = SYNOPSYS_XPCS_XLGMII_ID, + .mask = SYNOPSYS_XPCS_MASK, + .supported = xpcs_xlgmii_features, + .interface = xpcs_xlgmii_interfaces, }, }; @@ -458,6 +499,60 @@ static void xpcs_resolve_lpa(struct mdio_xpcs_args *xpcs, state->duplex = DUPLEX_FULL; } +static int xpcs_get_max_xlgmii_speed(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) +{ + unsigned long *adv = state->advertising; + int speed = SPEED_UNKNOWN; + int bit; + + for_each_set_bit(bit, adv, __ETHTOOL_LINK_MODE_MASK_NBITS) { + int new_speed = SPEED_UNKNOWN; + + switch (bit) { + case ETHTOOL_LINK_MODE_25000baseCR_Full_BIT: + case ETHTOOL_LINK_MODE_25000baseKR_Full_BIT: + case ETHTOOL_LINK_MODE_25000baseSR_Full_BIT: + new_speed = SPEED_25000; + break; + case ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT: + case ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT: + case ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT: + case ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT: + new_speed = SPEED_40000; + break; + case ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseKR_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseSR_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseCR_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT: + case ETHTOOL_LINK_MODE_50000baseDR_Full_BIT: + new_speed = SPEED_50000; + break; + case ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT: + case ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT: + new_speed = SPEED_100000; + break; + default: + continue; + } + + if (new_speed > speed) + speed = new_speed; + } + + return speed; +} + static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs, struct phylink_link_state *state) { @@ -468,6 +563,9 @@ static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs, case PHY_INTERFACE_MODE_10GKR: state->speed = SPEED_10000; break; + case PHY_INTERFACE_MODE_XLGMII: + state->speed = xpcs_get_max_xlgmii_speed(xpcs, state); + break; default: state->speed = SPEED_UNKNOWN; break; diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 3ab9ca7614d1..522760c8bca6 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -825,6 +825,38 @@ int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val) EXPORT_SYMBOL(__mdiobus_write); /** + * __mdiobus_modify_changed - Unlocked version of the mdiobus_modify function + * @bus: the mii_bus struct + * @addr: the phy address + * @regnum: register number to modify + * @mask: bit mask of bits to clear + * @set: bit mask of bits to set + * + * Read, modify, and if any change, write the register value back to the + * device. Any error returns a negative number. + * + * NOTE: MUST NOT be called from interrupt context. + */ +int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, + u16 mask, u16 set) +{ + int new, ret; + + ret = __mdiobus_read(bus, addr, regnum); + if (ret < 0) + return ret; + + new = (ret & ~mask) | set; + if (new == ret) + return 0; + + ret = __mdiobus_write(bus, addr, regnum, new); + + return ret < 0 ? ret : 1; +} +EXPORT_SYMBOL_GPL(__mdiobus_modify_changed); + +/** * mdiobus_read_nested - Nested version of the mdiobus_read function * @bus: the mii_bus struct * @addr: the phy address @@ -841,7 +873,8 @@ int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum) { int retval; - BUG_ON(in_interrupt()); + if (WARN_ON_ONCE(in_interrupt())) + return -EINVAL; mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED); retval = __mdiobus_read(bus, addr, regnum); @@ -865,7 +898,8 @@ int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum) { int retval; - BUG_ON(in_interrupt()); + if (WARN_ON_ONCE(in_interrupt())) + return -EINVAL; mutex_lock(&bus->mdio_lock); retval = __mdiobus_read(bus, addr, regnum); @@ -893,7 +927,8 @@ int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val) { int err; - BUG_ON(in_interrupt()); + if (WARN_ON_ONCE(in_interrupt())) + return -EINVAL; mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED); err = __mdiobus_write(bus, addr, regnum, val); @@ -918,7 +953,8 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val) { int err; - BUG_ON(in_interrupt()); + if (WARN_ON_ONCE(in_interrupt())) + return -EINVAL; mutex_lock(&bus->mdio_lock); err = __mdiobus_write(bus, addr, regnum, val); @@ -929,6 +965,30 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val) EXPORT_SYMBOL(mdiobus_write); /** + * mdiobus_modify - Convenience function for modifying a given mdio device + * register + * @bus: the mii_bus struct + * @addr: the phy address + * @regnum: register number to write + * @mask: bit mask of bits to clear + * @set: bit mask of bits to set + */ +int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set) +{ + int err; + + if (WARN_ON_ONCE(in_interrupt())) + return -EINVAL; + + mutex_lock(&bus->mdio_lock); + err = __mdiobus_modify_changed(bus, addr, regnum, mask, set); + mutex_unlock(&bus->mdio_lock); + + return err < 0 ? err : 0; +} +EXPORT_SYMBOL_GPL(mdiobus_modify); + +/** * mdio_bus_match - determine if given MDIO driver supports the given * MDIO device * @dev: target MDIO device diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index cb4d65f81095..2f6229a70ec1 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -1429,11 +1429,21 @@ err: return ret; } -static int vsc8584_handle_interrupt(struct phy_device *phydev) +static irqreturn_t vsc8584_handle_interrupt(struct phy_device *phydev) { - vsc8584_handle_macsec_interrupt(phydev); - phy_mac_interrupt(phydev); - return 0; + int irq_status; + + irq_status = phy_read(phydev, MII_VSC85XX_INT_STATUS); + if (irq_status < 0 || !(irq_status & MII_VSC85XX_INT_MASK_MASK)) + return IRQ_NONE; + + if (irq_status & MII_VSC85XX_INT_MASK_EXT) + vsc8584_handle_macsec_interrupt(phydev); + + if (irq_status & MII_VSC85XX_INT_MASK_LINK_CHG) + phy_mac_interrupt(phydev); + + return IRQ_HANDLED; } static int vsc85xx_config_init(struct phy_device *phydev) diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index e083e7a76ada..94cd85b1e49b 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -489,37 +489,6 @@ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) EXPORT_SYMBOL(phy_write_mmd); /** - * __phy_modify_changed() - Convenience function for modifying a PHY register - * @phydev: a pointer to a &struct phy_device - * @regnum: register number - * @mask: bit mask of bits to clear - * @set: bit mask of bits to set - * - * Unlocked helper function which allows a PHY register to be modified as - * new register value = (old register value & ~mask) | set - * - * Returns negative errno, 0 if there was no change, and 1 in case of change - */ -int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, - u16 set) -{ - int new, ret; - - ret = __phy_read(phydev, regnum); - if (ret < 0) - return ret; - - new = (ret & ~mask) | set; - if (new == ret) - return 0; - - ret = __phy_write(phydev, regnum, new); - - return ret < 0 ? ret : 1; -} -EXPORT_SYMBOL_GPL(__phy_modify_changed); - -/** * phy_modify_changed - Function for modifying a PHY register * @phydev: the phy_device struct * @regnum: register number to modify diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 355bfdef48d2..d71212a418f3 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -715,26 +715,24 @@ static int phy_disable_interrupts(struct phy_device *phydev) static irqreturn_t phy_interrupt(int irq, void *phy_dat) { struct phy_device *phydev = phy_dat; + struct phy_driver *drv = phydev->drv; - if (phydev->drv->did_interrupt && !phydev->drv->did_interrupt(phydev)) + if (drv->handle_interrupt) + return drv->handle_interrupt(phydev); + + if (drv->did_interrupt && !drv->did_interrupt(phydev)) return IRQ_NONE; - if (phydev->drv->handle_interrupt) { - if (phydev->drv->handle_interrupt(phydev)) - goto phy_err; - } else { - /* reschedule state queue work to run as soon as possible */ - phy_trigger_machine(phydev); - } + /* reschedule state queue work to run as soon as possible */ + phy_trigger_machine(phydev); /* did_interrupt() may have cleared the interrupt already */ - if (!phydev->drv->did_interrupt && phy_clear_interrupt(phydev)) - goto phy_err; - return IRQ_HANDLED; + if (!drv->did_interrupt && phy_clear_interrupt(phydev)) { + phy_error(phydev); + return IRQ_NONE; + } -phy_err: - phy_error(phydev); - return IRQ_NONE; + return IRQ_HANDLED; } /** diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 60f32b354013..fed0c5907c6a 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -2068,4 +2068,240 @@ void phylink_helper_basex_speed(struct phylink_link_state *state) } EXPORT_SYMBOL_GPL(phylink_helper_basex_speed); +static void phylink_decode_c37_word(struct phylink_link_state *state, + uint16_t config_reg, int speed) +{ + bool tx_pause, rx_pause; + int fd_bit; + + if (speed == SPEED_2500) + fd_bit = ETHTOOL_LINK_MODE_2500baseX_Full_BIT; + else + fd_bit = ETHTOOL_LINK_MODE_1000baseX_Full_BIT; + + mii_lpa_mod_linkmode_x(state->lp_advertising, config_reg, fd_bit); + + if (linkmode_test_bit(fd_bit, state->advertising) && + linkmode_test_bit(fd_bit, state->lp_advertising)) { + state->speed = speed; + state->duplex = DUPLEX_FULL; + } else { + /* negotiation failure */ + state->link = false; + } + + linkmode_resolve_pause(state->advertising, state->lp_advertising, + &tx_pause, &rx_pause); + + if (tx_pause) + state->pause |= MLO_PAUSE_TX; + if (rx_pause) + state->pause |= MLO_PAUSE_RX; +} + +static void phylink_decode_sgmii_word(struct phylink_link_state *state, + uint16_t config_reg) +{ + if (!(config_reg & LPA_SGMII_LINK)) { + state->link = false; + return; + } + + switch (config_reg & LPA_SGMII_SPD_MASK) { + case LPA_SGMII_10: + state->speed = SPEED_10; + break; + case LPA_SGMII_100: + state->speed = SPEED_100; + break; + case LPA_SGMII_1000: + state->speed = SPEED_1000; + break; + default: + state->link = false; + return; + } + if (config_reg & LPA_SGMII_FULL_DUPLEX) + state->duplex = DUPLEX_FULL; + else + state->duplex = DUPLEX_HALF; +} + +/** + * phylink_mii_c22_pcs_get_state() - read the MAC PCS state + * @pcs: a pointer to a &struct mdio_device. + * @state: a pointer to a &struct phylink_link_state. + * + * Helper for MAC PCS supporting the 802.3 clause 22 register set for + * clause 37 negotiation and/or SGMII control. + * + * Read the MAC PCS state from the MII device configured in @config and + * parse the Clause 37 or Cisco SGMII link partner negotiation word into + * the phylink @state structure. This is suitable to be directly plugged + * into the mac_pcs_get_state() member of the struct phylink_mac_ops + * structure. + */ +void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, + struct phylink_link_state *state) +{ + struct mii_bus *bus = pcs->bus; + int addr = pcs->addr; + int bmsr, lpa; + + bmsr = mdiobus_read(bus, addr, MII_BMSR); + lpa = mdiobus_read(bus, addr, MII_LPA); + if (bmsr < 0 || lpa < 0) { + state->link = false; + return; + } + + state->link = !!(bmsr & BMSR_LSTATUS); + state->an_complete = !!(bmsr & BMSR_ANEGCOMPLETE); + if (!state->link) + return; + + switch (state->interface) { + case PHY_INTERFACE_MODE_1000BASEX: + phylink_decode_c37_word(state, lpa, SPEED_1000); + break; + + case PHY_INTERFACE_MODE_2500BASEX: + phylink_decode_c37_word(state, lpa, SPEED_2500); + break; + + case PHY_INTERFACE_MODE_SGMII: + phylink_decode_sgmii_word(state, lpa); + break; + + default: + state->link = false; + break; + } +} +EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_get_state); + +/** + * phylink_mii_c22_pcs_set_advertisement() - configure the clause 37 PCS + * advertisement + * @pcs: a pointer to a &struct mdio_device. + * @state: a pointer to the state being configured. + * + * Helper for MAC PCS supporting the 802.3 clause 22 register set for + * clause 37 negotiation and/or SGMII control. + * + * Configure the clause 37 PCS advertisement as specified by @state. This + * does not trigger a renegotiation; phylink will do that via the + * mac_an_restart() method of the struct phylink_mac_ops structure. + * + * Returns negative error code on failure to configure the advertisement, + * zero if no change has been made, or one if the advertisement has changed. + */ +int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs, + const struct phylink_link_state *state) +{ + struct mii_bus *bus = pcs->bus; + int addr = pcs->addr; + int val, ret; + u16 adv; + + switch (state->interface) { + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + adv = ADVERTISE_1000XFULL; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, + state->advertising)) + adv |= ADVERTISE_1000XPAUSE; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, + state->advertising)) + adv |= ADVERTISE_1000XPSE_ASYM; + + val = mdiobus_read(bus, addr, MII_ADVERTISE); + if (val < 0) + return val; + + if (val == adv) + return 0; + + ret = mdiobus_write(bus, addr, MII_ADVERTISE, adv); + if (ret < 0) + return ret; + + return 1; + + case PHY_INTERFACE_MODE_SGMII: + val = mdiobus_read(bus, addr, MII_ADVERTISE); + if (val < 0) + return val; + + if (val == 0x0001) + return 0; + + ret = mdiobus_write(bus, addr, MII_ADVERTISE, 0x0001); + if (ret < 0) + return ret; + + return 1; + + default: + /* Nothing to do for other modes */ + return 0; + } +} +EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_set_advertisement); + +/** + * phylink_mii_c22_pcs_an_restart() - restart 802.3z autonegotiation + * @pcs: a pointer to a &struct mdio_device. + * + * Helper for MAC PCS supporting the 802.3 clause 22 register set for + * clause 37 negotiation. + * + * Restart the clause 37 negotiation with the link partner. This is + * suitable to be directly plugged into the mac_pcs_get_state() member + * of the struct phylink_mac_ops structure. + */ +void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs) +{ + struct mii_bus *bus = pcs->bus; + int val, addr = pcs->addr; + + val = mdiobus_read(bus, addr, MII_BMCR); + if (val >= 0) { + val |= BMCR_ANRESTART; + + mdiobus_write(bus, addr, MII_BMCR, val); + } +} +EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_an_restart); + +#define C45_ADDR(d,a) (MII_ADDR_C45 | (d) << 16 | (a)) +void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, + struct phylink_link_state *state) +{ + struct mii_bus *bus = pcs->bus; + int addr = pcs->addr; + int stat; + + stat = mdiobus_read(bus, addr, C45_ADDR(MDIO_MMD_PCS, MDIO_STAT1)); + if (stat < 0) { + state->link = false; + return; + } + + state->link = !!(stat & MDIO_STAT1_LSTATUS); + if (!state->link) + return; + + switch (state->interface) { + case PHY_INTERFACE_MODE_10GBASER: + state->speed = SPEED_10000; + state->duplex = DUPLEX_FULL; + break; + + default: + break; + } +} +EXPORT_SYMBOL_GPL(phylink_mii_c45_pcs_get_state); + MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index f5fa2fff3ddc..2d99e9de6ee1 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -49,6 +49,8 @@ #define RTL_LPADV_5000FULL BIT(6) #define RTL_LPADV_2500FULL BIT(5) +#define RTLGEN_SPEED_MASK 0x0630 + #define RTL_GENERIC_PHYID 0x001cc800 MODULE_DESCRIPTION("Realtek PHY driver"); @@ -309,6 +311,55 @@ static int rtl8366rb_config_init(struct phy_device *phydev) return ret; } +/* get actual speed to cover the downshift case */ +static int rtlgen_get_speed(struct phy_device *phydev) +{ + int val; + + if (!phydev->link) + return 0; + + val = phy_read_paged(phydev, 0xa43, 0x12); + if (val < 0) + return val; + + switch (val & RTLGEN_SPEED_MASK) { + case 0x0000: + phydev->speed = SPEED_10; + break; + case 0x0010: + phydev->speed = SPEED_100; + break; + case 0x0020: + phydev->speed = SPEED_1000; + break; + case 0x0200: + phydev->speed = SPEED_10000; + break; + case 0x0210: + phydev->speed = SPEED_2500; + break; + case 0x0220: + phydev->speed = SPEED_5000; + break; + default: + break; + } + + return 0; +} + +static int rtlgen_read_status(struct phy_device *phydev) +{ + int ret; + + ret = genphy_read_status(phydev); + if (ret < 0) + return ret; + + return rtlgen_get_speed(phydev); +} + static int rtlgen_read_mmd(struct phy_device *phydev, int devnum, u16 regnum) { int ret; @@ -429,6 +480,8 @@ static int rtl8125_config_aneg(struct phy_device *phydev) static int rtl8125_read_status(struct phy_device *phydev) { + int ret; + if (phydev->autoneg == AUTONEG_ENABLE) { int lpadv = phy_read_paged(phydev, 0xa5d, 0x13); @@ -443,7 +496,11 @@ static int rtl8125_read_status(struct phy_device *phydev) phydev->lp_advertising, lpadv & RTL_LPADV_2500FULL); } - return genphy_read_status(phydev); + ret = genphy_read_status(phydev); + if (ret < 0) + return ret; + + return rtlgen_get_speed(phydev); } static bool rtlgen_supports_2_5gbps(struct phy_device *phydev) @@ -550,6 +607,7 @@ static struct phy_driver realtek_drvs[] = { }, { .name = "Generic FE-GE Realtek PHY", .match_phy_device = rtlgen_match_phy_device, + .read_status = rtlgen_read_status, .suspend = genphy_suspend, .resume = genphy_resume, .read_page = rtl821x_read_page, diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 962be94ed3ca..6eb431c194bd 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -847,11 +847,6 @@ struct qeth_trap_id { /*some helper functions*/ #define QETH_CARD_IFNAME(card) (((card)->dev)? (card)->dev->name : "") -static inline bool qeth_netdev_is_registered(struct net_device *dev) -{ - return dev->netdev_ops != NULL; -} - static inline u16 qeth_iqd_translate_txq(struct net_device *dev, u16 txq) { if (txq == QETH_IQD_MCAST_TXQ) @@ -1053,6 +1048,7 @@ int qeth_configure_cq(struct qeth_card *, enum qeth_cq); int qeth_hw_trap(struct qeth_card *, enum qeth_diags_trap_action); void qeth_trace_features(struct qeth_card *); int qeth_setassparms_cb(struct qeth_card *, struct qeth_reply *, unsigned long); +int qeth_setup_netdev(struct qeth_card *card); int qeth_set_features(struct net_device *, netdev_features_t); void qeth_enable_hw_features(struct net_device *dev); netdev_features_t qeth_fix_features(struct net_device *, netdev_features_t); @@ -1060,6 +1056,7 @@ netdev_features_t qeth_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); void qeth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats); +int qeth_set_real_num_tx_queues(struct qeth_card *card, unsigned int count); u16 qeth_iqd_select_queue(struct net_device *dev, struct sk_buff *skb, u8 cast_type, struct net_device *sb_dev); int qeth_open(struct net_device *dev); diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 6caa78d51bd1..bd3adbb6ad50 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -244,7 +244,7 @@ static struct qeth_buffer_pool_entry *qeth_alloc_pool_entry(unsigned int pages) return NULL; for (i = 0; i < pages; i++) { - entry->elements[i] = alloc_page(GFP_KERNEL); + entry->elements[i] = __dev_alloc_page(GFP_KERNEL); if (!entry->elements[i]) { qeth_free_pool_entry(entry); @@ -538,9 +538,10 @@ static void qeth_qdio_handle_aob(struct qeth_card *card, for (i = 0; i < aob->sb_count && i < QETH_MAX_BUFFER_ELEMENTS(card); i++) { - if (aob->sba[i] && buffer->is_header[i]) - kmem_cache_free(qeth_core_header_cache, - (void *) aob->sba[i]); + void *data = phys_to_virt(aob->sba[i]); + + if (data && buffer->is_header[i]) + kmem_cache_free(qeth_core_header_cache, data); } atomic_set(&buffer->state, QETH_QDIO_BUF_HANDLED_DELAYED); @@ -1244,9 +1245,12 @@ EXPORT_SYMBOL_GPL(qeth_drain_output_queues); static int qeth_osa_set_output_queues(struct qeth_card *card, bool single) { - unsigned int count = single ? 1 : card->dev->num_tx_queues; + unsigned int max = single ? 1 : card->dev->num_tx_queues; + unsigned int count; int rc; + count = IS_VM_NIC(card) ? min(max, card->dev->real_num_tx_queues) : max; + rtnl_lock(); rc = netif_set_real_num_tx_queues(card->dev, count); rtnl_unlock(); @@ -1254,16 +1258,16 @@ static int qeth_osa_set_output_queues(struct qeth_card *card, bool single) if (rc) return rc; - if (card->qdio.no_out_queues == count) + if (card->qdio.no_out_queues == max) return 0; if (atomic_read(&card->qdio.state) != QETH_QDIO_UNINITIALIZED) qeth_free_qdio_queues(card); - if (count == 1) + if (max == 1 && card->qdio.do_prio_queueing != QETH_PRIOQ_DEFAULT) dev_info(&card->gdev->dev, "Priority Queueing not supported\n"); - card->qdio.no_out_queues = count; + card->qdio.no_out_queues = max; return 0; } @@ -2654,7 +2658,7 @@ static struct qeth_buffer_pool_entry *qeth_find_free_buffer_pool_entry( struct qeth_buffer_pool_entry, list); for (i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i) { if (page_count(entry->elements[i]) > 1) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = dev_alloc_page(); if (!page) return NULL; @@ -3352,6 +3356,7 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index, for (i = index; i < index + count; ++i) { unsigned int bidx = QDIO_BUFNR(i); + struct sk_buff *skb; buf = queue->bufs[bidx]; buf->buffer->element[buf->next_element_to_fill - 1].eflags |= @@ -3360,8 +3365,11 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index, if (queue->bufstates) queue->bufstates[bidx].user = buf; - if (IS_IQD(queue->card)) + if (IS_IQD(card)) { + skb_queue_walk(&buf->skb_list, skb) + skb_tx_timestamp(skb); continue; + } if (!queue->do_pack) { if ((atomic_read(&queue->used_buffers) >= @@ -3705,6 +3713,7 @@ static int qeth_add_hw_header(struct qeth_qdio_out_q *queue, unsigned int hdr_len, unsigned int proto_len, unsigned int *elements) { + gfp_t gfp = GFP_ATOMIC | (skb_pfmemalloc(skb) ? __GFP_MEMALLOC : 0); const unsigned int contiguous = proto_len ? proto_len : 1; const unsigned int max_elements = queue->max_elements; unsigned int __elements; @@ -3760,10 +3769,11 @@ check_layout: *hdr = skb_push(skb, hdr_len); return hdr_len; } - /* fall back */ + + /* Fall back to cache element with known-good alignment: */ if (hdr_len + proto_len > QETH_HDR_CACHE_OBJ_SIZE) return -E2BIG; - *hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC); + *hdr = kmem_cache_alloc(qeth_core_header_cache, gfp); if (!*hdr) return -ENOMEM; /* Copy protocol headers behind HW header: */ @@ -5985,22 +5995,8 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card) SET_NETDEV_DEV(dev, &card->gdev->dev); netif_carrier_off(dev); - if (IS_OSN(card)) { - dev->ethtool_ops = &qeth_osn_ethtool_ops; - } else { - dev->ethtool_ops = &qeth_ethtool_ops; - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->hw_features |= NETIF_F_SG; - dev->vlan_features |= NETIF_F_SG; - if (IS_IQD(card)) { - dev->features |= NETIF_F_SG; - if (netif_set_real_num_tx_queues(dev, - QETH_IQD_MIN_TXQ)) { - free_netdev(dev); - return NULL; - } - } - } + dev->ethtool_ops = IS_OSN(card) ? &qeth_osn_ethtool_ops : + &qeth_ethtool_ops; return dev; } @@ -6016,6 +6012,28 @@ struct net_device *qeth_clone_netdev(struct net_device *orig) return clone; } +int qeth_setup_netdev(struct qeth_card *card) +{ + struct net_device *dev = card->dev; + unsigned int num_tx_queues; + + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->hw_features |= NETIF_F_SG; + dev->vlan_features |= NETIF_F_SG; + + if (IS_IQD(card)) { + dev->features |= NETIF_F_SG; + num_tx_queues = QETH_IQD_MIN_TXQ; + } else if (IS_VM_NIC(card)) { + num_tx_queues = 1; + } else { + num_tx_queues = dev->real_num_tx_queues; + } + + return qeth_set_real_num_tx_queues(card, num_tx_queues); +} +EXPORT_SYMBOL_GPL(qeth_setup_netdev); + static int qeth_core_probe_device(struct ccwgroup_device *gdev) { struct qeth_card *card; @@ -6055,12 +6073,13 @@ static int qeth_core_probe_device(struct ccwgroup_device *gdev) goto err_card; } + qeth_determine_capabilities(card); + qeth_set_blkt_defaults(card); + card->qdio.no_out_queues = card->dev->num_tx_queues; rc = qeth_update_from_chp_desc(card); if (rc) goto err_chp_desc; - qeth_determine_capabilities(card); - qeth_set_blkt_defaults(card); enforced_disc = qeth_enforce_discipline(card); switch (enforced_disc) { @@ -6245,9 +6264,6 @@ int qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) struct mii_ioctl_data *mii_data; int rc = 0; - if (!card) - return -ENODEV; - switch (cmd) { case SIOC_QETH_ADP_SET_SNMP_CONTROL: rc = qeth_snmp_command(card, rq->ifr_ifru.ifru_data); @@ -6627,12 +6643,59 @@ void qeth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) } EXPORT_SYMBOL_GPL(qeth_get_stats64); +#define TC_IQD_UCAST 0 +static void qeth_iqd_set_prio_tc_map(struct net_device *dev, + unsigned int ucast_txqs) +{ + unsigned int prio; + + /* IQD requires mcast traffic to be placed on a dedicated queue, and + * qeth_iqd_select_queue() deals with this. + * For unicast traffic, we defer the queue selection to the stack. + * By installing a trivial prio map that spans over only the unicast + * queues, we can encourage the stack to spread the ucast traffic evenly + * without selecting the mcast queue. + */ + + /* One traffic class, spanning over all active ucast queues: */ + netdev_set_num_tc(dev, 1); + netdev_set_tc_queue(dev, TC_IQD_UCAST, ucast_txqs, + QETH_IQD_MIN_UCAST_TXQ); + + /* Map all priorities to this traffic class: */ + for (prio = 0; prio <= TC_BITMASK; prio++) + netdev_set_prio_tc_map(dev, prio, TC_IQD_UCAST); +} + +int qeth_set_real_num_tx_queues(struct qeth_card *card, unsigned int count) +{ + struct net_device *dev = card->dev; + int rc; + + /* Per netif_setup_tc(), adjust the mapping first: */ + if (IS_IQD(card)) + qeth_iqd_set_prio_tc_map(dev, count - 1); + + rc = netif_set_real_num_tx_queues(dev, count); + + if (rc && IS_IQD(card)) + qeth_iqd_set_prio_tc_map(dev, dev->real_num_tx_queues - 1); + + return rc; +} + u16 qeth_iqd_select_queue(struct net_device *dev, struct sk_buff *skb, u8 cast_type, struct net_device *sb_dev) { + u16 txq; + if (cast_type != RTN_UNICAST) return QETH_IQD_MCAST_TXQ; - return QETH_IQD_MIN_UCAST_TXQ; + if (dev->real_num_tx_queues == QETH_IQD_MIN_TXQ) + return QETH_IQD_MIN_UCAST_TXQ; + + txq = netdev_pick_tx(dev, skb, sb_dev); + return (txq == QETH_IQD_MCAST_TXQ) ? QETH_IQD_MIN_UCAST_TXQ : txq; } EXPORT_SYMBOL_GPL(qeth_iqd_select_queue); diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index 78cae61bc924..533a7f26dbe1 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -176,7 +176,7 @@ static ssize_t qeth_dev_prioqing_store(struct device *dev, struct qeth_card *card = dev_get_drvdata(dev); int rc = 0; - if (IS_IQD(card)) + if (IS_IQD(card) || IS_VM_NIC(card)) return -EOPNOTSUPP; mutex_lock(&card->conf_mutex); diff --git a/drivers/s390/net/qeth_ethtool.c b/drivers/s390/net/qeth_ethtool.c index 9052c72d5b8f..31e019085fc3 100644 --- a/drivers/s390/net/qeth_ethtool.c +++ b/drivers/s390/net/qeth_ethtool.c @@ -153,7 +153,6 @@ static void qeth_get_drvinfo(struct net_device *dev, strlcpy(info->driver, IS_LAYER2(card) ? "qeth_l2" : "qeth_l3", sizeof(info->driver)); - strlcpy(info->version, "1.0", sizeof(info->version)); strlcpy(info->fw_version, card->info.mcl_level, sizeof(info->fw_version)); snprintf(info->bus_info, sizeof(info->bus_info), "%s/%s/%s", @@ -175,6 +174,46 @@ static void qeth_get_channels(struct net_device *dev, channels->combined_count = 0; } +static int qeth_set_channels(struct net_device *dev, + struct ethtool_channels *channels) +{ + struct qeth_card *card = dev->ml_priv; + + if (channels->rx_count == 0 || channels->tx_count == 0) + return -EINVAL; + if (channels->tx_count > card->qdio.no_out_queues) + return -EINVAL; + + if (IS_IQD(card)) { + if (channels->tx_count < QETH_IQD_MIN_TXQ) + return -EINVAL; + + /* Reject downgrade while running. It could push displaced + * ucast flows onto txq0, which is reserved for mcast. + */ + if (netif_running(dev) && + channels->tx_count < dev->real_num_tx_queues) + return -EPERM; + } else { + /* OSA still uses the legacy prio-queue mechanism: */ + if (!IS_VM_NIC(card)) + return -EOPNOTSUPP; + } + + return qeth_set_real_num_tx_queues(card, channels->tx_count); +} + +static int qeth_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *info) +{ + struct qeth_card *card = dev->ml_priv; + + if (!IS_IQD(card)) + return -EOPNOTSUPP; + + return ethtool_op_get_ts_info(dev, info); +} + static int qeth_get_tunable(struct net_device *dev, const struct ethtool_tunable *tuna, void *data) { @@ -410,6 +449,8 @@ const struct ethtool_ops qeth_ethtool_ops = { .get_sset_count = qeth_get_sset_count, .get_drvinfo = qeth_get_drvinfo, .get_channels = qeth_get_channels, + .set_channels = qeth_set_channels, + .get_ts_info = qeth_get_ts_info, .get_tunable = qeth_get_tunable, .set_tunable = qeth_set_tunable, .get_link_ksettings = qeth_get_link_ksettings, diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 4c8e93132e08..73cb363b1fab 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -499,6 +499,7 @@ static void qeth_l2_rx_mode_work(struct work_struct *work) static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb, struct qeth_qdio_out_q *queue) { + gfp_t gfp = GFP_ATOMIC | (skb_pfmemalloc(skb) ? __GFP_MEMALLOC : 0); struct qeth_hdr *hdr = (struct qeth_hdr *)skb->data; addr_t end = (addr_t)(skb->data + sizeof(*hdr)); addr_t start = (addr_t)skb->data; @@ -511,7 +512,7 @@ static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb, if (qeth_get_elements_for_range(start, end) > 1) { /* Misaligned HW header, move it to its own buffer element. */ - hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC); + hdr = kmem_cache_alloc(qeth_core_header_cache, gfp); if (!hdr) return -ENOMEM; hd_len = sizeof(*hdr); @@ -570,7 +571,9 @@ static u16 qeth_l2_select_queue(struct net_device *dev, struct sk_buff *skb, return qeth_iqd_select_queue(dev, skb, qeth_get_ether_cast_type(skb), sb_dev); - return qeth_get_priority_queue(card, skb); + + return IS_VM_NIC(card) ? netdev_pick_tx(dev, skb, sb_dev) : + qeth_get_priority_queue(card, skb); } static const struct device_type qeth_l2_devtype = { @@ -610,7 +613,7 @@ static void qeth_l2_remove_device(struct ccwgroup_device *cgdev) qeth_set_offline(card, false); cancel_work_sync(&card->close_dev_work); - if (qeth_netdev_is_registered(card->dev)) + if (card->dev->reg_state == NETREG_REGISTERED) unregister_netdev(card->dev); } @@ -648,7 +651,7 @@ static const struct net_device_ops qeth_osn_netdev_ops = { .ndo_tx_timeout = qeth_tx_timeout, }; -static int qeth_l2_setup_netdev(struct qeth_card *card, bool carrier_ok) +static int qeth_l2_setup_netdev(struct qeth_card *card) { int rc; @@ -658,6 +661,10 @@ static int qeth_l2_setup_netdev(struct qeth_card *card, bool carrier_ok) goto add_napi; } + rc = qeth_setup_netdev(card); + if (rc) + return rc; + card->dev->needed_headroom = sizeof(struct qeth_hdr); card->dev->netdev_ops = &qeth_l2_netdev_ops; card->dev->priv_flags |= IFF_UNICAST_FLT; @@ -704,13 +711,7 @@ static int qeth_l2_setup_netdev(struct qeth_card *card, bool carrier_ok) add_napi: netif_napi_add(card->dev, &card->napi, qeth_poll, QETH_NAPI_WEIGHT); - rc = register_netdev(card->dev); - if (!rc && carrier_ok) - netif_carrier_on(card->dev); - - if (rc) - card->dev->netdev_ops = NULL; - return rc; + return register_netdev(card->dev); } static void qeth_l2_trace_features(struct qeth_card *card) @@ -783,10 +784,13 @@ static int qeth_l2_set_online(struct qeth_card *card) qeth_set_allowed_threads(card, 0xffffffff, 0); - if (!qeth_netdev_is_registered(dev)) { - rc = qeth_l2_setup_netdev(card, carrier_ok); + if (dev->reg_state != NETREG_REGISTERED) { + rc = qeth_l2_setup_netdev(card); if (rc) goto out_remove; + + if (carrier_ok) + netif_carrier_on(dev); } else { rtnl_lock(); if (carrier_ok) @@ -1512,8 +1516,6 @@ int qeth_bridgeport_an_set(struct qeth_card *card, int enable) struct ccw_device *ddev; struct subchannel_id schid; - if (!card) - return -EINVAL; if (!card->options.sbp.supported_funcs) return -EOPNOTSUPP; ddev = CARD_DDEV(card); diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 8a803d6c9357..83ae75cf1389 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1880,7 +1880,8 @@ static u16 qeth_l3_osa_select_queue(struct net_device *dev, struct sk_buff *skb, { struct qeth_card *card = dev->ml_priv; - return qeth_get_priority_queue(card, skb); + return IS_VM_NIC(card) ? netdev_pick_tx(dev, skb, sb_dev) : + qeth_get_priority_queue(card, skb); } static const struct net_device_ops qeth_l3_netdev_ops = { @@ -1917,11 +1918,15 @@ static const struct net_device_ops qeth_l3_osa_netdev_ops = { .ndo_neigh_setup = qeth_l3_neigh_setup, }; -static int qeth_l3_setup_netdev(struct qeth_card *card, bool carrier_ok) +static int qeth_l3_setup_netdev(struct qeth_card *card) { unsigned int headroom; int rc; + rc = qeth_setup_netdev(card); + if (rc) + return rc; + if (IS_OSD(card) || IS_OSX(card)) { if ((card->info.link_type == QETH_LINK_TYPE_LANE_TR) || (card->info.link_type == QETH_LINK_TYPE_HSTR)) { @@ -1967,7 +1972,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card, bool carrier_ok) rc = qeth_l3_iqd_read_initial_mac(card); if (rc) - goto out; + return rc; } else return -ENODEV; @@ -1982,14 +1987,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card, bool carrier_ok) PAGE_SIZE * (QETH_MAX_BUFFER_ELEMENTS(card) - 1)); netif_napi_add(card->dev, &card->napi, qeth_poll, QETH_NAPI_WEIGHT); - rc = register_netdev(card->dev); - if (!rc && carrier_ok) - netif_carrier_on(card->dev); - -out: - if (rc) - card->dev->netdev_ops = NULL; - return rc; + return register_netdev(card->dev); } static const struct device_type qeth_l3_devtype = { @@ -2036,7 +2034,7 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_set_offline(card, false); cancel_work_sync(&card->close_dev_work); - if (qeth_netdev_is_registered(card->dev)) + if (card->dev->reg_state == NETREG_REGISTERED) unregister_netdev(card->dev); flush_workqueue(card->cmd_wq); @@ -2083,10 +2081,13 @@ static int qeth_l3_set_online(struct qeth_card *card) qeth_set_allowed_threads(card, 0xffffffff, 0); qeth_l3_recover_ip(card); - if (!qeth_netdev_is_registered(dev)) { - rc = qeth_l3_setup_netdev(card, carrier_ok); + if (dev->reg_state != NETREG_REGISTERED) { + rc = qeth_l3_setup_netdev(card); if (rc) goto out_remove; + + if (carrier_ok) + netif_carrier_on(dev); } else { rtnl_lock(); if (carrier_ok) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index be355f37337d..c1d379bf6ee1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -458,6 +458,8 @@ struct ethtool_ops { struct ethtool_stats *, u64 *); }; +int ethtool_check_ops(const struct ethtool_ops *ops); + struct ethtool_rx_flow_rule { struct flow_rule *rule; unsigned long priv[0]; diff --git a/include/linux/mdio.h b/include/linux/mdio.h index a7604248777b..917e4bb2ed71 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -316,11 +316,15 @@ static inline void mii_10gbt_stat_mod_linkmode_lpa_t(unsigned long *advertising, int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, + u16 mask, u16 set); int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, + u16 set); int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 5448c8b443db..ab192720e2d6 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -98,7 +98,7 @@ struct ip_set_counter { struct ip_set_comment_rcu { struct rcu_head rcu; - char str[0]; + char str[]; }; struct ip_set_comment { diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1b261c51b3a3..5da88451853b 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -264,7 +264,7 @@ struct xt_table_info { unsigned int stacksize; void ***jumpstack; - unsigned char entries[0] __aligned(8); + unsigned char entries[] __aligned(8); }; int xt_register_target(struct xt_target *target); @@ -464,7 +464,7 @@ struct compat_xt_entry_match { } kernel; u_int16_t match_size; } u; - unsigned char data[0]; + unsigned char data[]; }; struct compat_xt_entry_target { @@ -480,7 +480,7 @@ struct compat_xt_entry_target { } kernel; u_int16_t target_size; } u; - unsigned char data[0]; + unsigned char data[]; }; /* FIXME: this works only on 32 bit tasks @@ -494,7 +494,7 @@ struct compat_xt_counters { struct compat_xt_counters_info { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t num_counters; - struct compat_xt_counters counters[0]; + struct compat_xt_counters counters[]; }; struct _compat_xt_align { diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index e98028f00e47..7d3537c40ec9 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -67,7 +67,7 @@ struct compat_arpt_entry { __u16 next_offset; compat_uint_t comefrom; struct compat_xt_counters counters; - unsigned char elems[0]; + unsigned char elems[]; }; static inline struct xt_entry_target * diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 162f59d0d17a..2f5c4e6ecd8a 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -85,7 +85,7 @@ struct ebt_table_info { /* room to maintain the stack used for jumping from and into udc */ struct ebt_chainstack **chainstack; char *entries; - struct ebt_counter counters[0] ____cacheline_aligned; + struct ebt_counter counters[] ____cacheline_aligned; }; struct ebt_table { diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index e9e1ed74cdf1..b394bd4f68a3 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -76,7 +76,7 @@ struct compat_ipt_entry { __u16 next_offset; compat_uint_t comefrom; struct compat_xt_counters counters; - unsigned char elems[0]; + unsigned char elems[]; }; /* Helper functions */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 78ab959c4575..8225f7821a29 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -43,7 +43,7 @@ struct compat_ip6t_entry { __u16 next_offset; compat_uint_t comefrom; struct compat_xt_counters counters; - unsigned char elems[0]; + unsigned char elems[]; }; static inline struct xt_entry_target * diff --git a/include/linux/phy.h b/include/linux/phy.h index 6b872aed8ba6..36d9dea04016 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -23,6 +23,7 @@ #include <linux/workqueue.h> #include <linux/mod_devicetable.h> #include <linux/u64_stats_sync.h> +#include <linux/irqreturn.h> #include <linux/atomic.h> @@ -568,7 +569,7 @@ struct phy_driver { int (*did_interrupt)(struct phy_device *phydev); /* Override default interrupt handling */ - int (*handle_interrupt)(struct phy_device *phydev); + irqreturn_t (*handle_interrupt)(struct phy_device *phydev); /* Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); @@ -754,6 +755,25 @@ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val) } /** + * __phy_modify_changed() - Convenience function for modifying a PHY register + * @phydev: a pointer to a &struct phy_device + * @regnum: register number + * @mask: bit mask of bits to clear + * @set: bit mask of bits to set + * + * Unlocked helper function which allows a PHY register to be modified as + * new register value = (old register value & ~mask) | set + * + * Returns negative errno, 0 if there was no change, and 1 in case of change + */ +static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, + u16 mask, u16 set) +{ + return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr, + regnum, mask, set); +} + +/** * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. * @phydev: The phy_device struct diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 2180eb1aa254..8fa6df3b881b 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -317,4 +317,12 @@ int phylink_mii_ioctl(struct phylink *, struct ifreq *, int); void phylink_set_port_modes(unsigned long *bits); void phylink_helper_basex_speed(struct phylink_link_state *state); +void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, + struct phylink_link_state *state); +int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs, + const struct phylink_link_state *state); +void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs); + +void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, + struct phylink_link_state *state); #endif diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index efd8d47f6997..1e30b0d44b61 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -163,19 +163,17 @@ enum flow_action_mangle_base { }; enum flow_action_hw_stats_type_bit { - FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE_BIT, - FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT, + FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, + FLOW_ACTION_HW_STATS_DELAYED_BIT, }; enum flow_action_hw_stats_type { - FLOW_ACTION_HW_STATS_TYPE_DISABLED = 0, - FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE = - BIT(FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE_BIT), - FLOW_ACTION_HW_STATS_TYPE_DELAYED = - BIT(FLOW_ACTION_HW_STATS_TYPE_DELAYED_BIT), - FLOW_ACTION_HW_STATS_TYPE_ANY = - FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE | - FLOW_ACTION_HW_STATS_TYPE_DELAYED, + FLOW_ACTION_HW_STATS_DISABLED = 0, + FLOW_ACTION_HW_STATS_IMMEDIATE = + BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), + FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), + FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE | + FLOW_ACTION_HW_STATS_DELAYED, }; typedef void (*action_destr)(void *priv); @@ -285,8 +283,8 @@ static inline bool flow_offload_has_one_action(const struct flow_action *action) __act = &(__actions)->entries[++__i]) static inline bool -flow_action_mixed_hw_stats_types_check(const struct flow_action *action, - struct netlink_ext_ack *extack) +flow_action_mixed_hw_stats_check(const struct flow_action *action, + struct netlink_ext_ack *extack) { const struct flow_action_entry *action_entry; u8 uninitialized_var(last_hw_stats_type); @@ -313,20 +311,20 @@ flow_action_first_entry_get(const struct flow_action *action) } static inline bool -__flow_action_hw_stats_types_check(const struct flow_action *action, - struct netlink_ext_ack *extack, - bool check_allow_bit, - enum flow_action_hw_stats_type_bit allow_bit) +__flow_action_hw_stats_check(const struct flow_action *action, + struct netlink_ext_ack *extack, + bool check_allow_bit, + enum flow_action_hw_stats_type_bit allow_bit) { const struct flow_action_entry *action_entry; if (!flow_action_has_entries(action)) return true; - if (!flow_action_mixed_hw_stats_types_check(action, extack)) + if (!flow_action_mixed_hw_stats_check(action, extack)) return false; action_entry = flow_action_first_entry_get(action); if (!check_allow_bit && - action_entry->hw_stats_type != FLOW_ACTION_HW_STATS_TYPE_ANY) { + action_entry->hw_stats_type != FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && @@ -338,19 +336,18 @@ __flow_action_hw_stats_types_check(const struct flow_action *action, } static inline bool -flow_action_hw_stats_types_check(const struct flow_action *action, - struct netlink_ext_ack *extack, - enum flow_action_hw_stats_type_bit allow_bit) +flow_action_hw_stats_check(const struct flow_action *action, + struct netlink_ext_ack *extack, + enum flow_action_hw_stats_type_bit allow_bit) { - return __flow_action_hw_stats_types_check(action, extack, - true, allow_bit); + return __flow_action_hw_stats_check(action, extack, true, allow_bit); } static inline bool -flow_action_basic_hw_stats_types_check(const struct flow_action *action, - struct netlink_ext_ack *extack) +flow_action_basic_hw_stats_check(const struct flow_action *action, + struct netlink_ext_ack *extack) { - return __flow_action_hw_stats_types_check(action, extack, false, 0); + return __flow_action_hw_stats_check(action, extack, false, 0); } struct flow_rule { diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 5ae5295aa46d..e1e588387103 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -45,7 +45,7 @@ enum nf_ct_ext_id { struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; - char data[0]; + char data[]; }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 6dd72396f534..659b0ea25b4d 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -14,7 +14,7 @@ struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; - char data[0]; + char data[]; }; struct ctnl_timeout { diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 6890f1ca3e31..f523ea87b6ae 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -19,11 +19,17 @@ enum flow_offload_tuple_dir; struct nf_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; + struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; + struct flow_dissector_key_keyid enc_key_id; + union { + struct flow_dissector_key_ipv4_addrs enc_ipv4; + struct flow_dissector_key_ipv6_addrs enc_ipv6; + }; struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ports tp; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4170c033d461..5d80e09f8148 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -224,7 +224,7 @@ int nft_validate_register_store(const struct nft_ctx *ctx, */ struct nft_userdata { u8 len; - unsigned char data[0]; + unsigned char data[]; }; /** @@ -385,21 +385,14 @@ struct nft_set_ops { * struct nft_set_type - nf_tables set type * * @ops: set ops for this type - * @list: used internally - * @owner: module reference * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; - struct list_head list; - struct module *owner; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) -int nft_register_set(struct nft_set_type *type); -void nft_unregister_set(struct nft_set_type *type); - /** * struct nft_set - nf_tables set instance * @@ -572,7 +565,7 @@ struct nft_set_ext_tmpl { struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; - char data[0]; + char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) @@ -673,6 +666,10 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } +struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nlattr *attr); + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, @@ -849,8 +846,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) return (void *)expr->data; } -struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, - const struct nlattr *nla); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); @@ -895,6 +890,18 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } +static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_expr *expr; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) { + expr = nft_set_ext_expr(ext); + expr->ops->eval(expr, regs, pkt); + } +} + /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it @@ -1253,9 +1260,6 @@ void nft_trace_notify(struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) -#define MODULE_ALIAS_NFT_SET() \ - MODULE_ALIAS("nft-set") - #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) @@ -1385,7 +1389,7 @@ struct nft_trans { int msg_type; bool put_net; struct nft_ctx ctx; - char data[0]; + char data[]; }; struct nft_trans_rule { diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 29e7e1021267..78516de14d31 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -69,12 +69,13 @@ extern const struct nft_expr_ops nft_payload_fast_ops; extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; -extern struct nft_set_type nft_set_rhash_type; -extern struct nft_set_type nft_set_hash_type; -extern struct nft_set_type nft_set_hash_fast_type; -extern struct nft_set_type nft_set_rbtree_type; -extern struct nft_set_type nft_set_bitmap_type; -extern struct nft_set_type nft_set_pipapo_type; +extern const struct nft_set_type nft_set_rhash_type; +extern const struct nft_set_type nft_set_hash_type; +extern const struct nft_set_type nft_set_hash_fast_type; +extern const struct nft_set_type nft_set_rbtree_type; +extern const struct nft_set_type nft_set_bitmap_type; +extern const struct nft_set_type nft_set_pipapo_type; +extern const struct nft_set_type nft_set_pipapo_avx2_type; struct nft_expr; struct nft_regs; diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 20d2c6419612..9092e697059e 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -75,7 +75,15 @@ struct qdisc_watchdog { void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); + +void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, + u64 delta_ns); + +static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, + u64 expires) +{ + return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); +} static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 42f7ca38ad80..54010b49c093 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -131,6 +131,7 @@ enum { #define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */ #define BRIDGE_VLAN_INFO_BRENTRY (1<<5) /* Global bridge VLAN entry */ #define BRIDGE_VLAN_INFO_ONLY_OPTS (1<<6) /* Skip create/delete/flags */ +#define BRIDGE_VLAN_INFO_REMOVE_TUN (1<<7) /* Remove tunnel mapping */ struct bridge_vlan_info { __u16 flags; @@ -192,6 +193,7 @@ enum { BRIDGE_VLANDB_ENTRY_INFO, BRIDGE_VLANDB_ENTRY_RANGE, BRIDGE_VLANDB_ENTRY_STATE, + BRIDGE_VLANDB_ENTRY_TUNNEL_ID, __BRIDGE_VLANDB_ENTRY_MAX, }; #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index 0b9c3beda345..90f9b4e1ba27 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -134,11 +134,16 @@ /* MAC and PHY tx_config_Reg[15:0] for SGMII in-band auto-negotiation.*/ #define ADVERTISE_SGMII 0x0001 /* MAC can do SGMII */ #define LPA_SGMII 0x0001 /* PHY can do SGMII */ +#define LPA_SGMII_SPD_MASK 0x0c00 /* SGMII speed mask */ +#define LPA_SGMII_FULL_DUPLEX 0x1000 /* SGMII full duplex */ #define LPA_SGMII_DPX_SPD_MASK 0x1C00 /* SGMII duplex and speed bits */ +#define LPA_SGMII_10 0x0000 /* 10Mbps */ #define LPA_SGMII_10HALF 0x0000 /* Can do 10mbps half-duplex */ #define LPA_SGMII_10FULL 0x1000 /* Can do 10mbps full-duplex */ +#define LPA_SGMII_100 0x0400 /* 100Mbps */ #define LPA_SGMII_100HALF 0x0400 /* Can do 100mbps half-duplex */ #define LPA_SGMII_100FULL 0x1400 /* Can do 100mbps full-duplex */ +#define LPA_SGMII_1000 0x0800 /* 1000Mbps */ #define LPA_SGMII_1000HALF 0x0800 /* Can do 1000mbps half-duplex */ #define LPA_SGMII_1000FULL 0x1800 /* Can do 1000mbps full-duplex */ #define LPA_SGMII_LINK 0x8000 /* PHY link with copper-side partner */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 065218a20bb7..9c3d2d04d6a1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1770,6 +1770,7 @@ enum nft_tunnel_opts_attributes { NFTA_TUNNEL_KEY_OPTS_UNSPEC, NFTA_TUNNEL_KEY_OPTS_VXLAN, NFTA_TUNNEL_KEY_OPTS_ERSPAN, + NFTA_TUNNEL_KEY_OPTS_GENEVE, __NFTA_TUNNEL_KEY_OPTS_MAX }; #define NFTA_TUNNEL_KEY_OPTS_MAX (__NFTA_TUNNEL_KEY_OPTS_MAX - 1) @@ -1791,6 +1792,15 @@ enum nft_tunnel_opts_erspan_attributes { }; #define NFTA_TUNNEL_KEY_ERSPAN_MAX (__NFTA_TUNNEL_KEY_ERSPAN_MAX - 1) +enum nft_tunnel_opts_geneve_attributes { + NFTA_TUNNEL_KEY_GENEVE_UNSPEC, + NFTA_TUNNEL_KEY_GENEVE_CLASS, + NFTA_TUNNEL_KEY_GENEVE_TYPE, + NFTA_TUNNEL_KEY_GENEVE_DATA, + __NFTA_TUNNEL_KEY_GENEVE_MAX +}; +#define NFTA_TUNNEL_KEY_GENEVE_MAX (__NFTA_TUNNEL_KEY_GENEVE_MAX - 1) + enum nft_tunnel_flags { NFT_TUNNEL_F_ZERO_CSUM_TX = (1 << 0), NFT_TUNNEL_F_DONT_FRAGMENT = (1 << 1), diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h index 3c586a19baea..434e6506abaa 100644 --- a/include/uapi/linux/netfilter/xt_IDLETIMER.h +++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h @@ -1,4 +1,3 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * linux/include/linux/netfilter/xt_IDLETIMER.h * @@ -33,6 +32,7 @@ #include <linux/types.h> #define MAX_IDLETIMER_LABEL_SIZE 28 +#define XT_IDLETIMER_ALARM 0x01 struct idletimer_tg_info { __u32 timeout; @@ -43,4 +43,14 @@ struct idletimer_tg_info { struct idletimer_tg *timer __attribute__((aligned(8))); }; +struct idletimer_tg_info_v1 { + __u32 timeout; + + char label[MAX_IDLETIMER_LABEL_SIZE]; + + __u8 timer_type; + + /* for kernel module internal use only */ + struct idletimer_tg *timer __attribute__((aligned(8))); +}; #endif diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h index 9acf757bc1f7..73b26a280c4f 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_among.h +++ b/include/uapi/linux/netfilter_bridge/ebt_among.h @@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple { struct ebt_mac_wormhash { int table[257]; int poolsize; - struct ebt_mac_wormhash_tuple pool[0]; + struct ebt_mac_wormhash_tuple pool[]; }; #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \ diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index ea39287d59c8..7307a29a103e 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -911,6 +911,8 @@ enum { TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + TCA_FQ_TIMER_SLACK, /* timer slack */ + __TCA_FQ_MAX }; diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index afee292fb004..162998e2f039 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -26,8 +26,8 @@ static size_t __get_vlan_tinfo_size(void) nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */ } -static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr, - struct net_bridge_vlan *v_last) +bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *v_last) { __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id); __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id); @@ -193,8 +193,8 @@ static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, }; -static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd, - u16 vid, u32 tun_id, bool *changed) +int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, + u16 vid, u32 tun_id, bool *changed) { int err = 0; @@ -250,8 +250,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, return 0; } -int br_process_vlan_tunnel_info(struct net_bridge *br, - struct net_bridge_port *p, int cmd, +int br_process_vlan_tunnel_info(const struct net_bridge *br, + const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, struct vtunnel_info *tinfo_last, bool *changed) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5153ffe79a01..1f97703a52ff 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1199,8 +1199,8 @@ static inline void br_vlan_notify(const struct net_bridge *br, /* br_vlan_options.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING -bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, - const struct net_bridge_vlan *v2); +bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end); bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); size_t br_vlan_opts_nl_size(void); int br_vlan_process_options(const struct net_bridge *br, diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h index 2bdef2ea3420..c54cc26211d7 100644 --- a/net/bridge/br_private_tunnel.h +++ b/net/bridge/br_private_tunnel.h @@ -18,8 +18,8 @@ struct vtunnel_info { /* br_netlink_tunnel.c */ int br_parse_vlan_tunnel_info(struct nlattr *attr, struct vtunnel_info *tinfo); -int br_process_vlan_tunnel_info(struct net_bridge *br, - struct net_bridge_port *p, +int br_process_vlan_tunnel_info(const struct net_bridge *br, + const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, struct vtunnel_info *tinfo_last, @@ -32,8 +32,9 @@ int br_fill_vlan_tunnel_info(struct sk_buff *skb, /* br_vlan_tunnel.c */ int vlan_tunnel_init(struct net_bridge_vlan_group *vg); void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg); -int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid); -int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id); +int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid); +int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, + u32 tun_id); void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port); void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan); @@ -42,19 +43,23 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan_group *vg); int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan); +bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *v_last); +int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, + u16 vid, u32 tun_id, bool *changed); #else static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { return 0; } -static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, +static inline int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid) { return 0; } -static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, +static inline int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, u32 tun_id) { return 0; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6b5deca08b89..24f524536be4 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1694,7 +1694,7 @@ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && - br_vlan_opts_eq(v_curr, range_end); + br_vlan_opts_eq_range(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, @@ -1839,6 +1839,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = .len = sizeof(struct bridge_vlan_info) }, [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_ENTRY_TUNNEL_ID] = { .type = NLA_U32 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index cd2eb194eb98..138e180cf4d8 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -4,25 +4,48 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <net/ip_tunnels.h> #include "br_private.h" +#include "br_private_tunnel.h" -/* check if the options between two vlans are equal */ -bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, - const struct net_bridge_vlan *v2) +static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v) { - return v1->state == v2->state; + __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id); + + if (!v->tinfo.tunnel_dst) + return true; + + return !nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_ID, + be32_to_cpu(tid)); +} + +static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) || + vlan_tunid_inrange(v_curr, range_end); +} + +/* check if the options' state of v_curr allow it to enter the range */ +bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return v_curr->state == range_end->state && + __vlan_tun_can_enter_range(v_curr, range_end); } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, - br_vlan_get_state(v)); + br_vlan_get_state(v)) && + __vlan_tun_put(skb, v); } size_t br_vlan_opts_nl_size(void) { - return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */ + return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */ + + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_ENTRY_TUNNEL_ID */ } static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, @@ -62,6 +85,40 @@ static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, return 0; } +static int br_vlan_modify_tunnel(const struct net_bridge_port *p, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + struct bridge_vlan_info *vinfo; + int cmdmap; + u32 tun_id; + + if (!p) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans"); + return -EINVAL; + } + if (!(p->flags & BR_VLAN_TUNNEL)) { + NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set"); + return -EINVAL; + } + + /* vlan info attribute is guaranteed by br_vlan_rtm_process_one */ + vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); + cmdmap = vinfo->flags & BRIDGE_VLAN_INFO_REMOVE_TUN ? RTM_DELLINK : + RTM_SETLINK; + /* when working on vlan ranges this represents the starting tunnel id */ + tun_id = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_TUNNEL_ID]); + /* tunnel ids are mapped to each vlan in increasing order, + * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the + * current vlan, so we compute: tun_id + v - vinfo->vid + */ + tun_id += v->vid - vinfo->vid; + + return br_vlan_tunnel_info(p, cmdmap, v->vid, tun_id, changed); +} + static int br_vlan_process_one_opts(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, @@ -80,6 +137,11 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, if (err) return err; } + if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_ID]) { + err = br_vlan_modify_tunnel(p, v, tb, changed, extack); + if (err) + return err; + } return 0; } diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index d13d2080f527..169e005fbda2 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -89,7 +89,8 @@ out: /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ -int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) +int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, + u32 tun_id) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; @@ -107,7 +108,7 @@ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ -int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid) +int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index e1256e03a9a8..78db58c7aec2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1561,7 +1561,7 @@ struct compat_ebt_entry_mwt { compat_uptr_t ptr; } u; compat_uint_t match_size; - compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace)))); + compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace)); }; /* account for possible padding between match_size and ->data */ diff --git a/net/core/dev.c b/net/core/dev.c index d84541c24446..021e18251465 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9283,6 +9283,10 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); + ret = ethtool_check_ops(dev->ethtool_ops); + if (ret) + return ret; + spin_lock_init(&dev->addr_list_lock); lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c5beb3031a72..5f782fa3029f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -861,8 +861,8 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, if (!flow_offload_has_one_action(&cls->rule->action)) return err; - if (!flow_action_basic_hw_stats_types_check(&cls->rule->action, - cls->common.extack)) + if (!flow_action_basic_hw_stats_check(&cls->rule->action, + cls->common.extack)) return err; act = &cls->rule->action.entries[0]; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 0b22741b2f8f..dab047eec943 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -289,3 +289,14 @@ out: kfree(indir); return ret; } + +int ethtool_check_ops(const struct ethtool_ops *ops) +{ + if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params)) + return -EINVAL; + /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime, + * the fact that ops are checked at registration time does not + * mean the ops attached to a netdev later on are sane. + */ + return 0; +} diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 258840b19fb5..3852a58d7f95 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1519,9 +1519,6 @@ ethtool_set_coalesce_supported(struct net_device *dev, u32 supported_params = dev->ethtool_ops->supported_coalesce_params; u32 nonzero_params = 0; - if (!supported_params) - return true; - if (coalesce->rx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_RX_USECS; if (coalesce->rx_max_coalesced_frames) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f1f78a742b36..b167f4a5b684 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1057,7 +1057,7 @@ struct compat_arpt_replace { u32 underflow[NF_ARP_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; - struct compat_arpt_entry entries[0]; + struct compat_arpt_entry entries[]; }; static inline void compat_release_entry(struct compat_arpt_entry *e) @@ -1383,7 +1383,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, struct compat_arpt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; - struct compat_arpt_entry entrytable[0]; + struct compat_arpt_entry entrytable[]; }; static int compat_get_entries(struct net *net, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 10b91ebdf213..c2670eaa74e6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1211,7 +1211,7 @@ struct compat_ipt_replace { u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ - struct compat_ipt_entry entries[0]; + struct compat_ipt_entry entries[]; }; static int @@ -1562,7 +1562,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, struct compat_ipt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; - struct compat_ipt_entry entrytable[0]; + struct compat_ipt_entry entrytable[]; }; static int diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 645cc3009e64..f5f588b1f6e9 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -145,12 +145,13 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tcp_in_slow_start(tp)) - tcp_slow_start(tp, acked); - else { - bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt, 1); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; } + bictcp_update(ca, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, ca->cnt, acked); } /* diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 471571e1ab26..6cebf412d590 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -10,10 +10,9 @@ #include <net/tcp.h> /* These factors derived from the recommended values in the aer: - * .01 and and 7/8. We use 50 instead of 100 to account for - * delayed ack. + * .01 and and 7/8. */ -#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) @@ -23,11 +22,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tcp_in_slow_start(tp)) - tcp_slow_start(tp, acked); - else - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), - 1); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + acked); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 3b36bb1a0dda..50a9a6e2c4cd 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -153,31 +153,34 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; if (tcp_in_slow_start(tp)) { - /* Slow start. */ - tcp_slow_start(tp, acked); + /* Slow start. */ + acked = tcp_slow_start(tp, acked); + if (!acked) + goto done; + } + + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } else { - /* Congestion avoidance. */ - if (veno->diff < beta) { - /* In the "non-congestive state", increase cwnd - * every rtt. - */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); - } else { - /* In the "congestive state", increase cwnd - * every other rtt. - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (veno->inc && - tp->snd_cwnd < tp->snd_cwnd_clamp) { - tp->snd_cwnd++; - veno->inc = 0; - } else - veno->inc = 1; - tp->snd_cwnd_cnt = 0; + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc && + tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; } else - tp->snd_cwnd_cnt++; - } + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt += acked; } +done: if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; else if (tp->snd_cwnd > tp->snd_cwnd_clamp) diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index e00570dd0a69..3bb448761ca3 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -36,8 +36,6 @@ struct yeah { u32 reno_count; u32 fast_count; - - u32 pkts_acked; }; static void tcp_yeah_init(struct sock *sk) @@ -57,18 +55,6 @@ static void tcp_yeah_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void tcp_yeah_pkts_acked(struct sock *sk, - const struct ack_sample *sample) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct yeah *yeah = inet_csk_ca(sk); - - if (icsk->icsk_ca_state == TCP_CA_Open) - yeah->pkts_acked = sample->pkts_acked; - - tcp_vegas_pkts_acked(sk, sample); -} - static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -77,24 +63,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tcp_in_slow_start(tp)) - tcp_slow_start(tp, acked); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + goto do_vegas; + } - else if (!yeah->doing_reno_now) { + if (!yeah->doing_reno_now) { /* Scalable */ - - tp->snd_cwnd_cnt += yeah->pkts_acked; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - - yeah->pkts_acked = 1; - + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + acked); } else { /* Reno */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. @@ -118,7 +99,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) * of bytes we send in an RTT is often less than our cwnd will allow. * So we keep track of our cwnd separately, in v_beg_snd_cwnd. */ - +do_vegas: if (after(ack, yeah->vegas.beg_snd_nxt)) { /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -232,7 +213,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = { .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, - .pkts_acked = tcp_yeah_pkts_acked, + .pkts_acked = tcp_vegas_pkts_acked, .owner = THIS_MODULE, .name = "yeah", diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c973ace208c5..e27393498ecb 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1227,7 +1227,7 @@ struct compat_ip6t_replace { u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ - struct compat_ip6t_entry entries[0]; + struct compat_ip6t_entry entries[]; }; static int @@ -1571,7 +1571,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; - struct compat_ip6t_entry entrytable[0]; + struct compat_ip6t_entry entrytable[]; }; static int diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 04c3caed92df..e959104832ef 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -861,6 +861,9 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) ack_seq++; msk->ack_seq = ack_seq; } + + /* will be fully established after successful MPC subflow creation */ + inet_sk_state_store(nsk, TCP_SYN_RECV); bh_unlock_sock(nsk); /* keep a single reference */ @@ -916,10 +919,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); - /* will be fully established at mptcp_stream_accept() - * completion. - */ - inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); bh_unlock_sock(new_mptcp_sock); local_bh_enable(); } @@ -1256,8 +1255,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } - - inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); } sock_put(ssock->sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8434c7f5f712..052d72a1d3a2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -234,6 +234,8 @@ create_child: /* new mpc subflow takes ownership of the newly * created mptcp socket */ + inet_sk_state_store((struct sock *)new_msk, + TCP_ESTABLISHED); ctx->conn = new_msk; new_msk = NULL; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 91efae88e8c2..468fea1aebba 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -455,14 +455,6 @@ config NF_TABLES To compile it as a module, choose M here. if NF_TABLES - -config NF_TABLES_SET - tristate "Netfilter nf_tables set infrastructure" - help - This option enables the nf_tables set infrastructure that allows to - look up for elements in a set and to build one-way mappings between - matchings and actions. - config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3f572e5a975e..292e71dc7ba4 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,14 +78,17 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ - nft_chain_route.o nf_tables_offload.o + nft_chain_route.o nf_tables_offload.o \ + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o -nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ - nft_set_pipapo.o +ifdef CONFIG_X86_64 +ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS))) +nf_tables-objs += nft_set_pipapo_avx2.o +endif +endif obj-$(CONFIG_NF_TABLES) += nf_tables.o -obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 0a2196f59106..486959f70cf3 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -46,7 +46,7 @@ struct bitmap_ip { u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ - unsigned char extensions[0] /* data extensions */ + unsigned char extensions[] /* data extensions */ __aligned(__alignof__(u64)); }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 739e343efaf6..2310a316e0af 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -49,7 +49,7 @@ struct bitmap_ipmac { size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ struct ip_set *set; /* attached to this ip_set */ - unsigned char extensions[0] /* MAC + data extensions */ + unsigned char extensions[] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index b49978dd810d..e56ced66f202 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -37,7 +37,7 @@ struct bitmap_port { size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ - unsigned char extensions[0] /* data extensions */ + unsigned char extensions[] /* data extensions */ __aligned(__alignof__(u64)); }; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index e52d7b7597a0..1ee43752d6d3 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -76,7 +76,7 @@ struct hbucket { DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ - unsigned char value[0] /* the array of the values */ + unsigned char value[] /* the array of the values */ __aligned(__alignof__(u64)); }; @@ -109,7 +109,7 @@ struct htable { u8 htable_bits; /* size of hash table == 2^htable_bits */ u32 maxelem; /* Maxelem per region */ struct ip_set_region *hregion; /* Region locks and ext sizes */ - struct hbucket __rcu *bucket[0]; /* hashtable buckets */ + struct hbucket __rcu *bucket[]; /* hashtable buckets */ }; #define hbucket(h, i) ((h)->bucket[i]) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1927fc296f95..a18f8fe728e3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2633,7 +2633,6 @@ void nf_conntrack_init_end(void) */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) #define DYING_NULLS_VAL ((1<<30)+1) -#define TEMPLATE_NULLS_VAL ((1<<30)+2) int nf_conntrack_init_net(struct net *net) { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4912069627b6..9b57330c81f8 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1054,21 +1054,18 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); - /* Don't export sysctls to unprivileged users */ + /* Don't allow unprivileged users to alter certain sysctls */ if (net->user_ns != &init_user_ns) { - table[NF_SYSCTL_CT_MAX].procname = NULL; - table[NF_SYSCTL_CT_ACCT].procname = NULL; - table[NF_SYSCTL_CT_HELPER].procname = NULL; -#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP - table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL; -#endif + table[NF_SYSCTL_CT_MAX].mode = 0444; + table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; + table[NF_SYSCTL_CT_HELPER].mode = 0444; #ifdef CONFIG_NF_CONNTRACK_EVENTS - table[NF_SYSCTL_CT_EVENTS].procname = NULL; + table[NF_SYSCTL_CT_EVENTS].mode = 0444; #endif - } - - if (!net_eq(&init_net, net)) table[NF_SYSCTL_CT_BUCKETS].mode = 0444; + } else if (!net_eq(&init_net, net)) { + table[NF_SYSCTL_CT_BUCKETS].mode = 0444; + } net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 42b73a084a63..ad549317af30 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -7,6 +7,7 @@ #include <linux/tc_act/tc_csum.h> #include <net/flow_offload.h> #include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -27,11 +28,61 @@ struct flow_offload_work { (__match)->dissector.offset[__type] = \ offsetof(struct nf_flow_key, __field) +static void nf_flow_rule_lwt_match(struct nf_flow_match *match, + struct ip_tunnel_info *tun_info) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + unsigned int enc_keys; + + if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) + return; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); + key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); + mask->enc_key_id.keyid = 0xffffffff; + enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + + if (ip_tunnel_info_af(tun_info) == AF_INET) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + enc_ipv4); + key->enc_ipv4.src = tun_info->key.u.ipv4.dst; + key->enc_ipv4.dst = tun_info->key.u.ipv4.src; + if (key->enc_ipv4.src) + mask->enc_ipv4.src = 0xffffffff; + if (key->enc_ipv4.dst) + mask->enc_ipv4.dst = 0xffffffff; + enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else { + memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, + sizeof(struct in6_addr)); + memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src, + sizeof(struct in6_addr)); + if (memcmp(&key->enc_ipv6.src, &in6addr_any, + sizeof(struct in6_addr))) + memset(&key->enc_ipv6.src, 0xff, + sizeof(struct in6_addr)); + if (memcmp(&key->enc_ipv6.dst, &in6addr_any, + sizeof(struct in6_addr))) + memset(&key->enc_ipv6.dst, 0xff, + sizeof(struct in6_addr)); + enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + match->dissector.used_keys |= enc_keys; +} + static int nf_flow_rule_match(struct nf_flow_match *match, - const struct flow_offload_tuple *tuple) + const struct flow_offload_tuple *tuple, + struct dst_entry *other_dst) { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; + struct ip_tunnel_info *tun_info; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); @@ -41,6 +92,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + if (other_dst->lwtstate) { + tun_info = lwt_tun_info(other_dst->lwtstate); + nf_flow_rule_lwt_match(match, tun_info); + } + key->meta.ingress_ifindex = tuple->iifidx; mask->meta.ingress_ifindex = 0xffffffff; @@ -419,10 +475,52 @@ static void flow_offload_redirect(const struct flow_offload *flow, dev_hold(rt->dst.dev); } +static void flow_offload_encap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + struct dst_entry *dst; + + dst = flow->tuplehash[dir].tuple.dst_cache; + if (dst->lwtstate) { + struct ip_tunnel_info *tun_info; + + tun_info = lwt_tun_info(dst->lwtstate); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_TUNNEL_ENCAP; + entry->tunnel = tun_info; + } + } +} + +static void flow_offload_decap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + struct dst_entry *dst; + + dst = flow->tuplehash[!dir].tuple.dst_cache; + if (dst->lwtstate) { + struct ip_tunnel_info *tun_info; + + tun_info = lwt_tun_info(dst->lwtstate); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_TUNNEL_DECAP; + } + } +} + int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + flow_offload_decap_tunnel(flow, dir, flow_rule); + flow_offload_encap_tunnel(flow, dir, flow_rule); + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; @@ -449,6 +547,9 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + flow_offload_decap_tunnel(flow, dir, flow_rule); + flow_offload_encap_tunnel(flow, dir, flow_rule); + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; @@ -479,6 +580,7 @@ nf_flow_offload_rule_alloc(struct net *net, const struct flow_offload *flow = offload->flow; const struct flow_offload_tuple *tuple; struct nf_flow_rule *flow_rule; + struct dst_entry *other_dst; int err = -ENOMEM; flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); @@ -494,7 +596,8 @@ nf_flow_offload_rule_alloc(struct net *net, flow_rule->rule->match.key = &flow_rule->match.key; tuple = &flow->tuplehash[dir].tuple; - err = nf_flow_rule_match(&flow_rule->match, tuple); + other_dst = flow->tuplehash[!dir].tuple.dst_cache; + err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst); if (err < 0) goto err_flow_match; @@ -574,6 +677,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct nf_flow_rule *flow_rule, enum flow_offload_tuple_dir dir, int priority, int cmd, + struct flow_stats *stats, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; @@ -598,6 +702,9 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, } mutex_unlock(&flowtable->flow_block_lock); + if (cmd == FLOW_CLS_STATS) + memcpy(stats, &cls_flow.stats, sizeof(*stats)); + return i; } @@ -607,7 +714,7 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload, { return nf_flow_offload_tuple(offload->flowtable, offload->flow, flow_rule, dir, offload->priority, - FLOW_CLS_REPLACE, + FLOW_CLS_REPLACE, NULL, &offload->flowtable->flow_block.cb_list); } @@ -615,7 +722,7 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, - offload->priority, FLOW_CLS_DESTROY, + offload->priority, FLOW_CLS_DESTROY, NULL, &offload->flowtable->flow_block.cb_list); } @@ -661,21 +768,9 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir, struct flow_stats *stats) { - struct nf_flowtable *flowtable = offload->flowtable; - struct flow_cls_offload cls_flow = {}; - struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; - - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_STATS, - &offload->flow->tuplehash[dir].tuple, &extack); - - mutex_lock(&flowtable->flow_block_lock); - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) - block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); - mutex_unlock(&flowtable->flow_block_lock); - memcpy(stats, &cls_flow.stats, sizeof(*stats)); + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_STATS, stats, + &offload->flowtable->flow_block.cb_list); } static void flow_offload_work_stats(struct flow_offload_work *offload) @@ -820,25 +915,47 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, return err; } -static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, - struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd, - struct netlink_ext_ack *extack) +static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nf_flowtable *flowtable, + struct netlink_ext_ack *extack) { - int err; - - if (!dev->netdev_ops->ndo_setup_tc) - return -EOPNOTSUPP; - memset(bo, 0, sizeof(*bo)); - bo->net = dev_net(dev); + bo->net = net; bo->block = &flowtable->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; INIT_LIST_HEAD(&bo->cb_list); +} +static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) +{ + nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, + extack); + flow_indr_block_call(dev, bo, cmd); + + if (list_empty(&bo->cb_list)) + return -EOPNOTSUPP; + + return 0; +} + +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) +{ + int err; + + nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, + extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); if (err < 0) return err; @@ -857,7 +974,12 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, if (!nf_flowtable_hw_offload(flowtable)) return 0; - err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); + if (dev->netdev_ops->ndo_setup_tc) + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, + &extack); + else + err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd, + &extack); if (err < 0) return err; @@ -865,10 +987,75 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); +static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev, + struct nf_flowtable *flowtable, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + + if (!flowtable) + return; + + nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable, + &extack); + + cb(dev, cb_priv, TC_SETUP_FT, &bo); + + nf_flow_table_block_setup(flowtable, &bo, cmd); +} + +static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable, + struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) +{ + if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + return; + + nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd); +} + +static void nf_flow_table_indr_block_cb(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) +{ + struct net *net = dev_net(dev); + struct nft_flowtable *nft_ft; + struct nft_table *table; + struct nft_hook *hook; + + mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(nft_ft, &table->flowtables, list) { + list_for_each_entry(hook, &nft_ft->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + nf_flow_table_indr_block_cb_cmd(&nft_ft->data, + dev, cb, + cb_priv, cmd); + } + } + } + mutex_unlock(&net->nft.commit_mutex); +} + +static struct flow_indr_block_entry block_ing_entry = { + .cb = nf_flow_table_indr_block_cb, + .list = LIST_HEAD_INIT(block_ing_entry.list), +}; + int nf_flow_table_offload_init(void) { INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + flow_indr_add_block_cb(&block_ing_entry); + return 0; } @@ -877,6 +1064,8 @@ void nf_flow_table_offload_exit(void) struct flow_offload_work *offload, *next; LIST_HEAD(offload_pending_list); + flow_indr_del_block_cb(&block_ing_entry); + cancel_work_sync(&nf_flow_offload_work); list_for_each_entry_safe(offload, next, &offload_pending_list, list) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 38c680f28f15..f92fb6003745 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2523,8 +2523,8 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx, module_put(type->owner); } -struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, - const struct nlattr *nla) +static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, + const struct nlattr *nla) { struct nft_expr_info info; struct nft_expr *expr; @@ -3266,25 +3266,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, /* * Sets */ - -static LIST_HEAD(nf_tables_set_types); - -int nft_register_set(struct nft_set_type *type) -{ - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail_rcu(&type->list, &nf_tables_set_types); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - return 0; -} -EXPORT_SYMBOL_GPL(nft_register_set); - -void nft_unregister_set(struct nft_set_type *type) -{ - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del_rcu(&type->list); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_unregister_set); +static const struct nft_set_type *nft_set_types[] = { + &nft_set_hash_fast_type, + &nft_set_hash_type, + &nft_set_rhash_type, + &nft_set_bitmap_type, + &nft_set_rbtree_type, +#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2) + &nft_set_pipapo_avx2_type, +#endif + &nft_set_pipapo_type, +}; #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ @@ -3310,15 +3302,11 @@ nft_select_set_ops(const struct nft_ctx *ctx, struct nft_set_estimate est, best; const struct nft_set_type *type; u32 flags = 0; + int i; lockdep_assert_held(&ctx->net->nft.commit_mutex); lockdep_nfnl_nft_mutex_not_held(); -#ifdef CONFIG_MODULES - if (list_empty(&nf_tables_set_types)) { - if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) - return ERR_PTR(-EAGAIN); - } -#endif + if (nla[NFTA_SET_FLAGS] != NULL) flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); @@ -3327,7 +3315,8 @@ nft_select_set_ops(const struct nft_ctx *ctx, best.lookup = ~0; best.space = ~0; - list_for_each_entry(type, &nf_tables_set_types, list) { + for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) { + type = nft_set_types[i]; ops = &type->ops; if (!nft_set_ops_candidate(type, flags)) @@ -3358,11 +3347,6 @@ nft_select_set_ops(const struct nft_ctx *ctx, break; } - if (!try_module_get(type->owner)) - continue; - if (bops != NULL) - module_put(to_set_type(bops)->owner); - bops = ops; best = est; } @@ -4061,10 +4045,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, size = ops->privsize(nla, &desc); set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); - if (!set) { - err = -ENOMEM; - goto err1; - } + if (!set) + return -ENOMEM; name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); if (!name) { @@ -4123,8 +4105,6 @@ err3: kfree(set->name); err2: kvfree(set); -err1: - module_put(to_set_type(ops)->owner); return err; } @@ -4134,7 +4114,6 @@ static void nft_set_destroy(struct nft_set *set) return; set->ops->destroy(set); - module_put(to_set_type(set->ops)->owner); kfree(set->name); kvfree(set); } @@ -4312,7 +4291,6 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u32), }, }; -EXPORT_SYMBOL_GPL(nft_set_ext_types); /* * Set elements @@ -4801,6 +4779,36 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, return trans; } +struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nlattr *attr) +{ + struct nft_expr *expr; + int err; + + expr = nft_expr_init(ctx, attr); + if (IS_ERR(expr)) + return expr; + + err = -EOPNOTSUPP; + if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL)) + goto err_set_elem_expr; + + if (expr->ops->type->flags & NFT_EXPR_GC) { + if (set->flags & NFT_SET_TIMEOUT) + goto err_set_elem_expr; + if (!set->ops->gc_init) + goto err_set_elem_expr; + set->ops->gc_init(set); + } + + return expr; + +err_set_elem_expr: + nft_expr_destroy(ctx, expr); + return ERR_PTR(err); +} + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, @@ -4883,6 +4891,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; + struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; struct nft_data data; @@ -4950,10 +4959,17 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } + if (nla[NFTA_SET_ELEM_EXPR] != NULL) { + expr = nft_set_elem_expr_alloc(ctx, set, + nla[NFTA_SET_ELEM_EXPR]); + if (IS_ERR(expr)) + return PTR_ERR(expr); + } + err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) - return err; + goto err_set_elem_expr; nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); @@ -4972,6 +4988,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } + if (expr) + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR, + expr->ops->size); + if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; @@ -5056,6 +5076,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } + if (expr) { + memcpy(nft_set_ext_expr(ext), expr, expr->ops->size); + kfree(expr); + } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) @@ -5111,6 +5135,9 @@ err_parse_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); +err_set_elem_expr: + if (expr != NULL) + nft_expr_destroy(ctx, expr); return err; } @@ -5365,7 +5392,6 @@ void nft_set_gc_batch_release(struct rcu_head *rcu) nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); kfree(gcb); } -EXPORT_SYMBOL_GPL(nft_set_gc_batch_release); struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, gfp_t gfp) @@ -5378,7 +5404,6 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, gcb->head.set = set; return gcb; } -EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); /* * Stateful objects diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c deleted file mode 100644 index 586b621007eb..000000000000 --- a/net/netfilter/nf_tables_set_core.c +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/module.h> -#include <net/netfilter/nf_tables_core.h> - -static int __init nf_tables_set_module_init(void) -{ - nft_register_set(&nft_set_hash_fast_type); - nft_register_set(&nft_set_hash_type); - nft_register_set(&nft_set_rhash_type); - nft_register_set(&nft_set_bitmap_type); - nft_register_set(&nft_set_rbtree_type); - nft_register_set(&nft_set_pipapo_type); - - return 0; -} - -static void __exit nf_tables_set_module_exit(void) -{ - nft_unregister_set(&nft_set_pipapo_type); - nft_unregister_set(&nft_set_rbtree_type); - nft_unregister_set(&nft_set_bitmap_type); - nft_unregister_set(&nft_set_rhash_type); - nft_unregister_set(&nft_set_hash_type); - nft_unregister_set(&nft_set_hash_fast_type); -} - -module_init(nf_tables_set_module_init); -module_exit(nf_tables_set_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 2481470dec36..5827117f2635 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -33,7 +33,7 @@ struct nf_acct { refcount_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; - char data[0]; + char data[]; }; struct nfacct_filter { diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 0ed2281f03be..bc37d6c59db4 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -93,7 +93,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { static int nft_bitwise_init_bool(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc d1, d2; + struct nft_data_desc mask, xor; int err; if (tb[NFTA_BITWISE_DATA]) @@ -103,29 +103,29 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, !tb[NFTA_BITWISE_XOR]) return -EINVAL; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { + if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) { err = -EINVAL; goto err1; } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, + err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor, tb[NFTA_BITWISE_XOR]); if (err < 0) goto err1; - if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { + if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) { err = -EINVAL; goto err2; } return 0; err2: - nft_data_release(&priv->xor, d2.type); + nft_data_release(&priv->xor, xor.type); err1: - nft_data_release(&priv->mask, d1.type); + nft_data_release(&priv->mask, mask.type); return err; } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 683785225a3e..46ab28ec4b53 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -81,7 +81,6 @@ void nft_dynset_eval(const struct nft_expr *expr, const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set *set = priv->set; const struct nft_set_ext *ext; - const struct nft_expr *sexpr; u64 timeout; if (priv->op == NFT_DYNSET_OP_DELETE) { @@ -91,18 +90,13 @@ void nft_dynset_eval(const struct nft_expr *expr, if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { - sexpr = NULL; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - sexpr = nft_set_ext_expr(ext); - if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; } - if (sexpr != NULL) - sexpr->ops->eval(sexpr, regs, pkt); + nft_set_elem_update_expr(ext, regs, pkt); if (priv->invert) regs->verdict.code = NFT_BREAK; @@ -206,21 +200,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_EVAL)) return -EINVAL; - priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); + priv->expr = nft_set_elem_expr_alloc(ctx, set, + tb[NFTA_DYNSET_EXPR]); if (IS_ERR(priv->expr)) return PTR_ERR(priv->expr); - - err = -EOPNOTSUPP; - if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) - goto err1; - - if (priv->expr->ops->type->flags & NFT_EXPR_GC) { - if (set->flags & NFT_SET_TIMEOUT) - goto err1; - if (!set->ops->gc_init) - goto err1; - set->ops->gc_init(set); - } } nft_set_ext_prepare(&priv->tmpl); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 660bad688e2b..1e70359d633c 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -43,6 +43,7 @@ void nft_lookup_eval(const struct nft_expr *expr, nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); + nft_set_elem_update_expr(ext, regs, pkt); } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 87e8d9ba0c9b..1cb2e67e6e03 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -293,8 +293,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -struct nft_set_type nft_set_bitmap_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_bitmap_type = { .ops = { .privsize = nft_bitmap_privsize, .elemsize = offsetof(struct nft_bitmap_elem, ext), diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index d350a7cd3af0..4d3f147e8d8d 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -662,8 +662,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return true; } -struct nft_set_type nft_set_rhash_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_rhash_type = { .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT | NFT_SET_EVAL, .ops = { @@ -686,8 +685,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = { }, }; -struct nft_set_type nft_set_hash_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_hash_type = { .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { .privsize = nft_hash_privsize, @@ -706,8 +704,7 @@ struct nft_set_type nft_set_hash_type __read_mostly = { }, }; -struct nft_set_type nft_set_hash_fast_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_hash_fast_type = { .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { .privsize = nft_hash_privsize, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4fc0c924ed5d..c1afb6c94edc 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -330,144 +330,22 @@ #include <linux/kernel.h> #include <linux/init.h> -#include <linux/log2.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <uapi/linux/netfilter/nf_tables.h> -#include <net/ipv6.h> /* For the maximum length of a field */ #include <linux/bitmap.h> #include <linux/bitops.h> -/* Count of concatenated fields depends on count of 32-bit nftables registers */ -#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT - -/* Largest supported field size */ -#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) -#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) - -/* Number of bits to be grouped together in lookup table buckets, arbitrary */ -#define NFT_PIPAPO_GROUP_BITS 4 -#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) - -/* Fields are padded to 32 bits in input registers */ -#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ - (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) -#define NFT_PIPAPO_GROUPS_PADDING(x) \ - (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) - -/* Number of buckets, given by 2 ^ n, with n grouped bits */ -#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) - -/* Each n-bit range maps to up to n * 2 rules */ -#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) - -/* Use the rest of mapping table buckets for rule indices, but it makes no sense - * to exceed 32 bits - */ -#if BITS_PER_LONG == 64 -#define NFT_PIPAPO_MAP_TOBITS 32 -#else -#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) -#endif - -/* ...which gives us the highest allowed index for a rule */ -#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ - - (1UL << NFT_PIPAPO_MAP_NBITS)) - -#define nft_pipapo_for_each_field(field, index, match) \ - for ((field) = (match)->f, (index) = 0; \ - (index) < (match)->field_count; \ - (index)++, (field)++) - -/** - * union nft_pipapo_map_bucket - Bucket of mapping table - * @to: First rule number (in next field) this rule maps to - * @n: Number of rules (in next field) this rule maps to - * @e: If there's no next field, pointer to element this rule maps to - */ -union nft_pipapo_map_bucket { - struct { -#if BITS_PER_LONG == 64 - static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); - u32 to; - - static_assert(NFT_PIPAPO_MAP_NBITS <= 32); - u32 n; -#else - unsigned long to:NFT_PIPAPO_MAP_TOBITS; - unsigned long n:NFT_PIPAPO_MAP_NBITS; -#endif - }; - struct nft_pipapo_elem *e; -}; - -/** - * struct nft_pipapo_field - Lookup, mapping tables and related data for a field - * @groups: Amount of 4-bit groups - * @rules: Number of inserted rules - * @bsize: Size of each bucket in lookup table, in longs - * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets - * @mt: Mapping table: one bucket per rule - */ -struct nft_pipapo_field { - int groups; - unsigned long rules; - size_t bsize; - unsigned long *lt; - union nft_pipapo_map_bucket *mt; -}; - -/** - * struct nft_pipapo_match - Data used for lookup and matching - * @field_count Amount of fields in set - * @scratch: Preallocated per-CPU maps for partial matching results - * @bsize_max: Maximum lookup table bucket size of all fields, in longs - * @rcu Matching data is swapped on commits - * @f: Fields, with lookup and mapping tables - */ -struct nft_pipapo_match { - int field_count; - unsigned long * __percpu *scratch; - size_t bsize_max; - struct rcu_head rcu; - struct nft_pipapo_field f[0]; -}; +#include "nft_set_pipapo_avx2.h" +#include "nft_set_pipapo.h" /* Current working bitmap index, toggled between field matches */ static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); /** - * struct nft_pipapo - Representation of a set - * @match: Currently in-use matching data - * @clone: Copy where pending insertions and deletions are kept - * @groups: Total amount of 4-bit groups for fields in this set - * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions - * @last_gc: Timestamp of last garbage collection run, jiffies - */ -struct nft_pipapo { - struct nft_pipapo_match __rcu *match; - struct nft_pipapo_match *clone; - int groups; - int width; - bool dirty; - unsigned long last_gc; -}; - -struct nft_pipapo_elem; - -/** - * struct nft_pipapo_elem - API-facing representation of single set element - * @ext: nftables API extensions - */ -struct nft_pipapo_elem { - struct nft_set_ext ext; -}; - -/** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits * @len: Length of bitmap in longs @@ -484,9 +362,8 @@ struct nft_pipapo_elem { * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ -static int pipapo_refill(unsigned long *map, int len, int rules, - unsigned long *dst, union nft_pipapo_map_bucket *mt, - bool match_only) +int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, + union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; int k, ret = -1; @@ -559,26 +436,18 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; - unsigned long *lt = f->lt; - int b, group; + int b; - /* For each 4-bit group: select lookup table bucket depending on + /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ - for (group = 0; group < f->groups; group += 2) { - u8 v; - - v = *rp >> 4; - __bitmap_and(res_map, res_map, lt + v * f->bsize, - f->bsize * BITS_PER_LONG); - lt += f->bsize * NFT_PIPAPO_BUCKETS; - - v = *rp & 0x0f; - rp++; - __bitmap_and(res_map, res_map, lt + v * f->bsize, - f->bsize * BITS_PER_LONG); - lt += f->bsize * NFT_PIPAPO_BUCKETS; - } + if (likely(f->bb == 8)) + pipapo_and_field_buckets_8bit(f, res_map, rp); + else + pipapo_and_field_buckets_4bit(f, res_map, rp); + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + + rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -621,7 +490,7 @@ next_match: map_index = !map_index; swap(res_map, fill_map); - rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + rp += NFT_PIPAPO_GROUPS_PADDING(f); } out: @@ -669,26 +538,19 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; - unsigned long *lt = f->lt; - int b, group; + int b; - /* For each 4-bit group: select lookup table bucket depending on + /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ - for (group = 0; group < f->groups; group++) { - u8 v; - - if (group % 2) { - v = *data & 0x0f; - data++; - } else { - v = *data >> 4; - } - __bitmap_and(res_map, res_map, lt + v * f->bsize, - f->bsize * BITS_PER_LONG); + if (f->bb == 8) + pipapo_and_field_buckets_8bit(f, res_map, data); + else if (f->bb == 4) + pipapo_and_field_buckets_4bit(f, res_map, data); + else + BUG(); - lt += f->bsize * NFT_PIPAPO_BUCKETS; - } + data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -713,7 +575,7 @@ next_match: goto out; } - data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + data += NFT_PIPAPO_GROUPS_PADDING(f); /* Swap bitmap indices: fill_map will be the initial bitmap for * the next field (i.e. the new res_map), and res_map is @@ -736,8 +598,8 @@ out: * @elem: nftables API element representation containing key data * @flags: Unused */ -void *nft_pipapo_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { return pipapo_get(net, set, (const u8 *)elem->key.val.data, nft_genmask_cur(net)); @@ -763,6 +625,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) int group, bucket; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); +#ifdef NFT_PIPAPO_ALIGN + new_bucket_size = roundup(new_bucket_size, + NFT_PIPAPO_ALIGN / sizeof(*new_lt)); +#endif if (new_bucket_size == f->bsize) goto mt; @@ -772,15 +638,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * - sizeof(*new_lt), GFP_KERNEL); + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * + new_bucket_size * sizeof(*new_lt) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL); if (!new_lt) return -ENOMEM; - new_p = new_lt; - old_p = old_lt; + new_p = NFT_PIPAPO_LT_ALIGN(new_lt); + old_p = NFT_PIPAPO_LT_ALIGN(old_lt); + for (group = 0; group < f->groups; group++) { - for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { memcpy(new_p, old_p, copy * sizeof(*new_p)); new_p += copy; old_p += copy; @@ -807,7 +676,7 @@ mt: if (new_lt) { f->bsize = new_bucket_size; - f->lt = new_lt; + NFT_PIPAPO_LT_ASSIGN(f, new_lt); kvfree(old_lt); } @@ -829,13 +698,196 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, { unsigned long *pos; - pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos = NFT_PIPAPO_LT_ALIGN(f->lt); + pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; pos += f->bsize * v; __set_bit(rule, pos); } /** + * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits + * @old_groups: Number of current groups + * @bsize: Size of one bucket, in longs + * @old_lt: Pointer to the current lookup table + * @new_lt: Pointer to the new, pre-allocated lookup table + * + * Each bucket with index b in the new lookup table, belonging to group g, is + * filled with the bit intersection between: + * - bucket with index given by the upper 4 bits of b, from group g, and + * - bucket with index given by the lower 4 bits of b, from group g + 1 + * + * That is, given buckets from the new lookup table N(x, y) and the old lookup + * table O(x, y), with x bucket index, and y group index: + * + * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) + * + * This ensures equivalence of the matching results on lookup. Two examples in + * pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 + * 0 ^ + * 1 | ^ + * ... ( & ) | + * / \ | + * / \ .-( & )-. + * / bucket \ | | + * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | + * 0 / \ | | + * 1 \ | | + * 2 | --' + * 3 '- + * ... + */ +static void pipapo_lt_4b_to_8b(int old_groups, int bsize, + unsigned long *old_lt, unsigned long *new_lt) +{ + int g, b, i; + + for (g = 0; g < old_groups / 2; g++) { + int src_g0 = g * 2, src_g1 = g * 2 + 1; + + for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { + int src_b0 = b / NFT_PIPAPO_BUCKETS(4); + int src_b1 = b % NFT_PIPAPO_BUCKETS(4); + int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; + int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; + + for (i = 0; i < bsize; i++) { + *new_lt = old_lt[src_i0 * bsize + i] & + old_lt[src_i1 * bsize + i]; + new_lt++; + } + } + } +} + +/** + * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits + * @old_groups: Number of current groups + * @bsize: Size of one bucket, in longs + * @old_lt: Pointer to the current lookup table + * @new_lt: Pointer to the new, pre-allocated lookup table + * + * Each bucket with index b in the new lookup table, belonging to group g, is + * filled with the bit union of: + * - all the buckets with index such that the upper four bits of the lower byte + * equal b, from group g, with g odd + * - all the buckets with index such that the lower four bits equal b, from + * group g, with g even + * + * That is, given buckets from the new lookup table N(x, y) and the old lookup + * table O(x, y), with x bucket index, and y group index: + * + * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) + * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) + * + * where U() denotes the arbitrary union operation (binary OR of n terms). This + * ensures equivalence of the matching results on lookup. + */ +static void pipapo_lt_8b_to_4b(int old_groups, int bsize, + unsigned long *old_lt, unsigned long *new_lt) +{ + int g, b, bsrc, i; + + memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * + sizeof(unsigned long)); + + for (g = 0; g < old_groups * 2; g += 2) { + int src_g = g / 2; + + for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { + for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; + bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); + bsrc++) { + if (((bsrc & 0xf0) >> 4) != b) + continue; + + for (i = 0; i < bsize; i++) + new_lt[i] |= old_lt[bsrc * bsize + i]; + } + + new_lt += bsize; + } + + for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { + for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; + bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); + bsrc++) { + if ((bsrc & 0x0f) != b) + continue; + + for (i = 0; i < bsize; i++) + new_lt[i] |= old_lt[bsrc * bsize + i]; + } + + new_lt += bsize; + } + } +} + +/** + * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed + * @f: Field containing lookup table + */ +static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) +{ + unsigned long *new_lt; + int groups, bb; + size_t lt_size; + + lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * + sizeof(*f->lt); + + if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && + lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { + groups = f->groups * 2; + bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; + + lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * + sizeof(*f->lt); + } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && + lt_size < NFT_PIPAPO_LT_SIZE_LOW) { + groups = f->groups / 2; + bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; + + lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * + sizeof(*f->lt); + + /* Don't increase group width if the resulting lookup table size + * would exceed the upper size threshold for a "small" set. + */ + if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) + return; + } else { + return; + } + + new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); + if (!new_lt) + return; + + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + if (f->bb == 4 && bb == 8) { + pipapo_lt_4b_to_8b(f->groups, f->bsize, + NFT_PIPAPO_LT_ALIGN(f->lt), + NFT_PIPAPO_LT_ALIGN(new_lt)); + } else if (f->bb == 8 && bb == 4) { + pipapo_lt_8b_to_4b(f->groups, f->bsize, + NFT_PIPAPO_LT_ALIGN(f->lt), + NFT_PIPAPO_LT_ALIGN(new_lt)); + } else { + BUG(); + } + + f->groups = groups; + f->bb = bb; + kvfree(f->lt); + NFT_PIPAPO_LT_ASSIGN(f, new_lt); +} + +/** * pipapo_insert() - Insert new rule in field given input key and mask length * @f: Field containing lookup table * @k: Input key for classification, without nftables padding @@ -849,7 +901,7 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules++, group, ret; + int rule = f->rules++, group, ret, bit_offset = 0; ret = pipapo_resize(f, f->rules - 1, f->rules); if (ret) @@ -859,28 +911,33 @@ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int i, v; u8 mask; - if (group % 2) - v = k[group / 2] & 0x0f; - else - v = k[group / 2] >> 4; + v = k[group / (BITS_PER_BYTE / f->bb)]; + v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); + v >>= (BITS_PER_BYTE - bit_offset) - f->bb; + + bit_offset += f->bb; + bit_offset %= BITS_PER_BYTE; - if (mask_bits >= (group + 1) * 4) { + if (mask_bits >= (group + 1) * f->bb) { /* Not masked */ pipapo_bucket_set(f, rule, group, v); - } else if (mask_bits <= group * 4) { + } else if (mask_bits <= group * f->bb) { /* Completely masked */ - for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) pipapo_bucket_set(f, rule, group, i); } else { /* The mask limit falls on this group */ - mask = 0x0f >> (mask_bits - group * 4); - for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + mask = GENMASK(f->bb - 1, 0); + mask >>= mask_bits - group * f->bb; + for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { if ((i & ~mask) == (v & ~mask)) pipapo_bucket_set(f, rule, group, i); } } } + pipapo_lt_bits_adjust(f); + return 1; } @@ -1053,8 +1110,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, for_each_possible_cpu(i) { unsigned long *scratch; +#ifdef NFT_PIPAPO_ALIGN + unsigned long *scratch_aligned; +#endif - scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous @@ -1070,6 +1131,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, kfree(*per_cpu_ptr(clone->scratch, i)); *per_cpu_ptr(clone->scratch, i) = scratch; + +#ifdef NFT_PIPAPO_ALIGN + scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); + *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; +#endif } return 0; @@ -1123,11 +1189,11 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, return -ENOSPC; if (memcmp(start_p, end_p, - f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) return -EINVAL; - start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); - end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } /* Insert */ @@ -1141,22 +1207,19 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, rulemap[i].to = f->rules; ret = memcmp(start, end, - f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); - if (!ret) { - ret = pipapo_insert(f, start, - f->groups * NFT_PIPAPO_GROUP_BITS); - } else { - ret = pipapo_expand(f, start, end, - f->groups * NFT_PIPAPO_GROUP_BITS); - } + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); + if (!ret) + ret = pipapo_insert(f, start, f->groups * f->bb); + else + ret = pipapo_expand(f, start, end, f->groups * f->bb); if (f->bsize > bsize_max) bsize_max = f->bsize; rulemap[i].n = ret; - start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); - end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { @@ -1200,23 +1263,35 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new->scratch) goto out_scratch; +#ifdef NFT_PIPAPO_ALIGN + new->scratch_aligned = alloc_percpu(*new->scratch_aligned); + if (!new->scratch_aligned) + goto out_scratch; +#endif + rcu_head_init(&new->rcu); src = old->f; dst = new->f; for (i = 0; i < old->field_count; i++) { + unsigned long *new_lt; + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * - src->bsize * sizeof(*dst->lt), - GFP_KERNEL); - if (!dst->lt) + new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * + src->bsize * sizeof(*dst->lt) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL); + if (!new_lt) goto out_lt; - memcpy(dst->lt, src->lt, + NFT_PIPAPO_LT_ASSIGN(dst, new_lt); + + memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), + NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * - src->groups * NFT_PIPAPO_BUCKETS); + src->groups * NFT_PIPAPO_BUCKETS(src->bb)); dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); if (!dst->mt) @@ -1237,8 +1312,11 @@ out_lt: kvfree(dst->lt); dst--; } - free_percpu(new->scratch); +#ifdef NFT_PIPAPO_ALIGN + free_percpu(new->scratch_aligned); +#endif out_scratch: + free_percpu(new->scratch); kfree(new); return ERR_PTR(-ENOMEM); @@ -1394,9 +1472,10 @@ static void pipapo_drop(struct nft_pipapo_match *m, unsigned long *pos; int b; - pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * + NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; - for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { bitmap_cut(pos, pos, rulemap[i].to, rulemap[i].n, f->bsize * BITS_PER_LONG); @@ -1414,6 +1493,8 @@ static void pipapo_drop(struct nft_pipapo_match *m, ; } f->rules -= rulemap[i].n; + + pipapo_lt_bits_adjust(f); } } @@ -1498,6 +1579,9 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) for_each_possible_cpu(i) kfree(*per_cpu_ptr(m->scratch, i)); +#ifdef NFT_PIPAPO_ALIGN + free_percpu(m->scratch_aligned); +#endif free_percpu(m->scratch); pipapo_free_fields(m); @@ -1690,30 +1774,33 @@ static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, int rule_count, u8 *left, u8 *right) { + int g, mask_len = 0, bit_offset = 0; u8 *l = left, *r = right; - int g, mask_len = 0; for (g = 0; g < f->groups; g++) { int b, x0, x1; x0 = -1; x1 = -1; - for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { unsigned long *pos; - pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + pos = NFT_PIPAPO_LT_ALIGN(f->lt) + + (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; if (test_bit(first_rule, pos) && x0 == -1) x0 = b; if (test_bit(first_rule + rule_count - 1, pos)) x1 = b; } - if (g % 2) { - *(l++) |= x0 & 0x0f; - *(r++) |= x1 & 0x0f; - } else { - *l |= x0 << 4; - *r |= x1 << 4; + *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); + *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); + + bit_offset += f->bb; + if (bit_offset >= BITS_PER_BYTE) { + bit_offset %= BITS_PER_BYTE; + l++; + r++; } if (x1 - x0 == 0) @@ -1748,8 +1835,9 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, pipapo_get_boundaries(f, first_rule, rule_count, left, right); - return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && - !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + return !memcmp(start, left, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); } /** @@ -1801,8 +1889,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = f->mt[start].n; start = f->mt[start].to; - match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); - match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } if (i == m->field_count) { @@ -1885,56 +1973,24 @@ static u64 nft_pipapo_privsize(const struct nlattr * const nla[], } /** - * nft_pipapo_estimate() - Estimate set size, space and lookup complexity - * @desc: Set description, element count and field description used here + * nft_pipapo_estimate() - Set size, space and lookup complexity + * @desc: Set description, element count and field description used * @features: Flags: NFT_SET_INTERVAL needs to be there * @est: Storage for estimation data * - * The size for this set type can vary dramatically, as it depends on the number - * of rules (composing netmasks) the entries expand to. We compute the worst - * case here. - * - * In general, for a non-ranged entry or a single composing netmask, we need - * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that - * is, each input bit needs four bits of matching data), plus a bucket in the - * mapping table for each field. - * - * Return: true only for compatible range concatenations + * Return: true if set description is compatible, false otherwise */ static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { - unsigned long entry_size; - int i; - - if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + if (!(features & NFT_SET_INTERVAL) || + desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - for (i = 0, entry_size = 0; i < desc->field_count; i++) { - unsigned long rules; - - if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) - return false; - - /* Worst-case ranges for each concatenated field: each n-bit - * field can expand to up to n * 2 rules in each bucket, and - * each rule also needs a mapping bucket. - */ - rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; - entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; - entry_size += rules * sizeof(union nft_pipapo_map_bucket); - } - - /* Rules in lookup and mapping tables are needed for each entry */ - est->size = desc->size * entry_size; - if (est->size && div_u64(est->size, desc->size) != entry_size) + est->size = pipapo_estimate_size(desc); + if (!est->size) return false; - est->size += sizeof(struct nft_pipapo) + - sizeof(struct nft_pipapo_match) * 2; - - est->size += sizeof(struct nft_pipapo_field) * desc->field_count; - est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; @@ -1961,38 +2017,52 @@ static int nft_pipapo_init(const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; struct nft_pipapo_field *f; - int err, i; + int err, i, field_count; + + field_count = desc->field_count ? : 1; - if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, GFP_KERNEL); if (!m) return -ENOMEM; - m->field_count = desc->field_count; + m->field_count = field_count; m->bsize_max = 0; m->scratch = alloc_percpu(unsigned long *); if (!m->scratch) { err = -ENOMEM; - goto out_free; + goto out_scratch; } for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; +#ifdef NFT_PIPAPO_ALIGN + m->scratch_aligned = alloc_percpu(unsigned long *); + if (!m->scratch_aligned) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch_aligned, i) = NULL; +#endif + rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { - f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; - priv->groups += f->groups; + int len = desc->field_len[i] ? : set->klen; - priv->width += round_up(desc->field_len[i], sizeof(u32)); + f->bb = NFT_PIPAPO_GROUP_BITS_INIT; + f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); + + priv->width += round_up(len, sizeof(u32)); f->bsize = 0; f->rules = 0; - f->lt = NULL; + NFT_PIPAPO_LT_ASSIGN(f, NULL); f->mt = NULL; } @@ -2010,7 +2080,11 @@ static int nft_pipapo_init(const struct nft_set *set, return 0; out_free: +#ifdef NFT_PIPAPO_ALIGN + free_percpu(m->scratch_aligned); +#endif free_percpu(m->scratch); +out_scratch: kfree(m); return err; @@ -2045,16 +2119,21 @@ static void nft_pipapo_destroy(const struct nft_set *set) nft_set_elem_destroy(set, e, true); } +#ifdef NFT_PIPAPO_ALIGN + free_percpu(m->scratch_aligned); +#endif for_each_possible_cpu(cpu) kfree(*per_cpu_ptr(m->scratch, cpu)); free_percpu(m->scratch); - pipapo_free_fields(m); kfree(m); priv->match = NULL; } if (priv->clone) { +#ifdef NFT_PIPAPO_ALIGN + free_percpu(priv->clone->scratch_aligned); +#endif for_each_possible_cpu(cpu) kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); free_percpu(priv->clone->scratch); @@ -2081,8 +2160,7 @@ static void nft_pipapo_gc_init(const struct nft_set *set) priv->last_gc = jiffies; } -struct nft_set_type nft_set_pipapo_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_pipapo_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -2102,3 +2180,26 @@ struct nft_set_type nft_set_pipapo_type __read_mostly = { .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; + +#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2) +const struct nft_set_type nft_set_pipapo_avx2_type = { + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_avx2_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_avx2_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; +#endif diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h new file mode 100644 index 000000000000..25a75591583e --- /dev/null +++ b/net/netfilter/nft_set_pipapo.h @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef _NFT_SET_PIPAPO_H + +#include <linux/log2.h> +#include <net/ipv6.h> /* For the maximum length of a field */ + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Restrict usage to multiple fields, make sure rbtree is used otherwise */ +#define NFT_PIPAPO_MIN_FIELDS 2 + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Bits to be grouped together in table buckets depending on set size */ +#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET +#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8 +#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4 +#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \ + BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \ + (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4)) +#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb) + +/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the + * small group width, and switch to the big group width if the table gets + * smaller than NFT_PIPAPO_LT_SIZE_LOW. + * + * Picking 2MiB as threshold (for a single table) avoids as much as possible + * crossing page boundaries on most architectures (x86-64 and MIPS huge pages, + * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which + * keeps performance nice in case kvmalloc() gives us non-contiguous areas. + */ +#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21) +#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16) +#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD +#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \ + NFT_PIPAPO_LT_SIZE_HYSTERESIS + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \ + (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(f) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \ + NFT_PIPAPO_GROUPS_PER_BYTE(f)) + +/* Number of buckets given by 2 ^ n, with n bucket bits */ +#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb)) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +/* Definitions for vectorised implementations */ +#ifdef NFT_PIPAPO_ALIGN +#define NFT_PIPAPO_ALIGN_HEADROOM \ + (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) +#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) +#define NFT_PIPAPO_LT_ASSIGN(field, x) \ + do { \ + (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \ + (field)->lt = (x); \ + } while (0) +#else +#define NFT_PIPAPO_ALIGN_HEADROOM 0 +#define NFT_PIPAPO_LT_ALIGN(lt) (lt) +#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x)) +#endif /* NFT_PIPAPO_ALIGN */ + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @bb: Number of bits grouped together in lookup table buckets + * @lt: Lookup table: 'groups' rows of buckets + * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + int bb; +#ifdef NFT_PIPAPO_ALIGN + unsigned long *lt_aligned; +#endif + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; +#ifdef NFT_PIPAPO_ALIGN + unsigned long * __percpu *scratch_aligned; +#endif + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[]; +}; + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, + union nft_pipapo_map_bucket *mt, bool match_only); + +/** + * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets + * @f: Field including lookup table + * @dst: Area to store result + * @data: Input data selecting table buckets + */ +static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, + unsigned long *dst, + const u8 *data) +{ + unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); + int group; + + for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) { + u8 v; + + v = *data >> 4; + __bitmap_and(dst, dst, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS(4); + + v = *data & 0x0f; + __bitmap_and(dst, dst, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS(4); + } +} + +/** + * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets + * @f: Field including lookup table + * @dst: Area to store result + * @data: Input data selecting table buckets + */ +static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f, + unsigned long *dst, + const u8 *data) +{ + unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); + int group; + + for (group = 0; group < f->groups; group++, data++) { + __bitmap_and(dst, dst, lt + *data * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS(8); + } +} + +/** + * pipapo_estimate_size() - Estimate worst-case for set size + * @desc: Set description, element count and field description used here + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: worst-case set size in bytes, 0 on any overflow + */ +static u64 pipapo_estimate_size(const struct nft_set_desc *desc) +{ + unsigned long entry_size; + u64 size; + int i; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return 0; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * + NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) / + BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + size = desc->size * entry_size; + if (size && div_u64(size, desc->size) != entry_size) + return 0; + + size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2; + + size += sizeof(struct nft_pipapo_field) * desc->field_count; + + return size; +} + +#endif /* _NFT_SET_PIPAPO_H */ diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c new file mode 100644 index 000000000000..d65ae0e23028 --- /dev/null +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -0,0 +1,1223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <uapi/linux/netfilter/nf_tables.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include <linux/compiler.h> +#include <asm/fpu/api.h> + +#include "nft_set_pipapo_avx2.h" +#include "nft_set_pipapo.h" + +#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG) + +/* Load from memory into YMM register with non-temporal hint ("stream load"), + * that is, don't fetch lines from memory into the cache. This avoids pushing + * precious packet data out of the cache hierarchy, and is appropriate when: + * + * - loading buckets from lookup tables, as they are not going to be used + * again before packets are entirely classified + * + * - loading the result bitmap from the previous field, as it's never used + * again + */ +#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \ + asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc)) + +/* Stream a single lookup table bucket into YMM register given lookup table, + * group index, value of packet bits, bucket size. + */ +#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \ + NFT_PIPAPO_AVX2_LOAD(reg, \ + lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ + (v)) * (bsize)]) +#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \ + NFT_PIPAPO_AVX2_LOAD(reg, \ + lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ + (v)) * (bsize)]) + +/* Bitwise AND: the staple operation of this algorithm */ +#define NFT_PIPAPO_AVX2_AND(dst, a, b) \ + asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst) + +/* Jump to label if @reg is zero */ +#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ + asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + "je %l[" #label "]" : : : : label) + +/* Store 256 bits from YMM register into memory. Contrary to bucket load + * operation, we don't bypass the cache here, as stored matching results + * are always used shortly after. + */ +#define NFT_PIPAPO_AVX2_STORE(loc, reg) \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)) + +/* Zero out a complete YMM register, @reg */ +#define NFT_PIPAPO_AVX2_ZERO(reg) \ + asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); + +/** + * nft_pipapo_avx2_prepare() - Prepare before main algorithm body + * + * This zeroes out ymm15, which is later used whenever we need to clear a + * memory location, by storing its content into memory. + */ +static void nft_pipapo_avx2_prepare(void) +{ + NFT_PIPAPO_AVX2_ZERO(15); +} + +/** + * nft_pipapo_avx2_fill() - Fill a bitmap region with ones + * @data: Base memory area + * @start: First bit to set + * @len: Count of bits to fill + * + * This is nothing else than a version of bitmap_set(), as used e.g. by + * pipapo_refill(), tailored for the microarchitectures using it and better + * suited for the specific usage: it's very likely that we'll set a small number + * of bits, not crossing a word boundary, and correct branch prediction is + * critical here. + * + * This function doesn't actually use any AVX2 instruction. + */ +static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) +{ + int offset = start % BITS_PER_LONG; + unsigned long mask; + + data += start / BITS_PER_LONG; + + if (likely(len == 1)) { + *data |= BIT(offset); + return; + } + + if (likely(len < BITS_PER_LONG || offset)) { + if (likely(len + offset <= BITS_PER_LONG)) { + *data |= GENMASK(len - 1 + offset, offset); + return; + } + + *data |= ~0UL << offset; + len -= BITS_PER_LONG - offset; + data++; + + if (len <= BITS_PER_LONG) { + mask = ~0UL >> (BITS_PER_LONG - len); + *data |= mask; + return; + } + } + + memset(data, 0xff, len / BITS_PER_BYTE); + data += len / BITS_PER_LONG; + + len %= BITS_PER_LONG; + if (len) + *data |= ~0UL >> (BITS_PER_LONG - len); +} + +/** + * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits + * @offset: Start from given bitmap (equivalent to bucket) offset, in longs + * @map: Bitmap to be scanned for set bits + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @len: Length of bitmap in longs + * @last: Return index of first set bit, if this is the last field + * + * This is an alternative implementation of pipapo_refill() suitable for usage + * with AVX2 lookup routines: we know there are four words to be scanned, at + * a given offset inside the map, for each matching iteration. + * + * This function doesn't actually use any AVX2 instruction. + * + * Return: first set bit index if @last, index of first filled word otherwise. + */ +static int nft_pipapo_avx2_refill(int offset, unsigned long *map, + unsigned long *dst, + union nft_pipapo_map_bucket *mt, bool last) +{ + int ret = -1; + +#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \ + do { \ + while (map[(x)]) { \ + int r = __builtin_ctzl(map[(x)]); \ + int i = (offset + (x)) * BITS_PER_LONG + r; \ + \ + if (last) \ + return i; \ + \ + nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \ + \ + if (ret == -1) \ + ret = mt[i].to; \ + \ + map[(x)] &= ~(1UL << r); \ + } \ + } while (0) + + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0); + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1); + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2); + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3); +#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * Load buckets from lookup table corresponding to the values of each 4-bit + * group of packet bytes, and perform a bitwise intersection between them. If + * this is the first field in the set, simply AND the buckets together + * (equivalent to using an all-ones starting bitmap), use the provided starting + * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next + * working bitmap, @fill. + * + * This is used for 8-bit fields (i.e. protocol numbers). + * + * Out-of-order (and superscalar) execution is vital here, so it's critical to + * avoid false data dependencies. CPU and compiler could (mostly) take care of + * this on their own, but the operation ordering is explicitly given here with + * a likely execution order in mind, to highlight possible stalls. That's why + * a number of logically distinct operations (i.e. loading buckets, intersecting + * buckets) are interleaved. + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing); + NFT_PIPAPO_AVX2_AND(3, 0, 1); + NFT_PIPAPO_AVX2_AND(4, 2, 3); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 16-bit fields (i.e. ports). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_AND(7, 4, 5); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_AND(5, 0, 1); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(7, 4, 5); + /* Stall */ + NFT_PIPAPO_AVX2_AND(7, 6, 7); + } + + /* Stall */ + NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 7); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 32-bit fields (i.e. IPv4 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, + }; + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(8, 2, 3); + NFT_PIPAPO_AVX2_AND(9, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(11, 6, 7); + NFT_PIPAPO_AVX2_AND(12, 8, 9); + NFT_PIPAPO_AVX2_AND(13, 10, 11); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(1, 12, 13); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_AND(8, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(10, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(12, 6, 7); + NFT_PIPAPO_AVX2_AND(13, 8, 9); + NFT_PIPAPO_AVX2_AND(14, 10, 11); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(1, 12, 13); + NFT_PIPAPO_AVX2_AND(1, 1, 14); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 1); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, + pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, + }; + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (!first) + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + + if (!first) { + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(1, 1, 0); + } + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(9, 1, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(11, 5, 6); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize); + NFT_PIPAPO_AVX2_AND(13, 7, 8); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize); + + NFT_PIPAPO_AVX2_AND(0, 9, 10); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize); + NFT_PIPAPO_AVX2_AND(2, 11, 12); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); + NFT_PIPAPO_AVX2_AND(4, 13, 14); + NFT_PIPAPO_AVX2_AND(5, 0, 1); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + + /* Stalls */ + NFT_PIPAPO_AVX2_AND(7, 4, 5); + NFT_PIPAPO_AVX2_AND(8, 6, 7); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 8); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 128-bit fields (i.e. IPv6 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, + pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, + pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf, + pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf, + pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf, + pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf, + pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf, + }; + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (!first) + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + if (!first) { + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(1, 1, 0); + } + + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_AND(8, 1, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(10, 5, 6); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(12, 7, 8); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize); + NFT_PIPAPO_AVX2_AND(14, 9, 10); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize); + NFT_PIPAPO_AVX2_AND(1, 11, 12); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); + NFT_PIPAPO_AVX2_AND(4, 13, 14); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize); + NFT_PIPAPO_AVX2_AND(7, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize); + NFT_PIPAPO_AVX2_AND(9, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize); + NFT_PIPAPO_AVX2_AND(11, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize); + NFT_PIPAPO_AVX2_AND(13, 6, 7); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize); + + NFT_PIPAPO_AVX2_AND(0, 8, 9); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize); + NFT_PIPAPO_AVX2_AND(2, 10, 11); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize); + NFT_PIPAPO_AVX2_AND(4, 12, 13); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize); + NFT_PIPAPO_AVX2_AND(6, 14, 0); + NFT_PIPAPO_AVX2_AND(7, 1, 2); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize); + NFT_PIPAPO_AVX2_AND(9, 3, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize); + NFT_PIPAPO_AVX2_AND(11, 5, 6); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize); + NFT_PIPAPO_AVX2_AND(13, 7, 8); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize); + NFT_PIPAPO_AVX2_AND(1, 9, 10); + NFT_PIPAPO_AVX2_AND(2, 11, 12); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize); + NFT_PIPAPO_AVX2_AND(4, 13, 14); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize); + NFT_PIPAPO_AVX2_AND(6, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize); + NFT_PIPAPO_AVX2_AND(9, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize); + NFT_PIPAPO_AVX2_AND(11, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize); + + NFT_PIPAPO_AVX2_AND(0, 6, 7); + NFT_PIPAPO_AVX2_AND(1, 8, 9); + NFT_PIPAPO_AVX2_AND(2, 10, 11); + NFT_PIPAPO_AVX2_AND(3, 12, 0); + + /* Stalls */ + NFT_PIPAPO_AVX2_AND(4, 1, 2); + NFT_PIPAPO_AVX2_AND(5, 3, 4); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 5); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 8-bit fields (i.e. protocol numbers). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_AND(2, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 2); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 16-bit fields (i.e. ports). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + } else { + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(3, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(4, 3, 2); + } + + /* Stall */ + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 32-bit fields (i.e. IPv4 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + NFT_PIPAPO_AVX2_AND(6, 2, 3); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(7, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 6, 7); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 0); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize); + NFT_PIPAPO_AVX2_AND(7, 2, 3); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(1, 6, 7); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize); + NFT_PIPAPO_AVX2_AND(2, 6, 7); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(3, 0, 1); + NFT_PIPAPO_AVX2_AND(4, 2, 3); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 128-bit fields (i.e. IPv6 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (!first) + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); + if (!first) { + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(1, 1, 0); + } + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); + + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize); + NFT_PIPAPO_AVX2_AND(6, 1, 2); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize); + NFT_PIPAPO_AVX2_AND(0, 3, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize); + + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize); + NFT_PIPAPO_AVX2_AND(3, 5, 6); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); + NFT_PIPAPO_AVX2_AND(2, 6, 7); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize); + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize); + NFT_PIPAPO_AVX2_AND(2, 6, 7); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_AND(6, 4, 5); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 6); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * This function should never be called, but is provided for the case the field + * size doesn't match any of the known data types. Matching rate is + * substantially lower than AVX2 routines. + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + unsigned long *lt = f->lt, bsize = f->bsize; + int i, ret = -1, b; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + + if (first) + memset(map, 0xff, bsize * sizeof(*map)); + + for (i = offset; i < bsize; i++) { + if (f->bb == 8) + pipapo_and_field_buckets_8bit(f, map, pkt); + else + pipapo_and_field_buckets_4bit(f, map, pkt); + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + + b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last); + + if (last) + return b; + + if (ret == -1) + ret = b / XSAVE_YMM_SIZE; + } + + return ret; +} + +/** + * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity + * @desc: Set description, element count and field description used + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * Return: true if set is compatible and AVX2 available, false otherwise. + */ +bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + if (!(features & NFT_SET_INTERVAL) || + desc->field_count < NFT_PIPAPO_MIN_FIELDS) + return false; + + if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + return false; + + est->size = pipapo_estimate_size(desc); + if (!est->size) + return false; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. + * + * This implementation exploits the repetitive characteristic of the algorithm + * to provide a fast, vectorised version using the AVX2 SIMD instruction set. + * + * Return: true on match, false otherwise. + */ +bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res, *fill, *scratch; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i, ret = 0; + + m = rcu_dereference(priv->match); + + /* This also protects access to all data related to scratch maps */ + kernel_fpu_begin(); + + scratch = *raw_cpu_ptr(m->scratch_aligned); + if (unlikely(!scratch)) { + kernel_fpu_end(); + return false; + } + map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); + + res = scratch + (map_index ? m->bsize_max : 0); + fill = scratch + (map_index ? 0 : m->bsize_max); + + /* Starting map doesn't need to be set for this implementation */ + + nft_pipapo_avx2_prepare(); + +next_match: + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1, first = !i; + +#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ + (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ + ret, rp, \ + first, last)) + + if (likely(f->bb == 8)) { + if (f->groups == 1) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1); + } else if (f->groups == 2) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2); + } else if (f->groups == 4) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4); + } else if (f->groups == 6) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6); + } else if (f->groups == 16) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); + } else { + ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret, rp, + first, last); + } + } else { + if (f->groups == 2) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2); + } else if (f->groups == 4) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4); + } else if (f->groups == 8) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8); + } else if (f->groups == 12) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12); + } else if (f->groups == 32) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); + } else { + ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret, rp, + first, last); + } + } + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + +#undef NFT_SET_PIPAPO_AVX2_LOOKUP + + if (ret < 0) + goto out; + + if (last) { + *ext = &f->mt[ret].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) { + ret = 0; + goto next_match; + } + + goto out; + } + + swap(res, fill); + rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + } + +out: + if (i % 2) + raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); + kernel_fpu_end(); + + return ret >= 0; +} diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h new file mode 100644 index 000000000000..396caf7bfca8 --- /dev/null +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _NFT_SET_PIPAPO_AVX2_H + +#ifdef CONFIG_AS_AVX2 +#include <asm/fpu/xstate.h> +#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) + +bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est); +#endif /* CONFIG_AS_AVX2 */ + +#endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 5000b938ab1e..172ef8189f99 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -481,8 +481,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -struct nft_set_type nft_set_rbtree_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .privsize = nft_rbtree_privsize, diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 764e88682a81..30be5787fbde 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -11,6 +11,7 @@ #include <net/ip_tunnels.h> #include <net/vxlan.h> #include <net/erspan.h> +#include <net/geneve.h> struct nft_tunnel { enum nft_tunnel_keys key:8; @@ -144,6 +145,7 @@ struct nft_tunnel_opts { union { struct vxlan_metadata vxlan; struct erspan_metadata erspan; + u8 data[IP_TUNNEL_OPTS_MAX]; } u; u32 len; __be16 flags; @@ -301,9 +303,53 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, return 0; } +static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { + [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, + [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, + struct nft_tunnel_opts *opts) +{ + struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; + int err, data_len; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr, + nft_tunnel_opts_geneve_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] || + !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] || + !tb[NFTA_TUNNEL_KEY_GENEVE_DATA]) + return -EINVAL; + + attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + opts->len += sizeof(*opt) + data_len; + if (opts->len > IP_TUNNEL_OPTS_MAX) + return -EINVAL; + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); + opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); + opts->flags = TUNNEL_GENEVE_OPT; + + return 0; +} + static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = { + [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = { + .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE }, [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, }, [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, }, + [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, }, }; static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, @@ -311,22 +357,43 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct ip_tunnel_info *info, struct nft_tunnel_opts *opts) { - struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1]; - int err; + int err, rem, type = 0; + struct nlattr *nla; - err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr, - nft_tunnel_opts_policy, NULL); + err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, + nft_tunnel_opts_policy, NULL); if (err < 0) return err; - if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) { - err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN], - opts); - } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) { - err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN], - opts); - } else { - return -EOPNOTSUPP; + nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(nla)) { + case NFTA_TUNNEL_KEY_OPTS_VXLAN: + if (type) + return -EINVAL; + err = nft_tunnel_obj_vxlan_init(nla, opts); + if (err) + return err; + type = TUNNEL_VXLAN_OPT; + break; + case NFTA_TUNNEL_KEY_OPTS_ERSPAN: + if (type) + return -EINVAL; + err = nft_tunnel_obj_erspan_init(nla, opts); + if (err) + return err; + type = TUNNEL_ERSPAN_OPT; + break; + case NFTA_TUNNEL_KEY_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) + return -EINVAL; + err = nft_tunnel_obj_geneve_init(nla, opts); + if (err) + return err; + type = TUNNEL_GENEVE_OPT; + break; + default: + return -EOPNOTSUPP; + } } return err; @@ -518,6 +585,25 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); + } else if (opts->flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt; + int offset = 0; + + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + while (opts->len > offset) { + opt = (struct geneve_opt *)opts->u.data + offset; + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, + opt->type) || + nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA, + opt->length * 4, opt->opt_data)) + goto inner_failure; + offset += sizeof(*opt) + opt->length * 4; + } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index f56d3ed93e56..75bd0e5dd312 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/timer.h> +#include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> @@ -30,6 +31,7 @@ struct idletimer_tg { struct list_head entry; + struct alarm alarm; struct timer_list timer; struct work_struct work; @@ -37,6 +39,7 @@ struct idletimer_tg { struct device_attribute attr; unsigned int refcnt; + u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); @@ -62,20 +65,29 @@ static ssize_t idletimer_tg_show(struct device *dev, { struct idletimer_tg *timer; unsigned long expires = 0; + struct timespec64 ktimespec = {}; + long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); - if (timer) - expires = timer->timer.expires; + if (timer) { + if (timer->timer_type & XT_IDLETIMER_ALARM) { + ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); + ktimespec = ktime_to_timespec64(expires_alarm); + time_diff = ktimespec.tv_sec; + } else { + expires = timer->timer.expires; + time_diff = jiffies_to_msecs(expires - jiffies) / 1000; + } + } mutex_unlock(&list_mutex); - if (time_after(expires, jiffies)) - return sprintf(buf, "%u\n", - jiffies_to_msecs(expires - jiffies) / 1000); + if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) + return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff); - return sprintf(buf, "0\n"); + return snprintf(buf, PAGE_SIZE, "0\n"); } static void idletimer_tg_work(struct work_struct *work) @@ -95,6 +107,16 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } +static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, + ktime_t now) +{ + struct idletimer_tg *timer = alarm->data; + + pr_debug("alarm %s expired\n", timer->attr.attr.name); + schedule_work(&timer->work); + return ALARMTIMER_NORESTART; +} + static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; @@ -160,6 +182,68 @@ out: return ret; } +static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) +{ + int ret; + + info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + if (!info->timer) { + ret = -ENOMEM; + goto out; + } + + ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); + if (ret < 0) + goto out_free_timer; + + sysfs_attr_init(&info->timer->attr.attr); + info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); + if (!info->timer->attr.attr.name) { + ret = -ENOMEM; + goto out_free_timer; + } + info->timer->attr.attr.mode = 0444; + info->timer->attr.show = idletimer_tg_show; + + ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); + if (ret < 0) { + pr_debug("couldn't add file to sysfs"); + goto out_free_attr; + } + + /* notify userspace */ + kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); + + list_add(&info->timer->entry, &idletimer_tg_list); + pr_debug("timer type value is %u", info->timer_type); + info->timer->timer_type = info->timer_type; + info->timer->refcnt = 1; + + INIT_WORK(&info->timer->work, idletimer_tg_work); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + ktime_t tout; + alarm_init(&info->timer->alarm, ALARM_BOOTTIME, + idletimer_tg_alarmproc); + info->timer->alarm.data = info->timer; + tout = ktime_set(info->timeout, 0); + alarm_start_relative(&info->timer->alarm, tout); + } else { + timer_setup(&info->timer->timer, idletimer_tg_expired, 0); + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + } + + return 0; + +out_free_attr: + kfree(info->timer->attr.attr.name); +out_free_timer: + kfree(info->timer); +out: + return ret; +} + /* * The actual xt_tables plugin. */ @@ -177,13 +261,30 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, return XT_CONTINUE; } -static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +/* + * The actual xt_tables plugin. + */ +static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, + const struct xt_action_param *par) { - struct idletimer_tg_info *info = par->targinfo; - int ret; + const struct idletimer_tg_info_v1 *info = par->targinfo; - pr_debug("checkentry targinfo%s\n", info->label); + pr_debug("resetting timer %s, timeout period %u\n", + info->label, info->timeout); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + ktime_t tout = ktime_set(info->timeout, 0); + alarm_start_relative(&info->timer->alarm, tout); + } else { + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + } + return XT_CONTINUE; +} + +static int idletimer_tg_helper(struct idletimer_tg_info *info) +{ if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; @@ -198,7 +299,23 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } + return 0; +} + +static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info *info = par->targinfo; + int ret; + + pr_debug("checkentry targinfo%s\n", info->label); + + ret = idletimer_tg_helper(info); + if(ret < 0) + { + pr_debug("checkentry helper return invalid\n"); + return -EINVAL; + } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); @@ -222,6 +339,65 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) return 0; } +static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info_v1 *info = par->targinfo; + int ret; + + pr_debug("checkentry targinfo%s\n", info->label); + + ret = idletimer_tg_helper((struct idletimer_tg_info *)info); + if(ret < 0) + { + pr_debug("checkentry helper return invalid\n"); + return -EINVAL; + } + + if (info->timer_type > XT_IDLETIMER_ALARM) { + pr_debug("invalid value for timer type\n"); + return -EINVAL; + } + + mutex_lock(&list_mutex); + + info->timer = __idletimer_tg_find_by_label(info->label); + if (info->timer) { + if (info->timer->timer_type != info->timer_type) { + pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); + mutex_unlock(&list_mutex); + return -EINVAL; + } + + info->timer->refcnt++; + if (info->timer_type & XT_IDLETIMER_ALARM) { + /* calculate remaining expiry time */ + ktime_t tout = alarm_expires_remaining(&info->timer->alarm); + struct timespec64 ktimespec = ktime_to_timespec64(tout); + + if (ktimespec.tv_sec > 0) { + pr_debug("time_expiry_remaining %lld\n", + ktimespec.tv_sec); + alarm_start_relative(&info->timer->alarm, tout); + } + } else { + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + } + pr_debug("increased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } else { + ret = idletimer_tg_create_v1(info); + if (ret < 0) { + pr_debug("failed to create timer\n"); + mutex_unlock(&list_mutex); + return ret; + } + } + + mutex_unlock(&list_mutex); + return 0; +} + static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; @@ -247,7 +423,38 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) mutex_unlock(&list_mutex); } -static struct xt_target idletimer_tg __read_mostly = { +static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ + const struct idletimer_tg_info_v1 *info = par->targinfo; + + pr_debug("destroy targinfo %s\n", info->label); + + mutex_lock(&list_mutex); + + if (--info->timer->refcnt == 0) { + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + alarm_cancel(&info->timer->alarm); + } else { + del_timer_sync(&info->timer->timer); + } + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); + } else { + pr_debug("decreased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } + + mutex_unlock(&list_mutex); +} + + +static struct xt_target idletimer_tg[] __read_mostly = { + { .name = "IDLETIMER", .family = NFPROTO_UNSPEC, .target = idletimer_tg_target, @@ -256,6 +463,20 @@ static struct xt_target idletimer_tg __read_mostly = { .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, + }, + { + .name = "IDLETIMER", + .family = NFPROTO_UNSPEC, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, + }, + + }; static struct class *idletimer_tg_class; @@ -283,7 +504,8 @@ static int __init idletimer_tg_init(void) idletimer_tg_kobj = &idletimer_tg_device->kobj; - err = xt_register_target(&idletimer_tg); + err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); + if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; @@ -300,7 +522,7 @@ out: static void __exit idletimer_tg_exit(void) { - xt_unregister_target(&idletimer_tg); + xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 2317721f3ecb..75625d13e976 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -21,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: packet security mark modification"); MODULE_ALIAS("ipt_SECMARK"); MODULE_ALIAS("ip6t_SECMARK"); -#define PFX "SECMARK: " - static u8 mode; static unsigned int diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 8c835ad63729..9c5cfd74a0ee 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -132,7 +132,7 @@ struct xt_hashlimit_htable { const char *name; struct net *net; - struct hlist_head hash[0]; /* hashtable itself */ + struct hlist_head hash[]; /* hashtable itself */ }; static int diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 225a7ab6d79a..19bef176145e 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -71,7 +71,7 @@ struct recent_entry { u_int8_t ttl; u_int8_t index; u_int16_t nstamps; - unsigned long stamps[0]; + unsigned long stamps[]; }; struct recent_table { @@ -82,7 +82,7 @@ struct recent_table { unsigned int entries; u8 nstamps_max_mask; struct list_head lru_list; - struct list_head iphash[0]; + struct list_head iphash[]; }; struct recent_net { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 363264ca2e09..eefacb3176e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3528,9 +3528,9 @@ int tc_setup_flow_action(struct flow_action *flow_action, struct tc_action *act; int i, j, k, err = 0; - BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_ANY != FLOW_ACTION_HW_STATS_TYPE_ANY); - BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_IMMEDIATE != FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE); - BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_DELAYED != FLOW_ACTION_HW_STATS_TYPE_DELAYED); + BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_ANY != FLOW_ACTION_HW_STATS_ANY); + BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE); + BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_DELAYED != FLOW_ACTION_HW_STATS_DELAYED); if (!exts) return 0; @@ -3613,8 +3613,8 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); - entry = &flow_action->entries[++j]; entry->hw_stats_type = act->hw_stats_type; + entry = &flow_action->entries[++j]; } } else if (is_tcf_csum(act)) { entry->id = FLOW_ACTION_CSUM; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 50794125bf02..0d99df1e764d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -618,21 +618,28 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, + u64 delta_ns) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - if (wd->last_expires == expires) - return; + if (hrtimer_is_queued(&wd->timer)) { + /* If timer is already set in [expires, expires + delta_ns], + * do not reprogram it. + */ + if (wd->last_expires - expires <= delta_ns) + return; + } wd->last_expires = expires; - hrtimer_start(&wd->timer, - ns_to_ktime(expires), - HRTIMER_MODE_ABS_PINNED); + hrtimer_start_range_ns(&wd->timer, + ns_to_ktime(expires), + delta_ns, + HRTIMER_MODE_ABS_PINNED); } -EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); +EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 371ad84def3b..4c060134c736 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -121,6 +121,8 @@ struct fq_sched_data { u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; + + u32 timer_slack; /* hrtimer slack in ns */ struct qdisc_watchdog watchdog; }; @@ -504,8 +506,9 @@ begin: head = &q->old_flows; if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) - qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow); + qdisc_watchdog_schedule_range_ns(&q->watchdog, + q->time_next_delayed_flow, + q->timer_slack); return NULL; } } @@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log) } static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, @@ -747,6 +752,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -833,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, q->ce_threshold = (u64)NSEC_PER_USEC * nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (tb[TCA_FQ_TIMER_SLACK]) + q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (!err) { sch_tree_unlock(sch); err = fq_resize(sch, fq_log); @@ -884,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; + q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -924,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -947,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.flows_plimit = q->stat_flows_plimit; st.pkts_too_long = q->stat_pkts_too_long; st.allocation_errors = q->stat_allocation_errors; - st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); + st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - + ktime_get_ns(); st.flows = q->flows; st.inactive_flows = q->inactive_flows; st.throttled_flows = q->throttled_flows; diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh index a0795227216e..efd798a85931 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh @@ -8,9 +8,9 @@ tc_flower_get_target() # The driver associates a counter with each tc filter, which means the # number of supported filters is bounded by the number of available # counters. - # Currently, the driver supports 12K (12,288) flow counters and six of + # Currently, the driver supports 30K (30,720) flow counters and six of # these are used for multicast routing. - local target=12282 + local target=30714 if ((! should_fail)); then echo $target diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh new file mode 100755 index 000000000000..20ed98fe5a60 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + default_hw_stats_test + immediate_hw_stats_test + delayed_hw_stats_test + disabled_hw_stats_test +" +NUM_NETIFS=2 + +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 +} + +switch_create() +{ + simple_if_init $swp1 192.0.2.2/24 + tc qdisc add dev $swp1 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp1 clsact + simple_if_fini $swp1 192.0.2.2/24 +} + +hw_stats_test() +{ + RET=0 + + local name=$1 + local action_hw_stats=$2 + local occ_delta=$3 + local expected_packet_count=$4 + + local orig_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]') + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop $action_hw_stats + check_err $? "Failed to add rule with $name hw_stats" + + local new_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]') + local expected_occ=$((orig_occ + occ_delta)) + [ "$new_occ" == "$expected_occ" ] + check_err $? "Expected occupancy of $expected_occ, got $new_occ" + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $swp1mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -q + + tc_check_packets "dev $swp1 ingress" 101 $expected_packet_count + check_err $? "Did not match incoming packet" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + log_test "$name hw_stats" +} + +default_hw_stats_test() +{ + hw_stats_test "default" "" 2 1 +} + +immediate_hw_stats_test() +{ + hw_stats_test "immediate" "hw_stats immediate" 2 1 +} + +delayed_hw_stats_test() +{ + RET=0 + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop hw_stats delayed + check_fail $? "Unexpected success in adding rule with delayed hw_stats" + + log_test "delayed hw_stats" +} + +disabled_hw_stats_test() +{ + hw_stats_test "disabled" "hw_stats disabled" 0 0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + h1mac=$(mac_get $h1) + swp1mac=$(mac_get $swp1) + + vrf_prepare + + h1_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h1_destroy + + vrf_cleanup +} + +check_tc_action_hw_stats_support + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index a4a7879b3bb9..977fc2b326a2 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -60,6 +60,15 @@ check_tc_chain_support() fi } +check_tc_action_hw_stats_support() +{ + tc actions help 2>&1 | grep -q hw_stats + if [[ $? -ne 0 ]]; then + echo "SKIP: iproute2 too old; tc is missing action hw_stats support" + exit 1 + fi +} + if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" exit 0 |