summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 16:07:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 16:07:23 -0700
commit6e98b09da931a00bf4e0477d0fa52748bf28fcce (patch)
tree9c658ed95add5693f42f29f63df80a2ede3f6ec2 /net/core/dev.c
parentb68ee1c6131c540a62ecd443be89c406401df091 (diff)
parent9b78d919632b7149d311aaad5a977e4b48b10321 (diff)
Merge tag 'net-next-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core: - Introduce a config option to tweak MAX_SKB_FRAGS. Increasing the default value allows for better BIG TCP performances - Reduce compound page head access for zero-copy data transfers - RPS/RFS improvements, avoiding unneeded NET_RX_SOFTIRQ when possible - Threaded NAPI improvements, adding defer skb free support and unneeded softirq avoidance - Address dst_entry reference count scalability issues, via false sharing avoidance and optimize refcount tracking - Add lockless accesses annotation to sk_err[_soft] - Optimize again the skb struct layout - Extends the skb drop reasons to make it usable by multiple subsystems - Better const qualifier awareness for socket casts BPF: - Add skb and XDP typed dynptrs which allow BPF programs for more ergonomic and less brittle iteration through data and variable-sized accesses - Add a new BPF netfilter program type and minimal support to hook BPF programs to netfilter hooks such as prerouting or forward - Add more precise memory usage reporting for all BPF map types - Adds support for using {FOU,GUE} encap with an ipip device operating in collect_md mode and add a set of BPF kfuncs for controlling encap params - Allow BPF programs to detect at load time whether a particular kfunc exists or not, and also add support for this in light skeleton - Bigger batch of BPF verifier improvements to prepare for upcoming BPF open-coded iterators allowing for less restrictive looping capabilities - Rework RCU enforcement in the verifier, add kptr_rcu and enforce BPF programs to NULL-check before passing such pointers into kfunc - Add support for kptrs in percpu hashmaps, percpu LRU hashmaps and in local storage maps - Enable RCU semantics for task BPF kptrs and allow referenced kptr tasks to be stored in BPF maps - Add support for refcounted local kptrs to the verifier for allowing shared ownership, useful for adding a node to both the BPF list and rbtree - Add BPF verifier support for ST instructions in convert_ctx_access() which will help new -mcpu=v4 clang flag to start emitting them - Add ARM32 USDT support to libbpf - Improve bpftool's visual program dump which produces the control flow graph in a DOT format by adding C source inline annotations Protocols: - IPv4: Allow adding to IPv4 address a 'protocol' tag. Such value indicates the provenance of the IP address - IPv6: optimize route lookup, dropping unneeded R/W lock acquisition - Add the handshake upcall mechanism, allowing the user-space to implement generic TLS handshake on kernel's behalf - Bridge: support per-{Port, VLAN} neighbor suppression, increasing resilience to nodes failures - SCTP: add support for Fair Capacity and Weighted Fair Queueing schedulers - MPTCP: delay first subflow allocation up to its first usage. This will allow for later better LSM interaction - xfrm: Remove inner/outer modes from input/output path. These are not needed anymore - WiFi: - reduced neighbor report (RNR) handling for AP mode - HW timestamping support - support for randomized auth/deauth TA for PASN privacy - per-link debugfs for multi-link - TC offload support for mac80211 drivers - mac80211 mesh fast-xmit and fast-rx support - enable Wi-Fi 7 (EHT) mesh support Netfilter: - Add nf_tables 'brouting' support, to force a packet to be routed instead of being bridged - Update bridge netfilter and ovs conntrack helpers to handle IPv6 Jumbo packets properly, i.e. fetch the packet length from hop-by-hop extension header. This is needed for BIT TCP support - The iptables 32bit compat interface isn't compiled in by default anymore - Move ip(6)tables builtin icmp matches to the udptcp one. This has the advantage that icmp/icmpv6 match doesn't load the iptables/ip6tables modules anymore when iptables-nft is used - Extended netlink error report for netdevice in flowtables and netdev/chains. Allow for incrementally add/delete devices to netdev basechain. Allow to create netdev chain without device Driver API: - Remove redundant Device Control Error Reporting Enable, as PCI core has already error reporting enabled at enumeration time - Move Multicast DB netlink handlers to core, allowing devices other then bridge to use them - Allow the page_pool to directly recycle the pages from safely localized NAPI - Implement lockless TX queue stop/wake combo macros, allowing for further code de-duplication and sanitization - Add YNL support for user headers and struct attrs - Add partial YNL specification for devlink - Add partial YNL specification for ethtool - Add tc-mqprio and tc-taprio support for preemptible traffic classes - Add tx push buf len param to ethtool, specifies the maximum number of bytes of a transmitted packet a driver can push directly to the underlying device - Add basic LED support for switch/phy - Add NAPI documentation, stop relaying on external links - Convert dsa_master_ioctl() to netdev notifier. This is a preparatory work to make the hardware timestamping layer selectable by user space - Add transceiver support and improve the error messages for CAN-FD controllers New hardware / drivers: - Ethernet: - AMD/Pensando core device support - MediaTek MT7981 SoC - MediaTek MT7988 SoC - Broadcom BCM53134 embedded switch - Texas Instruments CPSW9G ethernet switch - Qualcomm EMAC3 DWMAC ethernet - StarFive JH7110 SoC - NXP CBTX ethernet PHY - WiFi: - Apple M1 Pro/Max devices - RealTek rtl8710bu/rtl8188gu - RealTek rtl8822bs, rtl8822cs and rtl8821cs SDIO chipset - Bluetooth: - Realtek RTL8821CS, RTL8851B, RTL8852BS - Mediatek MT7663, MT7922 - NXP w8997 - Actions Semi ATS2851 - QTI WCN6855 - Marvell 88W8997 - Can: - STMicroelectronics bxcan stm32f429 Drivers: - Ethernet NICs: - Intel (1G, icg): - add tracking and reporting of QBV config errors - add support for configuring max SDU for each Tx queue - Intel (100G, ice): - refactor mailbox overflow detection to support Scalable IOV - GNSS interface optimization - Intel (i40e): - support XDP multi-buffer - nVidia/Mellanox: - add the support for linux bridge multicast offload - enable TC offload for egress and engress MACVLAN over bond - add support for VxLAN GBP encap/decap flows offload - extend packet offload to fully support libreswan - support tunnel mode in mlx5 IPsec packet offload - extend XDP multi-buffer support - support MACsec VLAN offload - add support for dynamic msix vectors allocation - drop RX page_cache and fully use page_pool - implement thermal zone to report NIC temperature - Netronome/Corigine: - add support for multi-zone conntrack offload - Solarflare/Xilinx: - support offloading TC VLAN push/pop actions to the MAE - support TC decap rules - support unicast PTP - Other NICs: - Broadcom (bnxt): enforce software based freq adjustments only on shared PHC NIC - RealTek (r8169): refactor to addess ASPM issues during NAPI poll - Micrel (lan8841): add support for PTP_PF_PEROUT - Cadence (macb): enable PTP unicast - Engleder (tsnep): add XDP socket zero-copy support - virtio-net: implement exact header length guest feature - veth: add page_pool support for page recycling - vxlan: add MDB data path support - gve: add XDP support for GQI-QPL format - geneve: accept every ethertype - macvlan: allow some packets to bypass broadcast queue - mana: add support for jumbo frame - Ethernet high-speed switches: - Microchip (sparx5): Add support for TC flower templates - Ethernet embedded switches: - Broadcom (b54): - configure 6318 and 63268 RGMII ports - Marvell (mv88e6xxx): - faster C45 bus scan - Microchip: - lan966x: - add support for IS1 VCAP - better TX/RX from/to CPU performances - ksz9477: add ETS Qdisc support - ksz8: enhance static MAC table operations and error handling - sama7g5: add PTP capability - NXP (ocelot): - add support for external ports - add support for preemptible traffic classes - Texas Instruments: - add CPSWxG SGMII support for J7200 and J721E - Intel WiFi (iwlwifi): - preparation for Wi-Fi 7 EHT and multi-link support - EHT (Wi-Fi 7) sniffer support - hardware timestamping support for some devices/firwmares - TX beacon protection on newer hardware - Qualcomm 802.11ax WiFi (ath11k): - MU-MIMO parameters support - ack signal support for management packets - RealTek WiFi (rtw88): - SDIO bus support - better support for some SDIO devices (e.g. MAC address from efuse) - RealTek WiFi (rtw89): - HW scan support for 8852b - better support for 6 GHz scanning - support for various newer firmware APIs - framework firmware backwards compatibility - MediaTek WiFi (mt76): - P2P support - mesh A-MSDU support - EHT (Wi-Fi 7) support - coredump support" * tag 'net-next-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2078 commits) net: phy: hide the PHYLIB_LEDS knob net: phy: marvell-88x2222: remove unnecessary (void*) conversions tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp. net: amd: Fix link leak when verifying config failed net: phy: marvell: Fix inconsistent indenting in led_blink_set lan966x: Don't use xdp_frame when action is XDP_TX tsnep: Add XDP socket zero-copy TX support tsnep: Add XDP socket zero-copy RX support tsnep: Move skb receive action to separate function tsnep: Add functions for queue enable/disable tsnep: Rework TX/RX queue initialization tsnep: Replace modulo operation with mask net: phy: dp83867: Add led_brightness_set support net: phy: Fix reading LED reg property drivers: nfc: nfcsim: remove return value check of `dev_dir` net: phy: dp83867: Remove unnecessary (void*) conversions net: ethtool: coalesce: try to make user settings stick twice net: mana: Check if netdev/napi_alloc_frag returns single page net: mana: Rename mana_refill_rxoob and remove some empty lines net: veth: add page_pool stats ...
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c144
1 files changed, 91 insertions, 53 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 1488f700bf819..735096d42c1d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -160,8 +160,6 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static int netif_rx_internal(struct sk_buff *skb);
-static int call_netdevice_notifiers_info(unsigned long val,
- struct netdev_notifier_info *info);
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack);
@@ -1919,8 +1917,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev,
* are as for raw_notifier_call_chain().
*/
-static int call_netdevice_notifiers_info(unsigned long val,
- struct netdev_notifier_info *info)
+int call_netdevice_notifiers_info(unsigned long val,
+ struct netdev_notifier_info *info)
{
struct net *net = dev_net(info->dev);
int ret;
@@ -2535,6 +2533,8 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
struct xps_map *map, *new_map;
unsigned int nr_ids;
+ WARN_ON_ONCE(index >= dev->num_tx_queues);
+
if (dev->num_tc) {
/* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
@@ -3075,7 +3075,7 @@ void __netif_schedule(struct Qdisc *q)
EXPORT_SYMBOL(__netif_schedule);
struct dev_kfree_skb_cb {
- enum skb_free_reason reason;
+ enum skb_drop_reason reason;
};
static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
@@ -3108,7 +3108,7 @@ void netif_tx_wake_queue(struct netdev_queue *dev_queue)
}
EXPORT_SYMBOL(netif_tx_wake_queue);
-void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
unsigned long flags;
@@ -3128,18 +3128,16 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
-EXPORT_SYMBOL(__dev_kfree_skb_irq);
+EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
-void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
if (in_hardirq() || irqs_disabled())
- __dev_kfree_skb_irq(skb, reason);
- else if (unlikely(reason == SKB_REASON_DROPPED))
- kfree_skb(skb);
+ dev_kfree_skb_irq_reason(skb, reason);
else
- consume_skb(skb);
+ kfree_skb_reason(skb, reason);
}
-EXPORT_SYMBOL(__dev_kfree_skb_any);
+EXPORT_SYMBOL(dev_kfree_skb_any_reason);
/**
@@ -3317,8 +3315,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
skb->len - start, ~(__u32)0,
crc32c_csum_stub));
*(__le32 *)(skb->data + offset) = crc32c_csum;
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum_not_inet = 0;
+ skb_reset_csum_not_inet(skb);
out:
return ret;
}
@@ -3736,25 +3733,25 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
* we add to pkt_len the headers size of all segments
*/
if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
- unsigned int hdr_len;
u16 gso_segs = shinfo->gso_segs;
+ unsigned int hdr_len;
/* mac layer + network layer */
- hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+ hdr_len = skb_transport_offset(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
const struct tcphdr *th;
struct tcphdr _tcphdr;
- th = skb_header_pointer(skb, skb_transport_offset(skb),
+ th = skb_header_pointer(skb, hdr_len,
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
} else {
struct udphdr _udphdr;
- if (skb_header_pointer(skb, skb_transport_offset(skb),
+ if (skb_header_pointer(skb, hdr_len,
sizeof(_udphdr), &_udphdr))
hdr_len += sizeof(struct udphdr);
}
@@ -4361,7 +4358,12 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
list_add_tail(&napi->poll_list, &sd->poll_list);
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
+ /* If not called from net_rx_action()
+ * we have to raise NET_RX_SOFTIRQ.
+ */
+ if (!sd->in_net_rx_action)
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
@@ -4583,11 +4585,16 @@ static void trigger_rx_softirq(void *data)
}
/*
- * Check if this softnet_data structure is another cpu one
- * If yes, queue it to our IPI list and return 1
- * If no, return 0
+ * After we queued a packet into sd->input_pkt_queue,
+ * we need to make sure this queue is serviced soon.
+ *
+ * - If this is another cpu queue, link it to our rps_ipi_list,
+ * and make sure we will process rps_ipi_list from net_rx_action().
+ *
+ * - If this is our own queue, NAPI schedule our backlog.
+ * Note that this also raises NET_RX_SOFTIRQ.
*/
-static int napi_schedule_rps(struct softnet_data *sd)
+static void napi_schedule_rps(struct softnet_data *sd)
{
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
@@ -4596,12 +4603,15 @@ static int napi_schedule_rps(struct softnet_data *sd)
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- return 1;
+ /* If not called from net_rx_action() or napi_threaded_poll()
+ * we have to raise NET_RX_SOFTIRQ.
+ */
+ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return;
}
#endif /* CONFIG_RPS */
__napi_schedule_irqoff(&mysd->backlog);
- return 0;
}
#ifdef CONFIG_NET_FLOW_LIMIT
@@ -5021,16 +5031,17 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
clist = clist->next;
WARN_ON(refcount_read(&skb->users));
- if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
trace_consume_skb(skb, net_tx_action);
else
trace_kfree_skb(skb, net_tx_action,
- SKB_DROP_REASON_NOT_SPECIFIED);
+ get_kfree_skb_cb(skb)->reason);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
- __kfree_skb_defer(skb);
+ __napi_kfree_skb(skb,
+ get_kfree_skb_cb(skb)->reason);
}
}
@@ -6059,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+ WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state);
do {
@@ -6374,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
@@ -6585,9 +6598,31 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+ struct sk_buff *skb, *next;
+
+ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+ if (!READ_ONCE(sd->defer_list))
+ return;
+
+ spin_lock(&sd->defer_lock);
+ skb = sd->defer_list;
+ sd->defer_list = NULL;
+ sd->defer_count = 0;
+ spin_unlock(&sd->defer_lock);
+
+ while (skb != NULL) {
+ next = skb->next;
+ napi_consume_skb(skb, 1);
+ skb = next;
+ }
+}
+
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
+ struct softnet_data *sd;
void *have;
while (!napi_thread_wait(napi)) {
@@ -6595,11 +6630,21 @@ static int napi_threaded_poll(void *data)
bool repoll = false;
local_bh_disable();
+ sd = this_cpu_ptr(&softnet_data);
+ sd->in_napi_threaded_poll = true;
have = netpoll_poll_lock(napi);
__napi_poll(napi, &repoll);
netpoll_poll_unlock(have);
+ sd->in_napi_threaded_poll = false;
+ barrier();
+
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+ skb_defer_free_flush(sd);
local_bh_enable();
if (!repoll)
@@ -6611,27 +6656,6 @@ static int napi_threaded_poll(void *data)
return 0;
}
-static void skb_defer_free_flush(struct softnet_data *sd)
-{
- struct sk_buff *skb, *next;
-
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
-
- spin_lock_irq(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock_irq(&sd->defer_lock);
-
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
- }
-}
-
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -6641,6 +6665,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
LIST_HEAD(list);
LIST_HEAD(repoll);
+start:
+ sd->in_net_rx_action = true;
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
@@ -6651,8 +6677,18 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
skb_defer_free_flush(sd);
if (list_empty(&list)) {
- if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- goto end;
+ if (list_empty(&repoll)) {
+ sd->in_net_rx_action = false;
+ barrier();
+ /* We need to check if ____napi_schedule()
+ * had refilled poll_list while
+ * sd->in_net_rx_action was true.
+ */
+ if (!list_empty(&sd->poll_list))
+ goto start;
+ if (!sd_has_rps_ipi_waiting(sd))
+ goto end;
+ }
break;
}
@@ -6677,6 +6713,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ else
+ sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
end:;